From 0d446e6103c7d746f6076e3191d89ee3392c6017 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Mon, 12 Jun 2023 10:28:43 +0200 Subject: [PATCH 001/583] Develop is now preparing 1.7.0. --- CMakeLists.txt | 3 ++- examples/adaptiveprecision-blockjacobi/CMakeLists.txt | 2 +- examples/build-setup.sh | 2 +- examples/cb-gmres/CMakeLists.txt | 2 +- examples/custom-logger/CMakeLists.txt | 2 +- examples/custom-matrix-format/CMakeLists.txt | 2 +- examples/custom-stopping-criterion/CMakeLists.txt | 2 +- examples/ginkgo-overhead/CMakeLists.txt | 2 +- examples/ginkgo-ranges/CMakeLists.txt | 2 +- examples/heat-equation/CMakeLists.txt | 2 +- examples/ilu-preconditioned-solver/CMakeLists.txt | 2 +- examples/inverse-iteration/CMakeLists.txt | 2 +- examples/ir-ilu-preconditioned-solver/CMakeLists.txt | 2 +- examples/iterative-refinement/CMakeLists.txt | 2 +- examples/kokkos_assembly/CMakeLists.txt | 2 +- examples/minimal-cuda-solver/CMakeLists.txt | 2 +- examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt | 2 +- examples/mixed-multigrid-solver/CMakeLists.txt | 2 +- examples/mixed-precision-ir/CMakeLists.txt | 2 +- examples/mixed-spmv/CMakeLists.txt | 2 +- .../multigrid-preconditioned-solver-customized/CMakeLists.txt | 2 +- examples/multigrid-preconditioned-solver/CMakeLists.txt | 2 +- examples/nine-pt-stencil-solver/CMakeLists.txt | 2 +- examples/papi-logging/CMakeLists.txt | 2 +- examples/par-ilu-convergence/CMakeLists.txt | 2 +- examples/performance-debugging/CMakeLists.txt | 2 +- examples/poisson-solver/CMakeLists.txt | 2 +- examples/preconditioned-solver/CMakeLists.txt | 2 +- examples/preconditioner-export/CMakeLists.txt | 2 +- examples/schroedinger-splitting/CMakeLists.txt | 2 +- examples/simple-solver-logging/CMakeLists.txt | 2 +- examples/simple-solver/CMakeLists.txt | 2 +- examples/three-pt-stencil-solver/CMakeLists.txt | 2 +- 33 files changed, 34 insertions(+), 33 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7940e7f40b..df6f0ffb89a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) cmake_policy(SET CMP0104 OLD) endif() -project(Ginkgo LANGUAGES C CXX VERSION 1.6.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") + +project(Ginkgo LANGUAGES C CXX VERSION 1.7.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") set(Ginkgo_VERSION_TAG "master") set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG}) diff --git a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt index b121e201c77..744df84a74b 100644 --- a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt +++ b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt @@ -3,7 +3,7 @@ project(adaptiveprecision-blockjacobi) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(adaptiveprecision-blockjacobi adaptiveprecision-blockjacobi.cpp) diff --git a/examples/build-setup.sh b/examples/build-setup.sh index f7a14a0d0a6..a0c947e433b 100644 --- a/examples/build-setup.sh +++ b/examples/build-setup.sh @@ -3,7 +3,7 @@ # copy libraries LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip ginkgo_dpcpp ginkgo_device" SUFFIXES=".so .dylib .dll d.so d.dylib d.dll" -VERSION="1.6.0" +VERSION="1.7.0" for name in ${LIBRARY_NAMES}; do for suffix in ${SUFFIXES}; do cp ${BUILD_DIR}/lib/lib${name}${suffix}.${VERSION} \ diff --git a/examples/cb-gmres/CMakeLists.txt b/examples/cb-gmres/CMakeLists.txt index 97321c8ccbc..d616b16c882 100644 --- a/examples/cb-gmres/CMakeLists.txt +++ b/examples/cb-gmres/CMakeLists.txt @@ -3,7 +3,7 @@ project(cb-gmres) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(cb-gmres cb-gmres.cpp) diff --git a/examples/custom-logger/CMakeLists.txt b/examples/custom-logger/CMakeLists.txt index 1d0c8bcf9ad..f986dd52e76 100644 --- a/examples/custom-logger/CMakeLists.txt +++ b/examples/custom-logger/CMakeLists.txt @@ -3,7 +3,7 @@ project(custom-logger) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(custom-logger custom-logger.cpp) diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index c357572edea..47eeda0143c 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -3,7 +3,7 @@ project(custom-matrix-format CXX CUDA) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) find_package(OpenMP 3.0 REQUIRED) endif() diff --git a/examples/custom-stopping-criterion/CMakeLists.txt b/examples/custom-stopping-criterion/CMakeLists.txt index 79b7b9aaab5..811baa59a9c 100644 --- a/examples/custom-stopping-criterion/CMakeLists.txt +++ b/examples/custom-stopping-criterion/CMakeLists.txt @@ -3,7 +3,7 @@ project(custom-stopping-criterion) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) endif() diff --git a/examples/ginkgo-overhead/CMakeLists.txt b/examples/ginkgo-overhead/CMakeLists.txt index 5afbc22c731..fcd7a81c230 100644 --- a/examples/ginkgo-overhead/CMakeLists.txt +++ b/examples/ginkgo-overhead/CMakeLists.txt @@ -3,7 +3,7 @@ project(ginkgo-overhead) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(ginkgo-overhead ginkgo-overhead.cpp) diff --git a/examples/ginkgo-ranges/CMakeLists.txt b/examples/ginkgo-ranges/CMakeLists.txt index de86438d62b..6e30c4f9af4 100644 --- a/examples/ginkgo-ranges/CMakeLists.txt +++ b/examples/ginkgo-ranges/CMakeLists.txt @@ -3,7 +3,7 @@ project(ginkgo-ranges) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(ginkgo-ranges ginkgo-ranges.cpp) target_link_libraries(ginkgo-ranges Ginkgo::ginkgo) diff --git a/examples/heat-equation/CMakeLists.txt b/examples/heat-equation/CMakeLists.txt index 3b0cfc57cb0..f4790edaa8d 100644 --- a/examples/heat-equation/CMakeLists.txt +++ b/examples/heat-equation/CMakeLists.txt @@ -3,7 +3,7 @@ project(heat-equation) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() find_package(OpenCV REQUIRED) diff --git a/examples/ilu-preconditioned-solver/CMakeLists.txt b/examples/ilu-preconditioned-solver/CMakeLists.txt index 85daf54923a..e6c840f38f8 100644 --- a/examples/ilu-preconditioned-solver/CMakeLists.txt +++ b/examples/ilu-preconditioned-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(ilu-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(ilu-preconditioned-solver ilu-preconditioned-solver.cpp) diff --git a/examples/inverse-iteration/CMakeLists.txt b/examples/inverse-iteration/CMakeLists.txt index fa1d17e55c4..deb72accffd 100644 --- a/examples/inverse-iteration/CMakeLists.txt +++ b/examples/inverse-iteration/CMakeLists.txt @@ -3,7 +3,7 @@ project(inverse-iteration) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(inverse-iteration inverse-iteration.cpp) diff --git a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt index c1424429636..fc1205fbd0d 100644 --- a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt +++ b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(ir-ilu-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(ir-ilu-preconditioned-solver ir-ilu-preconditioned-solver.cpp) diff --git a/examples/iterative-refinement/CMakeLists.txt b/examples/iterative-refinement/CMakeLists.txt index 39a2651a90d..fe94a94455b 100644 --- a/examples/iterative-refinement/CMakeLists.txt +++ b/examples/iterative-refinement/CMakeLists.txt @@ -3,7 +3,7 @@ project(iterative-refinement) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(iterative-refinement iterative-refinement.cpp) diff --git a/examples/kokkos_assembly/CMakeLists.txt b/examples/kokkos_assembly/CMakeLists.txt index e6f214e68e2..bfee201c91d 100644 --- a/examples/kokkos_assembly/CMakeLists.txt +++ b/examples/kokkos_assembly/CMakeLists.txt @@ -3,7 +3,7 @@ project(kokkos-assembly CXX) # We only need to find Ginkgo if we build this example stand-alone if(NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() find_package(Kokkos REQUIRED) diff --git a/examples/minimal-cuda-solver/CMakeLists.txt b/examples/minimal-cuda-solver/CMakeLists.txt index 52aa56b60fc..3add4bb30ad 100644 --- a/examples/minimal-cuda-solver/CMakeLists.txt +++ b/examples/minimal-cuda-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(minimal-cuda-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(minimal-cuda-solver minimal-cuda-solver.cpp) diff --git a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt index 54384f544b7..d710f10f146 100644 --- a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt +++ b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(mixed-multigrid-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(mixed-multigrid-preconditioned-solver mixed-multigrid-preconditioned-solver.cpp) diff --git a/examples/mixed-multigrid-solver/CMakeLists.txt b/examples/mixed-multigrid-solver/CMakeLists.txt index e4ee334e38f..17ec2fa398e 100644 --- a/examples/mixed-multigrid-solver/CMakeLists.txt +++ b/examples/mixed-multigrid-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(mixed-multigrid-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(mixed-multigrid-solver mixed-multigrid-solver.cpp) diff --git a/examples/mixed-precision-ir/CMakeLists.txt b/examples/mixed-precision-ir/CMakeLists.txt index a0a46c0fd6e..01094a5376b 100644 --- a/examples/mixed-precision-ir/CMakeLists.txt +++ b/examples/mixed-precision-ir/CMakeLists.txt @@ -3,7 +3,7 @@ project(mixed-precision-ir) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(mixed-precision-ir mixed-precision-ir.cpp) diff --git a/examples/mixed-spmv/CMakeLists.txt b/examples/mixed-spmv/CMakeLists.txt index ad8e31aad3e..0e4378ca82f 100644 --- a/examples/mixed-spmv/CMakeLists.txt +++ b/examples/mixed-spmv/CMakeLists.txt @@ -3,7 +3,7 @@ project(mixed-spmv) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(mixed-spmv mixed-spmv.cpp) diff --git a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt index 4d2b0822d08..411b57b2c83 100644 --- a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt +++ b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt @@ -3,7 +3,7 @@ project(multigrid-preconditioned-solver-customized) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(multigrid-preconditioned-solver-customized multigrid-preconditioned-solver-customized.cpp) diff --git a/examples/multigrid-preconditioned-solver/CMakeLists.txt b/examples/multigrid-preconditioned-solver/CMakeLists.txt index af7c296b631..90277398b85 100644 --- a/examples/multigrid-preconditioned-solver/CMakeLists.txt +++ b/examples/multigrid-preconditioned-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(multigrid-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(multigrid-preconditioned-solver multigrid-preconditioned-solver.cpp) diff --git a/examples/nine-pt-stencil-solver/CMakeLists.txt b/examples/nine-pt-stencil-solver/CMakeLists.txt index d2384129d47..35610ba758a 100644 --- a/examples/nine-pt-stencil-solver/CMakeLists.txt +++ b/examples/nine-pt-stencil-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(nine-pt-stencil-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(nine-pt-stencil-solver nine-pt-stencil-solver.cpp) diff --git a/examples/papi-logging/CMakeLists.txt b/examples/papi-logging/CMakeLists.txt index ac2560f499d..6927675e2ec 100644 --- a/examples/papi-logging/CMakeLists.txt +++ b/examples/papi-logging/CMakeLists.txt @@ -3,7 +3,7 @@ project(papi-logging) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() if (NOT GINKGO_HAVE_PAPI_SDE) diff --git a/examples/par-ilu-convergence/CMakeLists.txt b/examples/par-ilu-convergence/CMakeLists.txt index bee08841173..23b7afd1e75 100644 --- a/examples/par-ilu-convergence/CMakeLists.txt +++ b/examples/par-ilu-convergence/CMakeLists.txt @@ -3,7 +3,7 @@ project(par-ilu-convergence) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(par-ilu-convergence par-ilu-convergence.cpp) diff --git a/examples/performance-debugging/CMakeLists.txt b/examples/performance-debugging/CMakeLists.txt index 4f095e4d1c6..715cd99fe1b 100644 --- a/examples/performance-debugging/CMakeLists.txt +++ b/examples/performance-debugging/CMakeLists.txt @@ -3,7 +3,7 @@ project(performance-debugging) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(performance-debugging performance-debugging.cpp) diff --git a/examples/poisson-solver/CMakeLists.txt b/examples/poisson-solver/CMakeLists.txt index 64e0633ee75..bd5383876d5 100644 --- a/examples/poisson-solver/CMakeLists.txt +++ b/examples/poisson-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(poisson-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(poisson-solver poisson-solver.cpp) diff --git a/examples/preconditioned-solver/CMakeLists.txt b/examples/preconditioned-solver/CMakeLists.txt index b046686243d..a412885f219 100644 --- a/examples/preconditioned-solver/CMakeLists.txt +++ b/examples/preconditioned-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(preconditioned-solver preconditioned-solver.cpp) target_link_libraries(preconditioned-solver Ginkgo::ginkgo) diff --git a/examples/preconditioner-export/CMakeLists.txt b/examples/preconditioner-export/CMakeLists.txt index 1d2156b9d5a..1cfd6d7ff84 100644 --- a/examples/preconditioner-export/CMakeLists.txt +++ b/examples/preconditioner-export/CMakeLists.txt @@ -3,7 +3,7 @@ project(preconditioner-export) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(preconditioner-export preconditioner-export.cpp) diff --git a/examples/schroedinger-splitting/CMakeLists.txt b/examples/schroedinger-splitting/CMakeLists.txt index b7bdece35e8..1e49a1f88b4 100644 --- a/examples/schroedinger-splitting/CMakeLists.txt +++ b/examples/schroedinger-splitting/CMakeLists.txt @@ -3,7 +3,7 @@ project(schroedinger-splitting) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() find_package(OpenCV REQUIRED) diff --git a/examples/simple-solver-logging/CMakeLists.txt b/examples/simple-solver-logging/CMakeLists.txt index 4092445848a..befead38e7d 100644 --- a/examples/simple-solver-logging/CMakeLists.txt +++ b/examples/simple-solver-logging/CMakeLists.txt @@ -3,7 +3,7 @@ project(simple-solver-logging) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(simple-solver-logging simple-solver-logging.cpp) diff --git a/examples/simple-solver/CMakeLists.txt b/examples/simple-solver/CMakeLists.txt index f505e19729e..dd0faec5f53 100644 --- a/examples/simple-solver/CMakeLists.txt +++ b/examples/simple-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(simple-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(simple-solver simple-solver.cpp) diff --git a/examples/three-pt-stencil-solver/CMakeLists.txt b/examples/three-pt-stencil-solver/CMakeLists.txt index d2941b12976..fc0691dd7c9 100644 --- a/examples/three-pt-stencil-solver/CMakeLists.txt +++ b/examples/three-pt-stencil-solver/CMakeLists.txt @@ -3,7 +3,7 @@ project(three-pt-stencil-solver) # We only need to find Ginkgo if we build this example stand-alone if (NOT GINKGO_BUILD_EXAMPLES) - find_package(Ginkgo 1.6.0 REQUIRED) + find_package(Ginkgo 1.7.0 REQUIRED) endif() add_executable(three-pt-stencil-solver three-pt-stencil-solver.cpp) From a93239103afe92de477b9e82f0f2b34b3e088d62 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 23 Jun 2023 09:53:00 +0200 Subject: [PATCH 002/583] add reorderings to sparse_blas benchmark --- benchmark/sparse_blas/operations.cpp | 126 +++++++++++++++++++++++++- benchmark/sparse_blas/sparse_blas.cpp | 12 ++- 2 files changed, 130 insertions(+), 8 deletions(-) diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index 6a817a67c0d..dc96143ed6d 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -695,11 +695,111 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { }; +class ReorderRcmOperation : public BenchmarkOperation { + using reorder_type = gko::reorder::Rcm; + +public: + explicit ReorderRcmOperation(const Mtx* mtx) + : mtx_{mtx->clone()}, + factory_{reorder_type::build().on(mtx->get_executor())} + {} + + std::pair validate() const override + { + // validating RCM correctness is hard, let's leave it out for now + return {true, 0.0}; + } + + gko::size_type get_flops() const override { return 0; } + + gko::size_type get_memory() const override { return 0; } + + void prepare() override {} + + void run() override { reorder_ = factory_->generate(mtx_); } + +private: + std::shared_ptr mtx_; + std::unique_ptr factory_; + std::unique_ptr reorder_; +}; + + +#if GKO_HAVE_METIS + + +class ReorderNestedDissectionOperation : public BenchmarkOperation { + using factory_type = + gko::experimental::reorder::NestedDissection; + using reorder_type = gko::matrix::Permutation; + +public: + explicit ReorderNestedDissectionOperation(const Mtx* mtx) + : mtx_{mtx->clone()}, + factory_{factory_type::build().on(mtx->get_executor())} + {} + + std::pair validate() const override + { + // validating ND correctness is hard, let's leave it out for now + return {true, 0.0}; + } + + gko::size_type get_flops() const override { return 0; } + + gko::size_type get_memory() const override { return 0; } + + void prepare() override {} + + void run() override { reorder_ = factory_->generate(mtx_); } + +private: + std::shared_ptr mtx_; + std::unique_ptr factory_; + std::unique_ptr reorder_; +}; + + +#endif + + +class ReorderApproxMinDegOperation : public BenchmarkOperation { + using factory_type = gko::experimental::reorder::Amd; + using reorder_type = gko::matrix::Permutation; + +public: + explicit ReorderApproxMinDegOperation(const Mtx* mtx) + : mtx_{mtx->clone()}, + factory_{factory_type::build().on(mtx->get_executor())} + {} + + std::pair validate() const override + { + // validating AMD correctness is hard, let's leave it out for now + return {true, 0.0}; + } + + gko::size_type get_flops() const override { return 0; } + + gko::size_type get_memory() const override { return 0; } + + void prepare() override {} + + void run() override { reorder_ = factory_->generate(mtx_); } + +private: + std::shared_ptr mtx_; + std::unique_ptr factory_; + std::unique_ptr reorder_; +}; + + const std::map(const Mtx*)>> - operation_map{ - {"spgemm", - [](const Mtx* mtx) { return std::make_unique(mtx); }}, + operation_map +{ + {"spgemm", + [](const Mtx* mtx) { return std::make_unique(mtx); }}, {"spgeam", [](const Mtx* mtx) { return std::make_unique(mtx); }}, {"transpose", @@ -726,9 +826,25 @@ const std::map(mtx, false); }}, - {"symbolic_cholesky_symmetric", [](const Mtx* mtx) { + {"symbolic_cholesky_symmetric", + [](const Mtx* mtx) { return std::make_unique(mtx, true); - }}}; + }}, + {"reorder_rcm", + [](const Mtx* mtx) { + return std::make_unique(mtx); + }}, + {"reorder_amd", [](const Mtx* mtx) { + return std::make_unique(mtx); + }}, +#if GKO_HAVE_METIS + { + "reorder_nd", [](const Mtx* mtx) { + return std::make_unique(mtx); + } + } +#endif +}; std::unique_ptr get_operation(std::string name, diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index 4fb06d2a4a0..3b0ce26db5f 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -57,11 +57,17 @@ const auto benchmark_name = "sparse_blas"; using mat_data = gko::matrix_data; -DEFINE_string( - operations, "spgemm,spgeam,transpose", +const char* operations_string = "Comma-separated list of operations to be benchmarked. Can be " "spgemm, spgeam, transpose, sort, is_sorted, generate_lookup, " - "lookup, symbolic_lu, symbolic_cholesky, symbolic_cholesky_symmetric"); + "lookup, symbolic_lu, symbolic_cholesky, " + "symbolic_cholesky_symmetric, reorder_rcm, " +#if GKO_HAVE_METIS + "reorder_nd, " +#endif + "reorder_amd"; + +DEFINE_string(operations, "spgemm,spgeam,transpose", operations_string); DEFINE_bool(validate, false, "Check for correct sparsity pattern and compute the L2 norm " From 201ee956747aa5328dd4f524813b7721f4c60de0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 23 Jun 2023 10:58:05 +0200 Subject: [PATCH 003/583] improve formatting Co-authored-by: Yuhsiang M. Tsai --- benchmark/sparse_blas/operations.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index dc96143ed6d..66e5707c559 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -796,10 +796,9 @@ class ReorderApproxMinDegOperation : public BenchmarkOperation { const std::map(const Mtx*)>> - operation_map -{ - {"spgemm", - [](const Mtx* mtx) { return std::make_unique(mtx); }}, + operation_map{ + {"spgemm", + [](const Mtx* mtx) { return std::make_unique(mtx); }}, {"spgeam", [](const Mtx* mtx) { return std::make_unique(mtx); }}, {"transpose", @@ -834,17 +833,18 @@ const std::map(mtx); }}, - {"reorder_amd", [](const Mtx* mtx) { + {"reorder_amd", + [](const Mtx* mtx) { return std::make_unique(mtx); }}, + {"reorder_nd", + [](const Mtx* mtx) -> std::unique_ptr { #if GKO_HAVE_METIS - { - "reorder_nd", [](const Mtx* mtx) { - return std::make_unique(mtx); - } - } + return std::make_unique(mtx); +#else + GKO_NOT_COMPILED(METIS); #endif -}; + }}}; std::unique_ptr get_operation(std::string name, From 42c9d00fa9f5b07474c5331ccb10aefd39f57c27 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 5 Apr 2023 10:55:35 +0200 Subject: [PATCH 004/583] add SparsityCsr sorting kernels --- .../matrix/sparsity_csr_kernels.hpp.inc | 32 +++--- core/test/utils/unsort_matrix.hpp | 19 +--- cuda/matrix/sparsity_csr_kernels.cu | 65 ++++++++++++ dpcpp/matrix/sparsity_csr_kernels.dp.cpp | 45 ++++++++- hip/matrix/sparsity_csr_kernels.hip.cpp | 63 ++++++++++++ test/matrix/sparsity_csr_kernels.cpp | 98 +++++++++++++++++++ 6 files changed, 287 insertions(+), 35 deletions(-) diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc index dddd7946a04..2d2ca9a5183 100644 --- a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc @@ -121,19 +121,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void sort_by_column_index(std::shared_ptr exec, - matrix::SparsityCsr* to_sort) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); - - -template -void is_sorted_by_column_index( - std::shared_ptr exec, - const matrix::SparsityCsr* to_check, - bool* is_sorted) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); +void fallback_sort(std::shared_ptr exec, + matrix::SparsityCsr* to_sort) +{ + const auto row_ptrs = to_sort->get_const_row_ptrs(); + const auto col_idxs = to_sort->get_col_idxs(); + const auto nnz = to_sort->get_num_nonzeros(); + const auto num_rows = to_sort->get_size()[0]; + array row_idx_array(exec, nnz); + const auto row_idxs = row_idx_array.get_data(); + components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs); + // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort + thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz, + row_idxs); + thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, + col_idxs); +} diff --git a/core/test/utils/unsort_matrix.hpp b/core/test/utils/unsort_matrix.hpp index 04ece71d346..1af40352bd2 100644 --- a/core/test/utils/unsort_matrix.hpp +++ b/core/test/utils/unsort_matrix.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include @@ -55,24 +56,10 @@ namespace test { template void unsort_matrix(MtxPtr&& mtx, RandomEngine&& engine) { - using value_type = gko::detail::pointeeget_values())>; - using index_type = gko::detail::pointeeget_col_idxs())>; - auto nnz = mtx->get_num_stored_elements(); - if (nnz <= 0) { - return; - } - + using value_type = typename gko::detail::pointee::value_type; + using index_type = typename gko::detail::pointee::index_type; const auto exec = mtx->get_executor(); const auto master = exec->get_master(); - - // If exec is not the master/host, extract the master and perform the - // unsorting there, followed by copying it back - if (exec != master) { - auto h_mtx = mtx->clone(master); - unsort_matrix(h_mtx, engine); - mtx->copy_from(h_mtx); - return; - } matrix_data data; mtx->write(data); auto& nonzeros = data.nonzeros; diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu index 73e1fd9cb76..ab367c80b20 100644 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ b/cuda/matrix/sparsity_csr_kernels.cu @@ -33,15 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/sparsity_csr_kernels.hpp" +#include + + #include #include "accessor/cuda_helper.hpp" #include "accessor/reduced_row_major.hpp" #include "core/base/mixed_precision_types.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" +#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" +#include "cuda/base/thrust.cuh" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" @@ -61,6 +68,7 @@ namespace sparsity_csr { constexpr int classical_oversubscription = 32; +constexpr int default_block_size = 512; constexpr int spmv_block_size = 128; constexpr int warps_in_block = 4; @@ -68,6 +76,7 @@ constexpr int warps_in_block = 4; using classical_kernels = syn::value_list; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" @@ -178,6 +187,62 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +template +void sort_by_column_index(std::shared_ptr exec, + matrix::SparsityCsr* to_sort) +{ + const auto nnz = static_cast(to_sort->get_num_nonzeros()); + const auto num_rows = static_cast(to_sort->get_size()[0]); + const auto num_cols = static_cast(to_sort->get_size()[1]); + const auto row_ptrs = to_sort->get_const_row_ptrs(); + const auto col_idxs = to_sort->get_col_idxs(); + if (cusparse::is_supported::value) { + const auto handle = exec->get_cusparse_handle(); + auto descr = cusparse::create_mat_descr(); + array permutation_array(exec, to_sort->get_num_nonzeros()); + auto permutation = permutation_array.get_data(); + components::fill_seq_array(exec, permutation, + to_sort->get_num_nonzeros()); + size_type buffer_size{}; + cusparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs, + col_idxs, buffer_size); + array buffer_array{exec, buffer_size}; + auto buffer = buffer_array.get_data(); + cusparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, + col_idxs, permutation, buffer); + cusparse::destroy(descr); + } else { + fallback_sort(exec, to_sort); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); + + +template +void is_sorted_by_column_index( + std::shared_ptr exec, + const matrix::SparsityCsr* to_check, bool* is_sorted) +{ + *is_sorted = true; + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); + auto gpu_array = array{exec, cpu_array}; + const auto num_rows = static_cast(to_check->get_size()[0]); + auto num_blocks = ceildiv(num_rows, default_block_size); + if (num_blocks > 0) { + kernel::check_unsorted<<get_stream()>>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_rows, gpu_array.get_data()); + } + cpu_array = gpu_array; +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); + + } // namespace sparsity_csr } // namespace cuda } // namespace kernels diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index 2cebac00c5f..f12d15175b7 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -303,7 +303,24 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void sort_by_column_index(std::shared_ptr exec, matrix::SparsityCsr* to_sort) - GKO_NOT_IMPLEMENTED; +{ + const auto num_rows = to_sort->get_size()[0]; + const auto row_ptrs = to_sort->get_const_row_ptrs(); + const auto cols = to_sort->get_const_col_idxs(); + auto queue = exec->get_queue(); + // build sorted postorder node list for each row + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx_id) { + const auto row = idx_id[0]; + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + auto lower_end = row_begin; + // heap-sort the elements + std::make_heap(cols + row_begin, cols + lower_end); + std::sort_heap(cols + row_begin, cols + lower_end); + }); + }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); @@ -312,8 +329,30 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void is_sorted_by_column_index( std::shared_ptr exec, - const matrix::SparsityCsr* to_check, - bool* is_sorted) GKO_NOT_IMPLEMENTED; + const matrix::SparsityCsr* to_check, bool* is_sorted) +{ + array is_sorted_device_array{exec, {true}}; + const auto num_rows = to_check->get_size()[0]; + const auto row_ptrs = to_check->get_const_row_ptrs(); + const auto cols = to_check->get_const_col_idxs(); + auto is_sorted_device = is_sorted_device_array.get_data(); + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + const auto row = static_cast(idx[0]); + const auto begin = row_ptrs[row]; + const auto end = row_ptrs[row + 1]; + if (*is_sorted_device) { + for (auto i = begin; i < end - 1; i++) { + if (cols[i] > cols[i + 1]) { + *is_sorted_device = false; + break; + } + } + } + }); + }); + *is_sorted = exec->copy_val_to_host(is_sorted_device); +}; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp index bc9cd0a31db..2084aa5656f 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/hip/matrix/sparsity_csr_kernels.hip.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -42,9 +43,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "accessor/hip_helper.hpp" #include "accessor/reduced_row_major.hpp" #include "core/base/mixed_precision_types.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" +#include "hip/base/thrust.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" @@ -64,6 +69,7 @@ namespace sparsity_csr { constexpr int classical_oversubscription = 32; +constexpr int default_block_size = 512; constexpr int spmv_block_size = 256; constexpr int warps_in_block = 4; @@ -71,6 +77,7 @@ constexpr int warps_in_block = 4; using classical_kernels = syn::value_list; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" @@ -181,6 +188,62 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +template +void sort_by_column_index(std::shared_ptr exec, + matrix::SparsityCsr* to_sort) +{ + const auto nnz = static_cast(to_sort->get_num_nonzeros()); + const auto num_rows = static_cast(to_sort->get_size()[0]); + const auto num_cols = static_cast(to_sort->get_size()[1]); + const auto row_ptrs = to_sort->get_const_row_ptrs(); + const auto col_idxs = to_sort->get_col_idxs(); + if (hipsparse::is_supported::value) { + const auto handle = exec->get_hipsparse_handle(); + auto descr = hipsparse::create_mat_descr(); + array permutation_array(exec, to_sort->get_num_nonzeros()); + auto permutation = permutation_array.get_data(); + components::fill_seq_array(exec, permutation, + to_sort->get_num_nonzeros()); + size_type buffer_size{}; + hipsparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, + row_ptrs, col_idxs, buffer_size); + array buffer_array{exec, buffer_size}; + auto buffer = buffer_array.get_data(); + hipsparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, + col_idxs, permutation, buffer); + hipsparse::destroy(descr); + } else { + fallback_sort(exec, to_sort); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); + + +template +void is_sorted_by_column_index( + std::shared_ptr exec, + const matrix::SparsityCsr* to_check, bool* is_sorted) +{ + *is_sorted = true; + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); + auto gpu_array = array{exec, cpu_array}; + const auto num_rows = static_cast(to_check->get_size()[0]); + auto num_blocks = ceildiv(num_rows, default_block_size); + if (num_blocks > 0) { + kernel::check_unsorted<<get_stream()>>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_rows, gpu_array.get_data()); + } + cpu_array = gpu_array; +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); + + } // namespace sparsity_csr } // namespace hip } // namespace kernels diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp index b137ce72ca8..af1e6ca1401 100644 --- a/test/matrix/sparsity_csr_kernels.cpp +++ b/test/matrix/sparsity_csr_kernels.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/matrix_generator.hpp" +#include "core/test/utils/unsort_matrix.hpp" #include "test/utils/executor.hpp" @@ -59,6 +60,7 @@ namespace { class SparsityCsr : public CommonTestFixture { protected: using Mtx = gko::matrix::SparsityCsr; + using Mtx64 = gko::matrix::SparsityCsr; SparsityCsr() : rng{9312} { @@ -145,4 +147,100 @@ TEST_F(SparsityCsr, ConvertToDenseIsEquivalentToRef) } +TEST_F(SparsityCsr, SortSortedMatrixIsEquivalentToRef) +{ + mtx->sort_by_column_index(); + dmtx->sort_by_column_index(); + + auto cols_view = + gko::make_array_view(ref, mtx->get_num_nonzeros(), mtx->get_col_idxs()); + auto dcols_view = gko::make_array_view(exec, dmtx->get_num_nonzeros(), + dmtx->get_col_idxs()); + GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view); +} + + +TEST_F(SparsityCsr, SortSortedMatrix64IsEquivalentToRef) +{ + auto mtx64 = Mtx64::create(ref); + auto dmtx64 = Mtx64::create(exec); + gko::matrix_data data; + gko::matrix_data data64; + mtx->sort_by_column_index(); + mtx->write(data); + data64.size = data.size; + for (auto entry : data.nonzeros) { + data64.nonzeros.emplace_back(entry.row, entry.column, entry.value); + } + mtx64->read(data64); + dmtx64->read(data64); + + mtx64->sort_by_column_index(); + dmtx64->sort_by_column_index(); + + auto cols_view = gko::make_array_view(ref, mtx64->get_num_nonzeros(), + mtx64->get_col_idxs()); + auto dcols_view = gko::make_array_view(exec, dmtx64->get_num_nonzeros(), + dmtx64->get_col_idxs()); + GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view); +} + + +TEST_F(SparsityCsr, SortUnsortedMatrixIsEquivalentToRef) +{ + gko::test::unsort_matrix(mtx, rng); + dmtx->copy_from(mtx); + + mtx->sort_by_column_index(); + dmtx->sort_by_column_index(); + + auto cols_view = + gko::make_array_view(ref, mtx->get_num_nonzeros(), mtx->get_col_idxs()); + auto dcols_view = gko::make_array_view(exec, dmtx->get_num_nonzeros(), + dmtx->get_col_idxs()); + GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view); +} + + +TEST_F(SparsityCsr, SortUnsortedMatrix64IsEquivalentToRef) +{ + gko::test::unsort_matrix(mtx, rng); + auto mtx64 = Mtx64::create(ref); + auto dmtx64 = Mtx64::create(exec); + gko::matrix_data data; + gko::matrix_data data64; + mtx->sort_by_column_index(); + mtx->write(data); + data64.size = data.size; + for (auto entry : data.nonzeros) { + data64.nonzeros.emplace_back(entry.row, entry.column, entry.value); + } + mtx64->read(data64); + dmtx64->read(data64); + + mtx64->sort_by_column_index(); + dmtx64->sort_by_column_index(); + + auto cols_view = gko::make_array_view(ref, mtx64->get_num_nonzeros(), + mtx64->get_col_idxs()); + auto dcols_view = gko::make_array_view(exec, dmtx64->get_num_nonzeros(), + dmtx64->get_col_idxs()); + GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view); +} + + +TEST_F(SparsityCsr, RecognizesUnsortedMatrix) +{ + gko::test::unsort_matrix(dmtx, rng); + + ASSERT_FALSE(dmtx->is_sorted_by_column_index()); +} + + +TEST_F(SparsityCsr, RecognizesSortedMatrix) +{ + ASSERT_TRUE(dmtx->is_sorted_by_column_index()); +} + + } // namespace From ce1e0f73b7e1af9a518555c5650cd044cd7c9d2d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 5 Apr 2023 15:23:58 +0200 Subject: [PATCH 005/583] fix dpcpp compilation issues --- dpcpp/matrix/sparsity_csr_kernels.dp.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index f12d15175b7..6001e687dca 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -306,7 +306,7 @@ void sort_by_column_index(std::shared_ptr exec, { const auto num_rows = to_sort->get_size()[0]; const auto row_ptrs = to_sort->get_const_row_ptrs(); - const auto cols = to_sort->get_const_col_idxs(); + const auto cols = to_sort->get_col_idxs(); auto queue = exec->get_queue(); // build sorted postorder node list for each row queue->submit([&](sycl::handler& cgh) { @@ -314,10 +314,9 @@ void sort_by_column_index(std::shared_ptr exec, const auto row = idx_id[0]; const auto row_begin = row_ptrs[row]; const auto row_end = row_ptrs[row + 1]; - auto lower_end = row_begin; // heap-sort the elements - std::make_heap(cols + row_begin, cols + lower_end); - std::sort_heap(cols + row_begin, cols + lower_end); + std::make_heap(cols + row_begin, cols + row_end); + std::sort_heap(cols + row_begin, cols + row_end); }); }); } From 3af2e6caa3e8b4278163f2ea440a1e8c2a5d7007 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 5 Jun 2023 21:39:10 +0200 Subject: [PATCH 006/583] fix unsorted test --- test/matrix/sparsity_csr_kernels.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp index af1e6ca1401..d865570b6d0 100644 --- a/test/matrix/sparsity_csr_kernels.cpp +++ b/test/matrix/sparsity_csr_kernels.cpp @@ -209,7 +209,6 @@ TEST_F(SparsityCsr, SortUnsortedMatrix64IsEquivalentToRef) auto dmtx64 = Mtx64::create(exec); gko::matrix_data data; gko::matrix_data data64; - mtx->sort_by_column_index(); mtx->write(data); data64.size = data.size; for (auto entry : data.nonzeros) { From 6ef07edb9abb35da530198a6c298ed7963e342ba Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 7 Jun 2023 12:11:41 +0200 Subject: [PATCH 007/583] review updates Co-authored-by: Marcel Koch --- core/test/utils/unsort_matrix.hpp | 2 -- dpcpp/matrix/sparsity_csr_kernels.dp.cpp | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/core/test/utils/unsort_matrix.hpp b/core/test/utils/unsort_matrix.hpp index 1af40352bd2..e22d86b326e 100644 --- a/core/test/utils/unsort_matrix.hpp +++ b/core/test/utils/unsort_matrix.hpp @@ -58,8 +58,6 @@ void unsort_matrix(MtxPtr&& mtx, RandomEngine&& engine) { using value_type = typename gko::detail::pointee::value_type; using index_type = typename gko::detail::pointee::index_type; - const auto exec = mtx->get_executor(); - const auto master = exec->get_master(); matrix_data data; mtx->write(data); auto& nonzeros = data.nonzeros; diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index 6001e687dca..133e5f41478 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -330,11 +330,12 @@ void is_sorted_by_column_index( std::shared_ptr exec, const matrix::SparsityCsr* to_check, bool* is_sorted) { - array is_sorted_device_array{exec, {true}}; + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); + auto gpu_array = array{exec, cpu_array}; const auto num_rows = to_check->get_size()[0]; const auto row_ptrs = to_check->get_const_row_ptrs(); const auto cols = to_check->get_const_col_idxs(); - auto is_sorted_device = is_sorted_device_array.get_data(); + auto is_sorted_device = gpu_array.get_data(); exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { const auto row = static_cast(idx[0]); @@ -350,7 +351,7 @@ void is_sorted_by_column_index( } }); }); - *is_sorted = exec->copy_val_to_host(is_sorted_device); + cpu_array = gpu_array; }; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( From 6267a56c25dba1e77d2ca43d231738398803d19b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 20 Jun 2023 10:56:22 +0200 Subject: [PATCH 008/583] fix is_sorted initialization --- dpcpp/matrix/sparsity_csr_kernels.dp.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index 133e5f41478..1acc16d7026 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -330,6 +330,7 @@ void is_sorted_by_column_index( std::shared_ptr exec, const matrix::SparsityCsr* to_check, bool* is_sorted) { + *is_sorted = true; auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); auto gpu_array = array{exec, cpu_array}; const auto num_rows = to_check->get_size()[0]; From 70c75b38d3a67223ee59c786351c3fa071fe8851 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 30 Jun 2023 09:45:04 +0200 Subject: [PATCH 009/583] disable ambiguous range constructor this should fix the bug reported by sonarcloud: https://sonarcloud.io/project/issues?open=AYTucKfs8qk247btl14g&id=ginkgo-project_ginkgo --- include/ginkgo/core/base/range.hpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index c9713f33572..ed8901075bd 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -188,6 +188,27 @@ GKO_ATTRIBUTES constexpr GKO_INLINE equal_dimensions(first, second); } +/** + * Helper that stores the first type of a parameter pack, if its length is + * greater 0. + */ +template +struct head; + +/** + * @copydoc head + */ +template +struct head { + using type = First; +}; + +/** + * @copydoc head + */ +template +using head_t = typename head::type; + } // namespace detail @@ -327,7 +348,12 @@ class range { * * @param params parameters forwarded to Accessor constructor. */ - template + template < + typename... AccessorParams, + typename = std::enable_if_t< + sizeof...(AccessorParams) != 1 || + !std::is_same< + range, std::decay>>::value>> GKO_ATTRIBUTES constexpr explicit range(AccessorParams&&... params) : accessor_{std::forward(params)...} {} From 73bad7245b1af3f9ef4a6bd5c0abf3bff9477d7e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 5 Jul 2023 10:03:08 +0200 Subject: [PATCH 010/583] Guard against spaces in GinkgoConfig.cmake.in --- cmake/GinkgoConfig.cmake.in | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index a2857310183..44aaf34fc3f 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -61,27 +61,27 @@ set(GINKGO_IWYU_PATH @GINKGO_IWYU_PATH@) set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@) -set(GINKGO_CUDA_ARCHITECTURES @GINKGO_CUDA_ARCHITECTURES@) +set(GINKGO_CUDA_ARCHITECTURES "@GINKGO_CUDA_ARCHITECTURES@") set(GINKGO_CUDA_DEFAULT_HOST_COMPILER @GINKGO_CUDA_DEFAULT_HOST_COMPILER@) -set(GINKGO_CUDA_HOST_COMPILER @CMAKE_CUDA_HOST_COMPILER@) -set(GINKGO_CUDA_ARCH_FLAGS @GINKGO_CUDA_ARCH_FLAGS@) +set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") +set(GINKGO_CUDA_ARCH_FLAGS "@GINKGO_CUDA_ARCH_FLAGS@") -set(GINKGO_HIP_COMPILER_FLAGS @GINKGO_HIP_COMPILER_FLAGS@) -set(GINKGO_HIP_HCC_COMPILER_FLAGS @GINKGO_HIP_HCC_COMPILER_FLAGS@) -set(GINKGO_HIP_NVCC_COMPILER_FLAGS @GINKGO_HIP_NVCC_COMPILER_FLAGS@) -set(GINKGO_HIP_CLANG_COMPILER_FLAGS @GINKGO_HIP_CLANG_COMPILER_FLAGS@) +set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@") +set(GINKGO_HIP_HCC_COMPILER_FLAGS "@GINKGO_HIP_HCC_COMPILER_FLAGS@") +set(GINKGO_HIP_NVCC_COMPILER_FLAGS "@GINKGO_HIP_NVCC_COMPILER_FLAGS@") +set(GINKGO_HIP_CLANG_COMPILER_FLAGS "@GINKGO_HIP_CLANG_COMPILER_FLAGS@") set(GINKGO_HIP_PLATFORM @GINKGO_HIP_PLATFORM@) -set(GINKGO_HIP_PLATFORM_AMD_REGEX @HIP_PLATFORM_AMD_REGEX@) -set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX @HIP_PLATFORM_NVIDIA_REGEX@) -set(GINKGO_HIP_AMDGPU @GINKGO_HIP_AMDGPU@) +set(GINKGO_HIP_PLATFORM_AMD_REGEX "@HIP_PLATFORM_AMD_REGEX@") +set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX "@HIP_PLATFORM_NVIDIA_REGEX@") +set(GINKGO_HIP_AMDGPU "@GINKGO_HIP_AMDGPU@") set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@) -set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@) +set(GINKGO_AMD_ARCH_FLAGS "@GINKGO_AMD_ARCH_FLAGS@") set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@) set(GINKGO_DPCPP_MAJOR_VERSION @GINKGO_DPCPP_MAJOR_VERSION@) -set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@) -set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@) -set(GINKGO_DPL_ROOT @GINKGO_DPL_ROOT@) +set(GINKGO_DPCPP_FLAGS "@GINKGO_DPCPP_FLAGS@") +set(GINKGO_MKL_ROOT "@GINKGO_MKL_ROOT@") +set(GINKGO_DPL_ROOT "@GINKGO_DPL_ROOT@") set(GINKGO_BUILD_MPI @GINKGO_BUILD_MPI@) @@ -117,9 +117,9 @@ set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") set(GINKGO_CUDA_COMPILER_VERSION @CMAKE_CUDA_COMPILER_VERSION@) set(GINKGO_CUDA_HOST_LINK_LAUNCHER "@CMAKE_CUDA_HOST_LINK_LAUNCHER@") -set(GINKGO_CUBLAS_LIBRARIES @CUBLAS@) -set(GINKGO_CUSPARSE_LIBRARIES @CUSPARSE@) -set(GINKGO_CUDA_LIBRARIES @CUDA_RUNTIME_LIBS@) +set(GINKGO_CUBLAS_LIBRARIES "@CUBLAS@") +set(GINKGO_CUSPARSE_LIBRARIES "@CUSPARSE@") +set(GINKGO_CUDA_LIBRARIES "@CUDA_RUNTIME_LIBS@") set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@") set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS_MODIFY@") @@ -129,8 +129,8 @@ set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE_MODIFY@") # OpenMP set(GINKGO_OPENMP_VERSION @OpenMP_CXX_VERSION@) -set(GINKGO_OPENMP_LIB_NAMES @OpenMP_CXX_LIB_NAMES@) -set(GINKGO_OPENMP_LIBRARIES @OpenMP_CXX_LIBRARIES@) +set(GINKGO_OPENMP_LIB_NAMES "@OpenMP_CXX_LIB_NAMES@") +set(GINKGO_OPENMP_LIBRARIES "@OpenMP_CXX_LIBRARIES@") set(GINKGO_OPENMP_FLAGS "@OpenMP_CXX_FLAGS@") From ab93e8c40e9c64b1dfe0a12ae7fe06962bcc94f1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 6 Jul 2023 11:29:03 +0200 Subject: [PATCH 011/583] Remove CUDA_ARCH from GPU-enabled CI jobs These are all jobs running on amdci right now. We can specify the CUDA architecture from environment variables of the gitlab-runner, which makes the test runs a bit more flexible as well. --- .gitlab-ci.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f81e271288c..d15c25dc270 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -98,7 +98,6 @@ build/cuda92/nompi/gcc/all/release/shared: BUILD_CUDA: "ON" BUILD_HIP: "ON" BUILD_TYPE: "Release" - CUDA_ARCH: 61 # cuda 10.1 and friends # Build CUDA NVIDIA without omp @@ -169,7 +168,6 @@ build/cuda101/nompi/clang/all/release/static: # MPI_AS_ROOT: "ON" # BUILD_HIP: "OFF" # BUILD_TYPE: "Release" -# CUDA_ARCH: 61 #build/clang-cuda101/nompi/clang/cuda/debug/static: @@ -187,7 +185,6 @@ build/cuda101/nompi/clang/all/release/static: # BUILD_TYPE: "Debug" # FAST_TESTS: "ON" # BUILD_SHARED_LIBS: "OFF" -# CUDA_ARCH: 61 # cuda 10.2 and friends @@ -358,7 +355,6 @@ build/cuda114/nompi/gcc/cuda/debug/shared: CXX_FLAGS: "-Wno-error=maybe-uninitialized" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - CUDA_ARCH: 61 # nvhpc and friends @@ -381,7 +377,6 @@ build/nvhpc233/cuda120/nompi/nvcpp/release/static: CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - CUDA_ARCH: 61 build/nvhpc227/cuda117/nompi/nvcpp/debug/shared: extends: @@ -401,7 +396,6 @@ build/nvhpc227/cuda117/nompi/nvcpp/debug/shared: CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - CUDA_ARCH: 61 # ROCm 4.5 and friends build/amd/nompi/gcc/rocm45/release/shared: From 193cbbf14946da39b5b60dd535805ef6f8e7ae4e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 May 2023 13:26:18 +0200 Subject: [PATCH 012/583] enable ensemble builds - remove SYCL-specific headers from general builds - disable failing tests for rocFFT - disable DPC++ distributed tests --- include/ginkgo/core/base/math.hpp | 7 ------- test/matrix/fft_kernels.cpp | 6 ++++++ test/mpi/distributed/CMakeLists.txt | 4 ++-- test/mpi/distributed/preconditioner/CMakeLists.txt | 2 +- test/mpi/solver/CMakeLists.txt | 2 +- test/solver/idr_kernels.cpp | 5 +++++ 6 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 3a6152c55d4..70e4db5bb2d 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -47,13 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -// Using SYCL_LANGUAGE_VERSION will lead the mismatch sycl namespace from 6.0.0 -// when using dpcpp compiler without dpcpp module -#if GINKGO_DPCPP_MAJOR_VERSION -#include -#endif - - namespace gko { diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index 59d2d2de68e..fd9dda821c0 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -138,6 +138,9 @@ TYPED_TEST(Fft, Apply1DIsEqualToReference) TYPED_TEST(Fft, ApplyStrided1DIsEqualToReference) { +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC + GTEST_SKIP() << "rocFFT has a bug related to strided 1D FFT"; +#endif using T = typename TestFixture::value_type; this->fft->apply(this->data_strided, this->out_strided); @@ -160,6 +163,9 @@ TYPED_TEST(Fft, Apply1DInverseIsEqualToReference) TYPED_TEST(Fft, ApplyStrided1DInverseIsEqualToReference) { +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC + GTEST_SKIP() << "rocFFT has a bug related to strided 1D FFT"; +#endif using T = typename TestFixture::value_type; this->ifft->apply(this->data_strided, this->out_strided); diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt index a92e0ef4f70..b2368777589 100644 --- a/test/mpi/distributed/CMakeLists.txt +++ b/test/mpi/distributed/CMakeLists.txt @@ -1,4 +1,4 @@ -ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3) -ginkgo_create_common_and_reference_test(vector MPI_SIZE 3) +ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_and_reference_test(vector MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) add_subdirectory(preconditioner) diff --git a/test/mpi/distributed/preconditioner/CMakeLists.txt b/test/mpi/distributed/preconditioner/CMakeLists.txt index 681bbec3bc9..4f734d21df8 100644 --- a/test/mpi/distributed/preconditioner/CMakeLists.txt +++ b/test/mpi/distributed/preconditioner/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_common_and_reference_test(schwarz MPI_SIZE 3) +ginkgo_create_common_and_reference_test(schwarz MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) diff --git a/test/mpi/solver/CMakeLists.txt b/test/mpi/solver/CMakeLists.txt index 43a2d870d3f..bffd7b5ab10 100644 --- a/test/mpi/solver/CMakeLists.txt +++ b/test/mpi/solver/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_common_and_reference_test(solver MPI_SIZE 3) +ginkgo_create_common_and_reference_test(solver MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp index f7191483615..959c857cb71 100644 --- a/test/solver/idr_kernels.cpp +++ b/test/solver/idr_kernels.cpp @@ -40,6 +40,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#ifdef GKO_COMPILING_DPCPP +#include +#endif + + #include #include #include From d954f6d3a56990859dcd0a26756f8673ee2f2804 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 May 2023 13:26:50 +0200 Subject: [PATCH 013/583] uniform distributed test folder structure --- test/mpi/CMakeLists.txt | 1 - test/mpi/distributed/CMakeLists.txt | 1 + test/mpi/{ => distributed}/solver/CMakeLists.txt | 0 test/mpi/{ => distributed}/solver/solver.cpp | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename test/mpi/{ => distributed}/solver/CMakeLists.txt (100%) rename test/mpi/{ => distributed}/solver/solver.cpp (100%) diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt index f715ea482ec..9066de66970 100644 --- a/test/mpi/CMakeLists.txt +++ b/test/mpi/CMakeLists.txt @@ -1,2 +1 @@ add_subdirectory(distributed) -add_subdirectory(solver) diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt index b2368777589..b02a57b9983 100644 --- a/test/mpi/distributed/CMakeLists.txt +++ b/test/mpi/distributed/CMakeLists.txt @@ -2,3 +2,4 @@ ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 DISABLE_EXECUTORS dpcp ginkgo_create_common_and_reference_test(vector MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) add_subdirectory(preconditioner) +add_subdirectory(solver) diff --git a/test/mpi/solver/CMakeLists.txt b/test/mpi/distributed/solver/CMakeLists.txt similarity index 100% rename from test/mpi/solver/CMakeLists.txt rename to test/mpi/distributed/solver/CMakeLists.txt diff --git a/test/mpi/solver/solver.cpp b/test/mpi/distributed/solver/solver.cpp similarity index 100% rename from test/mpi/solver/solver.cpp rename to test/mpi/distributed/solver/solver.cpp From 0b49cd3743af03b509cfc6c78c661b2a7f7ecf0c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 5 Jul 2023 11:05:01 +0200 Subject: [PATCH 014/583] reenable distributed vector test for dpcpp --- test/mpi/distributed/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt index b02a57b9983..3d5e3cadd58 100644 --- a/test/mpi/distributed/CMakeLists.txt +++ b/test/mpi/distributed/CMakeLists.txt @@ -1,5 +1,5 @@ ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) -ginkgo_create_common_and_reference_test(vector MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_and_reference_test(vector MPI_SIZE 3) add_subdirectory(preconditioner) add_subdirectory(solver) From b9f80134d934b650820f3c09209e7360d192d418 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 6 Jul 2023 15:12:54 +0200 Subject: [PATCH 015/583] move test/mpi/distributed to test/mpi --- test/mpi/CMakeLists.txt | 6 +++++- test/mpi/distributed/CMakeLists.txt | 5 ----- test/mpi/{distributed => }/matrix.cpp | 0 test/mpi/{distributed => }/preconditioner/CMakeLists.txt | 0 test/mpi/{distributed => }/preconditioner/schwarz.cpp | 0 test/mpi/{distributed => }/solver/CMakeLists.txt | 0 test/mpi/{distributed => }/solver/solver.cpp | 0 test/mpi/{distributed => }/vector.cpp | 0 8 files changed, 5 insertions(+), 6 deletions(-) delete mode 100644 test/mpi/distributed/CMakeLists.txt rename test/mpi/{distributed => }/matrix.cpp (100%) rename test/mpi/{distributed => }/preconditioner/CMakeLists.txt (100%) rename test/mpi/{distributed => }/preconditioner/schwarz.cpp (100%) rename test/mpi/{distributed => }/solver/CMakeLists.txt (100%) rename test/mpi/{distributed => }/solver/solver.cpp (100%) rename test/mpi/{distributed => }/vector.cpp (100%) diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt index 9066de66970..3d5e3cadd58 100644 --- a/test/mpi/CMakeLists.txt +++ b/test/mpi/CMakeLists.txt @@ -1 +1,5 @@ -add_subdirectory(distributed) +ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_and_reference_test(vector MPI_SIZE 3) + +add_subdirectory(preconditioner) +add_subdirectory(solver) diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt deleted file mode 100644 index 3d5e3cadd58..00000000000 --- a/test/mpi/distributed/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) -ginkgo_create_common_and_reference_test(vector MPI_SIZE 3) - -add_subdirectory(preconditioner) -add_subdirectory(solver) diff --git a/test/mpi/distributed/matrix.cpp b/test/mpi/matrix.cpp similarity index 100% rename from test/mpi/distributed/matrix.cpp rename to test/mpi/matrix.cpp diff --git a/test/mpi/distributed/preconditioner/CMakeLists.txt b/test/mpi/preconditioner/CMakeLists.txt similarity index 100% rename from test/mpi/distributed/preconditioner/CMakeLists.txt rename to test/mpi/preconditioner/CMakeLists.txt diff --git a/test/mpi/distributed/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp similarity index 100% rename from test/mpi/distributed/preconditioner/schwarz.cpp rename to test/mpi/preconditioner/schwarz.cpp diff --git a/test/mpi/distributed/solver/CMakeLists.txt b/test/mpi/solver/CMakeLists.txt similarity index 100% rename from test/mpi/distributed/solver/CMakeLists.txt rename to test/mpi/solver/CMakeLists.txt diff --git a/test/mpi/distributed/solver/solver.cpp b/test/mpi/solver/solver.cpp similarity index 100% rename from test/mpi/distributed/solver/solver.cpp rename to test/mpi/solver/solver.cpp diff --git a/test/mpi/distributed/vector.cpp b/test/mpi/vector.cpp similarity index 100% rename from test/mpi/distributed/vector.cpp rename to test/mpi/vector.cpp From a07875294f8ce1edf35667c733ee807ad57286a6 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 10 Jul 2023 14:50:06 +0200 Subject: [PATCH 016/583] adds interruptible to gitlab CI --- .gitlab-ci.yml | 7 +++++++ .gitlab/add-interrupt.yml | 2 ++ 2 files changed, 9 insertions(+) create mode 100644 .gitlab/add-interrupt.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d15c25dc270..b9385ebb3cb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,6 +19,13 @@ include: - local: '.gitlab/rules.yml' - local: '.gitlab/scripts.yml' - local: '.gitlab/variables.yml' + # This is a workaround to conditionally make the branch pipelines + # interruptible, because the flag does not directly support rules [1]. + # + # [1] https://gitlab.com/gitlab-org/gitlab/-/issues/194023#note_1225906002 + - local: '.gitlab/add-interrupt.yml' + rules: + - if: $CI_COMMIT_BRANCH != "master" && $CI_COMMIT_BRANCH != "develop" sync: stage: sync diff --git a/.gitlab/add-interrupt.yml b/.gitlab/add-interrupt.yml new file mode 100644 index 00000000000..cf6fd95fe1e --- /dev/null +++ b/.gitlab/add-interrupt.yml @@ -0,0 +1,2 @@ +default: + interruptible: true From ba2d3d055c192d7037639cc9890616228889c8fc Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 10 Jul 2023 15:42:38 +0200 Subject: [PATCH 017/583] don't interrupt gh-pages or new-issue --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b9385ebb3cb..d6124211222 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -819,6 +819,7 @@ sonarqube_cov: # Deploy documentation to github-pages gh-pages: stage: deploy + interruptible: false extends: - .default_variables - .deploy_condition @@ -922,6 +923,7 @@ cudamemcheck: new-issue-on-failure: stage: on-failure + interruptible: false extends: - .default_variables - .use_status-job-settings From 032711ec88156ba231db7016a69a0a41334e0191 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 10 Jul 2023 15:52:33 +0200 Subject: [PATCH 018/583] no interrupt on tag --- .gitlab-ci.yml | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d6124211222..d43040620bb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,7 +25,7 @@ include: # [1] https://gitlab.com/gitlab-org/gitlab/-/issues/194023#note_1225906002 - local: '.gitlab/add-interrupt.yml' rules: - - if: $CI_COMMIT_BRANCH != "master" && $CI_COMMIT_BRANCH != "develop" + - if: $CI_COMMIT_BRANCH != "master" && $CI_COMMIT_BRANCH != "develop" && CI_COMMIT_TAG !~ /^v\d+\.\d+\.\d+/ sync: stage: sync @@ -779,15 +779,15 @@ sonarqube_cov_: - PR_ID=$(curl -s "https://api.github.com/search/issues?q=sha:${CI_COMMIT_SHA}" | jq '.items[0].number') - if [[ "${PR_ID}" != "null" ]]; then - target_branch=$(curl -s - "https://api.github.com/repos/ginkgo-project/ginkgo/pulls/${PR_ID}" | jq - '.base.ref' | sed 's/"//g'); - sonar_branching="-Dsonar.pullrequest.branch=${CI_COMMIT_REF_NAME} - -Dsonar.pullrequest.base=${target_branch} - -Dsonar.pullrequest.key=${PR_ID}"; + target_branch=$(curl -s + "https://api.github.com/repos/ginkgo-project/ginkgo/pulls/${PR_ID}" | jq + '.base.ref' | sed 's/"//g'); + sonar_branching="-Dsonar.pullrequest.branch=${CI_COMMIT_REF_NAME} + -Dsonar.pullrequest.base=${target_branch} + -Dsonar.pullrequest.key=${PR_ID}"; else - sonar_branching="-Dsonar.branch.name=${CI_COMMIT_REF_NAME} - -Dsonar.branch.target=develop"; + sonar_branching="-Dsonar.branch.name=${CI_COMMIT_REF_NAME} + -Dsonar.branch.target=develop"; fi - ctest -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=COVERAGE -DGINKGO_SONARQUBE_TEST=ON @@ -831,13 +831,13 @@ gh-pages: # build docs - mkdir -p ${CI_JOB_NAME} && pushd ${CI_JOB_NAME} - cmake ${CI_PROJECT_DIR} - -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} - -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF - -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF - -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF -DGINKGO_BUILD_MPI=OFF - -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF - -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON + -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} + -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF + -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF + -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF -DGINKGO_BUILD_MPI=OFF + -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF + -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON - make usr - make pdf - popd @@ -854,7 +854,7 @@ gh-pages: - git diff --quiet HEAD || (git commit -m "Update documentation from ginkgo-project/ginkgo@${CURRENT_SHA}" && git push) dependencies: null - needs: [] + needs: [ ] threadsanitizer: @@ -867,10 +867,10 @@ threadsanitizer: script: - LD_PRELOAD=/usr/local/lib/libomp.so CC=clang CXX=clang++ - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN - -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer - -DCTEST_MEMORYCHECK_SANITIZER_OPTIONS=ignore_noninstrumented_modules=1 - --timeout 6000 + ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN + -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer + -DCTEST_MEMORYCHECK_SANITIZER_OPTIONS=ignore_noninstrumented_modules=1 + --timeout 6000 leaksanitizer: stage: QoS_tools @@ -933,7 +933,7 @@ new-issue-on-failure: refs: - develop - master - dependencies: [] + dependencies: [ ] ## Benchmark SpMV From 564e5e480aab680b7f3cf591b7000a402ffbcdb1 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 10 Jul 2023 16:01:31 +0200 Subject: [PATCH 019/583] undo formatting --- .gitlab-ci.yml | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d43040620bb..9d374d81eef 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -779,15 +779,15 @@ sonarqube_cov_: - PR_ID=$(curl -s "https://api.github.com/search/issues?q=sha:${CI_COMMIT_SHA}" | jq '.items[0].number') - if [[ "${PR_ID}" != "null" ]]; then - target_branch=$(curl -s - "https://api.github.com/repos/ginkgo-project/ginkgo/pulls/${PR_ID}" | jq - '.base.ref' | sed 's/"//g'); - sonar_branching="-Dsonar.pullrequest.branch=${CI_COMMIT_REF_NAME} - -Dsonar.pullrequest.base=${target_branch} - -Dsonar.pullrequest.key=${PR_ID}"; + target_branch=$(curl -s + "https://api.github.com/repos/ginkgo-project/ginkgo/pulls/${PR_ID}" | jq + '.base.ref' | sed 's/"//g'); + sonar_branching="-Dsonar.pullrequest.branch=${CI_COMMIT_REF_NAME} + -Dsonar.pullrequest.base=${target_branch} + -Dsonar.pullrequest.key=${PR_ID}"; else - sonar_branching="-Dsonar.branch.name=${CI_COMMIT_REF_NAME} - -Dsonar.branch.target=develop"; + sonar_branching="-Dsonar.branch.name=${CI_COMMIT_REF_NAME} + -Dsonar.branch.target=develop"; fi - ctest -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=COVERAGE -DGINKGO_SONARQUBE_TEST=ON @@ -831,13 +831,13 @@ gh-pages: # build docs - mkdir -p ${CI_JOB_NAME} && pushd ${CI_JOB_NAME} - cmake ${CI_PROJECT_DIR} - -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} - -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF - -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF - -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF -DGINKGO_BUILD_MPI=OFF - -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF - -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON + -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} + -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF + -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF + -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF -DGINKGO_BUILD_MPI=OFF + -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF + -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON - make usr - make pdf - popd @@ -854,7 +854,7 @@ gh-pages: - git diff --quiet HEAD || (git commit -m "Update documentation from ginkgo-project/ginkgo@${CURRENT_SHA}" && git push) dependencies: null - needs: [ ] + needs: [] threadsanitizer: @@ -867,10 +867,10 @@ threadsanitizer: script: - LD_PRELOAD=/usr/local/lib/libomp.so CC=clang CXX=clang++ - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN - -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer - -DCTEST_MEMORYCHECK_SANITIZER_OPTIONS=ignore_noninstrumented_modules=1 - --timeout 6000 + ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=TSAN + -DCTEST_MEMORYCHECK_TYPE=ThreadSanitizer + -DCTEST_MEMORYCHECK_SANITIZER_OPTIONS=ignore_noninstrumented_modules=1 + --timeout 6000 leaksanitizer: stage: QoS_tools @@ -933,7 +933,7 @@ new-issue-on-failure: refs: - develop - master - dependencies: [ ] + dependencies: [] ## Benchmark SpMV From 986acd9ac682fcdea22b389bcf188c85a6bd2ad6 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 20 Jan 2023 12:48:36 +0100 Subject: [PATCH 020/583] adds test for reduction on zero size inputs --- test/base/kernel_launch_generic.cpp | 173 +++++++++++++++------------- 1 file changed, 95 insertions(+), 78 deletions(-) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index a90a5ea6c70..4e57904a9d2 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -321,44 +321,45 @@ TEST_F(KernelLaunch, Runs2DDense) void run1d_reduction(std::shared_ptr exec) { - gko::array output{exec, 1}; + gko::array output{exec, {-1l}}; + auto run_reduction = [&](int64 init, size_type size) { + gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a, auto dummy) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, init, output.get_data(), + size, output, move_only_val); + }; - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( - exec, - [] GKO_KERNEL(auto i, auto a, auto dummy) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); - static_assert(is_same::value, "dummy"); - return i + 1; - }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100000}, output, move_only_val); + { + SCOPED_TRACE("Size 0"); + run_reduction(int64{1}, size_type{0}); - // 2 * sum i=0...99999 (i+1) - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{1}); + } - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( - exec, - [] GKO_KERNEL(auto i, auto a, auto dummy) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); - static_assert(is_same::value, "dummy"); - return i + 1; - }, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "a"); - static_assert(is_same::value, "b"); - return i + j; - }, - [] GKO_KERNEL(auto j) { - static_assert(is_same::value, "value"); - return j * 2; - }, - int64{}, output.get_data(), size_type{100}, output, move_only_val); + { + SCOPED_TRACE("Size 100000"); + run_reduction(int64{0}, size_type{100000}); - // 2 * sum i=0...99 (i+1) - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); + // 2 * sum i=0...99999 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + int64{10000100000}); + } + + { + SCOPED_TRACE("Size 100"); + run_reduction(int64{0}, size_type{100}); + + // 2 * sum i=0...99 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + int64{10100}); + } } TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } @@ -366,54 +367,70 @@ TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } void run2d_reduction(std::shared_ptr exec) { - gko::array output{exec, 1}; + gko::array output{exec, {-1l}}; + auto run_reduction = [&](int64 init, gko::dim<2> size) { + gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 4; + }, + init, output.get_data(), size, output, move_only_val); + }; - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); - static_assert(is_same::value, "dummy"); - return (i + 1) * (j + 1); - }, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "a"); - static_assert(is_same::value, "b"); - return i + j; - }, - [] GKO_KERNEL(auto j) { - static_assert(is_same::value, "value"); - return j * 4; - }, - int64{}, output.get_data(), gko::dim<2>{1000, 100}, output, - move_only_val); + { + SCOPED_TRACE("Dim 0x0"); + run_reduction(int64{0}, gko::dim<2>{0, 0}); - // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0}); + } - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); - static_assert(is_same::value, "dummy"); - return (i + 1) * (j + 1); - }, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "a"); - static_assert(is_same::value, "b"); - return i + j; - }, - [] GKO_KERNEL(auto j) { - static_assert(is_same::value, "value"); - return j * 4; - }, - int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val); + { + SCOPED_TRACE("Dim 0x10"); + run_reduction(int64{0}, gko::dim<2>{0, 10}); + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0}); + } + + { + SCOPED_TRACE("Dim 10x0"); + run_reduction(int64{0}, gko::dim<2>{10, 0}); + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0}); + } + + { + SCOPED_TRACE("Dim 1000x100"); + run_reduction(int64{0}, gko::dim<2>{1000, 100}); + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + int64{10110100000}); + } - // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); + { + SCOPED_TRACE("Dim 10x10"); + run_reduction(int64{0}, gko::dim<2>{10, 10}); + + // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + int64{12100}); + } } TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } From acab03e7910f81e7c260afebbe6eb9471a4bfc76 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 6 Jul 2023 12:07:52 +0200 Subject: [PATCH 021/583] fix dpcpp reduction for size=0 inputs --- dpcpp/base/kernel_launch_reduction.dp.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 6cae0c72dcb..1cf7c1f774a 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -194,8 +194,8 @@ void run_kernel_reduction_impl(std::shared_ptr exec, } else { queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d( - cgh, static_cast(size), num_workgroups, fn, op, finalize, - identity, result, args...); + cgh, static_cast(size), 1, fn, op, finalize, identity, + result, args...); }); } } @@ -240,9 +240,9 @@ void run_kernel_reduction_impl(std::shared_ptr exec, }); } else { queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_2d( - cgh, rows, cols, num_workgroups, fn, op, finalize, identity, - result, args...); + generic_kernel_reduction_2d(cgh, rows, cols, 1, fn, + op, finalize, identity, + result, args...); }); } } From 95f3cb87ea5504c8abd9a54dde5c9a74f2e162a2 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 7 Jul 2023 13:32:22 +0200 Subject: [PATCH 022/583] review updates: - use correct identity value - remove incorrect comments Co-authored-by: Tobias Ribizel --- test/base/kernel_launch_generic.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 4e57904a9d2..3dd1570c5f8 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -338,16 +338,15 @@ void run1d_reduction(std::shared_ptr exec) { SCOPED_TRACE("Size 0"); - run_reduction(int64{1}, size_type{0}); + run_reduction(int64{0}, size_type{0}); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{1}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0}); } { SCOPED_TRACE("Size 100000"); run_reduction(int64{0}, size_type{100000}); - // 2 * sum i=0...99999 (i+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{10000100000}); } @@ -394,7 +393,6 @@ void run2d_reduction(std::shared_ptr exec) SCOPED_TRACE("Dim 0x0"); run_reduction(int64{0}, gko::dim<2>{0, 0}); - // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0}); } @@ -402,7 +400,6 @@ void run2d_reduction(std::shared_ptr exec) SCOPED_TRACE("Dim 0x10"); run_reduction(int64{0}, gko::dim<2>{0, 10}); - // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0}); } @@ -410,7 +407,6 @@ void run2d_reduction(std::shared_ptr exec) SCOPED_TRACE("Dim 10x0"); run_reduction(int64{0}, gko::dim<2>{10, 0}); - // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0}); } From 8c3076aa4805394339960476282b8c453561fd3b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 15 Apr 2023 17:08:57 +0200 Subject: [PATCH 023/583] make output more easily parseable --- benchmark/blas/blas_common.hpp | 5 +++-- benchmark/conversions/conversions.cpp | 5 +++-- benchmark/matrix_statistics/matrix_statistics.cpp | 2 +- benchmark/preconditioner/preconditioner.cpp | 5 +++-- benchmark/solver/solver_common.hpp | 5 +++-- benchmark/sparse_blas/sparse_blas.cpp | 5 +++-- benchmark/spmv/spmv_common.hpp | 5 +++-- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp index f36b7649ffc..fe0110f82fb 100644 --- a/benchmark/blas/blas_common.hpp +++ b/benchmark/blas/blas_common.hpp @@ -509,7 +509,8 @@ void apply_blas(const char* operation_name, std::shared_ptr exec, add_or_set_member(test_case["blas"][operation_name], "error", msg_value, allocator); } - std::cerr << "Error when processing test case " << test_case << "\n" + std::cerr << "Error when processing test case\n" + << test_case << "\n" << "what(): " << e.what() << std::endl; } } @@ -546,7 +547,7 @@ void run_blas_benchmarks(std::shared_ptr exec, continue; } if (do_print) { - std::clog << "Running test case: " << test_case << std::endl; + std::clog << "Running test case\n" << test_case << std::endl; } // annotate the test case auto test_case_range = annotate(describe(test_case)); diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp index ec7febf262f..b249293116b 100644 --- a/benchmark/conversions/conversions.cpp +++ b/benchmark/conversions/conversions.cpp @@ -103,7 +103,8 @@ void convert_matrix(const gko::LinOp* matrix_from, const char* format_to, add_or_set_member(test_case["conversions"][conversion_name], "error", msg_value, allocator); } - std::cerr << "Error when processing test case " << test_case << "\n" + std::cerr << "Error when processing test case\n" + << test_case << "\n" << "what(): " << e.what() << std::endl; } } @@ -150,7 +151,7 @@ int main(int argc, char* argv[]) } auto& conversion_case = test_case["conversions"]; - std::clog << "Running test case: " << test_case << std::endl; + std::clog << "Running test case\n" << test_case << std::endl; gko::matrix_data data; try { data = generator.generate_matrix_data(test_case); diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 45f21ca1e35..09cae6a7554 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -197,7 +197,7 @@ int main(int argc, char* argv[]) } auto& problem = test_case["problem"]; - std::clog << "Running test case: " << test_case << std::endl; + std::clog << "Running test case\n" << test_case << std::endl; auto matrix = DefaultSystemGenerator::generate_matrix_data( diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 281e64ddd76..d125b46bb34 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -248,7 +248,8 @@ void run_preconditioner(const char* precond_name, add_or_set_member(test_case["preconditioner"][encoded_name.c_str()], "error", msg_value, allocator); } - std::cerr << "Error when processing test case " << test_case << "\n" + std::cerr << "Error when processing test case\n" + << test_case << "\n" << "what(): " << e.what() << std::endl; } } @@ -310,7 +311,7 @@ int main(int argc, char* argv[]) })) { continue; } - std::clog << "Running test case: " << test_case << std::endl; + std::clog << "Running test case\n" << test_case << std::endl; // annotate the test case auto test_case_range = diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 64190f8d968..ae9ae6dc1fb 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -587,7 +587,8 @@ void solve_system(const std::string& solver_name, add_or_set_member(test_case["solver"][precond_solver_name], "error", msg_value, allocator); } - std::cerr << "Error when processing test case " << test_case << "\n" + std::cerr << "Error when processing test case\n" + << test_case << "\n" << "what(): " << e.what() << std::endl; } } @@ -638,7 +639,7 @@ void run_solver_benchmarks(std::shared_ptr exec, annotate(system_generator.describe_config(test_case)); if (do_print) { - std::clog << "Running test case: " << test_case << std::endl; + std::clog << "Running test case\n" << test_case << std::endl; } using Vec = typename SystemGenerator::Vec; diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index 3b0ce26db5f..cfa56ef81fe 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -145,7 +145,8 @@ void apply_sparse_blas(const char* operation_name, add_or_set_member(test_case[operation_name], "error", msg_value, allocator); } - std::cerr << "Error when processing test case " << test_case << "\n" + std::cerr << "Error when processing test case\n" + << test_case << "\n" << "what(): " << e.what() << std::endl; } } @@ -192,7 +193,7 @@ int main(int argc, char* argv[]) allocator); } auto& sp_blas_case = test_case[benchmark_name]; - std::clog << "Running test case: " << test_case << std::endl; + std::clog << "Running test case\n" << test_case << std::endl; auto data = generator.generate_matrix_data(test_case); data.ensure_row_major_order(); std::clog << "Matrix is of size (" << data.size[0] << ", " diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index 4c40f1b9a7b..3c8d886df3b 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -146,7 +146,8 @@ void apply_spmv(const char* format_name, std::shared_ptr exec, add_or_set_member(test_case["spmv"][format_name], "error", msg_value, allocator); } - std::cerr << "Error when processing test case " << test_case << "\n" + std::cerr << "Error when processing test case\n" + << test_case << "\n" << "what(): " << e.what() << std::endl; } } @@ -184,7 +185,7 @@ void run_spmv_benchmark(std::shared_ptr exec, continue; } if (do_print) { - std::clog << "Running test case: " << test_case << std::endl; + std::clog << "Running test case\n" << test_case << std::endl; } // annotate the test case auto test_case_range = From d82282bfc827508fc0220657e6c050cdab3d1a86 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 15 Apr 2023 17:15:08 +0200 Subject: [PATCH 024/583] simplify run_all_benchmarks copy --- benchmark/CMakeLists.txt | 13 +------------ benchmark/run_all_benchmarks.sh | 0 2 files changed, 1 insertion(+), 12 deletions(-) mode change 100644 => 100755 benchmark/run_all_benchmarks.sh diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 040356f1666..72647928185 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -161,21 +161,10 @@ add_subdirectory(sparse_blas) add_subdirectory(spmv) add_subdirectory(tools) -add_custom_target(make_run_all_benchmarks ALL) -file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/run_all_benchmarks.sh - DESTINATION ${CMAKE_CURRENT_BINARY_DIR} - FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE) - -add_custom_command( - TARGET make_run_all_benchmarks POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/run_all_benchmarks.sh - ${CMAKE_CURRENT_BINARY_DIR}/run_all_benchmarks.sh) +configure_file(run_all_benchmarks.sh run_all_benchmarks.sh COPYONLY) add_custom_target(benchmark) add_custom_command( TARGET benchmark POST_BUILD COMMAND bash run_all_benchmarks.sh >/dev/null - DEPENDS make_run_all_benchmarks WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh old mode 100644 new mode 100755 From d1a82974199396f8da494a9f24cfd0ef998de54a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 15 Apr 2023 14:25:29 +0200 Subject: [PATCH 025/583] add test framework for benchmarks --- .gitignore | 3 + benchmark/CMakeLists.txt | 3 + benchmark/blas/distributed/CMakeLists.txt | 2 +- benchmark/conversions/CMakeLists.txt | 2 +- benchmark/solver/distributed/CMakeLists.txt | 2 +- benchmark/spmv/distributed/CMakeLists.txt | 2 +- benchmark/test/CMakeLists.txt | 28 + benchmark/test/blas.py | 25 + benchmark/test/conversion.py | 28 + benchmark/test/input.blas.json | 5 + benchmark/test/input.distributed_mtx.json | 7 + benchmark/test/input.distributed_solver.json | 10 + benchmark/test/input.mtx.json | 6 + benchmark/test/input.solver.json | 9 + benchmark/test/matrix_statistics.py | 18 + benchmark/test/multi_vector_distributed.py | 30 + benchmark/test/preconditioner.py | 23 + benchmark/test/reference/blas.profile.stderr | 130 ++ benchmark/test/reference/blas.profile.stdout | 29 + benchmark/test/reference/blas.simple.stderr | 76 + benchmark/test/reference/blas.simple.stdout | 29 + .../test/reference/conversion.all.stderr | 26 + .../test/reference/conversion.all.stdout | 77 + .../test/reference/conversion.profile.stderr | 153 ++ .../test/reference/conversion.profile.stdout | 32 + .../test/reference/conversion.simple.stderr | 17 + .../test/reference/conversion.simple.stdout | 32 + .../distributed_solver.profile.stderr | 1845 +++++++++++++++++ .../distributed_solver.profile.stdout | 64 + .../distributed_solver.simple.stderr | 19 + .../distributed_solver.simple.stdout | 65 + .../reference/matrix_statistics.simple.stderr | 9 + .../reference/matrix_statistics.simple.stdout | 38 + .../multi_vector_distributed.profile.stderr | 0 .../multi_vector_distributed.profile.stdout | 29 + .../multi_vector_distributed.simple.stderr | 86 + .../multi_vector_distributed.simple.stdout | 29 + .../reference/preconditioner.profile.stderr | 137 ++ .../reference/preconditioner.profile.stdout | 30 + .../reference/preconditioner.simple.stderr | 43 + .../reference/preconditioner.simple.stdout | 30 + .../test/reference/solver.profile.stderr | 1336 ++++++++++++ .../test/reference/solver.profile.stdout | 59 + benchmark/test/reference/solver.simple.stderr | 18 + benchmark/test/reference/solver.simple.stdout | 60 + .../test/reference/sparse_blas.profile.stderr | 104 + .../test/reference/sparse_blas.profile.stdout | 26 + .../test/reference/sparse_blas.simple.stderr | 38 + .../test/reference/sparse_blas.simple.stdout | 26 + benchmark/test/reference/spmv.profile.stderr | 178 ++ benchmark/test/reference/spmv.profile.stdout | 20 + benchmark/test/reference/spmv.simple.stderr | 32 + benchmark/test/reference/spmv.simple.stdout | 20 + .../reference/spmv_distributed.profile.stderr | 0 .../reference/spmv_distributed.profile.stdout | 21 + .../reference/spmv_distributed.simple.stderr | 34 + .../reference/spmv_distributed.simple.stdout | 21 + benchmark/test/solver.py | 23 + benchmark/test/solver_distributed.py | 24 + benchmark/test/sparse_blas.py | 23 + benchmark/test/spmv.py | 23 + benchmark/test/spmv_distributed.py | 28 + benchmark/test/test_framework.py.in | 120 ++ 63 files changed, 5458 insertions(+), 4 deletions(-) create mode 100644 benchmark/test/CMakeLists.txt create mode 100755 benchmark/test/blas.py create mode 100755 benchmark/test/conversion.py create mode 100644 benchmark/test/input.blas.json create mode 100644 benchmark/test/input.distributed_mtx.json create mode 100644 benchmark/test/input.distributed_solver.json create mode 100644 benchmark/test/input.mtx.json create mode 100644 benchmark/test/input.solver.json create mode 100755 benchmark/test/matrix_statistics.py create mode 100644 benchmark/test/multi_vector_distributed.py create mode 100755 benchmark/test/preconditioner.py create mode 100644 benchmark/test/reference/blas.profile.stderr create mode 100644 benchmark/test/reference/blas.profile.stdout create mode 100644 benchmark/test/reference/blas.simple.stderr create mode 100644 benchmark/test/reference/blas.simple.stdout create mode 100644 benchmark/test/reference/conversion.all.stderr create mode 100644 benchmark/test/reference/conversion.all.stdout create mode 100644 benchmark/test/reference/conversion.profile.stderr create mode 100644 benchmark/test/reference/conversion.profile.stdout create mode 100644 benchmark/test/reference/conversion.simple.stderr create mode 100644 benchmark/test/reference/conversion.simple.stdout create mode 100644 benchmark/test/reference/distributed_solver.profile.stderr create mode 100644 benchmark/test/reference/distributed_solver.profile.stdout create mode 100644 benchmark/test/reference/distributed_solver.simple.stderr create mode 100644 benchmark/test/reference/distributed_solver.simple.stdout create mode 100644 benchmark/test/reference/matrix_statistics.simple.stderr create mode 100644 benchmark/test/reference/matrix_statistics.simple.stdout create mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stderr create mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stdout create mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stderr create mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stdout create mode 100644 benchmark/test/reference/preconditioner.profile.stderr create mode 100644 benchmark/test/reference/preconditioner.profile.stdout create mode 100644 benchmark/test/reference/preconditioner.simple.stderr create mode 100644 benchmark/test/reference/preconditioner.simple.stdout create mode 100644 benchmark/test/reference/solver.profile.stderr create mode 100644 benchmark/test/reference/solver.profile.stdout create mode 100644 benchmark/test/reference/solver.simple.stderr create mode 100644 benchmark/test/reference/solver.simple.stdout create mode 100644 benchmark/test/reference/sparse_blas.profile.stderr create mode 100644 benchmark/test/reference/sparse_blas.profile.stdout create mode 100644 benchmark/test/reference/sparse_blas.simple.stderr create mode 100644 benchmark/test/reference/sparse_blas.simple.stdout create mode 100644 benchmark/test/reference/spmv.profile.stderr create mode 100644 benchmark/test/reference/spmv.profile.stdout create mode 100644 benchmark/test/reference/spmv.simple.stderr create mode 100644 benchmark/test/reference/spmv.simple.stdout create mode 100644 benchmark/test/reference/spmv_distributed.profile.stderr create mode 100644 benchmark/test/reference/spmv_distributed.profile.stdout create mode 100644 benchmark/test/reference/spmv_distributed.simple.stderr create mode 100644 benchmark/test/reference/spmv_distributed.simple.stdout create mode 100755 benchmark/test/solver.py create mode 100644 benchmark/test/solver_distributed.py create mode 100755 benchmark/test/sparse_blas.py create mode 100755 benchmark/test/spmv.py create mode 100644 benchmark/test/spmv_distributed.py create mode 100644 benchmark/test/test_framework.py.in diff --git a/.gitignore b/.gitignore index af0a88ef513..827f4025a2e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ compile_commands.json CTestTestfile.cmake build +### Python +__pycache__ + ### IDE # Clion .idea diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 72647928185..434474fd336 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -160,6 +160,9 @@ add_subdirectory(solver) add_subdirectory(sparse_blas) add_subdirectory(spmv) add_subdirectory(tools) +if (GINKGO_BUILD_TESTS) + add_subdirectory(test) +endif() configure_file(run_all_benchmarks.sh run_all_benchmarks.sh COPYONLY) diff --git a/benchmark/blas/distributed/CMakeLists.txt b/benchmark/blas/distributed/CMakeLists.txt index 1371294efb8..a756b9c0071 100644 --- a/benchmark/blas/distributed/CMakeLists.txt +++ b/benchmark/blas/distributed/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(multi-vector-distributed "NO" multi_vector.cpp) +ginkgo_add_typed_benchmark_executables(multi_vector_distributed "NO" multi_vector.cpp) diff --git a/benchmark/conversions/CMakeLists.txt b/benchmark/conversions/CMakeLists.txt index 0e0893c3aec..21dd363d3c0 100644 --- a/benchmark/conversions/CMakeLists.txt +++ b/benchmark/conversions/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(conversions "NO" conversions.cpp) +ginkgo_add_typed_benchmark_executables(conversion "NO" conversions.cpp) diff --git a/benchmark/solver/distributed/CMakeLists.txt b/benchmark/solver/distributed/CMakeLists.txt index ca6586f1acf..5f6acd5a06c 100644 --- a/benchmark/solver/distributed/CMakeLists.txt +++ b/benchmark/solver/distributed/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(solver-distributed "YES" solver.cpp) +ginkgo_add_typed_benchmark_executables(solver_distributed "YES" solver.cpp) diff --git a/benchmark/spmv/distributed/CMakeLists.txt b/benchmark/spmv/distributed/CMakeLists.txt index cadde3eea34..4322dd70e90 100644 --- a/benchmark/spmv/distributed/CMakeLists.txt +++ b/benchmark/spmv/distributed/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(spmv-distributed "YES" spmv.cpp) +ginkgo_add_typed_benchmark_executables(spmv_distributed "YES" spmv.cpp) diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt new file mode 100644 index 00000000000..b3acaf3b709 --- /dev/null +++ b/benchmark/test/CMakeLists.txt @@ -0,0 +1,28 @@ +find_package(Python3 COMPONENTS Interpreter REQUIRED) +function(add_benchmark_test test_name) + configure_file(${test_name}.py ${test_name}.py COPYONLY) + add_test(NAME benchmark_${test_name} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py + WORKING_DIRECTORY "$") + set(regenerate_target benchmark_test_${test_name}_regenerate) + add_custom_target(${regenerate_target} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py --generate + COMMENT "Regenerating reference output for ${test_name}" + WORKING_DIRECTORY "$") + add_dependencies(${regenerate_target} ${test_name}) + add_dependencies(benchmark_test_regenerate ${regenerate_target}) +endfunction() +add_custom_target(benchmark_test_regenerate) +configure_file(test_framework.py.in test_framework.py @ONLY) +add_benchmark_test(blas) +add_benchmark_test(conversion) +add_benchmark_test(matrix_statistics) +add_benchmark_test(preconditioner) +add_benchmark_test(solver) +add_benchmark_test(sparse_blas) +add_benchmark_test(spmv) +if (GINKGO_BUILD_MPI) + add_benchmark_test(multi_vector_distributed) + add_benchmark_test(spmv_distributed) + add_benchmark_test(solver_distributed) +endif() \ No newline at end of file diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py new file mode 100755 index 00000000000..16a423ba696 --- /dev/null +++ b/benchmark/test/blas.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +import test_framework +# check that all input modes work: +# parameter +test_framework.compare_output(["blas/blas", "-input", '[{"n": 100}]'], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr") + +# stdin +test_framework.compare_output(["blas/blas"], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", + stdin='[{"n": 100}]') + +# file +test_framework.compare_output(["blas/blas", "-input", str(test_framework.sourcepath / "input.blas.json")], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", + stdin='[{"n": 100}]') + +# profiler annotations +test_framework.compare_output(["blas/blas", "-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="blas.profile.stdout", + expected_stderr="blas.profile.stderr", + stdin='[{"n": 100}]') diff --git a/benchmark/test/conversion.py b/benchmark/test/conversion.py new file mode 100755 index 00000000000..1ef41c4a8ea --- /dev/null +++ b/benchmark/test/conversion.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import test_framework +# check that all input modes work: +# parameter +test_framework.compare_output(["conversion/conversion", "-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr") + +# stdin +test_framework.compare_output(["conversion/conversion", "-formats", "coo,csr"], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]') + +# input file +test_framework.compare_output(["conversion/conversion", "-input", str(test_framework.sourcepath / "input.mtx.json"), "-formats", "coo,csr"], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr") + +# check that all conversions work +test_framework.compare_output(["conversion/conversion", "-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr,ell,sellp,hybrid"], + expected_stdout="conversion.all.stdout", + expected_stderr="conversion.all.stderr") + +# profiler annotations +test_framework.compare_output(["conversion/conversion", "-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr", '-profile', '-profiler_hook', 'debug'], + expected_stdout="conversion.profile.stdout", + expected_stderr="conversion.profile.stderr") diff --git a/benchmark/test/input.blas.json b/benchmark/test/input.blas.json new file mode 100644 index 00000000000..fe366aa6fa0 --- /dev/null +++ b/benchmark/test/input.blas.json @@ -0,0 +1,5 @@ +[ + { + "n": 100 + } +] \ No newline at end of file diff --git a/benchmark/test/input.distributed_mtx.json b/benchmark/test/input.distributed_mtx.json new file mode 100644 index 00000000000..aca115179e6 --- /dev/null +++ b/benchmark/test/input.distributed_mtx.json @@ -0,0 +1,7 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil" + } +] \ No newline at end of file diff --git a/benchmark/test/input.distributed_solver.json b/benchmark/test/input.distributed_solver.json new file mode 100644 index 00000000000..16efbf03fba --- /dev/null +++ b/benchmark/test/input.distributed_solver.json @@ -0,0 +1,10 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + } + } +] \ No newline at end of file diff --git a/benchmark/test/input.mtx.json b/benchmark/test/input.mtx.json new file mode 100644 index 00000000000..fdeb10c8eee --- /dev/null +++ b/benchmark/test/input.mtx.json @@ -0,0 +1,6 @@ +[ + { + "size": 100, + "stencil": "7pt" + } +] \ No newline at end of file diff --git a/benchmark/test/input.solver.json b/benchmark/test/input.solver.json new file mode 100644 index 00000000000..0183700dfe8 --- /dev/null +++ b/benchmark/test/input.solver.json @@ -0,0 +1,9 @@ +[ + { + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + } + } +] \ No newline at end of file diff --git a/benchmark/test/matrix_statistics.py b/benchmark/test/matrix_statistics.py new file mode 100755 index 00000000000..d350c94fae5 --- /dev/null +++ b/benchmark/test/matrix_statistics.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import test_framework +# check that all input modes work: +# parameter +test_framework.compare_output(["matrix_statistics/matrix_statistics", "-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr") + +# stdin +test_framework.compare_output(["matrix_statistics/matrix_statistics"], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]') + +# input file +test_framework.compare_output(["matrix_statistics/matrix_statistics", "-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr") diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py new file mode 100644 index 00000000000..bc039a1b9fe --- /dev/null +++ b/benchmark/test/multi_vector_distributed.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +import test_framework +base_flags = ["blas/distributed/multi_vector_distributed"] +# check that all input modes work: +# parameter +test_framework.compare_output_distributed(base_flags + ["-input", '[{"n": 100}]'], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + num_procs=3) + +# stdin +test_framework.compare_output_distributed(base_flags, + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + stdin='[{"n": 100}]', + num_procs=3) + +# file +test_framework.compare_output_distributed(base_flags + ["-input", str(test_framework.sourcepath / "input.blas.json")], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + stdin='[{"n": 100}]', + num_procs=3) + +# profiler annotations +test_framework.compare_output_distributed(base_flags + ["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="multi_vector_distributed.profile.stdout", + expected_stderr="multi_vector_distributed.profile.stderr", + stdin='[{"n": 100}]', + num_procs=3) diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py new file mode 100755 index 00000000000..67266e78324 --- /dev/null +++ b/benchmark/test/preconditioner.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +import test_framework +# check that all input modes work: +# parameter +test_framework.compare_output(["preconditioner/preconditioner", "-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr") + +# stdin +test_framework.compare_output(["preconditioner/preconditioner"], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]') + +# input file +test_framework.compare_output(["preconditioner/preconditioner", "-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr") + +# profiler annotations +test_framework.compare_output(["preconditioner/preconditioner", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="preconditioner.profile.stdout", + expected_stderr="preconditioner.profile.stderr") diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr new file mode 100644 index 00000000000..1fb7d5b93bc --- /dev/null +++ b/benchmark/test/reference/blas.profile.stderr @@ -0,0 +1,130 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scalRunning test case +{ + "n": 100, + "blas": {} +} +DEBUG: begin n = 100 +DEBUG: begin copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end copy +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] +DEBUG: begin axpy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end axpy +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] +DEBUG: begin scal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::scale +DEBUG: end dense::scale +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end scal +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] +DEBUG: end n = 100 diff --git a/benchmark/test/reference/blas.profile.stdout b/benchmark/test/reference/blas.profile.stdout new file mode 100644 index 00000000000..3a2e7e54f80 --- /dev/null +++ b/benchmark/test/reference/blas.profile.stdout @@ -0,0 +1,29 @@ + +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr new file mode 100644 index 00000000000..e9b186e1353 --- /dev/null +++ b/benchmark/test/reference/blas.simple.stderr @@ -0,0 +1,76 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scalRunning test case +{ + "n": 100, + "blas": {} +} +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/blas.simple.stdout b/benchmark/test/reference/blas.simple.stdout new file mode 100644 index 00000000000..08e692727fe --- /dev/null +++ b/benchmark/test/reference/blas.simple.stdout @@ -0,0 +1,29 @@ + +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr new file mode 100644 index 00000000000..dbc5720527c --- /dev/null +++ b/benchmark/test/reference/conversion.all.stderr @@ -0,0 +1,26 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr,ell,sellp,hybrid +Running test case +{ + "size": 100, + "stencil": "7pt", + "conversion": {} +} +Matrix is of size (125, 125), 725 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo + Running conversion: csr-ell + Running conversion: csr-sellp + Running conversion: csr-hybrid + Running conversion: ell-read + Running conversion: ell-csr + Running conversion: sellp-read + Running conversion: sellp-csr + Running conversion: hybrid-read + Running conversion: hybrid-csr diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout new file mode 100644 index 00000000000..c4b657a42c4 --- /dev/null +++ b/benchmark/test/reference/conversion.all.stdout @@ -0,0 +1,77 @@ + +[ + { + "size": 100, + "stencil": "7pt", + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr new file mode 100644 index 00000000000..6fc5cde206e --- /dev/null +++ b/benchmark/test/reference/conversion.profile.stderr @@ -0,0 +1,153 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr +Running test case +{ + "size": 100, + "stencil": "7pt", + "conversion": {} +} +Matrix is of size (125, 125), 725 +DEBUG: begin stencil(100,7pt) + Running conversion: coo-read +DEBUG: begin coo-read +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end coo-read + Running conversion: coo-csr +DEBUG: begin coo-csr +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy(gko::matrix::Coo,gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: end copy(gko::matrix::Coo,gko::matrix::Csr) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end coo-csr + Running conversion: csr-read +DEBUG: begin csr-read +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end csr-read + Running conversion: csr-coo +DEBUG: begin csr-coo +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::matrix::Csr,gko::matrix::Coo) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::convert_ptrs_to_idxs +DEBUG: end components::convert_ptrs_to_idxs +DEBUG: end copy(gko::matrix::Csr,gko::matrix::Coo) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end csr-coo +DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout new file mode 100644 index 00000000000..b29815f6c17 --- /dev/null +++ b/benchmark/test/reference/conversion.profile.stdout @@ -0,0 +1,32 @@ + +[ + { + "size": 100, + "stencil": "7pt", + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, + "coo-csr": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, + "csr-read": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr new file mode 100644 index 00000000000..1e4dbc4bd51 --- /dev/null +++ b/benchmark/test/reference/conversion.simple.stderr @@ -0,0 +1,17 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr +Running test case +{ + "size": 100, + "stencil": "7pt", + "conversion": {} +} +Matrix is of size (125, 125), 725 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout new file mode 100644 index 00000000000..856f1330eea --- /dev/null +++ b/benchmark/test/reference/conversion.simple.stdout @@ -0,0 +1,32 @@ + +[ + { + "size": 100, + "stencil": "7pt", + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr new file mode 100644 index 00000000000..64b09a754c3 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -0,0 +1,1845 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +DEBUG: begin stencil(100,7pt,stencil) +Running test case +{ + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + }, + "solver": {} +} +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +Matrix is of size (125, 125) +DEBUG: begin cg + Running solver: cg +DEBUG: begin none +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin generate(gko::solver::Cg::Factory) +DEBUG: begin generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::solver::Cg::Factory) +DEBUG: begin copy(gko::matrix::Identity,gko::matrix::Identity) +DEBUG: end copy(gko::matrix::Identity,gko::matrix::Identity) +DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end iteration +DEBUG: end apply(gko::solver::Cg) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin free +DEBUG: end free +DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin iteration +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end iteration +DEBUG: end apply(gko::solver::Cg) +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin free +DEBUG: end free +DEBUG: begin generate(gko::solver::Cg::Factory) +DEBUG: begin generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::solver::Cg::Factory) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end iteration +DEBUG: end apply(gko::solver::Cg) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end none +DEBUG: end cg +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end stencil(100,7pt,stencil) diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout new file mode 100644 index 00000000000..16dc6741930 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.profile.stdout @@ -0,0 +1,64 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate(gko::solver::Cg::Factory)": 1.0, + "generate(gko::matrix::IdentityFactory)": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply(gko::solver::Cg)": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply(gko::experimental::distributed::Matrix)": 1.0, + "dense::row_gather": 1.0, + "advanced_apply(gko::matrix::Csr)": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_squared_norm2": 1.0, + "dense::compute_sqrt": 1.0, + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector)": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check(gko::stop::Combined)": 1.0, + "check(gko::stop::ResidualNorm)": 1.0, + "residual_norm::residual_norm": 1.0, + "check(gko::stop::Iteration)": 1.0, + "cg::step_1": 1.0, + "apply(gko::experimental::distributed::Matrix)": 1.0, + "apply(gko::matrix::Csr)": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr new file mode 100644 index 00000000000..7800bb0b97e --- /dev/null +++ b/benchmark/test/reference/distributed_solver.simple.stderr @@ -0,0 +1,19 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case +{ + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + }, + "solver": {} +} +Matrix is of size (125, 125) + Running solver: cg diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout new file mode 100644 index 00000000000..96ef102f8b8 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.simple.stdout @@ -0,0 +1,65 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate(gko::solver::Cg::Factory)": 1.0, + "generate(gko::matrix::IdentityFactory)": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply(gko::solver::Cg)": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply(gko::experimental::distributed::Matrix)": 1.0, + "dense::row_gather": 1.0, + "advanced_apply(gko::matrix::Csr)": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_squared_norm2": 1.0, + "dense::compute_sqrt": 1.0, + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector)": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check(gko::stop::Combined)": 1.0, + "check(gko::stop::ResidualNorm)": 1.0, + "residual_norm::residual_norm": 1.0, + "check(gko::stop::Iteration)": 1.0, + "cg::step_1": 1.0, + "apply(gko::experimental::distributed::Matrix)": 1.0, + "apply(gko::matrix::Csr)": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr new file mode 100644 index 00000000000..e77cd5d413a --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.simple.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running test case +{ + "size": 100, + "stencil": "7pt", + "problem": {} +} +Matrix is of size (125, 125) diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout new file mode 100644 index 00000000000..4470784e7c5 --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.simple.stdout @@ -0,0 +1,38 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "problem": { + "rows": 125, + "columns": 125, + "nonzeros": 725, + "row_distribution": { + "min": 4, + "q1": 5.0, + "median": 6.0, + "q3": 6.0, + "max": 7, + "mean": 5.8, + "variance": 0.7199999999999992, + "skewness": -0.23570226039551892, + "kurtosis": 2.388888888888889, + "hyperskewness": -1.741577812922432, + "hyperflatness": 7.762345679012379 + }, + "col_distribution": { + "min": 4, + "q1": 5.0, + "median": 6.0, + "q3": 6.0, + "max": 7, + "mean": 5.8, + "variance": 0.7199999999999992, + "skewness": -0.23570226039551892, + "kurtosis": 2.388888888888889, + "hyperskewness": -1.741577812922432, + "hyperflatness": 7.762345679012379 + } + } + } +] diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr new file mode 100644 index 00000000000..e69de29bb2d diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout new file mode 100644 index 00000000000..3a2e7e54f80 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout @@ -0,0 +1,29 @@ + +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr new file mode 100644 index 00000000000..23f3554e9c4 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr @@ -0,0 +1,86 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scalThis is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scalThis is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scalRunning test case +{ + "n": 100, + "blas": {} +} +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout new file mode 100644 index 00000000000..08e692727fe --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout @@ -0,0 +1,29 @@ + +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr new file mode 100644 index 00000000000..97341459e69 --- /dev/null +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -0,0 +1,137 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case +{ + "size": 100, + "stencil": "7pt", + "preconditioner": {} +} +DEBUG: begin stencil(100,7pt) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +Matrix is of size (125, 125) +DEBUG: begin none +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::matrix::IdentityFactory) +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::matrix::IdentityFactory) +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin free +DEBUG: end free +DEBUG: end none +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate(gko::matrix::IdentityFactory)": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 1 + }, + "apply": { + "components": { + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 1 + }, + "completed": true + } + } + } +] +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout new file mode 100644 index 00000000000..c775fd61285 --- /dev/null +++ b/benchmark/test/reference/preconditioner.profile.stdout @@ -0,0 +1,30 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate(gko::matrix::IdentityFactory)": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 1 + }, + "apply": { + "components": { + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 1 + }, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr new file mode 100644 index 00000000000..4a7ee9498d5 --- /dev/null +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -0,0 +1,43 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case +{ + "size": 100, + "stencil": "7pt", + "preconditioner": {} +} +Matrix is of size (125, 125) +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate(gko::matrix::IdentityFactory)": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout new file mode 100644 index 00000000000..84100628d73 --- /dev/null +++ b/benchmark/test/reference/preconditioner.simple.stdout @@ -0,0 +1,30 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate(gko::matrix::IdentityFactory)": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr new file mode 100644 index 00000000000..e50ab7f27b3 --- /dev/null +++ b/benchmark/test/reference/solver.profile.stderr @@ -0,0 +1,1336 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +DEBUG: begin stencil(100,7pt) +Running test case +{ + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": {} +} +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +Matrix is of size (125, 125) +DEBUG: begin cg + Running solver: cg +DEBUG: begin none +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin generate(gko::solver::Cg::Factory) +DEBUG: begin generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::solver::Cg::Factory) +DEBUG: begin copy(gko::matrix::Identity,gko::matrix::Identity) +DEBUG: end copy(gko::matrix::Identity,gko::matrix::Identity) +DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end iteration +DEBUG: end apply(gko::solver::Cg) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin free +DEBUG: end free +DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin iteration +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end iteration +DEBUG: end apply(gko::solver::Cg) +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin free +DEBUG: end free +DEBUG: begin generate(gko::solver::Cg::Factory) +DEBUG: begin generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::matrix::IdentityFactory) +DEBUG: end generate(gko::solver::Cg::Factory) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin iteration +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: begin check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Iteration) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply(gko::matrix::Csr) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply(gko::matrix::Identity) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply(gko::matrix::Identity) +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin check(gko::stop::Combined) +DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check(gko::stop::ResidualNorm) +DEBUG: end check(gko::stop::Combined) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end iteration +DEBUG: end apply(gko::solver::Cg) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end none +DEBUG: end cg +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout new file mode 100644 index 00000000000..a61b432ca0d --- /dev/null +++ b/benchmark/test/reference/solver.profile.stdout @@ -0,0 +1,59 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate(gko::solver::Cg::Factory)": 1.0, + "generate(gko::matrix::IdentityFactory)": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply(gko::solver::Cg)": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply(gko::matrix::Csr)": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_norm2_dispatch": 1.0, + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check(gko::stop::Combined)": 1.0, + "check(gko::stop::ResidualNorm)": 1.0, + "residual_norm::residual_norm": 1.0, + "check(gko::stop::Iteration)": 1.0, + "cg::step_1": 1.0, + "apply(gko::matrix::Csr)": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr new file mode 100644 index 00000000000..dad85f1c921 --- /dev/null +++ b/benchmark/test/reference/solver.simple.stderr @@ -0,0 +1,18 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case +{ + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": {} +} +Matrix is of size (125, 125) + Running solver: cg diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout new file mode 100644 index 00000000000..2e44c73fdfa --- /dev/null +++ b/benchmark/test/reference/solver.simple.stdout @@ -0,0 +1,60 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate(gko::solver::Cg::Factory)": 1.0, + "generate(gko::matrix::IdentityFactory)": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply(gko::solver::Cg)": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply(gko::matrix::Csr)": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_norm2_dispatch": 1.0, + "apply(gko::matrix::Identity)": 1.0, + "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check(gko::stop::Combined)": 1.0, + "check(gko::stop::ResidualNorm)": 1.0, + "residual_norm::residual_norm": 1.0, + "check(gko::stop::Iteration)": 1.0, + "cg::step_1": 1.0, + "apply(gko::matrix::Csr)": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr new file mode 100644 index 00000000000..02dfdfdacfd --- /dev/null +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -0,0 +1,104 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The operations are transposeRunning test case +{ + "size": 100, + "stencil": "7pt", + "sparse_blas": {} +} +DEBUG: begin stencil(100,7pt) +Matrix is of size (125, 125), 725 +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin transpose +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin csr::transpose +DEBUG: end csr::transpose +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin csr::transpose +DEBUG: end csr::transpose +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end transpose +Current state: +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout new file mode 100644 index 00000000000..ba92c30298a --- /dev/null +++ b/benchmark/test/reference/sparse_blas.profile.stdout @@ -0,0 +1,26 @@ + +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr new file mode 100644 index 00000000000..a813994e739 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -0,0 +1,38 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are transposeRunning test case +{ + "size": 100, + "stencil": "7pt", + "sparse_blas": {} +} +Matrix is of size (125, 125), 725 +Current state: +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/sparse_blas.simple.stdout b/benchmark/test/reference/sparse_blas.simple.stdout new file mode 100644 index 00000000000..f39300ca35b --- /dev/null +++ b/benchmark/test/reference/sparse_blas.simple.stdout @@ -0,0 +1,26 @@ + +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr new file mode 100644 index 00000000000..3ddabd987ad --- /dev/null +++ b/benchmark/test/reference/spmv.profile.stderr @@ -0,0 +1,178 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case +{ + "size": 100, + "stencil": "7pt", + "spmv": {} +} +DEBUG: begin stencil(100,7pt) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +Matrix is of size (125, 125) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin apply(gko::matrix::Coo) +DEBUG: begin coo::spmv +DEBUG: end coo::spmv +DEBUG: end apply(gko::matrix::Coo) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin coo +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply(gko::matrix::Coo) +DEBUG: begin coo::spmv +DEBUG: end coo::spmv +DEBUG: end apply(gko::matrix::Coo) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply(gko::matrix::Coo) +DEBUG: begin coo::spmv +DEBUG: end coo::spmv +DEBUG: end apply(gko::matrix::Coo) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end coo +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "nnz": 725, + "optimal": {} + } +] +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout new file mode 100644 index 00000000000..ec7309613b6 --- /dev/null +++ b/benchmark/test/reference/spmv.profile.stdout @@ -0,0 +1,20 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "nnz": 725, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr new file mode 100644 index 00000000000..8a2ebe9fe15 --- /dev/null +++ b/benchmark/test/reference/spmv.simple.stderr @@ -0,0 +1,32 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case +{ + "size": 100, + "stencil": "7pt", + "spmv": {} +} +Matrix is of size (125, 125) +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "nnz": 725, + "optimal": {} + } +] diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout new file mode 100644 index 00000000000..90f8903a452 --- /dev/null +++ b/benchmark/test/reference/spmv.simple.stdout @@ -0,0 +1,20 @@ + +[ + { + "size": 125, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "nnz": 725, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr new file mode 100644 index 00000000000..e69de29bb2d diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout new file mode 100644 index 00000000000..2aeeeb5b0d5 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.profile.stdout @@ -0,0 +1,21 @@ + +[ + { + "size": 81, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 2316, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "nnz": 135, + "optimal": { + "spmv": "csr-csr" + } + } +] diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr new file mode 100644 index 00000000000..57f31d44686 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.simple.stderr @@ -0,0 +1,34 @@ +This is Ginkgo 1.5.0 (develop) + running with core module 1.5.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are [csr]x[csr] +The number of right hand sides is 1 +Running test case +{ + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": {} +} +Matrix is of size (81, 81) +Current state: +[ + { + "size": 81, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 2316, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "nnz": 135, + "optimal": {} + } +] diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout new file mode 100644 index 00000000000..d8cd32ba834 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.simple.stdout @@ -0,0 +1,21 @@ + +[ + { + "size": 81, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 2316, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "nnz": 135, + "optimal": { + "spmv": "csr-csr" + } + } +] diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py new file mode 100755 index 00000000000..afcbfde1a44 --- /dev/null +++ b/benchmark/test/solver.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +import test_framework +# check that all input modes work: +# parameter +test_framework.compare_output(["solver/solver", "-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr") + +# stdin +test_framework.compare_output(["solver/solver"], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]') + +# input file +test_framework.compare_output(["solver/solver", "-input", str(test_framework.sourcepath / "input.solver.json")], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr") + +# profiler annotations +test_framework.compare_output(["solver/solver", "-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="solver.profile.stdout", + expected_stderr="solver.profile.stderr") diff --git a/benchmark/test/solver_distributed.py b/benchmark/test/solver_distributed.py new file mode 100644 index 00000000000..c19e14718c2 --- /dev/null +++ b/benchmark/test/solver_distributed.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import test_framework +base_flags = ["solver/distributed/solver_distributed"] +# check that all input modes work: +# parameter +test_framework.compare_output(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]'], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr") + +# stdin +test_framework.compare_output(base_flags, + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]') + +# input file +test_framework.compare_output(base_flags + ["-input", str(test_framework.sourcepath / "input.distributed_solver.json")], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr") + +# profiler annotations +test_framework.compare_output(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="distributed_solver.profile.stdout", + expected_stderr="distributed_solver.profile.stderr") diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py new file mode 100755 index 00000000000..94b3041ff96 --- /dev/null +++ b/benchmark/test/sparse_blas.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +import test_framework +# check that all input modes work: +# parameter +test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr") + +# stdin +test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose"], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]') + +# input file +test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose", "-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr") + +# profiler annotations (transpose has the smallest number of allocations) +test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="sparse_blas.profile.stdout", + expected_stderr="sparse_blas.profile.stderr") diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py new file mode 100755 index 00000000000..718b34a2290 --- /dev/null +++ b/benchmark/test/spmv.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +import test_framework +# check that all input modes work: +# parameter +test_framework.compare_output(["spmv/spmv", "-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr") + +# stdin +test_framework.compare_output(["spmv/spmv"], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]') + +# input file +test_framework.compare_output(["spmv/spmv", "-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr") + +# profiler annotations +test_framework.compare_output(["spmv/spmv", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="spmv.profile.stdout", + expected_stderr="spmv.profile.stderr") diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py new file mode 100644 index 00000000000..d74730d2f49 --- /dev/null +++ b/benchmark/test/spmv_distributed.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import test_framework +base_flags = ["spmv/distributed/spmv_distributed"] +# check that all input modes work: +# parameter +test_framework.compare_output_distributed(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3) + +# stdin +test_framework.compare_output_distributed(base_flags, + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, + stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]') + +# input file +test_framework.compare_output_distributed(base_flags + ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3) + +# profiler annotations +test_framework.compare_output_distributed(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', '-profile', '-profiler_hook', 'debug'], + expected_stdout="spmv_distributed.profile.stdout", + expected_stderr="spmv_distributed.profile.stderr", + num_procs=3) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in new file mode 100644 index 00000000000..2d42f3677e6 --- /dev/null +++ b/benchmark/test/test_framework.py.in @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +import subprocess +import difflib +import json +import typing +import re +import pathlib +import sys +sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") +binpath = pathlib.Path("@PROJECT_BINARY_DIR@") +generate = False +if len(sys.argv) > 1 and sys.argv[1] == "--generate": + generate = True + + +denumberify_paths = ["time", "bandwidth", "flops", "components", + "residual_norm", "rhs_norm", "max_relative_norm2"] +empty_array_paths = ["recurrent_residuals", "true_residuals", + "implicit_residuals", "iteration_timestamps"] + + +def sanitize_json_single(key, value, sanitize_all): + if key in denumberify_paths and isinstance(value, float): + return 1.0 + if key in denumberify_paths and isinstance(value, typing.Dict): + return sanitize_json(value, True) + if key in empty_array_paths and isinstance(value, typing.List): + return [] + return sanitize_json(value, sanitize_all) + + +def sanitize_json(parsed_input, sanitize_all=False): + if isinstance(parsed_input, typing.Dict): + return {key: sanitize_json_single(key, value, sanitize_all) for key, value in parsed_input.items()} + elif isinstance(parsed_input, typing.List): + return [sanitize_json(e, sanitize_all) for e in parsed_input] + elif sanitize_all and isinstance(parsed_input, float): + return 1.0 + else: + return parsed_input + + +def sanitize_text(lines): + json_begins = [i for i, l in enumerate(lines) if l in ["[", "{"]] + json_ends = [i + 1 for i, l in enumerate(lines) if l in ["]", "}"]] + json_pairs = list(zip(json_begins, json_ends)) + if (len(json_pairs) == 0): + return lines + assert (all(begin < end for begin, end in json_pairs)) + nonjson_pairs = [(0, json_begins[0])] + list(zip(json_ends[:-1], + json_begins[1:])) + [(json_ends[-1], len(lines))] + combined_pairs = sorted([(begin, end, False) for begin, end in nonjson_pairs] + [ + (begin, end, True) for begin, end in json_pairs]) + texts = [("\n".join(lines[begin:end]), do_sanitize) + for begin, end, do_sanitize in combined_pairs] + reconstructed = [json.dumps(sanitize_json(json.loads( + t)), indent=4) if do_sanitize else t for t, do_sanitize in texts] + return "\n".join(reconstructed).split("\n") + + +def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patterns=[]): + lines = input.split("\n") + output_lines = [] + patterns = [re.compile(pattern) for pattern in ignore_patterns] + for line in lines: + keep = True + for pattern in patterns: + if re.match(pattern, line): + keep = False + break + if keep: + output_lines.append(line) + return sanitize_text(output_lines) + + +def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_flags=[]): + args[0] = binpath / "benchmark" / args[0] + expected_stdout = sourcepath / "reference" / expected_stdout + expected_stderr = sourcepath / "reference" / expected_stderr + result = subprocess.run(args=launcher_flags + args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, input=bytes(stdin, "utf-8")) + print("TEST: {}".format( + " ".join(["'{}'".format(arg) for arg in launcher_flags + args]))) + version_patterns = [ + " the .* module is", + ] + if generate: + open(expected_stdout, "w").write("\n".join(determinize_text( + result.stdout.decode()))) + open(expected_stderr, "w").write("\n".join(determinize_text(result.stderr.decode( + ), ignore_patterns=version_patterns))) + print("GENERATED") + return + result_stdout_processed = determinize_text( + result.stdout.decode()) + result_stderr_processed = determinize_text(result.stderr.decode( + ), ignore_patterns=version_patterns) + expected_stdout_processed = determinize_text( + open(expected_stdout).read()) + expected_stderr_processed = determinize_text(open(expected_stderr).read( + ), ignore_patterns=version_patterns) + failed = False + if result_stdout_processed != expected_stdout_processed: + print("FAIL: stdout differs") + print("\n".join(difflib.unified_diff( + expected_stdout_processed, result_stdout_processed))) + failed = True + if result_stderr_processed != expected_stderr_processed: + print("FAIL: stderr differs") + print("\n".join(difflib.unified_diff( + expected_stderr_processed, result_stderr_processed))) + failed = True + if failed: + exit(1) + print("PASS") + + +def compare_output_distributed(args, expected_stdout, expected_stderr, num_procs, stdin=""): + compare_output(args, expected_stdout, expected_stderr, stdin, [ + "@MPIEXEC_EXECUTABLE@", "@MPIEXEC_NUMPROC_FLAG@", str(num_procs)]) From 1a3af15487cce3dd95c000099a5407ea5740667a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 18 Apr 2023 15:25:22 +0200 Subject: [PATCH 026/583] fix pathlib issue --- benchmark/test/test_framework.py.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 2d42f3677e6..56ff9ccbbb8 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -74,9 +74,9 @@ def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patter def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_flags=[]): - args[0] = binpath / "benchmark" / args[0] - expected_stdout = sourcepath / "reference" / expected_stdout - expected_stderr = sourcepath / "reference" / expected_stderr + args[0] = str(binpath / "benchmark" / args[0]) + expected_stdout = str(sourcepath / "reference" / expected_stdout) + expected_stderr = str(sourcepath / "reference" / expected_stderr) result = subprocess.run(args=launcher_flags + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, input=bytes(stdin, "utf-8")) print("TEST: {}".format( From e605f2b397ecc17b3e584e0819a7b41ed5ca8be6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Apr 2023 09:58:22 +0200 Subject: [PATCH 027/583] fix benchmark tests for multi-config generators --- benchmark/test/CMakeLists.txt | 4 ++-- benchmark/test/blas.py | 8 ++++---- benchmark/test/conversion.py | 10 +++++----- benchmark/test/matrix_statistics.py | 6 +++--- benchmark/test/multi_vector_distributed.py | 9 ++++----- benchmark/test/preconditioner.py | 8 ++++---- benchmark/test/solver.py | 8 ++++---- benchmark/test/solver_distributed.py | 9 ++++----- benchmark/test/sparse_blas.py | 8 ++++---- benchmark/test/spmv.py | 8 ++++---- benchmark/test/spmv_distributed.py | 9 ++++----- benchmark/test/test_framework.py.in | 4 ++-- 12 files changed, 44 insertions(+), 47 deletions(-) diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt index b3acaf3b709..1cd589927fa 100644 --- a/benchmark/test/CMakeLists.txt +++ b/benchmark/test/CMakeLists.txt @@ -2,11 +2,11 @@ find_package(Python3 COMPONENTS Interpreter REQUIRED) function(add_benchmark_test test_name) configure_file(${test_name}.py ${test_name}.py COPYONLY) add_test(NAME benchmark_${test_name} - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py $ WORKING_DIRECTORY "$") set(regenerate_target benchmark_test_${test_name}_regenerate) add_custom_target(${regenerate_target} - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py --generate + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py $ --generate COMMENT "Regenerating reference output for ${test_name}" WORKING_DIRECTORY "$") add_dependencies(${regenerate_target} ${test_name}) diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py index 16a423ba696..e099718bae0 100755 --- a/benchmark/test/blas.py +++ b/benchmark/test/blas.py @@ -2,24 +2,24 @@ import test_framework # check that all input modes work: # parameter -test_framework.compare_output(["blas/blas", "-input", '[{"n": 100}]'], +test_framework.compare_output(["-input", '[{"n": 100}]'], expected_stdout="blas.simple.stdout", expected_stderr="blas.simple.stderr") # stdin -test_framework.compare_output(["blas/blas"], +test_framework.compare_output([], expected_stdout="blas.simple.stdout", expected_stderr="blas.simple.stderr", stdin='[{"n": 100}]') # file -test_framework.compare_output(["blas/blas", "-input", str(test_framework.sourcepath / "input.blas.json")], +test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.blas.json")], expected_stdout="blas.simple.stdout", expected_stderr="blas.simple.stderr", stdin='[{"n": 100}]') # profiler annotations -test_framework.compare_output(["blas/blas", "-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output(["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="blas.profile.stdout", expected_stderr="blas.profile.stderr", stdin='[{"n": 100}]') diff --git a/benchmark/test/conversion.py b/benchmark/test/conversion.py index 1ef41c4a8ea..91e71cc9e89 100755 --- a/benchmark/test/conversion.py +++ b/benchmark/test/conversion.py @@ -2,27 +2,27 @@ import test_framework # check that all input modes work: # parameter -test_framework.compare_output(["conversion/conversion", "-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"], expected_stdout="conversion.simple.stdout", expected_stderr="conversion.simple.stderr") # stdin -test_framework.compare_output(["conversion/conversion", "-formats", "coo,csr"], +test_framework.compare_output(["-formats", "coo,csr"], expected_stdout="conversion.simple.stdout", expected_stderr="conversion.simple.stderr", stdin='[{"size": 100, "stencil": "7pt"}]') # input file -test_framework.compare_output(["conversion/conversion", "-input", str(test_framework.sourcepath / "input.mtx.json"), "-formats", "coo,csr"], +test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.mtx.json"), "-formats", "coo,csr"], expected_stdout="conversion.simple.stdout", expected_stderr="conversion.simple.stderr") # check that all conversions work -test_framework.compare_output(["conversion/conversion", "-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr,ell,sellp,hybrid"], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr,ell,sellp,hybrid"], expected_stdout="conversion.all.stdout", expected_stderr="conversion.all.stderr") # profiler annotations -test_framework.compare_output(["conversion/conversion", "-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr", '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr", '-profile', '-profiler_hook', 'debug'], expected_stdout="conversion.profile.stdout", expected_stderr="conversion.profile.stderr") diff --git a/benchmark/test/matrix_statistics.py b/benchmark/test/matrix_statistics.py index d350c94fae5..62547acfbeb 100755 --- a/benchmark/test/matrix_statistics.py +++ b/benchmark/test/matrix_statistics.py @@ -2,17 +2,17 @@ import test_framework # check that all input modes work: # parameter -test_framework.compare_output(["matrix_statistics/matrix_statistics", "-input", '[{"size": 100, "stencil": "7pt"}]'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], expected_stdout="matrix_statistics.simple.stdout", expected_stderr="matrix_statistics.simple.stderr") # stdin -test_framework.compare_output(["matrix_statistics/matrix_statistics"], +test_framework.compare_output([], expected_stdout="matrix_statistics.simple.stdout", expected_stderr="matrix_statistics.simple.stderr", stdin='[{"size": 100, "stencil": "7pt"}]') # input file -test_framework.compare_output(["matrix_statistics/matrix_statistics", "-input", '[{"size": 100, "stencil": "7pt"}]'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], expected_stdout="matrix_statistics.simple.stdout", expected_stderr="matrix_statistics.simple.stderr") diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py index bc039a1b9fe..808a7c3e458 100644 --- a/benchmark/test/multi_vector_distributed.py +++ b/benchmark/test/multi_vector_distributed.py @@ -1,29 +1,28 @@ #!/usr/bin/env python3 import test_framework -base_flags = ["blas/distributed/multi_vector_distributed"] # check that all input modes work: # parameter -test_framework.compare_output_distributed(base_flags + ["-input", '[{"n": 100}]'], +test_framework.compare_output_distributed(["-input", '[{"n": 100}]'], expected_stdout="multi_vector_distributed.simple.stdout", expected_stderr="multi_vector_distributed.simple.stderr", num_procs=3) # stdin -test_framework.compare_output_distributed(base_flags, +test_framework.compare_output_distributed([], expected_stdout="multi_vector_distributed.simple.stdout", expected_stderr="multi_vector_distributed.simple.stderr", stdin='[{"n": 100}]', num_procs=3) # file -test_framework.compare_output_distributed(base_flags + ["-input", str(test_framework.sourcepath / "input.blas.json")], +test_framework.compare_output_distributed(["-input", str(test_framework.sourcepath / "input.blas.json")], expected_stdout="multi_vector_distributed.simple.stdout", expected_stderr="multi_vector_distributed.simple.stderr", stdin='[{"n": 100}]', num_procs=3) # profiler annotations -test_framework.compare_output_distributed(base_flags + ["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output_distributed(["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="multi_vector_distributed.profile.stdout", expected_stderr="multi_vector_distributed.profile.stderr", stdin='[{"n": 100}]', diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py index 67266e78324..4a044cd25f5 100755 --- a/benchmark/test/preconditioner.py +++ b/benchmark/test/preconditioner.py @@ -2,22 +2,22 @@ import test_framework # check that all input modes work: # parameter -test_framework.compare_output(["preconditioner/preconditioner", "-input", '[{"size": 100, "stencil": "7pt"}]'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], expected_stdout="preconditioner.simple.stdout", expected_stderr="preconditioner.simple.stderr") # stdin -test_framework.compare_output(["preconditioner/preconditioner"], +test_framework.compare_output([], expected_stdout="preconditioner.simple.stdout", expected_stderr="preconditioner.simple.stderr", stdin='[{"size": 100, "stencil": "7pt"}]') # input file -test_framework.compare_output(["preconditioner/preconditioner", "-input", str(test_framework.sourcepath / "input.mtx.json")], +test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.mtx.json")], expected_stdout="preconditioner.simple.stdout", expected_stderr="preconditioner.simple.stderr") # profiler annotations -test_framework.compare_output(["preconditioner/preconditioner", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="preconditioner.profile.stdout", expected_stderr="preconditioner.profile.stderr") diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py index afcbfde1a44..fd8130e0ae1 100755 --- a/benchmark/test/solver.py +++ b/benchmark/test/solver.py @@ -2,22 +2,22 @@ import test_framework # check that all input modes work: # parameter -test_framework.compare_output(["solver/solver", "-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'], expected_stdout="solver.simple.stdout", expected_stderr="solver.simple.stderr") # stdin -test_framework.compare_output(["solver/solver"], +test_framework.compare_output([], expected_stdout="solver.simple.stdout", expected_stderr="solver.simple.stderr", stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]') # input file -test_framework.compare_output(["solver/solver", "-input", str(test_framework.sourcepath / "input.solver.json")], +test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.solver.json")], expected_stdout="solver.simple.stdout", expected_stderr="solver.simple.stderr") # profiler annotations -test_framework.compare_output(["solver/solver", "-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="solver.profile.stdout", expected_stderr="solver.profile.stderr") diff --git a/benchmark/test/solver_distributed.py b/benchmark/test/solver_distributed.py index c19e14718c2..f8a02861e26 100644 --- a/benchmark/test/solver_distributed.py +++ b/benchmark/test/solver_distributed.py @@ -1,24 +1,23 @@ #!/usr/bin/env python3 import test_framework -base_flags = ["solver/distributed/solver_distributed"] # check that all input modes work: # parameter -test_framework.compare_output(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]'], expected_stdout="distributed_solver.simple.stdout", expected_stderr="distributed_solver.simple.stderr") # stdin -test_framework.compare_output(base_flags, +test_framework.compare_output([], expected_stdout="distributed_solver.simple.stdout", expected_stderr="distributed_solver.simple.stderr", stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]') # input file -test_framework.compare_output(base_flags + ["-input", str(test_framework.sourcepath / "input.distributed_solver.json")], +test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.distributed_solver.json")], expected_stdout="distributed_solver.simple.stdout", expected_stderr="distributed_solver.simple.stderr") # profiler annotations -test_framework.compare_output(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="distributed_solver.profile.stdout", expected_stderr="distributed_solver.profile.stderr") diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py index 94b3041ff96..913aac94d07 100755 --- a/benchmark/test/sparse_blas.py +++ b/benchmark/test/sparse_blas.py @@ -2,22 +2,22 @@ import test_framework # check that all input modes work: # parameter -test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]'], +test_framework.compare_output(["-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]'], expected_stdout="sparse_blas.simple.stdout", expected_stderr="sparse_blas.simple.stderr") # stdin -test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose"], +test_framework.compare_output(["-operations", "transpose"], expected_stdout="sparse_blas.simple.stdout", expected_stderr="sparse_blas.simple.stderr", stdin='[{"size": 100, "stencil": "7pt"}]') # input file -test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose", "-input", str(test_framework.sourcepath / "input.mtx.json")], +test_framework.compare_output(["-operations", "transpose", "-input", str(test_framework.sourcepath / "input.mtx.json")], expected_stdout="sparse_blas.simple.stdout", expected_stderr="sparse_blas.simple.stderr") # profiler annotations (transpose has the smallest number of allocations) -test_framework.compare_output(["sparse_blas/sparse_blas", "-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output(["-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="sparse_blas.profile.stdout", expected_stderr="sparse_blas.profile.stderr") diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py index 718b34a2290..d3f3015b9dd 100755 --- a/benchmark/test/spmv.py +++ b/benchmark/test/spmv.py @@ -2,22 +2,22 @@ import test_framework # check that all input modes work: # parameter -test_framework.compare_output(["spmv/spmv", "-input", '[{"size": 100, "stencil": "7pt"}]'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], expected_stdout="spmv.simple.stdout", expected_stderr="spmv.simple.stderr") # stdin -test_framework.compare_output(["spmv/spmv"], +test_framework.compare_output([], expected_stdout="spmv.simple.stdout", expected_stderr="spmv.simple.stderr", stdin='[{"size": 100, "stencil": "7pt"}]') # input file -test_framework.compare_output(["spmv/spmv", "-input", str(test_framework.sourcepath / "input.mtx.json")], +test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.mtx.json")], expected_stdout="spmv.simple.stdout", expected_stderr="spmv.simple.stderr") # profiler annotations -test_framework.compare_output(["spmv/spmv", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="spmv.profile.stdout", expected_stderr="spmv.profile.stderr") diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py index d74730d2f49..f6aa1accbe9 100644 --- a/benchmark/test/spmv_distributed.py +++ b/benchmark/test/spmv_distributed.py @@ -1,28 +1,27 @@ #!/usr/bin/env python3 import test_framework -base_flags = ["spmv/distributed/spmv_distributed"] # check that all input modes work: # parameter -test_framework.compare_output_distributed(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], +test_framework.compare_output_distributed(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], expected_stdout="spmv_distributed.simple.stdout", expected_stderr="spmv_distributed.simple.stderr", num_procs=3) # stdin -test_framework.compare_output_distributed(base_flags, +test_framework.compare_output_distributed([], expected_stdout="spmv_distributed.simple.stdout", expected_stderr="spmv_distributed.simple.stderr", num_procs=3, stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]') # input file -test_framework.compare_output_distributed(base_flags + ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], +test_framework.compare_output_distributed(["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], expected_stdout="spmv_distributed.simple.stdout", expected_stderr="spmv_distributed.simple.stderr", num_procs=3) # profiler annotations -test_framework.compare_output_distributed(base_flags + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', '-profile', '-profiler_hook', 'debug'], +test_framework.compare_output_distributed(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', '-profile', '-profiler_hook', 'debug'], expected_stdout="spmv_distributed.profile.stdout", expected_stderr="spmv_distributed.profile.stderr", num_procs=3) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 56ff9ccbbb8..e53a35c30a8 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -9,7 +9,7 @@ import sys sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") binpath = pathlib.Path("@PROJECT_BINARY_DIR@") generate = False -if len(sys.argv) > 1 and sys.argv[1] == "--generate": +if len(sys.argv) > 2 and sys.argv[2] == "--generate": generate = True @@ -74,7 +74,7 @@ def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patter def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_flags=[]): - args[0] = str(binpath / "benchmark" / args[0]) + args = [sys.argv[1]] + args expected_stdout = str(sourcepath / "reference" / expected_stdout) expected_stderr = str(sourcepath / "reference" / expected_stderr) result = subprocess.run(args=launcher_flags + args, stdout=subprocess.PIPE, From 5a3f4d3aff51a74179dc64edc7fb4b775fab4dcd Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Apr 2023 13:13:29 +0200 Subject: [PATCH 028/583] handle windows newlines correctly --- benchmark/test/test_framework.py.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index e53a35c30a8..27424cc30b6 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -96,8 +96,8 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl result_stderr_processed = determinize_text(result.stderr.decode( ), ignore_patterns=version_patterns) expected_stdout_processed = determinize_text( - open(expected_stdout).read()) - expected_stderr_processed = determinize_text(open(expected_stderr).read( + open(expected_stdout, 'rU').read()) + expected_stderr_processed = determinize_text(open(expected_stderr, 'rU').read( ), ignore_patterns=version_patterns) failed = False if result_stdout_processed != expected_stdout_processed: From de15622efc78465b6acbf60a951910a67ce00d01 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Apr 2023 13:16:42 +0200 Subject: [PATCH 029/583] fix SYCL warnings in output --- .gitlab-ci.yml | 6 +++--- dpcpp/get_info.cmake | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9d374d81eef..85683fc100c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -615,7 +615,7 @@ build/dpcpp/2022-1/cpu/release/static: BUILD_DPCPP: "ON" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" - SYCL_DEVICE_TYPE: "CPU" + SYCL_DEVICE_FILTER: "CPU" SLURM_PARTITION: "cpu" SLURM_TIME: "2:00:00" # This job is not in exclusive mode @@ -634,7 +634,7 @@ build/dpcpp/igpu/release/shared: BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_TYPE: "GPU" + SYCL_DEVICE_FILTER: "GPU" # TODO: Enable when debug shared library size issues are fixed # build/dpcpp/level_zero_igpu/debug/shared: @@ -666,7 +666,7 @@ build/dpcpp/dgpu/release/static: BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OF" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_TYPE: "GPU" + SYCL_DEVICE_FILTER: "GPU" build/dpcpp/level_zero_dgpu/release/shared: extends: diff --git a/dpcpp/get_info.cmake b/dpcpp/get_info.cmake index 36918a3a8c6..ee9c0398f3e 100644 --- a/dpcpp/get_info.cmake +++ b/dpcpp/get_info.cmake @@ -3,6 +3,5 @@ ginkgo_print_module_footer(${detailed_log} "DPCPP variables:") ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_FLAGS") ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SINGLE_MODE") ginkgo_print_module_footer(${detailed_log} "DPCPP environment variables:") -ginkgo_print_env_variable(${detailed_log} "SYCL_DEVICE_TYPE") -ginkgo_print_env_variable(${detailed_log} "SYCL_BE") +ginkgo_print_env_variable(${detailed_log} "SYCL_DEVICE_FILTER") ginkgo_print_module_footer(${detailed_log} "") From f7051d1a2c23a142c03f04637d544093a2101590 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 20 Apr 2023 11:53:19 +0200 Subject: [PATCH 030/583] strip implementation-dependent demangled typenames --- .../test/reference/conversion.profile.stderr | 8 +- .../reference/preconditioner.profile.stderr | 32 +- .../reference/preconditioner.profile.stdout | 6 +- .../reference/preconditioner.simple.stdout | 6 +- .../test/reference/solver.profile.stderr | 688 +++++++++--------- .../test/reference/solver.profile.stdout | 15 +- benchmark/test/reference/solver.simple.stdout | 15 +- benchmark/test/reference/spmv.profile.stderr | 32 +- benchmark/test/test_framework.py.in | 28 +- 9 files changed, 414 insertions(+), 416 deletions(-) diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index 6fc5cde206e..8ea580247d8 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -43,7 +43,7 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin copy(gko::matrix::Coo,gko::matrix::Csr) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free @@ -58,7 +58,7 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: end copy(gko::matrix::Coo,gko::matrix::Csr) +DEBUG: end copy() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -123,7 +123,7 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::matrix::Csr,gko::matrix::Coo) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin copy @@ -136,7 +136,7 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::convert_ptrs_to_idxs DEBUG: end components::convert_ptrs_to_idxs -DEBUG: end copy(gko::matrix::Csr,gko::matrix::Coo) +DEBUG: end copy() DEBUG: begin free DEBUG: end free DEBUG: begin free diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index 97341459e69..86ec044eb40 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -61,36 +61,36 @@ DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data Matrix is of size (125, 125) DEBUG: begin none -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::matrix::IdentityFactory) -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::matrix::IdentityFactory) -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin free DEBUG: end free DEBUG: end none diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout index c775fd61285..ba967989af4 100644 --- a/benchmark/test/reference/preconditioner.profile.stdout +++ b/benchmark/test/reference/preconditioner.profile.stdout @@ -7,7 +7,7 @@ "none": { "generate": { "components": { - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "overhead": 1.0 }, "time": 1.0, @@ -15,8 +15,8 @@ }, "apply": { "components": { - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "apply()": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "overhead": 1.0 }, diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout index 84100628d73..c47146a72e1 100644 --- a/benchmark/test/reference/preconditioner.simple.stdout +++ b/benchmark/test/reference/preconditioner.simple.stdout @@ -7,7 +7,7 @@ "none": { "generate": { "components": { - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "overhead": 1.0 }, "time": 1.0, @@ -15,8 +15,8 @@ }, "apply": { "components": { - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "apply()": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "overhead": 1.0 }, diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index e50ab7f27b3..8aa04832601 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -55,12 +55,12 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() Matrix is of size (125, 125) DEBUG: begin cg Running solver: cg @@ -73,19 +73,19 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin generate(gko::solver::Cg::Factory) -DEBUG: begin generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::solver::Cg::Factory) -DEBUG: begin copy(gko::matrix::Identity,gko::matrix::Identity) -DEBUG: end copy(gko::matrix::Identity,gko::matrix::Identity) -DEBUG: begin apply(gko::solver::Cg) +DEBUG: end copy() +DEBUG: begin generate() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: end generate() +DEBUG: begin copy() +DEBUG: end copy() +DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin allocate DEBUG: end allocate @@ -115,10 +115,10 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin cg::initialize DEBUG: end cg::initialize -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -131,232 +131,232 @@ DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: end check() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -368,16 +368,16 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end iteration -DEBUG: end apply(gko::solver::Cg) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end apply() +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() DEBUG: begin free DEBUG: end free -DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin dense::fill DEBUG: end dense::fill @@ -385,10 +385,10 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin cg::initialize DEBUG: end cg::initialize -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -401,12 +401,12 @@ DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin allocate @@ -421,16 +421,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -447,32 +447,32 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration @@ -489,16 +489,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -515,32 +515,32 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration @@ -557,16 +557,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -583,32 +583,32 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration @@ -625,16 +625,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -651,32 +651,32 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration @@ -693,16 +693,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -719,32 +719,32 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration @@ -761,16 +761,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -787,32 +787,32 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration @@ -829,16 +829,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -855,32 +855,32 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration @@ -897,16 +897,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -923,14 +923,14 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: end check() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -942,27 +942,27 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end iteration -DEBUG: end apply(gko::solver::Cg) +DEBUG: end apply() DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() DEBUG: begin free DEBUG: end free -DEBUG: begin generate(gko::solver::Cg::Factory) -DEBUG: begin generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::solver::Cg::Factory) +DEBUG: begin generate() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: end generate() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -985,7 +985,7 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin allocate DEBUG: end allocate @@ -1015,10 +1015,10 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin cg::initialize DEBUG: end cg::initialize -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -1031,232 +1031,232 @@ DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: end iteration DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: end check() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1268,21 +1268,21 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end iteration -DEBUG: end apply(gko::solver::Cg) +DEBUG: end apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout index a61b432ca0d..f66daea1f30 100644 --- a/benchmark/test/reference/solver.profile.stdout +++ b/benchmark/test/reference/solver.profile.stdout @@ -15,32 +15,27 @@ "rhs_norm": 1.0, "generate": { "components": { - "generate(gko::solver::Cg::Factory)": 1.0, - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "overhead": 1.0 }, "time": 1.0 }, "apply": { "components": { - "apply(gko::solver::Cg)": 1.0, + "apply()": 1.0, "iteration": 1.0, "allocate": 1.0, "dense::fill": 1.0, "cg::initialize": 1.0, - "advanced_apply(gko::matrix::Csr)": 1.0, + "advanced_apply()": 1.0, "csr::advanced_spmv": 1.0, "dense::compute_norm2_dispatch": 1.0, - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "dense::compute_conj_dot_dispatch": 1.0, - "check(gko::stop::Combined)": 1.0, - "check(gko::stop::ResidualNorm)": 1.0, + "check()": 1.0, "residual_norm::residual_norm": 1.0, - "check(gko::stop::Iteration)": 1.0, "cg::step_1": 1.0, - "apply(gko::matrix::Csr)": 1.0, "csr::spmv": 1.0, "cg::step_2": 1.0, "free": 1.0, diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout index 2e44c73fdfa..c6055339d67 100644 --- a/benchmark/test/reference/solver.simple.stdout +++ b/benchmark/test/reference/solver.simple.stdout @@ -15,8 +15,7 @@ "rhs_norm": 1.0, "generate": { "components": { - "generate(gko::solver::Cg::Factory)": 1.0, - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "free": 1.0, "overhead": 1.0 }, @@ -24,24 +23,20 @@ }, "apply": { "components": { - "apply(gko::solver::Cg)": 1.0, + "apply()": 1.0, "iteration": 1.0, "allocate": 1.0, "dense::fill": 1.0, "cg::initialize": 1.0, - "advanced_apply(gko::matrix::Csr)": 1.0, + "advanced_apply()": 1.0, "csr::advanced_spmv": 1.0, "dense::compute_norm2_dispatch": 1.0, - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "dense::compute_conj_dot_dispatch": 1.0, - "check(gko::stop::Combined)": 1.0, - "check(gko::stop::ResidualNorm)": 1.0, + "check()": 1.0, "residual_norm::residual_norm": 1.0, - "check(gko::stop::Iteration)": 1.0, "cg::step_1": 1.0, - "apply(gko::matrix::Csr)": 1.0, "csr::spmv": 1.0, "cg::step_2": 1.0, "free": 1.0, diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 3ddabd987ad..ea170aac1a8 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -53,12 +53,12 @@ DEBUG: end free DEBUG: begin free DEBUG: end free Matrix is of size (125, 125) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -67,10 +67,10 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin apply(gko::matrix::Coo) +DEBUG: begin apply() DEBUG: begin coo::spmv DEBUG: end coo::spmv -DEBUG: end apply(gko::matrix::Coo) +DEBUG: end apply() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -86,16 +86,16 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin apply(gko::matrix::Coo) +DEBUG: end copy() +DEBUG: begin apply() DEBUG: begin coo::spmv DEBUG: end coo::spmv -DEBUG: end apply(gko::matrix::Coo) +DEBUG: end apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -108,18 +108,18 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -132,16 +132,16 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) -DEBUG: begin apply(gko::matrix::Coo) +DEBUG: end copy() +DEBUG: begin apply() DEBUG: begin coo::spmv DEBUG: end coo::spmv -DEBUG: end apply(gko::matrix::Coo) +DEBUG: end apply() DEBUG: begin free DEBUG: end free DEBUG: begin free diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 27424cc30b6..e570458e4a4 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -58,11 +58,13 @@ def sanitize_text(lines): return "\n".join(reconstructed).split("\n") -def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patterns=[]): - lines = input.split("\n") +def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patterns=[], replace_patterns=[]): + lines = input.splitlines() output_lines = [] patterns = [re.compile(pattern) for pattern in ignore_patterns] for line in lines: + for pattern, replacement in replace_patterns: + line = re.sub(pattern, replacement, line) keep = True for pattern in patterns: if re.match(pattern, line): @@ -70,7 +72,10 @@ def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patter break if keep: output_lines.append(line) - return sanitize_text(output_lines) + try: + return sanitize_text(output_lines) + except json.decoder.JSONDecodeError: + return output_lines def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_flags=[]): @@ -84,21 +89,24 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl version_patterns = [ " the .* module is", ] + profiler_hook_typename_patterns = [ + ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()") + ] if generate: open(expected_stdout, "w").write("\n".join(determinize_text( - result.stdout.decode()))) + result.stdout.decode(), replace_patterns=profiler_hook_typename_patterns))) open(expected_stderr, "w").write("\n".join(determinize_text(result.stderr.decode( - ), ignore_patterns=version_patterns))) + ), ignore_patterns=version_patterns, replace_patterns=profiler_hook_typename_patterns))) print("GENERATED") return result_stdout_processed = determinize_text( - result.stdout.decode()) + result.stdout.decode(), replace_patterns=profiler_hook_typename_patterns) result_stderr_processed = determinize_text(result.stderr.decode( - ), ignore_patterns=version_patterns) + ), ignore_patterns=version_patterns, replace_patterns=profiler_hook_typename_patterns) expected_stdout_processed = determinize_text( - open(expected_stdout, 'rU').read()) - expected_stderr_processed = determinize_text(open(expected_stderr, 'rU').read( - ), ignore_patterns=version_patterns) + open(expected_stdout).read(), replace_patterns=profiler_hook_typename_patterns) + expected_stderr_processed = determinize_text(open(expected_stderr).read( + ), ignore_patterns=version_patterns, replace_patterns=profiler_hook_typename_patterns) failed = False if result_stdout_processed != expected_stdout_processed: print("FAIL: stdout differs") From c9a448a9ef8a036f26eee3c0ed881e02de948f5a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 21 May 2023 11:04:24 +0200 Subject: [PATCH 031/583] strip more path-depentent output in test framework --- benchmark/test/test_framework.py.in | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index e570458e4a4..a0a7757b043 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -8,6 +8,7 @@ import pathlib import sys sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") binpath = pathlib.Path("@PROJECT_BINARY_DIR@") +projectroot = "@PROJECT_SOURCE_DIR@" generate = False if len(sys.argv) > 2 and sys.argv[2] == "--generate": generate = True @@ -15,11 +16,14 @@ if len(sys.argv) > 2 and sys.argv[2] == "--generate": denumberify_paths = ["time", "bandwidth", "flops", "components", "residual_norm", "rhs_norm", "max_relative_norm2"] +empty_string_paths = ["error"] empty_array_paths = ["recurrent_residuals", "true_residuals", "implicit_residuals", "iteration_timestamps"] def sanitize_json_single(key, value, sanitize_all): + if key in empty_string_paths and isinstance(value, str): + return "" if key in denumberify_paths and isinstance(value, float): return 1.0 if key in denumberify_paths and isinstance(value, typing.Dict): @@ -63,6 +67,7 @@ def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patter output_lines = [] patterns = [re.compile(pattern) for pattern in ignore_patterns] for line in lines: + line = line.replace(projectroot, "ginkgo") for pattern, replacement in replace_patterns: line = re.sub(pattern, replacement, line) keep = True @@ -72,6 +77,8 @@ def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patter break if keep: output_lines.append(line) + if output_lines[-1] != "": + output_lines.append("") try: return sanitize_text(output_lines) except json.decoder.JSONDecodeError: @@ -89,24 +96,25 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl version_patterns = [ " the .* module is", ] - profiler_hook_typename_patterns = [ - ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()") + typename_patterns = [ + ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()"), + ("Operation .* does not support [^\"]*", "Operation does not support") ] if generate: open(expected_stdout, "w").write("\n".join(determinize_text( - result.stdout.decode(), replace_patterns=profiler_hook_typename_patterns))) + result.stdout.decode(), replace_patterns=typename_patterns))) open(expected_stderr, "w").write("\n".join(determinize_text(result.stderr.decode( - ), ignore_patterns=version_patterns, replace_patterns=profiler_hook_typename_patterns))) + ), ignore_patterns=version_patterns, replace_patterns=typename_patterns))) print("GENERATED") return result_stdout_processed = determinize_text( - result.stdout.decode(), replace_patterns=profiler_hook_typename_patterns) + result.stdout.decode(), replace_patterns=typename_patterns) result_stderr_processed = determinize_text(result.stderr.decode( - ), ignore_patterns=version_patterns, replace_patterns=profiler_hook_typename_patterns) + ), ignore_patterns=version_patterns, replace_patterns=typename_patterns) expected_stdout_processed = determinize_text( - open(expected_stdout).read(), replace_patterns=profiler_hook_typename_patterns) + open(expected_stdout).read(), replace_patterns=typename_patterns) expected_stderr_processed = determinize_text(open(expected_stderr).read( - ), ignore_patterns=version_patterns, replace_patterns=profiler_hook_typename_patterns) + ), ignore_patterns=version_patterns, replace_patterns=typename_patterns) failed = False if result_stdout_processed != expected_stdout_processed: print("FAIL: stdout differs") From 84c4871b75c94d5b0aee718928b3bbf2a85eae6a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 21 May 2023 11:04:43 +0200 Subject: [PATCH 032/583] update benchmark test outputs --- benchmark/test/reference/blas.profile.stderr | 4 +- benchmark/test/reference/blas.simple.stderr | 4 +- .../test/reference/conversion.all.stderr | 1866 ++++++++++++++++- .../test/reference/conversion.all.stdout | 74 +- .../test/reference/conversion.profile.stderr | 102 +- .../test/reference/conversion.profile.stdout | 19 +- .../test/reference/conversion.simple.stderr | 46 +- .../test/reference/conversion.simple.stdout | 19 +- .../reference/matrix_statistics.simple.stderr | 4 +- .../reference/preconditioner.profile.stderr | 10 +- .../reference/preconditioner.simple.stderr | 10 +- .../test/reference/solver.profile.stderr | 214 +- benchmark/test/reference/solver.simple.stderr | 4 +- .../test/reference/sparse_blas.profile.stderr | 4 +- .../test/reference/sparse_blas.simple.stderr | 4 +- benchmark/test/reference/spmv.profile.stderr | 4 +- benchmark/test/reference/spmv.simple.stderr | 4 +- 17 files changed, 2130 insertions(+), 262 deletions(-) diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index 1fb7d5b93bc..16a86bd4c94 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr index e9b186e1353..72a2fbb9b90 100644 --- a/benchmark/test/reference/blas.simple.stderr +++ b/benchmark/test/reference/blas.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr index dbc5720527c..a21a0254200 100644 --- a/benchmark/test/reference/conversion.all.stderr +++ b/benchmark/test/reference/conversion.all.stderr @@ -1,26 +1,1856 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr,ell,sellp,hybrid +Benchmarking conversions. Running test case { "size": 100, "stencil": "7pt", - "conversion": {} -} -Matrix is of size (125, 125), 725 - Running conversion: coo-read - Running conversion: coo-csr - Running conversion: csr-read - Running conversion: csr-coo - Running conversion: csr-ell - Running conversion: csr-sellp - Running conversion: csr-hybrid - Running conversion: ell-read - Running conversion: ell-csr - Running conversion: sellp-read - Running conversion: sellp-csr - Running conversion: hybrid-read - Running conversion: hybrid-csr + "conversions": {} +} +Matrix is of size (125, 125) +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" + }, + "hybrid-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" + }, + "hybrid-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-ell": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" + }, + "hybrid-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-ell": { + "completed": false, + "error": "" + } + } + } +] +Error when processing test case +{ + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" + }, + "hybrid-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-ell": { + "completed": false, + "error": "" + }, + "hybrid-sellp": { + "completed": false, + "error": "" + } + } +} +what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-coo": { + "completed": false, + "error": "" + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" + }, + "hybrid-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-ell": { + "completed": false, + "error": "" + }, + "hybrid-sellp": { + "completed": false, + "error": "" + } + } + } +] diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout index c4b657a42c4..cb53bb81a6c 100644 --- a/benchmark/test/reference/conversion.all.stdout +++ b/benchmark/test/reference/conversion.all.stdout @@ -1,23 +1,25 @@ [ { - "size": 100, + "size": 125, "stencil": "7pt", - "conversion": { - "coo-read": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, + "conversions": { "coo-csr": { "time": 1.0, "repetitions": 10, "completed": true }, - "csr-read": { - "time": 1.0, - "repetitions": 10, - "completed": true + "coo-ell": { + "completed": false, + "error": "" + }, + "coo-sellp": { + "completed": false, + "error": "" + }, + "coo-hybrid": { + "completed": false, + "error": "" }, "csr-coo": { "time": 1.0, @@ -39,39 +41,57 @@ "repetitions": 10, "completed": true }, - "ell-read": { - "time": 1.0, - "repetitions": 10, - "completed": true + "ell-coo": { + "completed": false, + "error": "" }, "ell-csr": { "time": 1.0, "repetitions": 10, "completed": true }, - "sellp-read": { - "time": 1.0, - "repetitions": 10, - "completed": true + "ell-sellp": { + "completed": false, + "error": "" + }, + "ell-hybrid": { + "completed": false, + "error": "" + }, + "sellp-coo": { + "completed": false, + "error": "" }, "sellp-csr": { "time": 1.0, "repetitions": 10, "completed": true }, - "hybrid-read": { - "time": 1.0, - "repetitions": 10, - "completed": true + "sellp-ell": { + "completed": false, + "error": "" + }, + "sellp-hybrid": { + "completed": false, + "error": "" + }, + "hybrid-coo": { + "completed": false, + "error": "" }, "hybrid-csr": { "time": 1.0, "repetitions": 10, "completed": true + }, + "hybrid-ell": { + "completed": false, + "error": "" + }, + "hybrid-sellp": { + "completed": false, + "error": "" } - }, - "rows": 125, - "cols": 125, - "nonzeros": 725 + } } ] diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index 8ea580247d8..e772752ea4a 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -1,19 +1,18 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo,csr +Benchmarking conversions. Running test case { "size": 100, "stencil": "7pt", - "conversion": {} + "conversions": {} } -Matrix is of size (125, 125), 725 -DEBUG: begin stencil(100,7pt) - Running conversion: coo-read -DEBUG: begin coo-read +Matrix is of size (125, 125) +DEBUG: begin stencil(125,7pt) DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -22,17 +21,13 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end coo-read - Running conversion: coo-csr DEBUG: begin coo-csr DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -41,8 +36,12 @@ DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate @@ -50,10 +49,14 @@ DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin components::convert_idxs_to_ptrs @@ -65,15 +68,27 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end coo-csr +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end coo-csr - Running conversion: csr-read -DEBUG: begin csr-read DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array @@ -94,46 +109,32 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end csr-read - Running conversion: csr-coo DEBUG: begin csr-coo DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free DEBUG: begin components::convert_ptrs_to_idxs DEBUG: end components::convert_ptrs_to_idxs DEBUG: end copy() @@ -143,11 +144,30 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end csr-coo +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end csr-coo -DEBUG: end stencil(100,7pt) +DEBUG: end stencil(125,7pt) diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout index b29815f6c17..3e76bc26934 100644 --- a/benchmark/test/reference/conversion.profile.stdout +++ b/benchmark/test/reference/conversion.profile.stdout @@ -1,32 +1,19 @@ [ { - "size": 100, + "size": 125, "stencil": "7pt", - "conversion": { - "coo-read": { - "time": 1.0, - "repetitions": 1, - "completed": true - }, + "conversions": { "coo-csr": { "time": 1.0, "repetitions": 1, "completed": true }, - "csr-read": { - "time": 1.0, - "repetitions": 1, - "completed": true - }, "csr-coo": { "time": 1.0, "repetitions": 1, "completed": true } - }, - "rows": 125, - "cols": 125, - "nonzeros": 725 + } } ] diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr index 1e4dbc4bd51..f044da61804 100644 --- a/benchmark/test/reference/conversion.simple.stderr +++ b/benchmark/test/reference/conversion.simple.stderr @@ -1,17 +1,47 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr +Benchmarking conversions. Running test case { "size": 100, "stencil": "7pt", - "conversion": {} + "conversions": {} } -Matrix is of size (125, 125), 725 - Running conversion: coo-read - Running conversion: coo-csr - Running conversion: csr-read - Running conversion: csr-coo +Matrix is of size (125, 125) +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] +Current state: +[ + { + "size": 125, + "stencil": "7pt", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout index 856f1330eea..9ecdd46f5e1 100644 --- a/benchmark/test/reference/conversion.simple.stdout +++ b/benchmark/test/reference/conversion.simple.stdout @@ -1,32 +1,19 @@ [ { - "size": 100, + "size": 125, "stencil": "7pt", - "conversion": { - "coo-read": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, + "conversions": { "coo-csr": { "time": 1.0, "repetitions": 10, "completed": true }, - "csr-read": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, "csr-coo": { "time": 1.0, "repetitions": 10, "completed": true } - }, - "rows": 125, - "cols": 125, - "nonzeros": 725 + } } ] diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr index e77cd5d413a..69d2bbf9098 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stderr +++ b/benchmark/test/reference/matrix_statistics.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running test case { "size": 100, diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index 86ec044eb40..2bebc03be8d 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 @@ -103,7 +103,7 @@ Current state: "none": { "generate": { "components": { - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "overhead": 1.0 }, "time": 1.0, @@ -111,8 +111,8 @@ Current state: }, "apply": { "components": { - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "apply()": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "overhead": 1.0 }, diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr index 4a7ee9498d5..bfec4a697ee 100644 --- a/benchmark/test/reference/preconditioner.simple.stderr +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 @@ -20,7 +20,7 @@ Current state: "none": { "generate": { "components": { - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "overhead": 1.0 }, "time": 1.0, @@ -28,8 +28,8 @@ Current state: }, "apply": { "components": { - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::matrix::Dense,gko::matrix::Dense)": 1.0, + "apply()": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "overhead": 1.0 }, diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 8aa04832601..a601444163d 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 @@ -167,8 +167,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -179,6 +177,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -197,8 +197,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -209,6 +207,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -227,8 +227,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -239,6 +237,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -257,8 +257,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -269,6 +267,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -287,8 +287,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -299,6 +297,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -317,8 +317,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -329,6 +327,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -347,8 +347,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -409,6 +407,16 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -447,16 +455,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -475,6 +473,16 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -515,16 +523,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -543,6 +541,16 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -583,16 +591,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -611,6 +609,16 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -651,16 +659,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -679,6 +677,16 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -719,16 +727,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -747,6 +745,16 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -787,16 +795,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -815,6 +813,16 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -855,16 +863,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -883,8 +881,14 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: end check() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -923,14 +927,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: end check() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1067,8 +1063,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -1079,6 +1073,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -1097,8 +1093,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -1109,6 +1103,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -1127,8 +1123,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -1139,6 +1133,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -1157,8 +1153,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -1169,6 +1163,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -1187,8 +1183,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -1199,6 +1193,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -1217,8 +1213,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch @@ -1229,6 +1223,8 @@ DEBUG: end check() DEBUG: begin check() DEBUG: end check() DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 DEBUG: begin apply() @@ -1247,8 +1243,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration DEBUG: begin check() DEBUG: begin check() DEBUG: begin dense::compute_norm2_dispatch diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr index dad85f1c921..936046c4949 100644 --- a/benchmark/test/reference/solver.simple.stderr +++ b/benchmark/test/reference/solver.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index 02dfdfdacfd..3dee884861e 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr index a813994e739..e6e0884e267 100644 --- a/benchmark/test/reference/sparse_blas.simple.stderr +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index ea170aac1a8..735e4bf5d23 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr index 8a2ebe9fe15..1bb4472bce6 100644 --- a/benchmark/test/reference/spmv.simple.stderr +++ b/benchmark/test/reference/spmv.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 From eba1b0b37253a34d1338bb50f20ce910007720fc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 21 May 2023 12:12:04 +0200 Subject: [PATCH 033/583] more strict path removal --- .../test/reference/conversion.all.stderr | 24 +++++++++---------- benchmark/test/test_framework.py.in | 4 +--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr index a21a0254200..d6aab6a0331 100644 --- a/benchmark/test/reference/conversion.all.stderr +++ b/benchmark/test/reference/conversion.all.stderr @@ -42,7 +42,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -81,7 +81,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -128,7 +128,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -357,7 +357,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -520,7 +520,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -641,7 +641,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -770,7 +770,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -984,7 +984,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -1139,7 +1139,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -1302,7 +1302,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -1567,7 +1567,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { @@ -1756,7 +1756,7 @@ Error when processing test case } } } -what(): ginkgo/include/ginkgo/core/base/utils_helper.hpp:368: Operation does not support +what(): Current state: [ { diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index a0a7757b043..2d22f11ac4f 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -8,7 +8,6 @@ import pathlib import sys sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") binpath = pathlib.Path("@PROJECT_BINARY_DIR@") -projectroot = "@PROJECT_SOURCE_DIR@" generate = False if len(sys.argv) > 2 and sys.argv[2] == "--generate": generate = True @@ -67,7 +66,6 @@ def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patter output_lines = [] patterns = [re.compile(pattern) for pattern in ignore_patterns] for line in lines: - line = line.replace(projectroot, "ginkgo") for pattern, replacement in replace_patterns: line = re.sub(pattern, replacement, line) keep = True @@ -98,7 +96,7 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl ] typename_patterns = [ ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()"), - ("Operation .* does not support [^\"]*", "Operation does not support") + ("what\\(\\): .*", "what(): ") ] if generate: open(expected_stdout, "w").write("\n".join(determinize_text( From 041e2740f58b1b6f0dff07aecd05312a3672b7a6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 22 May 2023 13:56:13 +0200 Subject: [PATCH 034/583] update distributed outputs --- .../distributed_solver.profile.stderr | 1114 ++++---- .../distributed_solver.profile.stdout | 17 +- .../distributed_solver.simple.stderr | 4 +- .../distributed_solver.simple.stdout | 17 +- .../multi_vector_distributed.profile.stderr | 808 ++++++ .../multi_vector_distributed.simple.stderr | 14 +- .../reference/spmv_distributed.profile.stderr | 2380 +++++++++++++++++ .../reference/spmv_distributed.profile.stdout | 4 +- .../reference/spmv_distributed.simple.stderr | 8 +- .../reference/spmv_distributed.simple.stdout | 4 +- 10 files changed, 3764 insertions(+), 606 deletions(-) diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 64b09a754c3..e0ddd10ab54 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 @@ -56,7 +56,7 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy DEBUG: begin copy @@ -65,12 +65,12 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: end copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: end copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy DEBUG: begin copy @@ -79,7 +79,7 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: end copy(gko::matrix::Csr,gko::matrix::Csr) +DEBUG: end copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::fill @@ -206,12 +206,12 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::matrix::Dense,gko::matrix::Dense) +DEBUG: end copy() Matrix is of size (125, 125) DEBUG: begin cg Running solver: cg @@ -226,19 +226,19 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin generate(gko::solver::Cg::Factory) -DEBUG: begin generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::solver::Cg::Factory) -DEBUG: begin copy(gko::matrix::Identity,gko::matrix::Identity) -DEBUG: end copy(gko::matrix::Identity,gko::matrix::Identity) -DEBUG: begin apply(gko::solver::Cg) +DEBUG: end copy() +DEBUG: begin generate() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: end generate() +DEBUG: begin copy() +DEBUG: end copy() +DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin allocate DEBUG: end allocate @@ -268,18 +268,18 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin cg::initialize DEBUG: end cg::initialize -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -294,304 +294,302 @@ DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: end check() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -603,16 +601,16 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end iteration -DEBUG: end apply(gko::solver::Cg) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end apply() +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end copy() DEBUG: begin free DEBUG: end free -DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin dense::fill DEBUG: end dense::fill @@ -620,18 +618,18 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin cg::initialize DEBUG: end cg::initialize -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -646,14 +644,26 @@ DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -668,24 +678,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -704,44 +714,44 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -758,24 +768,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -794,44 +804,44 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -848,24 +858,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -884,44 +894,44 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -938,24 +948,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -974,44 +984,44 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -1028,24 +1038,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -1064,44 +1074,44 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -1118,24 +1128,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -1154,44 +1164,44 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: end iteration DEBUG: begin iteration DEBUG: begin allocate @@ -1208,24 +1218,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -1244,46 +1254,42 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: end check() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -1298,24 +1304,24 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -1334,16 +1340,6 @@ DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: end check(gko::stop::Combined) DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1355,27 +1351,27 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end iteration -DEBUG: end apply(gko::solver::Cg) +DEBUG: end apply() DEBUG: begin free DEBUG: end free -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end copy() +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: end copy() DEBUG: begin free DEBUG: end free -DEBUG: begin generate(gko::solver::Cg::Factory) -DEBUG: begin generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::matrix::IdentityFactory) -DEBUG: end generate(gko::solver::Cg::Factory) +DEBUG: begin generate() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: end generate() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1398,7 +1394,7 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin apply(gko::solver::Cg) +DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin allocate DEBUG: end allocate @@ -1428,18 +1424,18 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin cg::initialize DEBUG: end cg::initialize -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -1454,304 +1450,302 @@ DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: begin check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Iteration) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration DEBUG: begin cg::step_1 DEBUG: end cg::step_1 -DEBUG: begin apply(gko::experimental::distributed::Matrix) +DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin apply(gko::matrix::Csr) +DEBUG: begin apply() DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch DEBUG: begin cg::step_2 DEBUG: end cg::step_2 -DEBUG: begin apply(gko::matrix::Identity) -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin apply() +DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: end apply(gko::matrix::Identity) +DEBUG: end copy() +DEBUG: end apply() DEBUG: begin dense::compute_conj_dot_dispatch DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin check(gko::stop::Combined) -DEBUG: begin check(gko::stop::ResidualNorm) +DEBUG: begin check() +DEBUG: begin check() DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm -DEBUG: end check(gko::stop::ResidualNorm) -DEBUG: end check(gko::stop::Combined) +DEBUG: end check() +DEBUG: end check() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1763,29 +1757,29 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end iteration -DEBUG: end apply(gko::solver::Cg) +DEBUG: end apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) +DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector) -DEBUG: begin advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end copy() +DEBUG: begin advanced_apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: begin advanced_apply(gko::matrix::Csr) +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply(gko::matrix::Csr) -DEBUG: end advanced_apply(gko::experimental::distributed::Matrix) +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout index 16dc6741930..a31b88ff582 100644 --- a/benchmark/test/reference/distributed_solver.profile.stdout +++ b/benchmark/test/reference/distributed_solver.profile.stdout @@ -16,36 +16,29 @@ "rhs_norm": 1.0, "generate": { "components": { - "generate(gko::solver::Cg::Factory)": 1.0, - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "overhead": 1.0 }, "time": 1.0 }, "apply": { "components": { - "apply(gko::solver::Cg)": 1.0, + "apply()": 1.0, "iteration": 1.0, "allocate": 1.0, "dense::fill": 1.0, "cg::initialize": 1.0, - "advanced_apply(gko::experimental::distributed::Matrix)": 1.0, + "advanced_apply()": 1.0, "dense::row_gather": 1.0, - "advanced_apply(gko::matrix::Csr)": 1.0, "csr::advanced_spmv": 1.0, "dense::compute_squared_norm2": 1.0, "dense::compute_sqrt": 1.0, - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector)": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "dense::compute_conj_dot_dispatch": 1.0, - "check(gko::stop::Combined)": 1.0, - "check(gko::stop::ResidualNorm)": 1.0, + "check()": 1.0, "residual_norm::residual_norm": 1.0, - "check(gko::stop::Iteration)": 1.0, "cg::step_1": 1.0, - "apply(gko::experimental::distributed::Matrix)": 1.0, - "apply(gko::matrix::Csr)": 1.0, "csr::spmv": 1.0, "cg::step_2": 1.0, "free": 1.0, diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr index 7800bb0b97e..6a5dab5d844 100644 --- a/benchmark/test/reference/distributed_solver.simple.stderr +++ b/benchmark/test/reference/distributed_solver.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout index 96ef102f8b8..54d7233ba77 100644 --- a/benchmark/test/reference/distributed_solver.simple.stdout +++ b/benchmark/test/reference/distributed_solver.simple.stdout @@ -16,8 +16,7 @@ "rhs_norm": 1.0, "generate": { "components": { - "generate(gko::solver::Cg::Factory)": 1.0, - "generate(gko::matrix::IdentityFactory)": 1.0, + "generate()": 1.0, "free": 1.0, "overhead": 1.0 }, @@ -25,28 +24,22 @@ }, "apply": { "components": { - "apply(gko::solver::Cg)": 1.0, + "apply()": 1.0, "iteration": 1.0, "allocate": 1.0, "dense::fill": 1.0, "cg::initialize": 1.0, - "advanced_apply(gko::experimental::distributed::Matrix)": 1.0, + "advanced_apply()": 1.0, "dense::row_gather": 1.0, - "advanced_apply(gko::matrix::Csr)": 1.0, "csr::advanced_spmv": 1.0, "dense::compute_squared_norm2": 1.0, "dense::compute_sqrt": 1.0, - "apply(gko::matrix::Identity)": 1.0, - "copy(gko::experimental::distributed::Vector,gko::experimental::distributed::Vector)": 1.0, + "copy()": 1.0, "dense::copy": 1.0, "dense::compute_conj_dot_dispatch": 1.0, - "check(gko::stop::Combined)": 1.0, - "check(gko::stop::ResidualNorm)": 1.0, + "check()": 1.0, "residual_norm::residual_norm": 1.0, - "check(gko::stop::Iteration)": 1.0, "cg::step_1": 1.0, - "apply(gko::experimental::distributed::Matrix)": 1.0, - "apply(gko::matrix::Csr)": 1.0, "csr::spmv": 1.0, "cg::step_2": 1.0, "free": 1.0, diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr index e69de29bb2d..3e650323bfa 100644 --- a/benchmark/test/reference/multi_vector_distributed.profile.stderr +++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr @@ -0,0 +1,808 @@ +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scalRunning test case +{ + "n": DEBUG: begin n = 100 +DEBUG: begin copy +100, + "blaDEBUG: begin allocate +DEBUG: end s": {} +} +DEBUG: begin n = 100 +DEBUG: begin allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +copy +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate + +DEBUG: begin allocate +DEBUG: end DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguousDEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_starting_indicesDEBUG: begin n = 100 +DEBUG: begin copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array + +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin DEBUG: begin components::fill_array +DEBUG: end components::fill_arrayallocate +DEBUG: end allocate +allocate +DEBUG: end allocate +DEBUG: begin components::fill_array + +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end DEBUG: begin free +DEBUG: end free +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end DEBUG: end DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocatecomponents::fill_array +allocate +DEBUG: begin +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin freeDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array + +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin freeDEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end free +DEBUG: end free +DEBUG: begin +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indicescopy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate + +DEBUG: begin copy +DEBUG: end copyDEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_arrayallocate +DEBUG: end allocate +DEBUG: begin DEBUG: begin dense::fill +DEBUG: end +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::copy +DEBUG: end dense::copy +dense::fill +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end copy +DEBUG: begin axpy +DEBUG: begin allocate +DEBUG: end copy +Current state: +[ + DEBUG: begin axpy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin { + "n": 10allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin 0, + DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end "blas": { + DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array + "copy": { + components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indicesDEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free "ti +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free + +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocateme": 0.000013, + DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end allocate +DEBUG: begin free +free +DEBUG: begin free +DEBUG: end free + "flopDEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin s": 7692307.692307693, + allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free + "DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin bandwidth":DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array 1230allocate +DEBUG: end allocate +DEBUG: begin allocate + +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end 76923.076923DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices09, + components::fill_array +DEBUG: begin components::fill_array +DEBUG: end +DEBUG: begin copy +DEBUG: end copy +components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy + "rcomponents::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copyDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end epetitions": 1, + "completed": t +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin freedense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +rue + +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled + } + free +DEBUG: begin dense::fill +DEBUG: end dense::fill + } + DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill + } +] +DEBUG: begin axpy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end axpy +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end axpy +DEBUG: begin scal +scal +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end axpy +Current state: +[ + { + DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate + "n": 100, + allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate "blasallocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array + +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end ": { + DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_arraycomponents::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array + "co +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end py": {partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy + free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate + "tiallocate +DEBUG: begin free +DEBUG: end free +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +me": 0.000013, + DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end +DEBUG: begin dense::fill +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill + "flops":DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::scale +DEBUG: end dense::scale + 7692307.692307693, + dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::scale +DEBUG: end dense::scale + "bandwidth": 123076923.07692309, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 0.000017, + "flops": 11764705.88235294, + "bandwidth": 141176470.5882353, + "repetitions": 1, + "completed": true + } + } + } +] +DEBUG: begin scal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::scale +DEBUG: end dense::scale +DEBUG: begin freeDEBUG: begin free + +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end scal +DEBUG: end n = 100 scal +Current state: +[ + { + "n":scal +DEBUG: end n = 100 + + 100, + "blas": { + "copy": { + "time": 0.000013, + "flops": 7692307.692307693, + "bandwidth": 123076923.07692309, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 0.000017, + "flops": 11764705.88235294, + "bandwidth": 141176470.5882353, + "repetitions": 1, + "completed": true + }, + "scal": { + "time": 0.000007, + "flops": 14285714.285714285, + "bandwidth": 228571428.57142857, + "repetitions": 1, + "completed": true + } + } + } +] +DEBUG: end n = 100 diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr index 23f3554e9c4..72a2fbb9b90 100644 --- a/benchmark/test/reference/multi_vector_distributed.simple.stderr +++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr @@ -1,15 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) -Running on reference(0) -Running with 2 warm iterations and 10 running iterations -The random seed for right hand sides is 42 -The operations are copy,axpy,scalThis is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) -Running on reference(0) -Running with 2 warm iterations and 10 running iterations -The random seed for right hand sides is 42 -The operations are copy,axpy,scalThis is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index e69de29bb2d..1ce62b48dc2 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -0,0 +1,2380 @@ +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The formats are [csr]x[csr] +The number of right hand sides is 1 +DEBUG: begin stencil(100,7pt,stencil) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin stencil(100,7pt,stencil) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate + +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end Running test case +{ + DEBUG: end allocate +DEBUG: begin components::fill_array +allocate +DEBUG: begin components::fill_array + "size":DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array 100, + DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin "stencil": +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end DEBUG: begin copy +DEBUG: end copy +"7pt", + "comm_copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +pattern": "stallocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin encil", + "spmv": {} +} + +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +dense::fill +DEBUG: begin dense::fill +DEBUG: begin stencil(100,7pt,stencil) +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin allocate +DEBUG: end allocate +free +DEBUG: begin free +DEBUG: end free +DEBUG: begin partition::build_ranges_from_global_size + +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end partition::build_ranges_from_global_size + +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_sizeDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocateDEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array + +DEBUG: end allocate +DEBUG: begin +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin DEBUG: begin allocate +DEBUG: end components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguouscomponents::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array + +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copyDEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copyDEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin components::aos_to_soa +DEBUG: end +DEBUG: end copy +DEBUG: begin DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fillcomponents::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: begin dense::fill +allocate +DEBUG: end allocate +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_dataDEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fillfree +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: begin free +DEBUG: end free + +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +free +freeDEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end DEBUG: begin copy() +copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateallocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array + +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguousDEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguousallocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin DEBUG: begin copy() +components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_arraycopy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy + +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin copy()DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data + +DEBUG: begin copy +DEBUG: end copy +free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copyfree +DEBUG: end free +Matrix is of size (81DEBUG: end allocate +DEBUG: begin distributed_matrix::build_local_nonlocal + +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy, 81) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate + +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_sizeDEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: end allocate +DEBUG: begin allocate +free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_arrayDEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +free +DEBUG: end free +DEBUG: begin allocatecomponents::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indicesDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free + +DEBUG: end free +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate + +DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end freeDEBUG: end allocate +DEBUG: begin free +DEBUG: end free +allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end freeDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateallocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateallocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocatefree +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin freeDEBUG: begin free +DEBUG: end free +DEBUG: begin free +allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocatefree +DEBUG: begin free +DEBUG: begin free +DEBUG: end free + +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: end copy + +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: begin freeDEBUG: end free +DEBUG: begin freeDEBUG: begin free +DEBUG: end free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free + +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin freeDEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin apply() +free +DEBUG: end free +DEBUG: begin apply() +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin apply() + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end DEBUG: begin dense::row_gather +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +allocate +DEBUG: end allocate +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin dense::row_gather +DEBUG: end DEBUG: end dense::row_gather +dense::row_gather +DEBUG: begin DEBUG: begin apply() +apply() +DEBUG: begin coo::spmv +DEBUG: end coo::spmv +DEBUG: begin apply() +DEBUG: begin coo::spmv +DEBUG: end coo::spmvDEBUG: begin coo::spmv +DEBUG: end DEBUG: end apply() + +DEBUG: end apply() +DEBUG: begin advanced_apply() +coo::spmv +DEBUG: begin advanced_apply() +DEBUG: begin coo::advanced_spmv +DEBUG: end coo::advanced_spmv +DEBUG: begin coo::advanced_spmv +DEBUG: end coo::advanced_spmv +DEBUG: end advanced_apply()DEBUG: end apply() +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin advanced_apply() +DEBUG: begin coo::advanced_spmv +DEBUG: end coo::advanced_spmv +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin freeDEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin free +DEBUG: end freeDEBUG: end apply() +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end +DEBUG: begin free +DEBUG: end free + +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freefree +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +free +DEBUG: end free +free +DEBUG: end free +DEBUG: end free +DEBUG: begin csr-csr +DEBUG: begin free +DEBUG: end free +DEBUG: begin csr-csr +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin csr-csr +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end DEBUG: end allocate +DEBUG: begin allocate +allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end DEBUG: end components::fill_array +DEBUG: begin components::fill_arraycomponents::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin freeDEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +free + +DEBUG: end free +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array + +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end components::fill_array +DEBUG: begin DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copyallocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy + +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy()DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array + +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy() +DEBUG: begin allocateDEBUG: begin copy() +DEBUG: begin DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +dense::fill +DEBUG: begin allocate +DEBUG: end DEBUG: begin copy +DEBUG: end DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin allocate +allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy() +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soaDEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end dense::fill +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa + +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateallocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate + +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin freeDEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate + +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocateDEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end freefree +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free + +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateallocate + +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocateDEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocatefree +DEBUG: end free + +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: begin allocate +allocate +DEBUG: end allocate +DEBUG: begin DEBUG: end allocate +free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateallocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateallocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocateallocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateallocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate + +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateDEBUG: end allocate +DEBUG: begin free + +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin freeallocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: end free +DEBUG: begin free +DEBUG: end freefree +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freefree +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy + +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin allocatedistributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: end copy +free +DEBUG: end free +DEBUG: begin free + +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrsDEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy + +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end DEBUG: end copy +free +DEBUG: begin copy +DEBUG: end copy +free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin freeDEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: end free +DEBUG: begin free +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +free +DEBUG: end free +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: end free +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freefree +DEBUG: end free +DEBUG: begin +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy() +free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin apply() +DEBUG: begin allocatedense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin apply() +DEBUG: begin allocate + +DEBUG: end allocate +DEBUG: begin DEBUG: end allocate +DEBUG: begin allocate +copy() +DEBUG: begin apply() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateallocate +DEBUG: end allocate +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: end allocate +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather + +DEBUG: begin apply() +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: begin csr::spmv +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end csr::spmv +DEBUG: end DEBUG: end apply() +DEBUG: begin DEBUG: end apply() +DEBUG: begin advanced_apply() +apply() +DEBUG: begin advanced_apply()advanced_apply() +DEBUG: begin csr::advanced_spmv + +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end csr::advanced_spmv +DEBUG: end apply() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2dense::compute_squared_norm2 +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv + +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin allocate +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin dense::add_scaled +dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: begin DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +allocate +DEBUG: end allocate +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: begin dense::compute_sqrt +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy()dense::compute_sqrt +DEBUG: begin copy() +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy + +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end copy() +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end copy() +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin free +DEBUG: end freeDEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy + +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end copy() +free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +free +DEBUG: end free +DEBUG: begin freeDEBUG: begin copy() +DEBUG: begin allocate +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end free +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin apply() + +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin apply() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmvDEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end dense::row_gather +DEBUG: begin apply() +apply() +DEBUG: begin advanced_apply() + +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: begin csr::spmv +DEBUG: end csr::spmvDEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() + +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: end freefree +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin freefree +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end free +DEBUG: begin free +DEBUG: end freefree +DEBUG: begin free + +DEBUG: end csr-csr +DEBUG: begin free + +DEBUG: begin free +DEBUG: end free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: end csr-csr +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end csr-csr +Current state:free +DEBUG: end free +DEBUG: end stencil(100,7pt,stencil) +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end stencil(100,7pt,stencil) +[ + + { + "size": 81, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 6420, + "max_relative_norm2": 0.0, + "time": 0.000037, + "repetitions": 1, + "completed": true + } + }, + "nnz": 144, + "optimal": {} + } +] +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end stencil(100,7pt,stencil) diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout index 2aeeeb5b0d5..5512866fdf0 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stdout +++ b/benchmark/test/reference/spmv_distributed.profile.stdout @@ -6,14 +6,14 @@ "comm_pattern": "stencil", "spmv": { "csr-csr": { - "storage": 2316, + "storage": 6420, "max_relative_norm2": 1.0, "time": 1.0, "repetitions": 1, "completed": true } }, - "nnz": 135, + "nnz": 144, "optimal": { "spmv": "csr-csr" } diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr index 57f31d44686..7fa9aeb581f 100644 --- a/benchmark/test/reference/spmv_distributed.simple.stderr +++ b/benchmark/test/reference/spmv_distributed.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.5.0 (develop) - running with core module 1.5.0 (develop) +This is Ginkgo 1.6.0 (develop) + running with core module 1.6.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 @@ -21,14 +21,14 @@ Current state: "comm_pattern": "stencil", "spmv": { "csr-csr": { - "storage": 2316, + "storage": 6420, "max_relative_norm2": 1.0, "time": 1.0, "repetitions": 10, "completed": true } }, - "nnz": 135, + "nnz": 144, "optimal": {} } ] diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout index d8cd32ba834..7b6e0883c14 100644 --- a/benchmark/test/reference/spmv_distributed.simple.stdout +++ b/benchmark/test/reference/spmv_distributed.simple.stdout @@ -6,14 +6,14 @@ "comm_pattern": "stencil", "spmv": { "csr-csr": { - "storage": 2316, + "storage": 6420, "max_relative_norm2": 1.0, "time": 1.0, "repetitions": 10, "completed": true } }, - "nnz": 135, + "nnz": 144, "optimal": { "spmv": "csr-csr" } From 19a4402a27a558a2dac6eb4ae3895811477e8161 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 22 May 2023 13:56:37 +0200 Subject: [PATCH 035/583] sanitize more output --- benchmark/blas/distributed/multi_vector.cpp | 9 +- .../multi_vector_distributed.profile.stderr | 680 +++---- .../reference/spmv_distributed.profile.stderr | 1794 ++++++++--------- 3 files changed, 1243 insertions(+), 1240 deletions(-) diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index 4d3b821ed2e..be326b08b96 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -63,12 +63,15 @@ Parameters for a benchmark case are: std::string format = example_config; initialize_argument_parsing(&argc, &argv, header, format); - std::string extra_information = "The operations are " + FLAGS_operations; - print_general_information(extra_information); - const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); const auto rank = comm.rank(); + if (rank == 0) { + std::string extra_information = + "The operations are " + FLAGS_operations; + print_general_information(extra_information); + } + auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); std::string json_input = broadcast_json_input(get_input_stream(), comm); diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr index 3e650323bfa..3cf18472311 100644 --- a/benchmark/test/reference/multi_vector_distributed.profile.stderr +++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr @@ -5,517 +5,517 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The operations are copy,axpy,scalRunning test case { - "n": DEBUG: begin n = 100 + "n": 100, + "blasDEBUG: begin n = 100 DEBUG: begin copy -100, - "blaDEBUG: begin allocate -DEBUG: end s": {} +": {} } DEBUG: begin n = 100 +DEBUG: begin copy DEBUG: begin allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -copy -DEBUG: begin allocateDEBUG: begin allocate DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_sizeDEBUG: begin n = 100 +DEBUG: begin copy +DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguousDEBUG: end allocate +DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin partition::build_starting_indicesDEBUG: begin n = 100 -DEBUG: begin copy DEBUG: begin allocate DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end allocate +DEBUG: begin allocate DEBUG: end allocate - DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array +allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin DEBUG: begin components::fill_array -DEBUG: end components::fill_arrayallocate -DEBUG: end allocate -allocate -DEBUG: end allocate DEBUG: begin components::fill_array - +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous -DEBUG: end DEBUG: begin free -DEBUG: end free +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copyDEBUG: end components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin free -DEBUG: end free +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copyDEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array -partition::build_from_contiguous +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy +DEBUG: end copy DEBUG: begin free -DEBUG: end free + +DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin allocate -DEBUG: end DEBUG: end DEBUG: begin copy +DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocatecomponents::fill_array -allocate -DEBUG: begin +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate DEBUG: end allocate + +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end DEBUG: begin free +DEBUG: end freefree +DEBUG: end free +DEBUG: begin free +DEBUG: end free free +DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin freeDEBUG: begin allocate -DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin free +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin allocate -components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array - +DEBUG: end allocate +DEBUG: begin free DEBUG: end free -DEBUG: begin copy -DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin freeDEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate +partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free +allocate DEBUG: begin allocate DEBUG: end allocateDEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end free -DEBUG: end free -DEBUG: begin -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate -components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indicescopy -DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free +DEBUG: begin components::fill_array +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: begin copy -DEBUG: end copyDEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_arrayDEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - +DEBUG: begin components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin components::fill_arrayallocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin dense::fill -DEBUG: end +DEBUG: begin components::fill_array +components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end partition::build_starting_indices +DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate -free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin freecopy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::copy -DEBUG: end dense::copy -dense::fill -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end free + DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: begin dense::fill +DEBUG: end dense::fill +free DEBUG: end free DEBUG: begin dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill DEBUG: end dense::fill DEBUG: begin dense::copy DEBUG: end dense::copy +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: begin dense::copy +DEBUG: end dense::copy DEBUG: begin free +DEBUG: end DEBUG: begin free DEBUG: end free +free DEBUG: begin free -DEBUG: end freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free +DEBUG: end DEBUG: begin free DEBUG: end free -DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end copy -DEBUG: begin axpy -DEBUG: begin allocate -DEBUG: end copy Current state: [ - DEBUG: begin axpy + free +DEBUG: end copy +DEBUG: begin axpy DEBUG: begin allocate DEBUG: end allocate +DEBUG: end copy +DEBUG: begin axpy DEBUG: begin allocate -DEBUG: end DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin allocate { + "n": DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin { - "n": 10allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate +DEBUG: end DEBUG: end allocate -DEBUG: begin allocate -partition::build_ranges_from_global_size +DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate +allocate DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin 0, - DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: end allocate +100, + "blas": { + allocateDEBUG: end allocate DEBUG: begin allocate -DEBUG: end "blas": { - DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array components::fill_array +DEBUG: begin "copy": { + +DEBUG: begin allocate +components::fill_array DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end -DEBUG: begin components::fill_array +DEBUG: begin DEBUG: end allocate +DEBUG: begin allocate DEBUG: end components::fill_array -DEBUG: begin components::fill_array - "copy": { - components::fill_array -DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indicesDEBUG: end components::fill_array +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free "ti -DEBUG: begin copy -DEBUG: end copy + "time": 0.0000components::fill_array +DEBUG: end components::fill_arrayDEBUG: end copy DEBUG: begin free - +08, + +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocateme": 0.000013, - DEBUG: end free -DEBUG: begin copy -DEBUG: end copy DEBUG: begin allocate -DEBUG: end DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end allocate -DEBUG: begin free -free -DEBUG: begin free -DEBUG: end free - "flopDEBUG: end free + "flops": 12DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: begin free +DEBUG: begin 500000.0, + partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin freefree DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin s": 7692307.692307693, - allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free + DEBUG: end free - "DEBUG: begin allocate -DEBUG: end allocate - +DEBUG: begin copy +DEBUG: end DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin bandwidth":DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array 1230allocate -DEBUG: end allocate -DEBUG: begin allocate - -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end 76923.076923DEBUG: end allocate +DEBUG: begin "bandwidth": 200000000.0, + copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +allocate +DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices09, - components::fill_array -DEBUG: begin components::fill_array -DEBUG: end -DEBUG: begin copy -DEBUG: end copy -components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array -DEBUG: end DEBUG: begin free +DEBUG: end components::fill_array "repetitiDEBUG: begin free DEBUG: end free -DEBUG: begin copy -DEBUG: end copy - "rcomponents::fill_array -DEBUG: begin partition::build_from_contiguous +DEBUG: begin free +DEBUG: end free + +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguousons": 1, + DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size + DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy -DEBUG: end copyDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end epetitions": 1, - "completed": t +DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate + "completed": tDEBUG: begin allocate DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +rue + DEBUG: end allocate +DEBUG: begin allocateDEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin freedense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -rue - +DEBUG: begin } + +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_arrayfree DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end DEBUG: begin dense::add_scaled -DEBUG: end dense::add_scaled - } - free +DEBUG: end free +DEBUG: begin } + } +] +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin +DEBUG: begin axpy +DEBUG: begin allocatecomponents::fill_array +DEBUG: end components::fill_array DEBUG: begin dense::fill DEBUG: end dense::fill - } - DEBUG: begin dense::fill +DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill - } -] -DEBUG: begin axpy -DEBUG: begin allocate + DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copypartition::build_ranges_from_global_size DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin dense::add_scaled +DEBUG: end +DEBUG: end copyDEBUG: begin dense::add_scaled DEBUG: end dense::add_scaled allocate +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: begin allocate DEBUG: end components::fill_array -DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: begin components::fill_arrayallocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end +DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy +free +DEBUG: end DEBUG: begin free +DEBUG: end free DEBUG: begin free DEBUG: end free +partition::build_starting_indices +DEBUG: begin copy +DEBUG: end DEBUG: begin dense::fill +DEBUG: end dense::fill +copy +DEBUG: begin free +DEBUG: end DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin free +DEBUG: begin DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +free DEBUG: end free DEBUG: begin free DEBUG: end free @@ -573,137 +573,137 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free +DEBUG: begin DEBUG: begin free +DEBUG: end freeDEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end free + DEBUG: begin free DEBUG: end free -DEBUG: end axpy -DEBUG: begin DEBUG: begin free -DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free DEBUG: end free DEBUG: end axpy DEBUG: begin scal -scal -DEBUG: begin allocate DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin DEBUG: end free DEBUG: end axpy -Current state: -[ - { - DEBUG: end allocate +DEBUG: begin scal DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end free +DEBUG: end axpy +allocate DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_sizeallocate DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_sizeCurrent state: +[ + { + DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate - "n": 100, - allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end DEBUG: end allocate -DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end "n"allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate "blasallocate +DEBUG: end allocateallocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array: 10 DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array - -DEBUG: begin components::fill_array +DEBUG: end DEBUG: end components::fill_array DEBUG: begin components::fill_array -DEBUG: end ": { - DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_arraycomponents::fill_array +DEBUG: end components::fill_array +0, + components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array - "co DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices -DEBUG: end DEBUG: begin components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices -DEBUG: end py": {partition::build_starting_indices +DEBUG: end partition::build_starting_indices DEBUG: begin copy -DEBUG: end copy -DEBUG: begin partition::build_starting_indices +DEBUG: end copy "blas": { + DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy - free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end DEBUG: begin free DEBUG: end free -DEBUG: begin copy +DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate - "tiallocate DEBUG: begin free DEBUG: end free + "copy": { DEBUG: end allocate DEBUG: begin free DEBUG: end free -me": 0.000013, - DEBUG: begin free -DEBUG: end free DEBUG: begin free +DEBUG: end DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin freefree +DEBUG: begin free + DEBUG: end free DEBUG: begin free -DEBUG: end freeDEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end -DEBUG: begin dense::fill + DEBUG: end free +DEBUG: begin free DEBUG: end free DEBUG: begin dense::fill DEBUG: end dense::fill - "flops":DEBUG: begin dense::fill +DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::scale -DEBUG: end dense::scale - 7692307.692307693, - dense::fill +DEBUG: end dense::scalefree DEBUG: begin dense::fill DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill + "ti DEBUG: begin dense::scale DEBUG: end dense::scale - "bandwidth": 123076923.07692309, +me": 0.000008, + "flops": 12500000.0, + "bandwidth": 200000000.0, "repetitions": 1, "completed": true }, "axpy": { - "time": 0.000017, - "flops": 11764705.88235294, - "bandwidth": 141176470.5882353, + "time": 0.00002, + "flops": 10000000.0, + "bandwidth": 119999999.99999999, "repetitions": 1, "completed": true } @@ -759,46 +759,46 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::scale DEBUG: end dense::scale -DEBUG: begin freeDEBUG: begin free - -DEBUG: end free DEBUG: begin free +DEBUG: end freeDEBUG: begin free DEBUG: end free -DEBUG: end DEBUG: begin free +DEBUG: begin freeDEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end DEBUG: end free +DEBUG: end scal +DEBUG: end n = 100 + DEBUG: begin free DEBUG: end free DEBUG: end scal -DEBUG: end n = 100 scal Current state: [ { - "n":scal + "n": 100, + +DEBUG: end free +DEBUG: end scal DEBUG: end n = 100 - - 100, - "blas": { + "blas": { "copy": { - "time": 0.000013, - "flops": 7692307.692307693, - "bandwidth": 123076923.07692309, + "time": 0.000008, + "flops": 12500000.0, + "bandwidth": 200000000.0, "repetitions": 1, "completed": true }, "axpy": { - "time": 0.000017, - "flops": 11764705.88235294, - "bandwidth": 141176470.5882353, + "time": 0.00002, + "flops": 10000000.0, + "bandwidth": 119999999.99999999, "repetitions": 1, "completed": true }, "scal": { - "time": 0.000007, - "flops": 14285714.285714285, - "bandwidth": 228571428.57142857, + "time": 0.000006, + "flops": 16666666.666666666, + "bandwidth": 266666666.66666666, "repetitions": 1, "completed": true } diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index 1ce62b48dc2..b190ac8a458 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -5,227 +5,270 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are [csr]x[csr] The number of right hand sides is 1 -DEBUG: begin stencil(100,7pt,stencil) -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin stencil(100,7pt,stencil) +Running test case +{ + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": {} +} +DEBUG: begin stencil(100,7pt,stencil)DEBUG: begin stencil(100,7pt,stencil) + DEBUG: begin allocate DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_sizeDEBUG: begin allocate +DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocateDEBUG: begin allocate +DEBUG: begin stencil(100,7pt,stencil) + +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate - DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end allocate DEBUG: begin allocate -DEBUG: end Running test case -{ - DEBUG: end allocate -DEBUG: begin components::fill_array -allocate -DEBUG: begin components::fill_array - "size":DEBUG: end components::fill_array +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin components::fill_array 100, - DEBUG: begin components::fill_array +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateDEBUG: end components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array + +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin "stencil": DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -partition::build_from_contiguous +DEBUG: end partition::build_starting_indicesDEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy -DEBUG: end DEBUG: begin copy DEBUG: end copy -"7pt", - "comm_copy DEBUG: begin free -DEBUG: end free +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array + DEBUG: begin copy DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin DEBUG: begin allocate DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate DEBUG: end allocate -pattern": "stallocate +DEBUG: begin allocate +DEBUG: end allocatepartition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices + +DEBUG: begin allocate +DEBUG: end DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin allocate +allocate +DEBUG: begin components::aos_to_soa +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end freeDEBUG: begin allocate DEBUG: end allocate +DEBUG: begin DEBUG: end components::aos_to_soa + +DEBUG: begin copy +DEBUG: end copycomponents::aos_to_soa DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin encil", - "spmv": {} -} - +DEBUG: begin dense::fill +DEBUG: end +DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::aos_to_soa +DEBUG: begin allocate DEBUG: end components::aos_to_soa +dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data DEBUG: begin allocate DEBUG: end allocate -dense::fill +DEBUG: end allocate +DEBUG: begin components::aos_to_soa DEBUG: begin dense::fill -DEBUG: begin stencil(100,7pt,stencil) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end freeDEBUG: end allocate DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data + +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data DEBUG: begin free +DEBUG: end dense::fill_in_matrix_data DEBUG: end free DEBUG: begin free -DEBUG: end DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin freeDEBUG: begin free +DEBUG: end free DEBUG: end free DEBUG: begin free +DEBUG: end free + DEBUG: end free DEBUG: begin free -DEBUG: end freeDEBUG: begin allocate -DEBUG: end allocate -free DEBUG: begin free DEBUG: end free -DEBUG: begin partition::build_ranges_from_global_size - DEBUG: begin free +DEBUG: end DEBUG: end free +free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end freeDEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end partition::build_ranges_from_global_size - +DEBUG: begin allocate +DEBUG: end DEBUG: begin free +DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin allocate +DEBUG: begin partition::build_ranges_from_global_size DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_sizeDEBUG: begin allocate DEBUG: end allocate +partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocateDEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array - DEBUG: end allocate DEBUG: begin -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin allocatefree +DEBUG: end free +allocate DEBUG: end allocate DEBUG: begin allocate + DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array -DEBUG: end components::fill_array +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin components::fill_array -DEBUG: end components::fill_array +DEBUG: end components::fill_arrayDEBUG: end components::fill_array DEBUG: begin components::fill_array +DEBUG: end +DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate + +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin DEBUG: begin allocate +components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguouscomponents::fill_array DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array - +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copyDEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copyDEBUG: begin components::fill_array -DEBUG: end components::fill_array + DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy +DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end -DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate -partition::build_starting_indices +DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate @@ -233,430 +276,341 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin components::aos_to_soa -DEBUG: end -DEBUG: end copy -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fillcomponents::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin -DEBUG: end dense::fill -DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free DEBUG: begin dense::fill -allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end DEBUG: end allocate +DEBUG: begin allocate DEBUG: end allocate -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: end dense::fill -DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_dataDEBUG: begin components::aos_to_soa +DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin dense::fillfree +DEBUG: begin dense::fill +DEBUG: end dense::fillDEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end free - +copy DEBUG: begin free DEBUG: end free - -DEBUG: end dense::fill +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end free +allocate +DEBUG: end allocate +DEBUG: begin DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: begin DEBUG: begin free +DEBUG: begin free +DEBUG: end freeallocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: begin DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +free DEBUG: end free DEBUG: begin free DEBUG: end free free -freeDEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end free +Matrix is of size (81, 81) +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free - +DEBUG: begin DEBUG: begin copy() +DEBUG: begin copy() +free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin allocate +DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: begin free +DEBUG: end free DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end DEBUG: begin copy() -copy() -DEBUG: begin allocate +DEBUG: end copy() +DEBUG: begin allocateDEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: begin free +DEBUG: end free + DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size +DEBUG: end copy() DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size +DEBUG: begin partition::build_ranges_from_global_sizeDEBUG: begin copy() DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate + +DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocateallocate +DEBUG: begin DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copyDEBUG: begin allocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate +DEBUG: end allocate DEBUG: begin allocate -DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end allocate +allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin components::fill_array +DEBUG: begin DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array - -DEBUG: end copy() +DEBUG: begin components::fill_arrayDEBUG: end copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin DEBUG: end components::fill_array +components::fill_array +DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array + DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguousDEBUG: begin components::fill_array +DEBUG: begin components::fill_arrayDEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocateDEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguousallocate + DEBUG: end allocate DEBUG: begin allocate -DEBUG: end partition::build_from_contiguous +DEBUG: end allocate +DEBUG: begin allocatepartition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy +DEBUG: begin free +DEBUG: end DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin -DEBUG: end allocate -DEBUG: begin allocate + DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - DEBUG: begin components::fill_array +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free DEBUG: end components::fill_array -DEBUG: begin DEBUG: begin copy() -components::fill_array +DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin components::fill_arraycopy +DEBUG: begin free +DEBUG: begin copy() +DEBUG: begin copycomponents::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end free +partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end DEBUG: begin copy DEBUG: end copy +partition::build_starting_indices DEBUG: begin copy DEBUG: end copy - -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill +DEBUG: begin freeDEBUG: end copy() +DEBUG: begin copy()DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy -DEBUG: end copy() -DEBUG: begin copy()DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data +DEBUG: end free DEBUG: begin copy DEBUG: end copy -free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin copy +DEBUG: begin copy + +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy() DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: end copy() +DEBUG: end copy()DEBUG: end copy()DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill + DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy DEBUG: begin copy -free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: end copy() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill DEBUG: begin allocate -DEBUG: begin copy() -DEBUG: begin copy -DEBUG: end copyfree -DEBUG: end free -Matrix is of size (81DEBUG: end allocate -DEBUG: begin distributed_matrix::build_local_nonlocal - +DEBUG: end DEBUG: begin copy DEBUG: end copy DEBUG: begin copy -DEBUG: end copy, 81) -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end copy() +DEBUG: begin copy() +allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate - +DEBUG: begin copy +DEBUG: end copyDEBUG: end allocate +DEBUG: begin components::aos_to_soaDEBUG: end copy DEBUG: end copy() DEBUG: begin allocate -DEBUG: end DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -allocate DEBUG: begin dense::fill DEBUG: end dense::fill -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy + +DEBUG: end components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end dense::copy DEBUG: end copy() DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_sizeDEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: begin -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate + +DEBUG: begin distributed_matrix::build_local_nonlocal DEBUG: end allocate DEBUG: begin allocate -free -DEBUG: end free +DEBUG: end allocateDEBUG: begin dense::fill +DEBUG: end dense::fill + DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: end allocate -DEBUG: begin allocate -free +DEBUG: begin components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_arrayDEBUG: begin free -DEBUG: end free DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end DEBUG: end components::aos_to_soa +allocate DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate + DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -free -DEBUG: end free -DEBUG: begin allocatecomponents::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indicesDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free +DEBUG: end allocateDEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: begin distributed_matrix::build_local_nonlocal + +DEBUG: begin components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end -DEBUG: begin free -DEBUG: end free DEBUG: begin allocate -DEBUG: end allocate -free +DEBUG: end DEBUG: end components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate +allocate DEBUG: begin free -DEBUG: end free - -DEBUG: end free -DEBUG: begin copy() -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy() +DEBUG: begin distributed_matrix::build_local_nonlocal DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin free +DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin allocateDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end free +free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end free +DEBUG: end DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free +DEBUG: begin freefree +DEBUG: begin allocate DEBUG: end free -DEBUG: begin copy() -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy() +DEBUG: begin allocateDEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate @@ -669,146 +623,119 @@ DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end freeDEBUG: end allocate -DEBUG: begin free DEBUG: end free -allocate -DEBUG: begin allocate +DEBUG: begin allocateDEBUG: end allocate +DEBUG: begin free DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin free DEBUG: end allocate -DEBUG: begin components::aos_to_soa - DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end allocateDEBUG: end components::aos_to_soa -DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin allocate -DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end freeDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: end allocate -DEBUG: begin allocate +DEBUG: end DEBUG: end allocate +DEBUG: begin freeDEBUG: end free DEBUG: begin allocate DEBUG: end allocate free -DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin free DEBUG: end free + DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate + DEBUG: begin free DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: begin DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin freeallocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate -allocate -DEBUG: end allocate +DEBUG: begin free +DEBUG: end DEBUG: end allocate +DEBUG: begin free +DEBUG: end free DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end allocate +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin allocate + DEBUG: end allocate +DEBUG: begin freefree +DEBUG: begin allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free +DEBUG: end free + DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateallocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin free +DEBUG: end freeDEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate + DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end allocateDEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin freeDEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate + +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocateallocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocateDEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate + DEBUG: begin free DEBUG: end free DEBUG: begin allocate @@ -817,180 +744,253 @@ DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: end allocate +DEBUG: begin free +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end DEBUG: begin free DEBUG: end free +DEBUG: begin DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate +allocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free +DEBUG: begin DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free +DEBUG: end freeDEBUG: begin free DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free - DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocatefree -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free + +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin freeallocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocateDEBUG: end free +DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -free +DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free - +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: begin free +DEBUG: begin DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free DEBUG: end free -DEBUG: begin freeDEBUG: begin free +DEBUG: begin allocate DEBUG: end free DEBUG: begin free -allocate +DEBUG: end free +DEBUG: begin free +DEBUG: end freeallocate + DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate - -DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: end free +DEBUG: begin allocate + DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin allocatefree DEBUG: begin free +DEBUG: end freeDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin DEBUG: end allocate DEBUG: begin free DEBUG: end free - -DEBUG: end allocate +DEBUG: begin DEBUG: begin free DEBUG: end free +allocate +DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocateallocate DEBUG: end allocate -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end freeDEBUG: begin free DEBUG: end free DEBUG: end distributed_matrix::build_local_nonlocal DEBUG: begin copy DEBUG: end copy -DEBUG: begin copy + DEBUG: begin allocate -DEBUG: begin free -DEBUG: end free +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin copy DEBUG: end copy +allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocate +free +DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: begin DEBUG: begin free +DEBUG: end allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy +DEBUG: begin free DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end free +DEBUG: begin freeallocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free + DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin freeDEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin +DEBUG: begin copyfree +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free + +DEBUG: end copy +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1031,130 +1031,123 @@ DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: begin freeDEBUG: end free -DEBUG: begin freeDEBUG: begin free DEBUG: end free +DEBUG: begin free +DEBUG: end freeDEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free - +DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free - DEBUG: begin free -DEBUG: end free +DEBUG: end DEBUG: begin free DEBUG: end free +free DEBUG: begin free DEBUG: end free -DEBUG: begin freeDEBUG: begin free -DEBUG: end free DEBUG: begin free +DEBUG: end freeDEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: end free +DEBUG: begin free + DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: begin freeDEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin +DEBUG: begin freeDEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: end free + DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin apply() -free -DEBUG: end free DEBUG: begin apply() -free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin apply() +DEBUG: end free +DEBUG: begin apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin allocate -DEBUG: end DEBUG: begin dense::row_gather DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -allocate +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::row_gather +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather DEBUG: end dense::row_gather DEBUG: begin dense::row_gather -DEBUG: end DEBUG: end dense::row_gather -dense::row_gather -DEBUG: begin DEBUG: begin apply() -apply() -DEBUG: begin coo::spmv -DEBUG: end coo::spmv +DEBUG: end dense::row_gather +DEBUG: begin apply() DEBUG: begin apply() +DEBUG: begin apply() +DEBUG: begin coo::spmv DEBUG: begin coo::spmv -DEBUG: end coo::spmvDEBUG: begin coo::spmv +DEBUG: end coo::spmv +DEBUG: end coo::spmv DEBUG: end DEBUG: end apply() - +DEBUG: begin coo::spmv DEBUG: end apply() -DEBUG: begin advanced_apply() coo::spmv +DEBUG: end apply() DEBUG: begin advanced_apply() DEBUG: begin coo::advanced_spmv -DEBUG: end coo::advanced_spmv +DEBUG: end DEBUG: begin advanced_apply() DEBUG: begin coo::advanced_spmv DEBUG: end coo::advanced_spmv -DEBUG: end advanced_apply()DEBUG: end apply() +DEBUG: end DEBUG: begin advanced_apply() +DEBUG: begin coo::advanced_spmv +coo::advanced_spmv DEBUG: end advanced_apply() +advanced_apply() DEBUG: end apply() DEBUG: begin free DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin advanced_apply() -DEBUG: begin coo::advanced_spmv DEBUG: end coo::advanced_spmv -free -DEBUG: end free +DEBUG: end advanced_apply() +DEBUG: end apply()DEBUG: end apply() DEBUG: begin free DEBUG: end free -DEBUG: begin freeDEBUG: end advanced_apply() -DEBUG: end apply() DEBUG: begin free -DEBUG: end freeDEBUG: end apply() +DEBUG: end DEBUG: begin free +DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end free - DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free @@ -1166,24 +1159,19 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end freefree -DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: begin free -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1191,63 +1179,76 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -free -DEBUG: end free -free DEBUG: end free +DEBUG: begin free DEBUG: end free -DEBUG: begin csr-csr DEBUG: begin free DEBUG: end free +DEBUG: begin free DEBUG: begin csr-csr DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size + +DEBUG: end free +DEBUG: begin free +DEBUG: end +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin csr-csr +DEBUG: begin free +DEBUG: begin free +DEBUG: end free +DEBUG: begin csr-csrfree +DEBUG: end free +DEBUG: begin csr-csr DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_sizeallocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +partition::build_ranges_from_global_size DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate - +DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end DEBUG: end allocate +DEBUG: end allocate DEBUG: begin allocate -allocate -DEBUG: begin components::fill_array +DEBUG: end allocateDEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array +DEBUG: end components::fill_arrayallocate +DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array + +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indicesDEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array -DEBUG: end DEBUG: end components::fill_array -DEBUG: begin components::fill_arraycomponents::fill_array -DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array @@ -1255,345 +1256,349 @@ DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices +DEBUG: begin components::fill_array + DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy DEBUG: begin free DEBUG: end free +DEBUG: end components::fill_array +DEBUG: begin components::fill_array DEBUG: begin allocate -partition::build_from_contiguous +DEBUG: end allocate +DEBUG: begin +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy DEBUG: begin free -DEBUG: end -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array +DEBUG: end DEBUG: end components::fill_array DEBUG: begin partition::build_from_contiguous DEBUG: end partition::build_from_contiguous DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy -DEBUG: end copy -DEBUG: begin freeDEBUG: end allocate -DEBUG: begin components::fill_array +DEBUG: end copycomponents::fill_array DEBUG: end components::fill_array DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array free - -DEBUG: end free -DEBUG: end components::fill_array DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocateDEBUG: begin allocate + +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array +DEBUG: begin components::fill_arrayDEBUG: end components::fill_array DEBUG: begin allocate DEBUG: end allocate -allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin components::fill_array -DEBUG: end DEBUG: begin copy() +DEBUG: end components::fill_array +DEBUG: begin copy +DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: begin copy() DEBUG: begin allocate -DEBUG: end components::fill_array -DEBUG: begin DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copyallocate +DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -copy() DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: begin copy +DEBUG: end copy()DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy - +DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy -DEBUG: end copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin copy()DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy + DEBUG: begin copy -DEBUG: end copy +DEBUG: end copyDEBUG: begin copy() DEBUG: begin copy -DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: end copy() DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy() -DEBUG: begin allocateDEBUG: begin copy() -DEBUG: begin DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end -DEBUG: end allocate -DEBUG: begin components::fill_array +DEBUG: end copyDEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: end copy() +DEBUG: begin +DEBUG: end copy() DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy() -DEBUG: begin allocate -dense::fill +DEBUG: begin copyallocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() DEBUG: begin allocate -DEBUG: end DEBUG: begin copy -DEBUG: end DEBUG: end allocate +DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin allocate -allocate -DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate +DEBUG: end copy DEBUG: begin copy +DEBUG: end copy DEBUG: end copy() -DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy + DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa + +DEBUG: end allocate +DEBUG: begin dense::fillDEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate + +DEBUG: end dense::fill DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocateDEBUG: end copy() +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::fill -allocate +DEBUG: end DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soaDEBUG: begin distributed_matrix::build_local_nonlocal DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin allocate DEBUG: end dense::fill DEBUG: begin allocate -DEBUG: end allocate +DEBUG: begin free +DEBUG: end DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate +allocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocatefree DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocateDEBUG: begin distributed_matrix::build_local_nonlocal DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa DEBUG: begin allocate +DEBUG: end DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free +DEBUG: begin DEBUG: end free DEBUG: begin allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa - DEBUG: end allocate DEBUG: begin free +allocate +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate +DEBUG: end allocate +free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate - DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: begin free -DEBUG: end free DEBUG: begin allocate DEBUG: end allocateDEBUG: begin allocate DEBUG: end allocate -DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin DEBUG: end free DEBUG: begin allocate DEBUG: end allocate + +DEBUG: begin free +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free +allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: begin allocateDEBUG: begin free +DEBUG: end free DEBUG: begin allocate -DEBUG: end allocateallocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end -DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate +allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate - DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin +DEBUG: end free DEBUG: begin allocate -allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: begin free +DEBUG: end allocateDEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin freeDEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: end allocate DEBUG: begin free +free DEBUG: end free -DEBUG: begin allocateDEBUG: begin allocate +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -DEBUG: end free +DEBUG: end DEBUG: end free +DEBUG: begin allocate +free DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate - +DEBUG: begin freeDEBUG: end allocate +DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin freeDEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocateDEBUG: begin free +DEBUG: begin allocate +DEBUG: end allocate DEBUG: end free DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free DEBUG: end -DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: begin DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end allocate +allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate +DEBUG: end allocateallocate DEBUG: end allocate DEBUG: begin free -DEBUG: end freefree +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free - +DEBUG: begin allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin DEBUG: begin free +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin allocate +DEBUG: begin DEBUG: end allocate DEBUG: begin free DEBUG: end free @@ -1601,82 +1606,53 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin allocateallocate - DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin allocate DEBUG: end allocate +free +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate DEBUG: begin free DEBUG: end free -DEBUG: begin allocateDEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end DEBUG: begin free DEBUG: end free -DEBUG: begin allocatefree -DEBUG: end free -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin allocate +DEBUG: end allocateallocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin allocate +DEBUG: begin allocateDEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free DEBUG: end free DEBUG: begin allocate -DEBUG: end allocateDEBUG: end free -DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end DEBUG: end allocate DEBUG: begin allocate +DEBUG: end DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate @@ -1684,171 +1660,232 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin DEBUG: begin allocate allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end allocate -free DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: begin DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocateallocate DEBUG: begin allocate +DEBUG: end allocateallocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin +DEBUG: begin free +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate + DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate -DEBUG: end DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocateallocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin -DEBUG: begin allocate -DEBUG: end allocateallocate DEBUG: begin allocate +DEBUG: end DEBUG: end allocate DEBUG: begin allocate +allocate DEBUG: end allocate +DEBUG: begin freeallocate DEBUG: begin free -DEBUG: end free +DEBUG: end DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateallocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate - +DEBUG: begin free +DEBUG: end freefree +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free DEBUG: end allocate -DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end allocatefree +DEBUG: begin free allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin free +DEBUG: end free DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocateDEBUG: end allocate +DEBUG: begin allocateDEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free DEBUG: begin free +DEBUG: end +DEBUG: begin allocate +DEBUG: end allocate DEBUG: end allocate +DEBUG: begin allocatefree +DEBUG: begin free +DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free DEBUG: end DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin freeallocate DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: begin free -DEBUG: end free +free DEBUG: begin free DEBUG: end free - +DEBUG: begin DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocateDEBUG: begin free DEBUG: end free -DEBUG: begin free -DEBUG: end freefree DEBUG: end free -DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free -DEBUG: end +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate + DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: end allocate +DEBUG: begin allocate DEBUG: begin free DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin freeDEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin free DEBUG: end free -free +DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocate + DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free -DEBUG: begin free +free DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free -DEBUG: end freefree DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin copy +DEBUG: end copy DEBUG: end free -DEBUG: begin free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocatefree DEBUG: end free -DEBUG: begin free +DEBUG: begin +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocateDEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free +DEBUG: end +DEBUG: begin allocate DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free DEBUG: end free - DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -1856,81 +1893,44 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end freeDEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy - DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end free -DEBUG: end free DEBUG: begin free -DEBUG: end freeDEBUG: begin allocatedistributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy +DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin -DEBUG: end copy -free +DEBUG: begin free DEBUG: end free DEBUG: begin free - -DEBUG: end allocate +DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrsDEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy - -DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin free DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin free -DEBUG: end DEBUG: end copy -free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal DEBUG: begin copy DEBUG: end copy -free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate @@ -1948,8 +1948,6 @@ DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: end free DEBUG: begin copy DEBUG: end copy @@ -1957,148 +1955,150 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin freeDEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin free -free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: end DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: end free DEBUG: begin free -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -free DEBUG: end free DEBUG: begin DEBUG: begin free DEBUG: end free +DEBUG: begin free DEBUG: end free free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin free +free +DEBUG: end free +DEBUG: begin DEBUG: begin free +DEBUG: end freefree DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: end freeDEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end freeDEBUG: begin free DEBUG: end free + DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end freefree DEBUG: end free -DEBUG: begin DEBUG: begin free +DEBUG: end free + DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin copy() -free +DEBUG: begin free +DEBUG: end DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin copy() +free +DEBUG: begin free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: begin DEBUG: end free DEBUG: begin copy() DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end DEBUG: end dense::copy DEBUG: end copy() DEBUG: begin apply() -DEBUG: begin allocatedense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin allocate - -DEBUG: end allocate -DEBUG: begin DEBUG: end allocate DEBUG: begin allocate copy() -DEBUG: begin apply() DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end DEBUG: end allocate +DEBUG: begin DEBUG: end allocate DEBUG: begin allocate -DEBUG: end allocateallocate DEBUG: end allocate DEBUG: begin dense::row_gather DEBUG: end dense::row_gather +dense::copy +DEBUG: end copy() +DEBUG: begin apply() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin apply() +DEBUG: begin DEBUG: begin apply() +DEBUG: begin csr::spmv +allocate DEBUG: end allocate DEBUG: begin dense::row_gather DEBUG: end dense::row_gather +allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin dense::row_gather DEBUG: end dense::row_gather - -DEBUG: begin apply() -DEBUG: begin apply() -DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: begin csr::spmv +DEBUG: end apply() DEBUG: begin apply() -DEBUG: begin csr::spmv +DEBUG: begin DEBUG: begin apply() +DEBUG: begin advanced_apply() +csr::spmv DEBUG: end csr::spmv +DEBUG: begin csr::spmv DEBUG: end csr::spmv -DEBUG: end DEBUG: end apply() -DEBUG: begin DEBUG: end apply() -DEBUG: begin advanced_apply() -apply() -DEBUG: begin advanced_apply()advanced_apply() -DEBUG: begin csr::advanced_spmv - +DEBUG: end apply() +DEBUG: end apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() -DEBUG: end csr::advanced_spmv -DEBUG: end apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end advanced_apply() DEBUG: end apply() +DEBUG: begin advanced_apply() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 -DEBUG: end DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2dense::compute_squared_norm2 +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv DEBUG: begin csr::advanced_spmv +DEBUG: end DEBUG: end dense::compute_squared_norm2 DEBUG: end csr::advanced_spmv - DEBUG: end advanced_apply() DEBUG: end apply() DEBUG: begin allocate DEBUG: end allocate +csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin allocate +DEBUG: end allocateDEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 + DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt @@ -2109,79 +2109,78 @@ DEBUG: begin allocate DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate DEBUG: end allocate DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::add_scaled +DEBUG: end allocate DEBUG: begin dense::add_scaled -dense::add_scaled -DEBUG: end dense::add_scaled DEBUG: end dense::add_scaled DEBUG: begin DEBUG: begin dense::add_scaled DEBUG: end dense::add_scaled DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 +DEBUG: end dense::add_scaled DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin DEBUG: begin dense::compute_sqrt -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt -DEBUG: begin copy()dense::compute_sqrt DEBUG: begin copy() +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt DEBUG: begin copy() DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy +DEBUG: end allocateDEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: begin allocate +DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy +DEBUG: end +DEBUG: begin dense::copy DEBUG: end dense::copy - -DEBUG: end dense::copy +DEBUG: end copy()dense::copy DEBUG: end copy() +dense::copy DEBUG: end copy() DEBUG: begin copy() +DEBUG: begin copy() DEBUG: begin allocate -DEBUG: end allocateDEBUG: end copy() +DEBUG: end allocate +DEBUG: begin dense::copy DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin free -DEBUG: end freeDEBUG: begin copy() +DEBUG: end DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy - -DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() DEBUG: begin free DEBUG: end free -DEBUG: begin DEBUG: end copy() -free -DEBUG: end free +DEBUG: begin freecopy() DEBUG: begin free +DEBUG: end freeDEBUG: end copy() DEBUG: begin free +DEBUG: end DEBUG: end free DEBUG: begin free +DEBUG: end DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -2190,81 +2189,80 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end free +free DEBUG: begin free DEBUG: end free free DEBUG: end free -DEBUG: begin freeDEBUG: begin copy() -DEBUG: begin allocate +DEBUG: begin DEBUG: end free DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate +DEBUG: begin DEBUG: begin free DEBUG: end free DEBUG: begin copy() DEBUG: begin allocate -DEBUG: end allocate +DEBUG: end allocatefree +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy() +allocate DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() -DEBUG: begin apply() DEBUG: begin dense::copy -DEBUG: end dense::copy +DEBUG: end dense::copyDEBUG: begin allocate +DEBUG: end allocateDEBUG: begin apply() +DEBUG: begin DEBUG: end copy() DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather + DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather DEBUG: begin apply() -DEBUG: begin csr::spmvDEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end DEBUG: begin apply() DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end dense::row_gather -DEBUG: begin apply() +DEBUG: end DEBUG: begin apply() apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +csr::spmv +DEBUG: end apply() DEBUG: begin advanced_apply() - +DEBUG: begin csr::advanced_spmv +DEBUG: end DEBUG: begin csr::spmv DEBUG: end csr::spmv DEBUG: end apply() DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv +DEBUG: begin apply() DEBUG: begin csr::spmv -DEBUG: end csr::spmvDEBUG: begin csr::advanced_spmv +DEBUG: end csr::spmv +DEBUG: end DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv +apply() +csr::advanced_spmv DEBUG: end advanced_apply() -DEBUG: end apply() - +DEBUG: end DEBUG: end advanced_apply() DEBUG: end apply() DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() DEBUG: end apply() -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin free -DEBUG: end free +apply() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -2275,94 +2273,96 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: end free +DEBUG: end DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +free DEBUG: begin free -DEBUG: end free DEBUG: begin free DEBUG: end free - DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +DEBUG: end freeDEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end freefree DEBUG: begin free DEBUG: end free + DEBUG: begin free DEBUG: end free -DEBUG: begin freefree DEBUG: begin free +DEBUG: end DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: begin +free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free +free +DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: begin freeDEBUG: end free +DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: begin freeDEBUG: begin free DEBUG: end free DEBUG: begin free -DEBUG: end freefree +DEBUG: end free DEBUG: begin free -DEBUG: end csr-csr +DEBUG: end free DEBUG: begin free - +DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end csr-csr DEBUG: end free -DEBUG: begin free +DEBUG: end csr-csr +Current state: +[ + { DEBUG: end free -DEBUG: begin DEBUG: end csr-csr +DEBUG: end csr-csr DEBUG: begin free DEBUG: end free + DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free + "size":DEBUG: begin free DEBUG: end free -DEBUG: end csr-csr -Current state:free +DEBUG: begin free DEBUG: end free DEBUG: end stencil(100,7pt,stencil) DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end stencil(100,7pt,stencil) -[ - - { - "size": 81, + 81, "stencil": "7pt", "comm_pattern": "stencil", "spmv": { "csr-csr": { "storage": 6420, "max_relative_norm2": 0.0, - "time": 0.000037, + "time": 0.000046, "repetitions": 1, "completed": true } From e3a85538e8e03ad307cd4448661d91fc7f2dea05 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 22 May 2023 13:56:46 +0200 Subject: [PATCH 036/583] format files --- benchmark/test/blas.py | 39 +++--- benchmark/test/conversion.py | 61 ++++++--- benchmark/test/matrix_statistics.py | 27 ++-- benchmark/test/multi_vector_distributed.py | 46 ++++--- benchmark/test/preconditioner.py | 41 ++++-- benchmark/test/solver.py | 41 ++++-- benchmark/test/solver_distributed.py | 44 +++++-- benchmark/test/sparse_blas.py | 48 +++++-- benchmark/test/spmv.py | 41 ++++-- benchmark/test/spmv_distributed.py | 42 +++--- benchmark/test/test_framework.py.in | 146 +++++++++++++++------ 11 files changed, 396 insertions(+), 180 deletions(-) diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py index e099718bae0..160d5364e20 100755 --- a/benchmark/test/blas.py +++ b/benchmark/test/blas.py @@ -1,25 +1,34 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-input", '[{"n": 100}]'], - expected_stdout="blas.simple.stdout", - expected_stderr="blas.simple.stderr") +test_framework.compare_output( + ["-input", '[{"n": 100}]'], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", +) # stdin -test_framework.compare_output([], - expected_stdout="blas.simple.stdout", - expected_stderr="blas.simple.stderr", - stdin='[{"n": 100}]') +test_framework.compare_output( + [], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", + stdin='[{"n": 100}]', +) # file -test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.blas.json")], - expected_stdout="blas.simple.stdout", - expected_stderr="blas.simple.stderr", - stdin='[{"n": 100}]') +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.blas.json")], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", + stdin='[{"n": 100}]', +) # profiler annotations -test_framework.compare_output(["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="blas.profile.stdout", - expected_stderr="blas.profile.stderr", - stdin='[{"n": 100}]') +test_framework.compare_output( + ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"], + expected_stdout="blas.profile.stdout", + expected_stderr="blas.profile.stderr", + stdin='[{"n": 100}]', +) diff --git a/benchmark/test/conversion.py b/benchmark/test/conversion.py index 91e71cc9e89..cf2e33983af 100755 --- a/benchmark/test/conversion.py +++ b/benchmark/test/conversion.py @@ -1,28 +1,57 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"], - expected_stdout="conversion.simple.stdout", - expected_stderr="conversion.simple.stderr") +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr", +) # stdin -test_framework.compare_output(["-formats", "coo,csr"], - expected_stdout="conversion.simple.stdout", - expected_stderr="conversion.simple.stderr", - stdin='[{"size": 100, "stencil": "7pt"}]') +test_framework.compare_output( + ["-formats", "coo,csr"], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) # input file -test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.mtx.json"), "-formats", "coo,csr"], - expected_stdout="conversion.simple.stdout", - expected_stderr="conversion.simple.stderr") +test_framework.compare_output( + [ + "-input", + str(test_framework.sourcepath / "input.mtx.json"), + "-formats", + "coo,csr", + ], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr", +) # check that all conversions work -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr,ell,sellp,hybrid"], - expected_stdout="conversion.all.stdout", - expected_stderr="conversion.all.stderr") +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-formats", + "coo,csr,ell,sellp,hybrid", + ], + expected_stdout="conversion.all.stdout", + expected_stderr="conversion.all.stderr", +) # profiler annotations -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr", '-profile', '-profiler_hook', 'debug'], - expected_stdout="conversion.profile.stdout", - expected_stderr="conversion.profile.stderr") +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-formats", + "coo,csr", + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="conversion.profile.stdout", + expected_stderr="conversion.profile.stderr", +) diff --git a/benchmark/test/matrix_statistics.py b/benchmark/test/matrix_statistics.py index 62547acfbeb..365cfe025dd 100755 --- a/benchmark/test/matrix_statistics.py +++ b/benchmark/test/matrix_statistics.py @@ -1,18 +1,25 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], - expected_stdout="matrix_statistics.simple.stdout", - expected_stderr="matrix_statistics.simple.stderr") +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr", +) # stdin -test_framework.compare_output([], - expected_stdout="matrix_statistics.simple.stdout", - expected_stderr="matrix_statistics.simple.stderr", - stdin='[{"size": 100, "stencil": "7pt"}]') +test_framework.compare_output( + [], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) # input file -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], - expected_stdout="matrix_statistics.simple.stdout", - expected_stderr="matrix_statistics.simple.stderr") +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr", +) diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py index 808a7c3e458..aab886ca509 100644 --- a/benchmark/test/multi_vector_distributed.py +++ b/benchmark/test/multi_vector_distributed.py @@ -1,29 +1,37 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output_distributed(["-input", '[{"n": 100}]'], - expected_stdout="multi_vector_distributed.simple.stdout", - expected_stderr="multi_vector_distributed.simple.stderr", - num_procs=3) +test_framework.compare_output_distributed( + ["-input", '[{"n": 100}]'], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + num_procs=3, +) # stdin -test_framework.compare_output_distributed([], - expected_stdout="multi_vector_distributed.simple.stdout", - expected_stderr="multi_vector_distributed.simple.stderr", - stdin='[{"n": 100}]', - num_procs=3) +test_framework.compare_output_distributed( + [], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + stdin='[{"n": 100}]', + num_procs=3, +) # file -test_framework.compare_output_distributed(["-input", str(test_framework.sourcepath / "input.blas.json")], - expected_stdout="multi_vector_distributed.simple.stdout", - expected_stderr="multi_vector_distributed.simple.stderr", - stdin='[{"n": 100}]', - num_procs=3) +test_framework.compare_output_distributed( + ["-input", str(test_framework.sourcepath / "input.blas.json")], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + stdin='[{"n": 100}]', + num_procs=3, +) # profiler annotations -test_framework.compare_output_distributed(["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="multi_vector_distributed.profile.stdout", - expected_stderr="multi_vector_distributed.profile.stderr", - stdin='[{"n": 100}]', - num_procs=3) +# currently still unstable output and thus disabled +# test_framework.compare_output_distributed(["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], +# expected_stdout="multi_vector_distributed.profile.stdout", +# expected_stderr="multi_vector_distributed.profile.stderr", +# stdin='[{"n": 100}]', +# num_procs=3) diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py index 4a044cd25f5..a5a8dd3f13f 100755 --- a/benchmark/test/preconditioner.py +++ b/benchmark/test/preconditioner.py @@ -1,23 +1,38 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], - expected_stdout="preconditioner.simple.stdout", - expected_stderr="preconditioner.simple.stderr") +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr", +) # stdin -test_framework.compare_output([], - expected_stdout="preconditioner.simple.stdout", - expected_stderr="preconditioner.simple.stderr", - stdin='[{"size": 100, "stencil": "7pt"}]') +test_framework.compare_output( + [], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) # input file -test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.mtx.json")], - expected_stdout="preconditioner.simple.stdout", - expected_stderr="preconditioner.simple.stderr") +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr", +) # profiler annotations -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="preconditioner.profile.stdout", - expected_stderr="preconditioner.profile.stderr") +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="preconditioner.profile.stdout", + expected_stderr="preconditioner.profile.stderr", +) diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py index fd8130e0ae1..e974f849276 100755 --- a/benchmark/test/solver.py +++ b/benchmark/test/solver.py @@ -1,23 +1,38 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'], - expected_stdout="solver.simple.stdout", - expected_stderr="solver.simple.stderr") +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr", +) # stdin -test_framework.compare_output([], - expected_stdout="solver.simple.stdout", - expected_stderr="solver.simple.stderr", - stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]') +test_framework.compare_output( + [], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', +) # input file -test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.solver.json")], - expected_stdout="solver.simple.stdout", - expected_stderr="solver.simple.stderr") +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.solver.json")], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr", +) # profiler annotations -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="solver.profile.stdout", - expected_stderr="solver.profile.stderr") +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="solver.profile.stdout", + expected_stderr="solver.profile.stderr", +) diff --git a/benchmark/test/solver_distributed.py b/benchmark/test/solver_distributed.py index f8a02861e26..c6623723a43 100644 --- a/benchmark/test/solver_distributed.py +++ b/benchmark/test/solver_distributed.py @@ -1,23 +1,41 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]'], - expected_stdout="distributed_solver.simple.stdout", - expected_stderr="distributed_solver.simple.stderr") +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', + ], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr", +) # stdin -test_framework.compare_output([], - expected_stdout="distributed_solver.simple.stdout", - expected_stderr="distributed_solver.simple.stderr", - stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]') +test_framework.compare_output( + [], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', +) # input file -test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.distributed_solver.json")], - expected_stdout="distributed_solver.simple.stdout", - expected_stderr="distributed_solver.simple.stderr") +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.distributed_solver.json")], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr", +) # profiler annotations -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="distributed_solver.profile.stdout", - expected_stderr="distributed_solver.profile.stderr") +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="distributed_solver.profile.stdout", + expected_stderr="distributed_solver.profile.stderr", +) diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py index 913aac94d07..7b0968a710c 100755 --- a/benchmark/test/sparse_blas.py +++ b/benchmark/test/sparse_blas.py @@ -1,23 +1,45 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]'], - expected_stdout="sparse_blas.simple.stdout", - expected_stderr="sparse_blas.simple.stderr") +test_framework.compare_output( + ["-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr", +) # stdin -test_framework.compare_output(["-operations", "transpose"], - expected_stdout="sparse_blas.simple.stdout", - expected_stderr="sparse_blas.simple.stderr", - stdin='[{"size": 100, "stencil": "7pt"}]') +test_framework.compare_output( + ["-operations", "transpose"], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) # input file -test_framework.compare_output(["-operations", "transpose", "-input", str(test_framework.sourcepath / "input.mtx.json")], - expected_stdout="sparse_blas.simple.stdout", - expected_stderr="sparse_blas.simple.stderr") +test_framework.compare_output( + [ + "-operations", + "transpose", + "-input", + str(test_framework.sourcepath / "input.mtx.json"), + ], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr", +) # profiler annotations (transpose has the smallest number of allocations) -test_framework.compare_output(["-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="sparse_blas.profile.stdout", - expected_stderr="sparse_blas.profile.stderr") +test_framework.compare_output( + [ + "-operations", + "transpose", + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="sparse_blas.profile.stdout", + expected_stderr="sparse_blas.profile.stderr", +) diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py index d3f3015b9dd..6e2d9f05d49 100755 --- a/benchmark/test/spmv.py +++ b/benchmark/test/spmv.py @@ -1,23 +1,38 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]'], - expected_stdout="spmv.simple.stdout", - expected_stderr="spmv.simple.stderr") +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr", +) # stdin -test_framework.compare_output([], - expected_stdout="spmv.simple.stdout", - expected_stderr="spmv.simple.stderr", - stdin='[{"size": 100, "stencil": "7pt"}]') +test_framework.compare_output( + [], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) # input file -test_framework.compare_output(["-input", str(test_framework.sourcepath / "input.mtx.json")], - expected_stdout="spmv.simple.stdout", - expected_stderr="spmv.simple.stderr") +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr", +) # profiler annotations -test_framework.compare_output(["-input", '[{"size": 100, "stencil": "7pt"}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="spmv.profile.stdout", - expected_stderr="spmv.profile.stderr") +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="spmv.profile.stdout", + expected_stderr="spmv.profile.stderr", +) diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py index f6aa1accbe9..1b219b34cda 100644 --- a/benchmark/test/spmv_distributed.py +++ b/benchmark/test/spmv_distributed.py @@ -1,27 +1,35 @@ #!/usr/bin/env python3 import test_framework + # check that all input modes work: # parameter -test_framework.compare_output_distributed(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], - expected_stdout="spmv_distributed.simple.stdout", - expected_stderr="spmv_distributed.simple.stderr", - num_procs=3) +test_framework.compare_output_distributed( + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, +) # stdin -test_framework.compare_output_distributed([], - expected_stdout="spmv_distributed.simple.stdout", - expected_stderr="spmv_distributed.simple.stderr", - num_procs=3, - stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]') +test_framework.compare_output_distributed( + [], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, + stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', +) # input file -test_framework.compare_output_distributed(["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], - expected_stdout="spmv_distributed.simple.stdout", - expected_stderr="spmv_distributed.simple.stderr", - num_procs=3) +test_framework.compare_output_distributed( + ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, +) # profiler annotations -test_framework.compare_output_distributed(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', '-profile', '-profiler_hook', 'debug'], - expected_stdout="spmv_distributed.profile.stdout", - expected_stderr="spmv_distributed.profile.stderr", - num_procs=3) +# currently still unstable output and thus disabled +# test_framework.compare_output_distributed(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', '-profile', '-profiler_hook', 'debug'], +# expected_stdout="spmv_distributed.profile.stdout", +# expected_stderr="spmv_distributed.profile.stderr", +# num_procs=3) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 2d22f11ac4f..fff93548ad6 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -6,6 +6,7 @@ import typing import re import pathlib import sys + sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") binpath = pathlib.Path("@PROJECT_BINARY_DIR@") generate = False @@ -13,11 +14,22 @@ if len(sys.argv) > 2 and sys.argv[2] == "--generate": generate = True -denumberify_paths = ["time", "bandwidth", "flops", "components", - "residual_norm", "rhs_norm", "max_relative_norm2"] +denumberify_paths = [ + "time", + "bandwidth", + "flops", + "components", + "residual_norm", + "rhs_norm", + "max_relative_norm2", +] empty_string_paths = ["error"] -empty_array_paths = ["recurrent_residuals", "true_residuals", - "implicit_residuals", "iteration_timestamps"] +empty_array_paths = [ + "recurrent_residuals", + "true_residuals", + "implicit_residuals", + "iteration_timestamps", +] def sanitize_json_single(key, value, sanitize_all): @@ -34,7 +46,10 @@ def sanitize_json_single(key, value, sanitize_all): def sanitize_json(parsed_input, sanitize_all=False): if isinstance(parsed_input, typing.Dict): - return {key: sanitize_json_single(key, value, sanitize_all) for key, value in parsed_input.items()} + return { + key: sanitize_json_single(key, value, sanitize_all) + for key, value in parsed_input.items() + } elif isinstance(parsed_input, typing.List): return [sanitize_json(e, sanitize_all) for e in parsed_input] elif sanitize_all and isinstance(parsed_input, float): @@ -47,21 +62,36 @@ def sanitize_text(lines): json_begins = [i for i, l in enumerate(lines) if l in ["[", "{"]] json_ends = [i + 1 for i, l in enumerate(lines) if l in ["]", "}"]] json_pairs = list(zip(json_begins, json_ends)) - if (len(json_pairs) == 0): + if len(json_pairs) == 0: return lines - assert (all(begin < end for begin, end in json_pairs)) - nonjson_pairs = [(0, json_begins[0])] + list(zip(json_ends[:-1], - json_begins[1:])) + [(json_ends[-1], len(lines))] - combined_pairs = sorted([(begin, end, False) for begin, end in nonjson_pairs] + [ - (begin, end, True) for begin, end in json_pairs]) - texts = [("\n".join(lines[begin:end]), do_sanitize) - for begin, end, do_sanitize in combined_pairs] - reconstructed = [json.dumps(sanitize_json(json.loads( - t)), indent=4) if do_sanitize else t for t, do_sanitize in texts] + assert all(begin < end for begin, end in json_pairs) + nonjson_pairs = ( + [(0, json_begins[0])] + + list(zip(json_ends[:-1], json_begins[1:])) + + [(json_ends[-1], len(lines))] + ) + combined_pairs = sorted( + [(begin, end, False) for begin, end in nonjson_pairs] + + [(begin, end, True) for begin, end in json_pairs] + ) + texts = [ + ("\n".join(lines[begin:end]), do_sanitize) + for begin, end, do_sanitize in combined_pairs + ] + reconstructed = [ + json.dumps(sanitize_json(json.loads(t)), indent=4) if do_sanitize else t + for t, do_sanitize in texts + ] return "\n".join(reconstructed).split("\n") -def determinize_text(input, denumberify_paths=[], remove_paths=[], ignore_patterns=[], replace_patterns=[]): +def determinize_text( + input, + denumberify_paths=[], + remove_paths=[], + ignore_patterns=[], + replace_patterns=[], +): lines = input.splitlines() output_lines = [] patterns = [re.compile(pattern) for pattern in ignore_patterns] @@ -87,48 +117,88 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl args = [sys.argv[1]] + args expected_stdout = str(sourcepath / "reference" / expected_stdout) expected_stderr = str(sourcepath / "reference" / expected_stderr) - result = subprocess.run(args=launcher_flags + args, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, input=bytes(stdin, "utf-8")) - print("TEST: {}".format( - " ".join(["'{}'".format(arg) for arg in launcher_flags + args]))) + result = subprocess.run( + args=launcher_flags + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + input=bytes(stdin, "utf-8"), + ) + print( + "TEST: {}".format( + " ".join(["'{}'".format(arg) for arg in launcher_flags + args]) + ) + ) version_patterns = [ " the .* module is", ] typename_patterns = [ ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()"), - ("what\\(\\): .*", "what(): ") + ("what\\(\\): .*", "what(): "), ] if generate: - open(expected_stdout, "w").write("\n".join(determinize_text( - result.stdout.decode(), replace_patterns=typename_patterns))) - open(expected_stderr, "w").write("\n".join(determinize_text(result.stderr.decode( - ), ignore_patterns=version_patterns, replace_patterns=typename_patterns))) + open(expected_stdout, "w").write( + "\n".join( + determinize_text( + result.stdout.decode(), replace_patterns=typename_patterns + ) + ) + ) + open(expected_stderr, "w").write( + "\n".join( + determinize_text( + result.stderr.decode(), + ignore_patterns=version_patterns, + replace_patterns=typename_patterns, + ) + ) + ) print("GENERATED") return result_stdout_processed = determinize_text( - result.stdout.decode(), replace_patterns=typename_patterns) - result_stderr_processed = determinize_text(result.stderr.decode( - ), ignore_patterns=version_patterns, replace_patterns=typename_patterns) + result.stdout.decode(), replace_patterns=typename_patterns + ) + result_stderr_processed = determinize_text( + result.stderr.decode(), + ignore_patterns=version_patterns, + replace_patterns=typename_patterns, + ) expected_stdout_processed = determinize_text( - open(expected_stdout).read(), replace_patterns=typename_patterns) - expected_stderr_processed = determinize_text(open(expected_stderr).read( - ), ignore_patterns=version_patterns, replace_patterns=typename_patterns) + open(expected_stdout).read(), replace_patterns=typename_patterns + ) + expected_stderr_processed = determinize_text( + open(expected_stderr).read(), + ignore_patterns=version_patterns, + replace_patterns=typename_patterns, + ) failed = False if result_stdout_processed != expected_stdout_processed: print("FAIL: stdout differs") - print("\n".join(difflib.unified_diff( - expected_stdout_processed, result_stdout_processed))) + print( + "\n".join( + difflib.unified_diff(expected_stdout_processed, result_stdout_processed) + ) + ) failed = True if result_stderr_processed != expected_stderr_processed: print("FAIL: stderr differs") - print("\n".join(difflib.unified_diff( - expected_stderr_processed, result_stderr_processed))) + print( + "\n".join( + difflib.unified_diff(expected_stderr_processed, result_stderr_processed) + ) + ) failed = True if failed: exit(1) print("PASS") -def compare_output_distributed(args, expected_stdout, expected_stderr, num_procs, stdin=""): - compare_output(args, expected_stdout, expected_stderr, stdin, [ - "@MPIEXEC_EXECUTABLE@", "@MPIEXEC_NUMPROC_FLAG@", str(num_procs)]) +def compare_output_distributed( + args, expected_stdout, expected_stderr, num_procs, stdin="" +): + compare_output( + args, + expected_stdout, + expected_stderr, + stdin, + ["@MPIEXEC_EXECUTABLE@", "@MPIEXEC_NUMPROC_FLAG@", str(num_procs)], + ) From bd7f565b30d8d10d3f27589af6ae7ac4ac4fffdb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 22 May 2023 15:10:03 +0200 Subject: [PATCH 037/583] disable unstable tests --- benchmark/test/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt index 1cd589927fa..0a2be0e662c 100644 --- a/benchmark/test/CMakeLists.txt +++ b/benchmark/test/CMakeLists.txt @@ -22,7 +22,8 @@ add_benchmark_test(solver) add_benchmark_test(sparse_blas) add_benchmark_test(spmv) if (GINKGO_BUILD_MPI) - add_benchmark_test(multi_vector_distributed) - add_benchmark_test(spmv_distributed) + # the distributed tests are still failing due to unstable output + #add_benchmark_test(multi_vector_distributed) + #add_benchmark_test(spmv_distributed) add_benchmark_test(solver_distributed) endif() \ No newline at end of file From 82d567fb88562f458cb35b1f0e9ddaae3a20aca8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 23 May 2023 10:52:20 +0200 Subject: [PATCH 038/583] move SYCL_DEVICE_FILTER by ONEAPI_DEVICE_SELECTOR --- .gitlab-ci.yml | 10 +++++----- .gitlab/scripts.yml | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 85683fc100c..d899ff00ad0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -615,7 +615,7 @@ build/dpcpp/2022-1/cpu/release/static: BUILD_DPCPP: "ON" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" - SYCL_DEVICE_FILTER: "CPU" + ONEAPI_DEVICE_SELECTOR: "*:cpu" SLURM_PARTITION: "cpu" SLURM_TIME: "2:00:00" # This job is not in exclusive mode @@ -634,7 +634,7 @@ build/dpcpp/igpu/release/shared: BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_FILTER: "GPU" + ONEAPI_DEVICE_SELECTOR: "*:gpu" # TODO: Enable when debug shared library size issues are fixed # build/dpcpp/level_zero_igpu/debug/shared: @@ -650,7 +650,7 @@ build/dpcpp/igpu/release/shared: # BUILD_TYPE: "Debug" # BUILD_SHARED_LIBS: "ON" # DPCPP_SINGLE_MODE: "ON" -# SYCL_DEVICE_FILTER: "Level_Zero:GPU" +# ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" # It gives two available backends of GPU on tests build/dpcpp/dgpu/release/static: @@ -666,7 +666,7 @@ build/dpcpp/dgpu/release/static: BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OF" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_FILTER: "GPU" + ONEAPI_DEVICE_SELECTOR: "*:gpu" build/dpcpp/level_zero_dgpu/release/shared: extends: @@ -680,7 +680,7 @@ build/dpcpp/level_zero_dgpu/release/shared: BUILD_DPCPP: "ON" BUILD_TYPE: "Release" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_FILTER: "Level_Zero:GPU" + ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" # Job with important warnings as error warnings: diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 537f2e5e83e..7b1c30c27c0 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -68,6 +68,7 @@ fi - if [ -n "${SYCL_DEVICE_TYPE}" ]; then export SYCL_DEVICE_TYPE; fi - if [ -n "${SYCL_DEVICE_FILTER}" ]; then export SYCL_DEVICE_FILTER; fi + - if [ -n "${ONEAPI_DEVICE_SELECTOR}" ]; then export ONEAPI_DEVICE_SELECTOR; fi - if [[ "${MPI_AS_ROOT}" == "ON" ]];then export OMPI_ALLOW_RUN_AS_ROOT=1; export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1; From 2241e7ea7e00bfd8a21f4d8860e84cecaffb7ea1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 4 Jun 2023 16:16:20 +0200 Subject: [PATCH 039/583] update benchmark outputs - no more -detailed information in the output - moved the range annotation closer to the hot loop --- .../reference/preconditioner.profile.stderr | 28 +- .../reference/preconditioner.profile.stdout | 12 +- .../test/reference/solver.profile.stderr | 890 ------------------ .../test/reference/solver.profile.stdout | 27 +- .../test/reference/sparse_blas.profile.stderr | 27 +- .../test/reference/sparse_blas.profile.stdout | 7 - benchmark/test/reference/spmv.profile.stderr | 65 -- benchmark/test/reference/spmv.profile.stdout | 1 - 8 files changed, 8 insertions(+), 1049 deletions(-) diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index 2bebc03be8d..bd8628be212 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -77,22 +77,6 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: begin free DEBUG: end free -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin generate() -DEBUG: end generate() -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin free -DEBUG: end free DEBUG: end none Current state: [ @@ -102,20 +86,12 @@ Current state: "preconditioner": { "none": { "generate": { - "components": { - "generate()": 1.0, - "overhead": 1.0 - }, + "components": {}, "time": 1.0, "repetitions": 1 }, "apply": { - "components": { - "apply()": 1.0, - "copy()": 1.0, - "dense::copy": 1.0, - "overhead": 1.0 - }, + "components": {}, "time": 1.0, "repetitions": 1 }, diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout index ba967989af4..cc73c4c4552 100644 --- a/benchmark/test/reference/preconditioner.profile.stdout +++ b/benchmark/test/reference/preconditioner.profile.stdout @@ -6,20 +6,12 @@ "preconditioner": { "none": { "generate": { - "components": { - "generate()": 1.0, - "overhead": 1.0 - }, + "components": {}, "time": 1.0, "repetitions": 1 }, "apply": { - "components": { - "apply()": 1.0, - "copy()": 1.0, - "dense::copy": 1.0, - "overhead": 1.0 - }, + "components": {}, "time": 1.0, "repetitions": 1 }, diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index a601444163d..3d9b9a3ad10 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -79,874 +79,6 @@ DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() -DEBUG: begin generate() -DEBUG: begin generate() -DEBUG: end generate() -DEBUG: end generate() -DEBUG: begin copy() -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin cg::initialize -DEBUG: end cg::initialize -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: end check() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end iteration -DEBUG: end apply() -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin free -DEBUG: end free -DEBUG: begin apply() -DEBUG: begin iteration -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin cg::initialize -DEBUG: end cg::initialize -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: end check() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end iteration -DEBUG: end apply() -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate @@ -959,28 +91,6 @@ DEBUG: begin generate() DEBUG: begin generate() DEBUG: end generate() DEBUG: end generate() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin allocate diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout index f66daea1f30..128a8a1f169 100644 --- a/benchmark/test/reference/solver.profile.stdout +++ b/benchmark/test/reference/solver.profile.stdout @@ -14,37 +14,14 @@ "iteration_timestamps": [], "rhs_norm": 1.0, "generate": { - "components": { - "generate()": 1.0, - "overhead": 1.0 - }, + "components": {}, "time": 1.0 }, "apply": { - "components": { - "apply()": 1.0, - "iteration": 1.0, - "allocate": 1.0, - "dense::fill": 1.0, - "cg::initialize": 1.0, - "advanced_apply()": 1.0, - "csr::advanced_spmv": 1.0, - "dense::compute_norm2_dispatch": 1.0, - "copy()": 1.0, - "dense::copy": 1.0, - "dense::compute_conj_dot_dispatch": 1.0, - "check()": 1.0, - "residual_norm::residual_norm": 1.0, - "cg::step_1": 1.0, - "csr::spmv": 1.0, - "cg::step_2": 1.0, - "free": 1.0, - "overhead": 1.0 - }, + "components": {}, "iterations": 7, "time": 1.0 }, - "preconditioner": {}, "residual_norm": 1.0, "repetitions": 1, "completed": true diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index 3dee884861e..66c67cf84ea 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -9,7 +9,6 @@ The operations are transposeRunning test case "stencil": "7pt", "sparse_blas": {} } -DEBUG: begin stencil(100,7pt) Matrix is of size (125, 125), 725 DEBUG: begin allocate DEBUG: end allocate @@ -35,6 +34,7 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free +DEBUG: begin stencil(100,7pt) DEBUG: begin transpose DEBUG: begin allocate DEBUG: end allocate @@ -46,22 +46,6 @@ DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin csr::transpose DEBUG: end csr::transpose -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin csr::transpose -DEBUG: end csr::transpose -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -80,13 +64,6 @@ Current state: "flops": 1.0, "bandwidth": 1.0, "repetitions": 1, - "components": { - "allocate": 1.0, - "components::fill_array": 1.0, - "csr::transpose": 1.0, - "free": 1.0, - "overhead": 1.0 - }, "completed": true } }, @@ -95,10 +72,10 @@ Current state: "nonzeros": 725 } ] +DEBUG: end stencil(100,7pt) DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout index ba92c30298a..848fb503ed4 100644 --- a/benchmark/test/reference/sparse_blas.profile.stdout +++ b/benchmark/test/reference/sparse_blas.profile.stdout @@ -9,13 +9,6 @@ "flops": 1.0, "bandwidth": 1.0, "repetitions": 1, - "components": { - "allocate": 1.0, - "components::fill_array": 1.0, - "csr::transpose": 1.0, - "free": 1.0, - "overhead": 1.0 - }, "completed": true } }, diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 735e4bf5d23..5a12a077bc5 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -59,24 +59,6 @@ DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: begin apply() -DEBUG: begin coo::spmv -DEBUG: end coo::spmv -DEBUG: end apply() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin coo DEBUG: begin allocate DEBUG: end allocate @@ -96,52 +78,6 @@ DEBUG: begin apply() DEBUG: begin coo::spmv DEBUG: end coo::spmv DEBUG: end apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::add_scaled -DEBUG: end dense::add_scaled -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_norm2_dispatch -DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin coo::spmv -DEBUG: end coo::spmv -DEBUG: end apply() DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -159,7 +95,6 @@ Current state: "spmv": { "coo": { "storage": 11600, - "max_relative_norm2": 1.0, "time": 1.0, "repetitions": 1, "completed": true diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout index ec7309613b6..dacc490ddf0 100644 --- a/benchmark/test/reference/spmv.profile.stdout +++ b/benchmark/test/reference/spmv.profile.stdout @@ -6,7 +6,6 @@ "spmv": { "coo": { "storage": 11600, - "max_relative_norm2": 1.0, "time": 1.0, "repetitions": 1, "completed": true From c5d24893b6a446fe69e670a738d24ea577233a65 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 5 Jun 2023 10:23:43 +0200 Subject: [PATCH 040/583] update distributed benchmark outputs --- .../distributed_solver.profile.stderr | 1150 ----------------- .../distributed_solver.profile.stdout | 29 +- 2 files changed, 2 insertions(+), 1177 deletions(-) diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index e0ddd10ab54..718240f5a38 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -232,1134 +232,6 @@ DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() -DEBUG: begin generate() -DEBUG: begin generate() -DEBUG: end generate() -DEBUG: end generate() -DEBUG: begin copy() -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin cg::initialize -DEBUG: end cg::initialize -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: end check() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end iteration -DEBUG: end apply() -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin free -DEBUG: end free -DEBUG: begin apply() -DEBUG: begin iteration -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin cg::initialize -DEBUG: end cg::initialize -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: begin check() -DEBUG: end check() -DEBUG: end check() -DEBUG: end iteration -DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin cg::step_1 -DEBUG: end cg::step_1 -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin cg::step_2 -DEBUG: end cg::step_2 -DEBUG: begin apply() -DEBUG: begin copy() -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: end apply() -DEBUG: begin dense::compute_conj_dot_dispatch -DEBUG: end dense::compute_conj_dot_dispatch -DEBUG: begin check() -DEBUG: begin check() -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin residual_norm::residual_norm -DEBUG: end residual_norm::residual_norm -DEBUG: end check() -DEBUG: end check() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin advanced_apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end iteration -DEBUG: end apply() -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate @@ -1372,28 +244,6 @@ DEBUG: begin generate() DEBUG: begin generate() DEBUG: end generate() DEBUG: end generate() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin apply() DEBUG: begin iteration DEBUG: begin allocate diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout index a31b88ff582..c61541a5d5b 100644 --- a/benchmark/test/reference/distributed_solver.profile.stdout +++ b/benchmark/test/reference/distributed_solver.profile.stdout @@ -15,39 +15,14 @@ "iteration_timestamps": [], "rhs_norm": 1.0, "generate": { - "components": { - "generate()": 1.0, - "overhead": 1.0 - }, + "components": {}, "time": 1.0 }, "apply": { - "components": { - "apply()": 1.0, - "iteration": 1.0, - "allocate": 1.0, - "dense::fill": 1.0, - "cg::initialize": 1.0, - "advanced_apply()": 1.0, - "dense::row_gather": 1.0, - "csr::advanced_spmv": 1.0, - "dense::compute_squared_norm2": 1.0, - "dense::compute_sqrt": 1.0, - "copy()": 1.0, - "dense::copy": 1.0, - "dense::compute_conj_dot_dispatch": 1.0, - "check()": 1.0, - "residual_norm::residual_norm": 1.0, - "cg::step_1": 1.0, - "csr::spmv": 1.0, - "cg::step_2": 1.0, - "free": 1.0, - "overhead": 1.0 - }, + "components": {}, "iterations": 7, "time": 1.0 }, - "preconditioner": {}, "residual_norm": 1.0, "repetitions": 1, "completed": true From edf3c6b6a3b2f4b19480b7ef80c2260accca8a13 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 6 Jun 2023 10:59:22 +0200 Subject: [PATCH 041/583] Replace deprecated SYCL_DEVICE_FILTER --- .github/workflows/intel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index e612c72b7e7..0d8acd52a34 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -37,7 +37,7 @@ jobs: cd build cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON make -j8 - SYCL_DEVICE_FILTER=level_zero ctest -j10 --output-on-failure + ONEAPI_DEVICE_SELECTOR=level_zero:gpu ctest -j10 --output-on-failure - name: install run: | From d194cb01cf73ed341e15aa11c6844a42a0b7727a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 20 Jun 2023 14:30:08 +0200 Subject: [PATCH 042/583] improve documentation and function naming Co-authored-by: Gregor Olenik --- benchmark/test/test_framework.py.in | 86 ++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 20 deletions(-) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index fff93548ad6..09986fabdf1 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -12,8 +12,6 @@ binpath = pathlib.Path("@PROJECT_BINARY_DIR@") generate = False if len(sys.argv) > 2 and sys.argv[2] == "--generate": generate = True - - denumberify_paths = [ "time", "bandwidth", @@ -32,25 +30,35 @@ empty_array_paths = [ ] -def sanitize_json_single(key, value, sanitize_all): +def sanitize_json_key_value(key: str, value, sanitize_all: bool): + """Applies sanitation to a single key-value pair. + + Strings with a key in empty_string_paths will be emptied + Numbers with a key in denumberify_paths will be set to 1.0 + + """ if key in empty_string_paths and isinstance(value, str): return "" if key in denumberify_paths and isinstance(value, float): return 1.0 - if key in denumberify_paths and isinstance(value, typing.Dict): + if key in denumberify_paths and isinstance(value, dict): return sanitize_json(value, True) - if key in empty_array_paths and isinstance(value, typing.List): + if key in empty_array_paths and isinstance(value, list): return [] return sanitize_json(value, sanitize_all) def sanitize_json(parsed_input, sanitize_all=False): - if isinstance(parsed_input, typing.Dict): + """Removes non-deterministic parts of a parsed JSON input. + + If sanitize_all is set to True, all nested float values will be set to 0. + Otherwise, the sanitation""" + if isinstance(parsed_input, dict): return { - key: sanitize_json_single(key, value, sanitize_all) + key: sanitize_json_key_value(key, value, sanitize_all) for key, value in parsed_input.items() } - elif isinstance(parsed_input, typing.List): + elif isinstance(parsed_input, list): return [sanitize_json(e, sanitize_all) for e in parsed_input] elif sanitize_all and isinstance(parsed_input, float): return 1.0 @@ -58,7 +66,15 @@ def sanitize_json(parsed_input, sanitize_all=False): return parsed_input -def sanitize_text(lines): +def sanitize_json_in_text(lines: list[str]) -> list[str]: + """Sanitizes all occurrences of JSON content inside text input. + + Takes a list of text lines and detects any pretty-printed JSON output inside + (recognized by a single [, {, } or ] in an otherwise empty line). + The JSON output will be parsed and sanitized through sanitize_json(...) + and pretty-printed to replace the original JSON input. + The function returns the resulting output""" + json_begins = [i for i, l in enumerate(lines) if l in ["[", "{"]] json_ends = [i + 1 for i, l in enumerate(lines) if l in ["]", "}"]] json_pairs = list(zip(json_begins, json_ends)) @@ -86,12 +102,20 @@ def sanitize_text(lines): def determinize_text( - input, - denumberify_paths=[], - remove_paths=[], - ignore_patterns=[], - replace_patterns=[], -): + input: str, + ignore_patterns: list[str], + replace_patterns: list[(str, str)], +) -> list[str]: + """Sanitizes the given input string. + + Every input line matching an entry from ignore_patterns will be removed. + Every line matching the first string in an entry from replace_patterns + will be replaced by the second string. + Finally, the text will be passed to sanitize_json_in_text, which removes + nondeterministic parts from JSON objects/arrays in the input, + if it can be parsed correctly. + The output is guaranteed to end with an empty line. + """ lines = input.splitlines() output_lines = [] patterns = [re.compile(pattern) for pattern in ignore_patterns] @@ -108,12 +132,18 @@ def determinize_text( if output_lines[-1] != "": output_lines.append("") try: - return sanitize_text(output_lines) + return sanitize_json_in_text(output_lines) except json.decoder.JSONDecodeError: return output_lines -def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_flags=[]): +def compare_output_impl( + args: list[str], + expected_stdout: str, + expected_stderr: str, + stdin: str, + launcher_flags: list[str], +): args = [sys.argv[1]] + args expected_stdout = str(sourcepath / "reference" / expected_stdout) expected_stderr = str(sourcepath / "reference" / expected_stderr) @@ -139,7 +169,9 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl open(expected_stdout, "w").write( "\n".join( determinize_text( - result.stdout.decode(), replace_patterns=typename_patterns + result.stdout.decode(), + ignore_patterns=[], + replace_patterns=typename_patterns, ) ) ) @@ -155,7 +187,7 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl print("GENERATED") return result_stdout_processed = determinize_text( - result.stdout.decode(), replace_patterns=typename_patterns + result.stdout.decode(), ignore_patterns=[], replace_patterns=typename_patterns ) result_stderr_processed = determinize_text( result.stderr.decode(), @@ -163,7 +195,9 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl replace_patterns=typename_patterns, ) expected_stdout_processed = determinize_text( - open(expected_stdout).read(), replace_patterns=typename_patterns + open(expected_stdout).read(), + ignore_patterns=[], + replace_patterns=typename_patterns, ) expected_stderr_processed = determinize_text( open(expected_stderr).read(), @@ -192,6 +226,18 @@ def compare_output(args, expected_stdout, expected_stderr, stdin="", launcher_fl print("PASS") +def compare_output( + args: list[str], expected_stdout: str, expected_stderr: str, stdin: str = "" +): + compare_output_impl( + args, + expected_stdout=expected_stdout, + expected_stderr=expected_stderr, + stdin=stdin, + launcher_flags=[], + ) + + def compare_output_distributed( args, expected_stdout, expected_stderr, num_procs, stdin="" ): From 5047d143622facb66c983ef8bb9356ba73001324 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 21 Jun 2023 12:20:48 +0200 Subject: [PATCH 043/583] update version --- benchmark/test/reference/blas.profile.stderr | 4 ++-- benchmark/test/reference/blas.simple.stderr | 4 ++-- benchmark/test/reference/conversion.all.stderr | 4 ++-- benchmark/test/reference/conversion.profile.stderr | 4 ++-- benchmark/test/reference/conversion.simple.stderr | 4 ++-- benchmark/test/reference/distributed_solver.profile.stderr | 4 ++-- benchmark/test/reference/distributed_solver.simple.stderr | 4 ++-- benchmark/test/reference/matrix_statistics.simple.stderr | 4 ++-- benchmark/test/reference/preconditioner.profile.stderr | 4 ++-- benchmark/test/reference/preconditioner.simple.stderr | 4 ++-- benchmark/test/reference/solver.profile.stderr | 4 ++-- benchmark/test/reference/solver.simple.stderr | 4 ++-- benchmark/test/reference/sparse_blas.profile.stderr | 4 ++-- benchmark/test/reference/sparse_blas.simple.stderr | 4 ++-- benchmark/test/reference/spmv.profile.stderr | 4 ++-- benchmark/test/reference/spmv.simple.stderr | 4 ++-- 16 files changed, 32 insertions(+), 32 deletions(-) diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index 16a86bd4c94..b697ad41392 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr index 72a2fbb9b90..02b6f94ba28 100644 --- a/benchmark/test/reference/blas.simple.stderr +++ b/benchmark/test/reference/blas.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr index d6aab6a0331..9ab8a899649 100644 --- a/benchmark/test/reference/conversion.all.stderr +++ b/benchmark/test/reference/conversion.all.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index e772752ea4a..6733472be8f 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr index f044da61804..d221ead12a4 100644 --- a/benchmark/test/reference/conversion.simple.stderr +++ b/benchmark/test/reference/conversion.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 718240f5a38..efd79f66dc5 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr index 6a5dab5d844..9feb7fa9522 100644 --- a/benchmark/test/reference/distributed_solver.simple.stderr +++ b/benchmark/test/reference/distributed_solver.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr index 69d2bbf9098..6b853c3f4ea 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stderr +++ b/benchmark/test/reference/matrix_statistics.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running test case { "size": 100, diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index bd8628be212..5b47bc9bd94 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr index bfec4a697ee..d480d4fedbd 100644 --- a/benchmark/test/reference/preconditioner.simple.stderr +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 3d9b9a3ad10..65b7560d936 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr index 936046c4949..c5e4267a6bd 100644 --- a/benchmark/test/reference/solver.simple.stderr +++ b/benchmark/test/reference/solver.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index 66c67cf84ea..d05f5117b8e 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr index e6e0884e267..bf5001f67b7 100644 --- a/benchmark/test/reference/sparse_blas.simple.stderr +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 5a12a077bc5..961ac587990 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr index 1bb4472bce6..dc9933b40ec 100644 --- a/benchmark/test/reference/spmv.simple.stderr +++ b/benchmark/test/reference/spmv.simple.stderr @@ -1,5 +1,5 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 From ccabb4d052a966b7f073e24b1c0dba024865d851 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 21 Jun 2023 14:16:06 +0200 Subject: [PATCH 044/583] review updates - add missing newline - remove disable test outputs - fix docstrings - fix duplicate matrix_statistics test Co-authored-by: Yuhsiang M. Tsai --- .gitlab-ci.yml | 2 +- benchmark/blas/blas.cpp | 3 +- benchmark/test/matrix_statistics.py | 2 +- benchmark/test/reference/blas.profile.stderr | 3 +- benchmark/test/reference/blas.simple.stderr | 3 +- .../multi_vector_distributed.profile.stderr | 808 ------ .../multi_vector_distributed.profile.stdout | 29 - .../multi_vector_distributed.simple.stderr | 76 - .../multi_vector_distributed.simple.stdout | 29 - .../reference/spmv_distributed.profile.stderr | 2380 ----------------- .../reference/spmv_distributed.profile.stdout | 21 - .../reference/spmv_distributed.simple.stderr | 34 - .../reference/spmv_distributed.simple.stdout | 21 - benchmark/test/test_framework.py.in | 13 +- 14 files changed, 17 insertions(+), 3407 deletions(-) delete mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stderr delete mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stdout delete mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stderr delete mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stdout delete mode 100644 benchmark/test/reference/spmv_distributed.profile.stderr delete mode 100644 benchmark/test/reference/spmv_distributed.profile.stdout delete mode 100644 benchmark/test/reference/spmv_distributed.simple.stderr delete mode 100644 benchmark/test/reference/spmv_distributed.simple.stdout diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d899ff00ad0..eafeae20729 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -615,7 +615,7 @@ build/dpcpp/2022-1/cpu/release/static: BUILD_DPCPP: "ON" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" - ONEAPI_DEVICE_SELECTOR: "*:cpu" + SYCL_DEVICE_TYPE: "CPU" SLURM_PARTITION: "cpu" SLURM_TIME: "2:00:00" # This job is not in exclusive mode diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index ee2dc06d01b..11228ed5818 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -133,7 +133,8 @@ Parameters for a benchmark case are: std::string format = example_config; initialize_argument_parsing(&argc, &argv, header, format); - std::string extra_information = "The operations are " + FLAGS_operations; + std::string extra_information = + "The operations are " + FLAGS_operations + "\n"; print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); diff --git a/benchmark/test/matrix_statistics.py b/benchmark/test/matrix_statistics.py index 365cfe025dd..a29c80a0a7a 100755 --- a/benchmark/test/matrix_statistics.py +++ b/benchmark/test/matrix_statistics.py @@ -19,7 +19,7 @@ # input file test_framework.compare_output( - ["-input", '[{"size": 100, "stencil": "7pt"}]'], + ["-input", str(test_framework.sourcepath / "input.mtx.json")], expected_stdout="matrix_statistics.simple.stdout", expected_stderr="matrix_statistics.simple.stderr", ) diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index b697ad41392..abc496b0921 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -3,7 +3,8 @@ This is Ginkgo 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 -The operations are copy,axpy,scalRunning test case +The operations are copy,axpy,scal +Running test case { "n": 100, "blas": {} diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr index 02b6f94ba28..9508b0dcf1e 100644 --- a/benchmark/test/reference/blas.simple.stderr +++ b/benchmark/test/reference/blas.simple.stderr @@ -3,7 +3,8 @@ This is Ginkgo 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 -The operations are copy,axpy,scalRunning test case +The operations are copy,axpy,scal +Running test case { "n": 100, "blas": {} diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr deleted file mode 100644 index 3cf18472311..00000000000 --- a/benchmark/test/reference/multi_vector_distributed.profile.stderr +++ /dev/null @@ -1,808 +0,0 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) -Running on reference(0) -Running with 0 warm iterations and 1 running iterations -The random seed for right hand sides is 42 -The operations are copy,axpy,scalRunning test case -{ - "n": 100, - "blasDEBUG: begin n = 100 -DEBUG: begin copy -": {} -} -DEBUG: begin n = 100 -DEBUG: begin copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_sizeDEBUG: begin n = 100 -DEBUG: begin copy -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copyDEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copyDEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy -DEBUG: begin free - -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end freefree -DEBUG: end free -DEBUG: begin free -DEBUG: end free -free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_arrayDEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freecopy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end free - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin dense::fill -DEBUG: end dense::fill -free -DEBUG: end free -DEBUG: begin dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end free -free -DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end copy -Current state: -[ - free -DEBUG: end copy -DEBUG: begin axpy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end copy -DEBUG: begin axpy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate { - "n": DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -100, - "blas": { - allocateDEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin "copy": { - -DEBUG: begin allocate -components::fill_array -DEBUG: end components::fill_array -DEBUG: begin DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy - "time": 0.0000components::fill_array -DEBUG: end components::fill_arrayDEBUG: end copy -DEBUG: begin free -08, - -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free - "flops": 12DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin 500000.0, - partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin freefree -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size - -DEBUG: end free -DEBUG: begin copy -DEBUG: end DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin "bandwidth": 200000000.0, - copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array "repetitiDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguousons": 1, - DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size - -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate - "completed": tDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -rue - DEBUG: end allocate -DEBUG: begin allocateDEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin } - -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_arrayfree -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin } - } -] -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin -DEBUG: begin axpy -DEBUG: begin allocatecomponents::fill_array -DEBUG: end components::fill_array -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill - -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copypartition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end -DEBUG: end copyDEBUG: begin dense::add_scaled -DEBUG: end dense::add_scaled -allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: begin allocate -DEBUG: end components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_arrayallocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -free -DEBUG: end DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -partition::build_starting_indices -DEBUG: begin copy -DEBUG: end DEBUG: begin dense::fill -DEBUG: end dense::fill -copy -DEBUG: begin free -DEBUG: end DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin dense::add_scaled -DEBUG: end dense::add_scaled -free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::add_scaled -DEBUG: end dense::add_scaled -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin free -DEBUG: end freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end free -DEBUG: end axpy -DEBUG: begin scal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end free -DEBUG: end axpy -DEBUG: begin scal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end free -DEBUG: end axpy -allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_sizeallocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_sizeCurrent state: -[ - { - -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end "n"allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateallocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array: 10 -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -0, - components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy "blas": { - DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: begin free -DEBUG: end free - "copy": { -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freefree -DEBUG: begin free - -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end - DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::scale -DEBUG: end dense::scalefree -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill - "ti -DEBUG: begin dense::scale -DEBUG: end dense::scale -me": 0.000008, - "flops": 12500000.0, - "bandwidth": 200000000.0, - "repetitions": 1, - "completed": true - }, - "axpy": { - "time": 0.00002, - "flops": 10000000.0, - "bandwidth": 119999999.99999999, - "repetitions": 1, - "completed": true - } - } - } -] -DEBUG: begin scal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::scale -DEBUG: end dense::scale -DEBUG: begin free -DEBUG: end freeDEBUG: begin free -DEBUG: end free -DEBUG: begin freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end scal -DEBUG: end n = 100 - -DEBUG: begin free -DEBUG: end free -DEBUG: end scal -Current state: -[ - { - "n": 100, - -DEBUG: end free -DEBUG: end scal -DEBUG: end n = 100 - "blas": { - "copy": { - "time": 0.000008, - "flops": 12500000.0, - "bandwidth": 200000000.0, - "repetitions": 1, - "completed": true - }, - "axpy": { - "time": 0.00002, - "flops": 10000000.0, - "bandwidth": 119999999.99999999, - "repetitions": 1, - "completed": true - }, - "scal": { - "time": 0.000006, - "flops": 16666666.666666666, - "bandwidth": 266666666.66666666, - "repetitions": 1, - "completed": true - } - } - } -] -DEBUG: end n = 100 diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout deleted file mode 100644 index 3a2e7e54f80..00000000000 --- a/benchmark/test/reference/multi_vector_distributed.profile.stdout +++ /dev/null @@ -1,29 +0,0 @@ - -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - }, - "scal": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - } - } - } -] diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr deleted file mode 100644 index 72a2fbb9b90..00000000000 --- a/benchmark/test/reference/multi_vector_distributed.simple.stderr +++ /dev/null @@ -1,76 +0,0 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) -Running on reference(0) -Running with 2 warm iterations and 10 running iterations -The random seed for right hand sides is 42 -The operations are copy,axpy,scalRunning test case -{ - "n": 100, - "blas": {} -} -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "scal": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout deleted file mode 100644 index 08e692727fe..00000000000 --- a/benchmark/test/reference/multi_vector_distributed.simple.stdout +++ /dev/null @@ -1,29 +0,0 @@ - -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "scal": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr deleted file mode 100644 index b190ac8a458..00000000000 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ /dev/null @@ -1,2380 +0,0 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) -Running on reference(0) -Running with 0 warm iterations and 1 running iterations -The random seed for right hand sides is 42 -The formats are [csr]x[csr] -The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "comm_pattern": "stencil", - "spmv": {} -} -DEBUG: begin stencil(100,7pt,stencil)DEBUG: begin stencil(100,7pt,stencil) - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_sizeDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin stencil(100,7pt,stencil) - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array - -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indicesDEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array - -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocatepartition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices - -DEBUG: begin allocate -DEBUG: end DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: begin components::aos_to_soa -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end freeDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end components::aos_to_soa - -DEBUG: begin copy -DEBUG: end copycomponents::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end components::aos_to_soa -dense::fill -DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: begin dense::fill -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end freeDEBUG: end allocate -DEBUG: end dense::fill -DEBUG: begin dense::fill_in_matrix_data - -DEBUG: begin free -DEBUG: end free -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end dense::fill_in_matrix_data -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin freeDEBUG: begin free -DEBUG: end free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: end free -free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end allocate -DEBUG: end allocate -partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: end allocate -DEBUG: begin allocatefree -DEBUG: end free -allocate -DEBUG: end allocate -DEBUG: begin allocate - -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_arrayDEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size - -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices - -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fillDEBUG: end dense::fill -DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin -DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeallocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill -free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -free -Matrix is of size (81, 81) -DEBUG: begin dense::fill_in_matrix_data -DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin copy() -DEBUG: begin copy() -free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin allocateDEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: begin free -DEBUG: end free - -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_sizeDEBUG: begin copy() -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copyDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end dense::copy -allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_arrayDEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array - -DEBUG: end components::fill_array -DEBUG: begin components::fill_arrayDEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocateDEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array - -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocatepartition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices - -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin free -DEBUG: begin copy() -DEBUG: begin copycomponents::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end free -partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end DEBUG: begin copy -DEBUG: end copy -partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin freeDEBUG: end copy() -DEBUG: begin copy()DEBUG: begin copy() -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end free - -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy - -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy() -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy()DEBUG: end copy()DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin copy() -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy() -DEBUG: begin copy() -allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin copy -DEBUG: end copyDEBUG: end allocate -DEBUG: begin components::aos_to_soaDEBUG: end copy -DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy - -DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin dense::fill -DEBUG: end dense::fill - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end DEBUG: end components::aos_to_soa -allocate -DEBUG: begin free -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin distributed_matrix::build_local_nonlocal - -DEBUG: begin components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: begin free -DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freefree -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin allocateDEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate - -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocateDEBUG: end allocate -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free - -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end -DEBUG: end allocate -DEBUG: begin freeDEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: end free - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freeallocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: begin free -DEBUG: end DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free - -DEBUG: end allocate -DEBUG: begin freefree -DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free - -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin free -DEBUG: end freeDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freeDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate - -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocateDEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end freeDEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freeallocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeallocate - -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin free -DEBUG: end free -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateallocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin copy -DEBUG: end copy -allocate -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin free -DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin freeallocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin freeDEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin copyfree -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free - -DEBUG: end copy -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end -DEBUG: begin free -DEBUG: end free -free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin freeDEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin apply() -DEBUG: begin apply() - -DEBUG: end free -DEBUG: begin apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::row_gather -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin apply() -DEBUG: begin apply() -DEBUG: begin coo::spmv -DEBUG: begin coo::spmv -DEBUG: end coo::spmv -DEBUG: end coo::spmv -DEBUG: end DEBUG: end apply() -DEBUG: begin coo::spmv -DEBUG: end apply() -coo::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin coo::advanced_spmv -DEBUG: end DEBUG: begin advanced_apply() -DEBUG: begin coo::advanced_spmv -DEBUG: end coo::advanced_spmv -DEBUG: end DEBUG: begin advanced_apply() -DEBUG: begin coo::advanced_spmv -coo::advanced_spmv -DEBUG: end advanced_apply() -advanced_apply() -DEBUG: end apply() -DEBUG: begin free -DEBUG: end free -DEBUG: end coo::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply()DEBUG: end apply() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freefree -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin csr-csr -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: end free -DEBUG: begin free -DEBUG: end -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin csr-csrfree -DEBUG: end free -DEBUG: begin csr-csr -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end -DEBUG: begin partition::build_ranges_from_global_size -DEBUG: end partition::build_ranges_from_global_sizeallocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_arrayallocate -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indicesDEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array - -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin free -DEBUG: end DEBUG: end components::fill_array -DEBUG: begin partition::build_from_contiguous -DEBUG: end partition::build_from_contiguous -DEBUG: begin partition::build_starting_indices -DEBUG: end partition::build_starting_indices -DEBUG: begin copy -DEBUG: end copycomponents::fill_array -DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array - -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_arrayDEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array - -DEBUG: end components::fill_array -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy()DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy() -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy -DEBUG: end copy - -DEBUG: begin copy -DEBUG: end copyDEBUG: begin copy() -DEBUG: begin copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copyDEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: end copy() -DEBUG: begin -DEBUG: end copy() -DEBUG: begin copy() -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copyallocate -DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: end copy() -DEBUG: begin allocateDEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa - -DEBUG: end allocate -DEBUG: begin dense::fillDEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::fill -DEBUG: end DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: begin free -DEBUG: end -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocatefree -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocateDEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -allocate -DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: begin allocate -DEBUG: end allocate -free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocateDEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end DEBUG: end free -DEBUG: begin allocate -free -DEBUG: begin allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freeDEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin freeDEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: end free -DEBUG: begin allocate -DEBUG: end -DEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateallocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocateallocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end -DEBUG: begin free -DEBUG: end free - -DEBUG: begin allocate -DEBUG: end allocateallocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateallocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end -DEBUG: end allocate -DEBUG: begin allocate -allocate -DEBUG: end allocate -DEBUG: begin freeallocate -DEBUG: begin free -DEBUG: end DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end freefree -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end allocate -DEBUG: end allocate -DEBUG: begin -DEBUG: begin allocate -DEBUG: end allocatefree -DEBUG: begin free -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: end allocate -DEBUG: begin allocatefree -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocateDEBUG: begin free -DEBUG: end free -DEBUG: end free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin freeDEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin free -DEBUG: end free -DEBUG: begin -DEBUG: begin allocate -DEBUG: end allocate - -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -free -DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin copy -DEBUG: end copy -DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocatefree -DEBUG: end free -DEBUG: begin -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocateDEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end -DEBUG: begin allocate -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end distributed_matrix::build_local_nonlocal -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -free -DEBUG: end free -DEBUG: begin DEBUG: begin free -DEBUG: end freefree -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy() -free -DEBUG: begin free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin DEBUG: end free -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin allocate -copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end DEBUG: end allocate -DEBUG: begin DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -dense::copy -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin DEBUG: begin apply() -DEBUG: begin csr::spmv -allocate -DEBUG: end allocate -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin apply() -DEBUG: begin DEBUG: begin apply() -DEBUG: begin advanced_apply() -csr::spmv -DEBUG: end csr::spmv -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: end apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: begin csr::advanced_spmv -DEBUG: end DEBUG: end dense::compute_squared_norm2 -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin allocate -DEBUG: end allocate -csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 - -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin allocate -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::add_scaled -DEBUG: end allocate -DEBUG: begin dense::add_scaled -DEBUG: end dense::add_scaled -DEBUG: begin DEBUG: begin dense::add_scaled -DEBUG: end dense::add_scaled -DEBUG: begin allocate -DEBUG: end allocate -allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: end dense::add_scaled -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin dense::compute_squared_norm2 -DEBUG: end dense::compute_squared_norm2 -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy() -DEBUG: begin dense::compute_sqrt -DEBUG: end dense::compute_sqrt -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocateDEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy()dense::copy -DEBUG: end copy() -dense::copy -DEBUG: end copy() -DEBUG: begin copy() -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin free -DEBUG: end free -DEBUG: begin freecopy() -DEBUG: begin free -DEBUG: end freeDEBUG: end copy() -DEBUG: begin free -DEBUG: end -DEBUG: end free -DEBUG: begin free -DEBUG: end -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -free -DEBUG: begin free -DEBUG: end free -free -DEBUG: end free -DEBUG: begin DEBUG: end free -DEBUG: begin copy() -DEBUG: begin DEBUG: begin free -DEBUG: end free -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocatefree -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin copy() -allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() - -DEBUG: begin dense::copy -DEBUG: end dense::copyDEBUG: begin allocate -DEBUG: end allocateDEBUG: begin apply() -DEBUG: begin -DEBUG: end copy() -DEBUG: begin apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather - -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end DEBUG: begin apply() -apply() -DEBUG: begin dense::row_gather -DEBUG: end dense::row_gather -csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin apply() -DEBUG: begin csr::spmv -DEBUG: end csr::spmv -DEBUG: end DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -apply() -csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end DEBUG: end advanced_apply() -DEBUG: end apply() -DEBUG: begin advanced_apply() -DEBUG: begin csr::advanced_spmv -DEBUG: end csr::advanced_spmv -DEBUG: end advanced_apply() -DEBUG: end apply() -apply() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -free -DEBUG: begin free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end freeDEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin freeDEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin freeDEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free - -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end csr-csr -DEBUG: end free -DEBUG: end csr-csr -Current state: -[ - { -DEBUG: end free -DEBUG: end csr-csr -DEBUG: begin free -DEBUG: end free - -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free - "size":DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end stencil(100,7pt,stencil) -DEBUG: end free -DEBUG: end stencil(100,7pt,stencil) - 81, - "stencil": "7pt", - "comm_pattern": "stencil", - "spmv": { - "csr-csr": { - "storage": 6420, - "max_relative_norm2": 0.0, - "time": 0.000046, - "repetitions": 1, - "completed": true - } - }, - "nnz": 144, - "optimal": {} - } -] -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: end stencil(100,7pt,stencil) diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout deleted file mode 100644 index 5512866fdf0..00000000000 --- a/benchmark/test/reference/spmv_distributed.profile.stdout +++ /dev/null @@ -1,21 +0,0 @@ - -[ - { - "size": 81, - "stencil": "7pt", - "comm_pattern": "stencil", - "spmv": { - "csr-csr": { - "storage": 6420, - "max_relative_norm2": 1.0, - "time": 1.0, - "repetitions": 1, - "completed": true - } - }, - "nnz": 144, - "optimal": { - "spmv": "csr-csr" - } - } -] diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr deleted file mode 100644 index 7fa9aeb581f..00000000000 --- a/benchmark/test/reference/spmv_distributed.simple.stderr +++ /dev/null @@ -1,34 +0,0 @@ -This is Ginkgo 1.6.0 (develop) - running with core module 1.6.0 (develop) -Running on reference(0) -Running with 2 warm iterations and 10 running iterations -The random seed for right hand sides is 42 -The formats are [csr]x[csr] -The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "comm_pattern": "stencil", - "spmv": {} -} -Matrix is of size (81, 81) -Current state: -[ - { - "size": 81, - "stencil": "7pt", - "comm_pattern": "stencil", - "spmv": { - "csr-csr": { - "storage": 6420, - "max_relative_norm2": 1.0, - "time": 1.0, - "repetitions": 10, - "completed": true - } - }, - "nnz": 144, - "optimal": {} - } -] diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout deleted file mode 100644 index 7b6e0883c14..00000000000 --- a/benchmark/test/reference/spmv_distributed.simple.stdout +++ /dev/null @@ -1,21 +0,0 @@ - -[ - { - "size": 81, - "stencil": "7pt", - "comm_pattern": "stencil", - "spmv": { - "csr-csr": { - "storage": 6420, - "max_relative_norm2": 1.0, - "time": 1.0, - "repetitions": 10, - "completed": true - } - }, - "nnz": 144, - "optimal": { - "spmv": "csr-csr" - } - } -] diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 09986fabdf1..16a30c35410 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -35,8 +35,8 @@ def sanitize_json_key_value(key: str, value, sanitize_all: bool): Strings with a key in empty_string_paths will be emptied Numbers with a key in denumberify_paths will be set to 1.0 - """ + if key in empty_string_paths and isinstance(value, str): return "" if key in denumberify_paths and isinstance(value, float): @@ -48,11 +48,14 @@ def sanitize_json_key_value(key: str, value, sanitize_all: bool): return sanitize_json(value, sanitize_all) -def sanitize_json(parsed_input, sanitize_all=False): +def sanitize_json(parsed_input, sanitize_all: bool = False): """Removes non-deterministic parts of a parsed JSON input. If sanitize_all is set to True, all nested float values will be set to 0. - Otherwise, the sanitation""" + Otherwise, only JSON object entries will be sanitized + using sanitize_json_key_value. + """ + if isinstance(parsed_input, dict): return { key: sanitize_json_key_value(key, value, sanitize_all) @@ -73,7 +76,8 @@ def sanitize_json_in_text(lines: list[str]) -> list[str]: (recognized by a single [, {, } or ] in an otherwise empty line). The JSON output will be parsed and sanitized through sanitize_json(...) and pretty-printed to replace the original JSON input. - The function returns the resulting output""" + The function returns the resulting output. + """ json_begins = [i for i, l in enumerate(lines) if l in ["[", "{"]] json_ends = [i + 1 for i, l in enumerate(lines) if l in ["]", "}"]] @@ -116,6 +120,7 @@ def determinize_text( if it can be parsed correctly. The output is guaranteed to end with an empty line. """ + lines = input.splitlines() output_lines = [] patterns = [re.compile(pattern) for pattern in ignore_patterns] From 1b648448905ff513b99e7bef8893f683e08247ea Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 15:21:34 +0200 Subject: [PATCH 045/583] support older python versions --- benchmark/test/test_framework.py.in | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 16a30c35410..912d2f0d203 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -2,7 +2,7 @@ import subprocess import difflib import json -import typing +from typing import List, Tuple import re import pathlib import sys @@ -69,7 +69,7 @@ def sanitize_json(parsed_input, sanitize_all: bool = False): return parsed_input -def sanitize_json_in_text(lines: list[str]) -> list[str]: +def sanitize_json_in_text(lines: List[str]) -> List[str]: """Sanitizes all occurrences of JSON content inside text input. Takes a list of text lines and detects any pretty-printed JSON output inside @@ -99,7 +99,8 @@ def sanitize_json_in_text(lines: list[str]) -> list[str]: for begin, end, do_sanitize in combined_pairs ] reconstructed = [ - json.dumps(sanitize_json(json.loads(t)), indent=4) if do_sanitize else t + json.dumps(sanitize_json(json.loads(t)), + indent=4) if do_sanitize else t for t, do_sanitize in texts ] return "\n".join(reconstructed).split("\n") @@ -107,9 +108,9 @@ def sanitize_json_in_text(lines: list[str]) -> list[str]: def determinize_text( input: str, - ignore_patterns: list[str], - replace_patterns: list[(str, str)], -) -> list[str]: + ignore_patterns: List[str], + replace_patterns: List[Tuple[str, str]], +) -> List[str]: """Sanitizes the given input string. Every input line matching an entry from ignore_patterns will be removed. @@ -143,11 +144,11 @@ def determinize_text( def compare_output_impl( - args: list[str], + args: List[str], expected_stdout: str, expected_stderr: str, stdin: str, - launcher_flags: list[str], + launcher_flags: List[str], ): args = [sys.argv[1]] + args expected_stdout = str(sourcepath / "reference" / expected_stdout) @@ -214,7 +215,8 @@ def compare_output_impl( print("FAIL: stdout differs") print( "\n".join( - difflib.unified_diff(expected_stdout_processed, result_stdout_processed) + difflib.unified_diff( + expected_stdout_processed, result_stdout_processed) ) ) failed = True @@ -222,7 +224,8 @@ def compare_output_impl( print("FAIL: stderr differs") print( "\n".join( - difflib.unified_diff(expected_stderr_processed, result_stderr_processed) + difflib.unified_diff( + expected_stderr_processed, result_stderr_processed) ) ) failed = True @@ -232,7 +235,7 @@ def compare_output_impl( def compare_output( - args: list[str], expected_stdout: str, expected_stderr: str, stdin: str = "" + args: List[str], expected_stdout: str, expected_stderr: str, stdin: str = "" ): compare_output_impl( args, From 4fa0a5a6554f10b88cb277295443ffa99c782bbc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 15:21:43 +0200 Subject: [PATCH 046/583] fix typing error --- benchmark/test/test_framework.py.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 912d2f0d203..6037f8c594e 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -129,8 +129,8 @@ def determinize_text( for pattern, replacement in replace_patterns: line = re.sub(pattern, replacement, line) keep = True - for pattern in patterns: - if re.match(pattern, line): + for compiled_pattern in patterns: + if re.match(compiled_pattern, line): keep = False break if keep: From 2de207dbb10ea3d877485d111114545777eee020 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 15:21:57 +0200 Subject: [PATCH 047/583] remove unused tests --- benchmark/test/CMakeLists.txt | 5 +-- benchmark/test/input.distributed_mtx.json | 7 ---- benchmark/test/multi_vector_distributed.py | 37 ---------------------- benchmark/test/spmv_distributed.py | 35 -------------------- 4 files changed, 1 insertion(+), 83 deletions(-) delete mode 100644 benchmark/test/input.distributed_mtx.json delete mode 100644 benchmark/test/multi_vector_distributed.py delete mode 100644 benchmark/test/spmv_distributed.py diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt index 0a2be0e662c..e1aab6dd75d 100644 --- a/benchmark/test/CMakeLists.txt +++ b/benchmark/test/CMakeLists.txt @@ -22,8 +22,5 @@ add_benchmark_test(solver) add_benchmark_test(sparse_blas) add_benchmark_test(spmv) if (GINKGO_BUILD_MPI) - # the distributed tests are still failing due to unstable output - #add_benchmark_test(multi_vector_distributed) - #add_benchmark_test(spmv_distributed) add_benchmark_test(solver_distributed) -endif() \ No newline at end of file +endif() diff --git a/benchmark/test/input.distributed_mtx.json b/benchmark/test/input.distributed_mtx.json deleted file mode 100644 index aca115179e6..00000000000 --- a/benchmark/test/input.distributed_mtx.json +++ /dev/null @@ -1,7 +0,0 @@ -[ - { - "size": 100, - "stencil": "7pt", - "comm_pattern": "stencil" - } -] \ No newline at end of file diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py deleted file mode 100644 index aab886ca509..00000000000 --- a/benchmark/test/multi_vector_distributed.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -import test_framework - -# check that all input modes work: -# parameter -test_framework.compare_output_distributed( - ["-input", '[{"n": 100}]'], - expected_stdout="multi_vector_distributed.simple.stdout", - expected_stderr="multi_vector_distributed.simple.stderr", - num_procs=3, -) - -# stdin -test_framework.compare_output_distributed( - [], - expected_stdout="multi_vector_distributed.simple.stdout", - expected_stderr="multi_vector_distributed.simple.stderr", - stdin='[{"n": 100}]', - num_procs=3, -) - -# file -test_framework.compare_output_distributed( - ["-input", str(test_framework.sourcepath / "input.blas.json")], - expected_stdout="multi_vector_distributed.simple.stdout", - expected_stderr="multi_vector_distributed.simple.stderr", - stdin='[{"n": 100}]', - num_procs=3, -) - -# profiler annotations -# currently still unstable output and thus disabled -# test_framework.compare_output_distributed(["-input", '[{"n": 100}]', '-profile', '-profiler_hook', 'debug'], -# expected_stdout="multi_vector_distributed.profile.stdout", -# expected_stderr="multi_vector_distributed.profile.stderr", -# stdin='[{"n": 100}]', -# num_procs=3) diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py deleted file mode 100644 index 1b219b34cda..00000000000 --- a/benchmark/test/spmv_distributed.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -import test_framework - -# check that all input modes work: -# parameter -test_framework.compare_output_distributed( - ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], - expected_stdout="spmv_distributed.simple.stdout", - expected_stderr="spmv_distributed.simple.stderr", - num_procs=3, -) - -# stdin -test_framework.compare_output_distributed( - [], - expected_stdout="spmv_distributed.simple.stdout", - expected_stderr="spmv_distributed.simple.stderr", - num_procs=3, - stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', -) - -# input file -test_framework.compare_output_distributed( - ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], - expected_stdout="spmv_distributed.simple.stdout", - expected_stderr="spmv_distributed.simple.stderr", - num_procs=3, -) - -# profiler annotations -# currently still unstable output and thus disabled -# test_framework.compare_output_distributed(["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', '-profile', '-profiler_hook', 'debug'], -# expected_stdout="spmv_distributed.profile.stdout", -# expected_stderr="spmv_distributed.profile.stderr", -# num_procs=3) From 138f44c15404ac02a1dc6a73255f56a6855d526b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 15:48:29 +0200 Subject: [PATCH 048/583] fix device memory access segfault --- test/mpi/preconditioner/schwarz.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 95bfe3f66b4..8586711a114 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -162,13 +162,14 @@ class SchwarzPreconditioner : public CommonMpiTestFixture { std::shared_ptr dist_vec, std::shared_ptr local_vec) { + auto host_row_part = row_part->clone(ref); auto l_dist_vec = dist_vec->get_local_vector(); auto vec_view = local_vec_type::create_const( exec, l_dist_vec->get_size(), gko::array::const_view( exec, l_dist_vec->get_size()[0], local_vec->get_const_values() + - row_part->get_range_bounds()[comm.rank()]), + host_row_part->get_range_bounds()[comm.rank()]), l_dist_vec->get_size()[1]); GKO_ASSERT_MTX_NEAR(l_dist_vec, vec_view.get(), r::value); } From 684df2076d97d298d74771bd1207cefac8351d6f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 19:38:32 +0200 Subject: [PATCH 049/583] remove deprecated SYCL environment variables --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index eafeae20729..4ad66eca652 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -615,7 +615,7 @@ build/dpcpp/2022-1/cpu/release/static: BUILD_DPCPP: "ON" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" - SYCL_DEVICE_TYPE: "CPU" + SYCL_DEVICE_FILTER: "*:cpu" SLURM_PARTITION: "cpu" SLURM_TIME: "2:00:00" # This job is not in exclusive mode From 9b614a3c3e5d7945cfcf551aba493acd2151935e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 30 Mar 2023 12:20:20 +0200 Subject: [PATCH 050/583] add allocator support to all executors --- benchmark/utils/general.hpp | 4 +- core/CMakeLists.txt | 1 + core/base/memory.cpp | 59 ++++ core/device_hooks/cuda_hooks.cpp | 44 +++ core/device_hooks/dpcpp_hooks.cpp | 18 ++ core/device_hooks/hip_hooks.cpp | 19 ++ core/test/base/executor.cpp | 79 +----- cuda/CMakeLists.txt | 4 + cuda/base/device.cpp | 65 +++++ cuda/base/executor.cpp | 173 ++---------- cuda/base/memory.cpp | 168 +++++++++++ cuda/base/nvtx.cpp | 96 +++++++ .../stream.cpp} | 53 ++-- cuda/test/base/CMakeLists.txt | 2 +- cuda/test/base/cuda_executor.cu | 15 +- cuda/test/base/memory.cpp | 126 +++++++++ cuda/test/utils.hpp | 4 +- devices/cuda/executor.cpp | 27 -- devices/hip/executor.cpp | 34 --- devices/omp/executor.cpp | 7 +- dpcpp/base/executor.dp.cpp | 33 +++ .../base/memory.dp.cpp | 47 ++-- dpcpp/test/base/CMakeLists.txt | 1 + dpcpp/test/base/memory.dp.cpp | 98 +++++++ .../adaptiveprecision-blockjacobi.cpp | 7 +- examples/cb-gmres/cb-gmres.cpp | 7 +- examples/custom-logger/custom-logger.cpp | 7 +- .../custom-matrix-format.cpp | 7 +- .../custom-stopping-criterion.cpp | 7 +- .../ilu-preconditioned-solver.cpp | 7 +- .../inverse-iteration/inverse-iteration.cpp | 7 +- .../ir-ilu-preconditioned-solver.cpp | 7 +- .../iterative-refinement.cpp | 7 +- .../minimal-cuda-solver.cpp | 2 +- .../mixed-multigrid-preconditioned-solver.cpp | 7 +- .../mixed-multigrid-solver.cpp | 7 +- .../mixed-precision-ir/mixed-precision-ir.cpp | 7 +- examples/mixed-spmv/mixed-spmv.cpp | 7 +- ...igrid-preconditioned-solver-customized.cpp | 7 +- .../multigrid-preconditioned-solver.cpp | 7 +- .../nine-pt-stencil-solver.cpp | 7 +- examples/papi-logging/papi-logging.cpp | 7 +- .../performance-debugging.cpp | 7 +- examples/poisson-solver/poisson-solver.cpp | 7 +- .../preconditioned-solver.cpp | 7 +- .../simple-solver-logging.cpp | 7 +- examples/simple-solver/simple-solver.cpp | 7 +- .../three-pt-stencil-solver.cpp | 7 +- hip/CMakeLists.txt | 4 + hip/base/device.hip.cpp | 67 +++++ hip/base/executor.hip.cpp | 137 +-------- hip/base/memory.hip.cpp | 97 +++++++ hip/base/roctx.hip.cpp | 70 +++++ hip/base/stream.hip.cpp | 78 +++++ hip/test/base/CMakeLists.txt | 1 - hip/test/base/hip_executor.hip.cpp | 14 +- hip/test/utils.hip.hpp | 4 +- include/ginkgo/core/base/executor.hpp | 266 +++++------------- include/ginkgo/core/base/fwd_defs.hpp | 90 ++++++ include/ginkgo/core/base/memory.hpp | 211 ++++++++++++++ include/ginkgo/core/base/stream.hpp | 124 ++++++++ include/ginkgo/ginkgo.hpp | 3 + test/utils/executor.hpp | 19 +- test/utils/mpi/executor.hpp | 5 +- 64 files changed, 1755 insertions(+), 775 deletions(-) create mode 100644 core/base/memory.cpp create mode 100644 cuda/base/device.cpp create mode 100644 cuda/base/memory.cpp create mode 100644 cuda/base/nvtx.cpp rename cuda/{test/base/cuda_executor_reset.cpp => base/stream.cpp} (62%) create mode 100644 cuda/test/base/memory.cpp rename hip/test/base/hip_executor_reset.cpp => dpcpp/base/memory.dp.cpp (63%) create mode 100644 dpcpp/test/base/memory.dp.cpp create mode 100644 hip/base/device.hip.cpp create mode 100644 hip/base/memory.hip.cpp create mode 100644 hip/base/roctx.hip.cpp create mode 100644 hip/base/stream.hip.cpp create mode 100644 include/ginkgo/core/base/fwd_defs.hpp create mode 100644 include/ginkgo/core/base/memory.hpp create mode 100644 include/ginkgo/core/base/stream.hpp diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 92c3e5c9b13..35077f66d4b 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -337,12 +337,12 @@ const std::map(bool)>> {"cuda", [](bool) { return gko::CudaExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create(), true); + gko::OmpExecutor::create()); }}, {"hip", [](bool) { return gko::HipExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create(), true); + gko::OmpExecutor::create()); }}, {"dpcpp", [](bool use_gpu_timer) { auto property = dpcpp_queue_property::in_order; diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 2f9643115c9..49cf89b66d6 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -10,6 +10,7 @@ target_sources(ginkgo base/device_matrix_data.cpp base/executor.cpp base/index_set.cpp + base/memory.cpp base/mpi.cpp base/mtx_io.cpp base/perturbation.cpp diff --git a/core/base/memory.cpp b/core/base/memory.cpp new file mode 100644 index 00000000000..88d97bcc765 --- /dev/null +++ b/core/base/memory.cpp @@ -0,0 +1,59 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +namespace gko { + + +void* CpuAllocator::allocate(size_type num_bytes) const +{ + auto ptr = ::operator new (num_bytes, std::nothrow_t{}); + GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes); + return ptr; +} + + +void CpuAllocator::deallocate(void* ptr) const +{ + ::operator delete (ptr, std::nothrow_t{}); +} + + +} // namespace gko \ No newline at end of file diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index dd4c3f19f7c..cdecf735a9d 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -35,6 +35,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include +#include #include #include #include @@ -52,6 +54,45 @@ version version_info::get_cuda_version() noexcept } +void* CudaAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(cuda); + + +void CudaAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + + +CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream) + GKO_NOT_COMPILED(cuda); + + +void* CudaAsyncAllocator::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(cuda); + + +void CudaAsyncAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + + +CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) + GKO_NOT_COMPILED(cuda); + + +void* CudaUnifiedAllocator::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(cuda); + + +void CudaUnifiedAllocator::deallocate(void* dev_ptr) const + GKO_NOT_COMPILED(cuda); + + +CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda); + + +void* CudaHostAllocator::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(cuda); + + +void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + + std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, CUstream_st* stream) @@ -154,6 +195,9 @@ scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec, GKO_NOT_COMPILED(cuda); +cuda_stream::cuda_stream() GKO_NOT_COMPILED(cuda); + + cuda_stream::cuda_stream(int device_id) GKO_NOT_COMPILED(cuda); diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index a08f1f608fb..0ee3e6f289f 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -52,6 +53,23 @@ version version_info::get_dpcpp_version() noexcept } +void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type size) const + GKO_NOT_COMPILED(dpcpp); + + +void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const + GKO_NOT_COMPILED(dpcpp); + + +void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, + size_type size) const + GKO_NOT_COMPILED(dpcpp); + + +void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const + GKO_NOT_COMPILED(dpcpp); + + std::shared_ptr DpcppExecutor::create( int device_id, std::shared_ptr master, std::string device_type, dpcpp_queue_property property) diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 50637f7b3f0..739dac39f08 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -53,6 +53,22 @@ version version_info::get_hip_version() noexcept } +void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) + GKO_NOT_COMPILED(hip); + + +void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) @@ -155,6 +171,9 @@ scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec, GKO_NOT_COMPILED(hip); +hip_stream::hip_stream() GKO_NOT_COMPILED(hip); + + hip_stream::hip_stream(int device_id) GKO_NOT_COMPILED(hip); diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 0d64dfcf3cf..71064cf01d2 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include "ginkgo/core/base/memory.hpp" #if defined(__unix__) || defined(__APPLE__) @@ -263,35 +264,6 @@ TEST(CudaExecutor, KnowsItsDeviceId) } -TEST(CudaExecutor, CanGetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp); - - ASSERT_EQ(false, cuda->get_device_reset()); -} - - -TEST(CudaExecutor, CanSetDefaultDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp, true); - - ASSERT_EQ(true, cuda->get_device_reset()); -} - - -TEST(CudaExecutor, CanSetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp); - - cuda->set_device_reset(true); - - ASSERT_EQ(true, cuda->get_device_reset()); -} - - TEST(HipExecutor, KnowsItsMaster) { auto omp = gko::OmpExecutor::create(); @@ -310,35 +282,6 @@ TEST(HipExecutor, KnowsItsDeviceId) } -TEST(HipExecutor, CanGetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); - - ASSERT_EQ(false, hip->get_device_reset()); -} - - -TEST(HipExecutor, CanSetDefaultDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp, true); - - ASSERT_EQ(true, hip->get_device_reset()); -} - - -TEST(HipExecutor, CanSetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); - - hip->set_device_reset(true); - - ASSERT_EQ(true, hip->get_device_reset()); -} - - TEST(DpcppExecutor, KnowsItsMaster) { auto omp = gko::OmpExecutor::create(); @@ -442,20 +385,11 @@ TEST(Executor, CanVerifyMemory) } -template -struct mock_free : T { - /** - * @internal Due to a bug with gcc 5.3, the constructor needs to be called - * with `()` operator instead of `{}`. - */ - template - mock_free(Params&&... params) : T(std::forward(params)...) - {} - - void raw_free(void* ptr) const noexcept override +struct MockAllocator : gko::CpuAllocator { + void deallocate(void* ptr) const noexcept override { called_free = true; - T::raw_free(ptr); + CpuAllocator::deallocate(ptr); } mutable bool called_free{false}; @@ -464,12 +398,13 @@ struct mock_free : T { TEST(ExecutorDeleter, DeletesObject) { - auto ref = std::make_shared>(); + auto alloc = std::make_shared(); + auto ref = gko::ReferenceExecutor::create(alloc); auto x = ref->alloc(5); gko::executor_deleter{ref}(x); - ASSERT_TRUE(ref->called_free); + ASSERT_TRUE(alloc->called_free); } diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index bbe7a953dbd..aecf4e1c2f2 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,11 +1,15 @@ add_library(ginkgo_cuda $ "") target_sources(ginkgo_cuda PRIVATE + base/device.cpp base/device_matrix_data_kernels.cu base/exception.cpp base/executor.cpp base/index_set_kernels.cpp + base/memory.cpp + base/nvtx.cpp base/scoped_device_id.cpp + base/stream.cpp base/timer.cpp base/version.cpp components/prefix_sum_kernels.cu diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp new file mode 100644 index 00000000000..31ab5bcde63 --- /dev/null +++ b/cuda/base/device.cpp @@ -0,0 +1,65 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/base/device.hpp" + + +#include + + +#include + + +#include "cuda/base/scoped_device_id.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +void reset_device(int device_id) +{ + gko::detail::cuda_scoped_device_id_guard guard{device_id}; + cudaDeviceReset(); +} + + +void destroy_event(CUevent_st* event) +{ + GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event)); +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index e474d9c9f49..f6e838dd2dd 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -39,18 +39,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#ifdef GKO_LEGACY_NVTX -#include -#else -#include -#endif #include #include #include #include -#include +#include #include "cuda/base/config.hpp" @@ -65,25 +60,38 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" +std::unique_ptr allocator_from_mode(int device_id, + allocation_mode mode) +{ + switch (mode) { + case allocation_mode::device: + return std::make_unique(); + case allocation_mode::unified_global: + return std::make_unique(device_id); + case allocation_mode::unified_host: + return std::make_unique(device_id); + default: + GKO_NOT_SUPPORTED(mode); + } +} + + std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, cudaStream_t stream) { return std::shared_ptr( - new CudaExecutor(device_id, std::move(master), device_reset, alloc_mode, - stream), - [device_id](CudaExecutor* exec) { - auto device_reset = exec->get_device_reset(); - std::lock_guard guard( - nvidia_device::get_mutex(device_id)); - delete exec; - auto& num_execs = nvidia_device::get_num_execs(device_id); - num_execs--; - if (!num_execs && device_reset) { - detail::cuda_scoped_device_id_guard g(device_id); - cudaDeviceReset(); - } - }); + new CudaExecutor(device_id, std::move(master), + allocator_from_mode(device_id, alloc_mode), stream)); +} + + +std::shared_ptr CudaExecutor::create( + int device_id, std::shared_ptr master, + std::shared_ptr alloc, cudaStream_t stream) +{ + return std::shared_ptr(new CudaExecutor( + device_id, std::move(master), std::move(alloc), stream)); } @@ -123,41 +131,14 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, void CudaExecutor::raw_free(void* ptr) const noexcept { detail::cuda_scoped_device_id_guard g(this->get_device_id()); - auto error_code = cudaFree(ptr); - if (error_code != cudaSuccess) { -#if GKO_VERBOSE_LEVEL >= 1 - // Unfortunately, if memory free fails, there's not much we can do - std::cerr << "Unrecoverable CUDA error on device " - << this->get_device_id() << " in " << __func__ << ": " - << cudaGetErrorName(error_code) << ": " - << cudaGetErrorString(error_code) << std::endl - << "Exiting program" << std::endl; -#endif // GKO_VERBOSE_LEVEL >= 1 - std::exit(error_code); - } + alloc_->deallocate(ptr); } void* CudaExecutor::raw_alloc(size_type num_bytes) const { - void* dev_ptr = nullptr; detail::cuda_scoped_device_id_guard g(this->get_device_id()); - int error_code = 0; - if (this->alloc_mode_ == allocation_mode::unified_host) { - error_code = cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachHost); - } else if (this->alloc_mode_ == allocation_mode::unified_global) { - error_code = - cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachGlobal); - } else if (this->alloc_mode_ == allocation_mode::device) { - error_code = cudaMalloc(&dev_ptr, num_bytes); - } else { - GKO_NOT_SUPPORTED(this->alloc_mode_); - } - if (error_code != cudaErrorMemoryAllocation) { - GKO_ASSERT_NO_CUDA_ERRORS(error_code); - } - GKO_ENSURE_ALLOCATED(dev_ptr, "cuda", num_bytes); - return dev_ptr; + return alloc_->allocate(num_bytes); } @@ -298,98 +279,4 @@ void CudaExecutor::init_handles() } -cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id) -{ - detail::cuda_scoped_device_id_guard g(device_id_); - GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_)); -} - - -cuda_stream::~cuda_stream() -{ - if (stream_) { - detail::cuda_scoped_device_id_guard g(device_id_); - cudaStreamDestroy(stream_); - } -} - - -cuda_stream::cuda_stream(cuda_stream&& other) - : stream_{std::exchange(other.stream_, nullptr)}, - device_id_(std::exchange(other.device_id_, -1)) -{} - - -CUstream_st* cuda_stream::get() const { return stream_; } - - -namespace log { - - -// "GKO" in ASCII to avoid collision with other application's categories -constexpr static uint32 category_magic_offset = 0x676B6FU; - - -void init_nvtx() -{ -#define NAMED_CATEGORY(_name) \ - nvtxNameCategory(static_cast(profile_event_category::_name) + \ - category_magic_offset, \ - "gko::" #_name) - NAMED_CATEGORY(memory); - NAMED_CATEGORY(operation); - NAMED_CATEGORY(object); - NAMED_CATEGORY(linop); - NAMED_CATEGORY(factory); - NAMED_CATEGORY(solver); - NAMED_CATEGORY(criterion); - NAMED_CATEGORY(user); - NAMED_CATEGORY(internal); -#undef NAMED_CATEGORY -} - - -std::function begin_nvtx_fn( - uint32_t color_argb) -{ - return [color_argb](const char* name, profile_event_category category) { - nvtxEventAttributes_t attr{}; - attr.version = NVTX_VERSION; - attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - attr.category = static_cast(category) + category_magic_offset; - attr.colorType = NVTX_COLOR_ARGB; - attr.color = color_argb; - attr.payloadType = NVTX_PAYLOAD_UNKNOWN; - attr.messageType = NVTX_MESSAGE_TYPE_ASCII; - attr.message.ascii = name; - nvtxRangePushEx(&attr); - }; -} - - -void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); } - - -} // namespace log - - -namespace kernels { -namespace cuda { - - -void reset_device(int device_id) -{ - gko::detail::cuda_scoped_device_id_guard guard{device_id}; - cudaDeviceReset(); -} - - -void destroy_event(CUevent_st* event) -{ - GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event)); -} - - -} // namespace cuda -} // namespace kernels } // namespace gko diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp new file mode 100644 index 00000000000..11dee81ad42 --- /dev/null +++ b/cuda/base/memory.cpp @@ -0,0 +1,168 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include "cuda/base/scoped_device_id.hpp" + + +namespace gko { + + +#define GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(_operation, _size) \ + { \ + auto error_code = _operation; \ + if (error_code == cudaErrorMemoryAllocation) { \ + throw AllocationError(__FILE__, __LINE__, "cuda", _size); \ + } else { \ + GKO_ASSERT_NO_CUDA_ERRORS(error_code); \ + } \ + } + + +#if GKO_VERBOSE_LEVEL >= 1 +#define GKO_EXIT_ON_CUDA_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != cudaSuccess) { \ + int device_id{-1}; \ + cudaGetDevice(&device_id); \ + std::cerr << "Unrecoverable CUDA error on device " << device_id \ + << " in " << __func__ << ":" << __LINE__ << ": " \ + << cudaGetErrorName(error_code) << ": " \ + << cudaGetErrorString(error_code) << std::endl \ + << "Exiting program" << std::endl; \ + std::exit(error_code); \ + } \ + } +#else +#define GKO_EXIT_ON_CUDA_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != cudaSuccess) { \ + std::exit(error_code); \ + } \ + } +#endif + + +void* CudaAllocator::allocate(size_type num_bytes) const +{ + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void CudaAllocator::deallocate(void* ptr) const +{ + GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); +} + + +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} + + +void* CudaAsyncAllocator::allocate(size_type num_bytes) const +{ + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS( + cudaMallocAsync(&ptr, num_bytes, stream_), num_bytes); + return ptr; +} + +void CudaAsyncAllocator::deallocate(void* ptr) const +{ + GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_)); +} + + +CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id) + : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal} +{} + + +CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) + : device_id_{device_id}, flags_{flags} +{} + + +void* CudaUnifiedAllocator::allocate(size_type num_bytes) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS( + cudaMallocManaged(&ptr, num_bytes, flags_), num_bytes); + return ptr; +} + + +void CudaUnifiedAllocator::deallocate(void* ptr) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); +} + + +CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {} + + +void* CudaHostAllocator::allocate(size_type num_bytes) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMallocHost(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void CudaHostAllocator::deallocate(void* ptr) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_CUDA_ERROR(cudaFreeHost(ptr)); +} + + +} // namespace gko \ No newline at end of file diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp new file mode 100644 index 00000000000..e313c110ea2 --- /dev/null +++ b/cuda/base/nvtx.cpp @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#ifdef GKO_LEGACY_NVTX +#include +#else +#include +#endif + + +#include + + +namespace gko { +namespace log { + + +// "GKO" in ASCII to avoid collision with other application's categories +constexpr static uint32 category_magic_offset = 0x676B6FU; + + +void init_nvtx() +{ +#define NAMED_CATEGORY(_name) \ + nvtxNameCategory(static_cast(profile_event_category::_name) + \ + category_magic_offset, \ + "gko::" #_name) + NAMED_CATEGORY(memory); + NAMED_CATEGORY(operation); + NAMED_CATEGORY(object); + NAMED_CATEGORY(linop); + NAMED_CATEGORY(factory); + NAMED_CATEGORY(solver); + NAMED_CATEGORY(criterion); + NAMED_CATEGORY(user); + NAMED_CATEGORY(internal); +#undef NAMED_CATEGORY +} + + +std::function begin_nvtx_fn( + uint32_t color_argb) +{ + return [color_argb](const char* name, profile_event_category category) { + nvtxEventAttributes_t attr{}; + attr.version = NVTX_VERSION; + attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + attr.category = static_cast(category) + category_magic_offset; + attr.colorType = NVTX_COLOR_ARGB; + attr.color = color_argb; + attr.payloadType = NVTX_PAYLOAD_UNKNOWN; + attr.messageType = NVTX_MESSAGE_TYPE_ASCII; + attr.message.ascii = name; + nvtxRangePushEx(&attr); + }; +} + + +void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); } + + +} // namespace log +} // namespace gko diff --git a/cuda/test/base/cuda_executor_reset.cpp b/cuda/base/stream.cpp similarity index 62% rename from cuda/test/base/cuda_executor_reset.cpp rename to cuda/base/stream.cpp index c8159b9c4d7..8c6aa92c28b 100644 --- a/cuda/test/base/cuda_executor_reset.cpp +++ b/cuda/base/stream.cpp @@ -30,58 +30,45 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include -#include +#include +#include -#include +#include "cuda/base/scoped_device_id.hpp" -namespace { +namespace gko { -#define GTEST_ASSERT_NO_EXIT(statement) \ - ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "") +cuda_stream::cuda_stream() : stream_{}, device_id_{-1} {} -TEST(DeviceReset, HipCuda) +cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id) { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto hip = gko::HipExecutor::create(0, ref, true); - auto cuda = gko::CudaExecutor::create(0, ref, true); - }); + detail::cuda_scoped_device_id_guard g(device_id_); + GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_)); } -TEST(DeviceReset, CudaHip) +cuda_stream::~cuda_stream() { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, ref, true); - auto hip = gko::HipExecutor::create(0, ref, true); - }); + if (stream_) { + detail::cuda_scoped_device_id_guard g(device_id_); + cudaStreamDestroy(stream_); + } } -void func() -{ - auto ref = gko::ReferenceExecutor::create(); - auto exec = gko::CudaExecutor::create(0, ref, true); -} +cuda_stream::cuda_stream(cuda_stream&& other) + : stream_{std::exchange(other.stream_, nullptr)}, + device_id_(std::exchange(other.device_id_, -1)) +{} -TEST(DeviceReset, CudaCuda) -{ - GTEST_ASSERT_NO_EXIT({ - std::thread t1(func); - std::thread t2(func); - t1.join(); - t2.join(); - }); -} +CUstream_st* cuda_stream::get() const { return stream_; } -} // namespace +} // namespace gko diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt index 9be3caf9faa..a213e65277a 100644 --- a/cuda/test/base/CMakeLists.txt +++ b/cuda/test/base/CMakeLists.txt @@ -1,7 +1,6 @@ ginkgo_create_cuda_test(array) ginkgo_create_cuda_test(cuda_executor) ginkgo_create_test(index_set) -ginkgo_create_test(cuda_executor_reset ADDITIONAL_LIBRARIES Threads::Threads) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_cuda_test(cuda_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) @@ -10,4 +9,5 @@ ginkgo_create_cuda_test(exception_helpers) ginkgo_create_cuda_test(kernel_launch) ginkgo_create_cuda_test(lin_op) ginkgo_create_cuda_test(math) +ginkgo_create_test(memory) ginkgo_create_cuda_test(scoped_device_id) diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index 5f489ac22f0..afb23c06186 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include + #include "common/cuda_hip/base/executor.hpp.inc" #include "cuda/base/scoped_device_id.hpp" @@ -103,18 +105,19 @@ protected: ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM cuda = gko::CudaExecutor::create( - 0, omp, false, gko::default_cuda_alloc_mode, stream.get()); + 0, omp, std::make_shared(), stream.get()); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp, false, - gko::default_cuda_alloc_mode, other_stream.get()); + gko::CudaExecutor::get_num_devices() - 1, omp, + std::make_shared(), other_stream.get()); cuda3 = gko::CudaExecutor::create( - 0, omp, false, gko::allocation_mode::unified_global, stream.get()); + 0, omp, std::make_shared(0), + stream.get()); #else cuda = gko::CudaExecutor::create(0, omp); cuda2 = gko::CudaExecutor::create( gko::CudaExecutor::get_num_devices() - 1, omp); - cuda3 = gko::CudaExecutor::create(0, omp, false, - gko::allocation_mode::unified_global); + cuda3 = gko::CudaExecutor::create( + 0, omp, std::make_shared(0)); #endif } diff --git a/cuda/test/base/memory.cpp b/cuda/test/base/memory.cpp new file mode 100644 index 00000000000..a329817f4af --- /dev/null +++ b/cuda/test/base/memory.cpp @@ -0,0 +1,126 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include + + +#include "cuda/test/utils.hpp" + + +namespace { + + +class Memory : public CudaTestFixture { +protected: + Memory() + : host_exec_with_pinned{gko::OmpExecutor::create( + std::make_shared(0))}, + host_exec_with_unified{gko::OmpExecutor::create( + std::make_shared(0))}, + exec_with_normal{gko::CudaExecutor::create( + 0, ref, std::make_shared(), + exec->get_stream())}, + exec_with_async{gko::CudaExecutor::create( + 0, host_exec_with_pinned, + std::make_shared(exec->get_stream()), + exec->get_stream())}, + exec_with_unified{gko::CudaExecutor::create( + 0, host_exec_with_unified, + std::make_shared(0), + exec->get_stream())} + {} + + std::shared_ptr host_exec_with_pinned; + std::shared_ptr host_exec_with_unified; + std::shared_ptr exec_with_normal; + std::shared_ptr exec_with_async; + std::shared_ptr exec_with_unified; +}; + + +TEST_F(Memory, DeviceAllocationWorks) +{ + gko::array data{exec_with_normal, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, AsyncDeviceAllocationWorks) +{ + gko::array data{exec_with_async, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, UnifiedDeviceAllocationWorks) +{ + gko::array data{exec_with_unified, {1, 2}}; + exec->synchronize(); + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostUnifiedAllocationWorks) +{ + gko::array data{host_exec_with_unified, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostPinnedAllocationWorks) +{ + gko::array data{host_exec_with_pinned, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +} // namespace diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index 814405ba0d9..e1156b91903 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "cuda/base/device.hpp" @@ -60,8 +61,9 @@ class CudaTestFixture : public ::testing::Test { CudaTestFixture() : ref(gko::ReferenceExecutor::create()), #ifdef GKO_TEST_NONDEFAULT_STREAM + stream(0), exec(gko::CudaExecutor::create( - 0, ref, false, gko::default_cuda_alloc_mode, stream.get())) + 0, ref, std::make_shared(), stream.get())) #else exec(gko::CudaExecutor::create(0, ref)) #endif diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp index d066d272f81..3789274c3f3 100644 --- a/devices/cuda/executor.cpp +++ b/devices/cuda/executor.cpp @@ -64,31 +64,4 @@ bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const } -void CudaExecutor::increase_num_execs(unsigned device_id) -{ -#ifdef GKO_COMPILING_CUDA_DEVICE - // increase the Cuda Device count only when ginkgo build cuda - std::lock_guard guard(nvidia_device::get_mutex(device_id)); - nvidia_device::get_num_execs(device_id)++; -#endif // GKO_COMPILING_CUDA_DEVICE -} - - -void CudaExecutor::decrease_num_execs(unsigned device_id) -{ -#ifdef GKO_COMPILING_CUDA_DEVICE - // increase the Cuda Device count only when ginkgo build cuda - std::lock_guard guard(nvidia_device::get_mutex(device_id)); - nvidia_device::get_num_execs(device_id)--; -#endif // GKO_COMPILING_CUDA_DEVICE -} - - -unsigned CudaExecutor::get_num_execs(unsigned device_id) -{ - std::lock_guard guard(nvidia_device::get_mutex(device_id)); - return nvidia_device::get_num_execs(device_id); -} - - } // namespace gko diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp index 60efb4c53a3..b044074c19e 100644 --- a/devices/hip/executor.cpp +++ b/devices/hip/executor.cpp @@ -61,38 +61,4 @@ bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const } -#if (GINKGO_HIP_PLATFORM_NVCC == 1) -using hip_device_class = nvidia_device; -#else -using hip_device_class = amd_device; -#endif - - -void HipExecutor::increase_num_execs(int device_id) -{ -#ifdef GKO_COMPILING_HIP_DEVICE - // increase the HIP Device count only when ginkgo build hip - std::lock_guard guard(hip_device_class::get_mutex(device_id)); - hip_device_class::get_num_execs(device_id)++; -#endif // GKO_COMPILING_HIP_DEVICE -} - - -void HipExecutor::decrease_num_execs(int device_id) -{ -#ifdef GKO_COMPILING_HIP_DEVICE - // increase the HIP Device count only when ginkgo build hip - std::lock_guard guard(hip_device_class::get_mutex(device_id)); - hip_device_class::get_num_execs(device_id)--; -#endif // GKO_COMPILING_HIP_DEVICE -} - - -int HipExecutor::get_num_execs(int device_id) -{ - std::lock_guard guard(hip_device_class::get_mutex(device_id)); - return hip_device_class::get_num_execs(device_id); -} - - } // namespace gko diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp index 352216f7633..f8e700bc2d5 100644 --- a/devices/omp/executor.cpp +++ b/devices/omp/executor.cpp @@ -55,7 +55,10 @@ void OmpExecutor::populate_exec_info(const machine_topology* mach_topo) } -void OmpExecutor::raw_free(void* ptr) const noexcept { std::free(ptr); } +void OmpExecutor::raw_free(void* ptr) const noexcept +{ + return alloc_->deallocate(ptr); +} std::shared_ptr OmpExecutor::get_master() noexcept @@ -72,7 +75,7 @@ std::shared_ptr OmpExecutor::get_master() const noexcept void* OmpExecutor::raw_alloc(size_type num_bytes) const { - return GKO_ENSURE_ALLOCATED(std::malloc(num_bytes), "OMP", num_bytes); + return alloc_->allocate(num_bytes); } diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index c2015c8664c..d668331a43b 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -51,6 +51,39 @@ namespace gko { namespace detail { +DpcppAllocator::DpcppAllocator(sycl::queue* queue) : queue_{queue} {} + + +void* DpcppAllocator::allocate(size_type size) +{ + return sycl::malloc_device(size, *queue_); +} + + +void DpcppAllocator::deallocate(void* ptr) +{ + queue_->wait_and_throw(); + sycl::free(ptr, queue_->get_context()); +} + + +DpcppUnifiedAllocator::DpcppUnifiedAllocator(sycl::queue* queue) : queue_{queue} +{} + + +void* DpcppUnifiedAllocator::allocate(size_type size) +{ + return sycl::malloc_shared(size, *queue_); +} + + +void DpcppUnifiedAllocator::deallocate(void* ptr) +{ + queue_->wait_and_throw(); + sycl::free(ptr, queue_->get_context()); +} + + const std::vector get_devices(std::string device_type) { std::map device_type_map{ diff --git a/hip/test/base/hip_executor_reset.cpp b/dpcpp/base/memory.dp.cpp similarity index 63% rename from hip/test/base/hip_executor_reset.cpp rename to dpcpp/base/memory.dp.cpp index 39e3252e053..b1ccd007dea 100644 --- a/hip/test/base/hip_executor_reset.cpp +++ b/dpcpp/base/memory.dp.cpp @@ -30,58 +30,43 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include -#include +#include -#include +namespace gko { -namespace { +DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {} -#define GTEST_ASSERT_NO_EXIT(statement) \ - ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "") - - -TEST(DeviceReset, HipCuda) +void* DpcppAllocator::allocate_impl(sycl::queue* queue, + size_type num_bytes) const { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto hip = gko::HipExecutor::create(0, ref, true); - auto cuda = gko::CudaExecutor::create(0, ref, true); - }); + return sycl::malloc_device(size, *queue); } -TEST(DeviceReset, CudaHip) +void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, ref, true); - auto hip = gko::HipExecutor::create(0, ref, true); - }); + queue->wait_and_throw(); + sycl::free(ptr, queue->get_context()); } -void func() +void* DpcppUnifiedAllocator::allocate(size_type num_bytes) { - auto ref = gko::ReferenceExecutor::create(); - auto exec = gko::HipExecutor::create(0, ref, true); + return sycl::malloc_shared(size, *queue_); } -TEST(DeviceReset, HipHip) +void DpcppUnifiedAllocator::deallocate(void* ptr) { - GTEST_ASSERT_NO_EXIT({ - std::thread t1(func); - std::thread t2(func); - t1.join(); - t2.join(); - }); + queue_->wait_and_throw(); + sycl::free(ptr, queue_->get_context()); } -} // namespace +} // namespace gko diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt index bb9c8a75050..5c0ca601f04 100644 --- a/dpcpp/test/base/CMakeLists.txt +++ b/dpcpp/test/base/CMakeLists.txt @@ -3,3 +3,4 @@ ginkgo_create_dpcpp_test(dim3) ginkgo_create_dpcpp_test(kernel_launch) # set correct flags for kernel_launch.hpp target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP) +ginkgo_create_dpcpp_test(memory) \ No newline at end of file diff --git a/dpcpp/test/base/memory.dp.cpp b/dpcpp/test/base/memory.dp.cpp new file mode 100644 index 00000000000..e587660cde3 --- /dev/null +++ b/dpcpp/test/base/memory.dp.cpp @@ -0,0 +1,98 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include + + +#include "dpcpp/test/utils.hpp" + + +namespace { + + +class Memory : public ::testing::Test { +protected: + Memory() + : exec{gko::DpcppExecutor::create(0, gko::OmpExecutor::create())}, + host_exec_with_unified{gko::OmpExecutor::create( + std::make_shared(exec->get_queue()))}, + exec_with_unified{gko::DpcppExecutor::create( + exec->get_queue(), host_exec_with_unified, + std::make_shared(exec->get_queue()))} + {} + + std::shared_ptr exec; + std::shared_ptr host_exec_with_unified; + std::shared_ptr exec_with_unified; +}; + + +TEST_F(Memory, DeviceAllocationWorks) +{ + gko::array data{exec, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, UnifiedDeviceAllocationWorks) +{ + gko::array data{exec_with_unified, {1, 2}}; + exec->synchronize(); + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostUnifiedAllocationWorks) +{ + gko::array data{host_exec_with_unified, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +} // namespace diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp index b300292e9a3..79b197aacc8 100644 --- a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp +++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp @@ -68,13 +68,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/cb-gmres/cb-gmres.cpp b/examples/cb-gmres/cb-gmres.cpp index c0235f75e55..b096e48c71a 100644 --- a/examples/cb-gmres/cb-gmres.cpp +++ b/examples/cb-gmres/cb-gmres.cpp @@ -108,13 +108,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/custom-logger/custom-logger.cpp b/examples/custom-logger/custom-logger.cpp index c2270cadb0d..7e6cf531edd 100644 --- a/examples/custom-logger/custom-logger.cpp +++ b/examples/custom-logger/custom-logger.cpp @@ -249,13 +249,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp index af08dbdf226..4610413fe9c 100644 --- a/examples/custom-matrix-format/custom-matrix-format.cpp +++ b/examples/custom-matrix-format/custom-matrix-format.cpp @@ -255,13 +255,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp index 9389f86cc45..e07f1bf92fb 100644 --- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp +++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp @@ -158,13 +158,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp index aa32e0e879a..33946b7de44 100644 --- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp +++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp @@ -68,13 +68,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp index 5d8270f1ca1..460370b7e00 100644 --- a/examples/inverse-iteration/inverse-iteration.cpp +++ b/examples/inverse-iteration/inverse-iteration.cpp @@ -72,13 +72,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp index e676e15cc6d..407a083e548 100644 --- a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp +++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp @@ -71,13 +71,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp index cbd2156be60..14384eaab52 100644 --- a/examples/iterative-refinement/iterative-refinement.cpp +++ b/examples/iterative-refinement/iterative-refinement.cpp @@ -68,13 +68,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp index 7182bc9ad8c..5a7a8c086af 100644 --- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp +++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main() { // Instantiate a CUDA executor - auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); // Read data auto A = gko::read>(std::cin, gpu); auto b = gko::read>(std::cin, gpu); diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp index 6f1600d2805..9edd7ff29a1 100644 --- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp +++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp @@ -71,13 +71,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp index d3f45cda916..cbecbbbdc02 100644 --- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp +++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp @@ -69,13 +69,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp index 3510a2163e1..0882d755cdc 100644 --- a/examples/mixed-precision-ir/mixed-precision-ir.cpp +++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp @@ -76,13 +76,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/mixed-spmv/mixed-spmv.cpp b/examples/mixed-spmv/mixed-spmv.cpp index 78461de39ef..6b327c1c708 100644 --- a/examples/mixed-spmv/mixed-spmv.cpp +++ b/examples/mixed-spmv/mixed-spmv.cpp @@ -170,13 +170,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp index 6f75ca29630..a455ca2e8ed 100644 --- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp +++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp @@ -64,13 +64,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp index 7f47d039072..75c03259c67 100644 --- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp +++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp @@ -62,13 +62,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp index 51fdf97d4a4..05ee0503a5f 100644 --- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp +++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp @@ -230,13 +230,12 @@ void solve_system(const std::string& executor_string, {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp index 0d81ef65909..1ae2ae9ec08 100644 --- a/examples/papi-logging/papi-logging.cpp +++ b/examples/papi-logging/papi-logging.cpp @@ -151,13 +151,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp index f357a8d4619..5f036728924 100644 --- a/examples/performance-debugging/performance-debugging.cpp +++ b/examples/performance-debugging/performance-debugging.cpp @@ -371,13 +371,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp index 7602600a514..e16f0b26968 100644 --- a/examples/poisson-solver/poisson-solver.cpp +++ b/examples/poisson-solver/poisson-solver.cpp @@ -144,13 +144,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp index 37963f205cc..b64b588c4ef 100644 --- a/examples/preconditioned-solver/preconditioned-solver.cpp +++ b/examples/preconditioned-solver/preconditioned-solver.cpp @@ -69,13 +69,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp index 6aa85462605..02318dd7784 100644 --- a/examples/simple-solver-logging/simple-solver-logging.cpp +++ b/examples/simple-solver-logging/simple-solver-logging.cpp @@ -85,13 +85,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/simple-solver/simple-solver.cpp b/examples/simple-solver/simple-solver.cpp index 8f665f98496..81dc9ee6d74 100644 --- a/examples/simple-solver/simple-solver.cpp +++ b/examples/simple-solver/simple-solver.cpp @@ -89,13 +89,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp index 6bf3cc21a8a..63adfaa5571 100644 --- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp +++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp @@ -165,13 +165,12 @@ void solve_system(const std::string& executor_string, {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 6c6fc235f45..61b06ad4058 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,9 +1,13 @@ set(GINKGO_HIP_SOURCES + base/device.hip.cpp base/device_matrix_data_kernels.hip.cpp base/exception.hip.cpp base/executor.hip.cpp base/index_set_kernels.hip.cpp + base/memory.hip.cpp + base/roctx.hip.cpp base/scoped_device_id.hip.cpp + base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp components/prefix_sum_kernels.hip.cpp diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp new file mode 100644 index 00000000000..b5ec1bec6d6 --- /dev/null +++ b/hip/base/device.hip.cpp @@ -0,0 +1,67 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include "hip/base/scoped_device_id.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +void reset_device(int device_id) +{ + gko::detail::hip_scoped_device_id_guard guard{device_id}; + hipDeviceReset(); +} + + +void destroy_event(GKO_HIP_EVENT_STRUCT* event) +{ + GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event)); +} + + +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index cd8a485c19d..6b4b0fd5ddc 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -37,15 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX -#include -#endif #include #include #include -#include #include "hip/base/config.hip.hpp" @@ -60,32 +56,22 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" -#if (GINKGO_HIP_PLATFORM_NVCC == 1) -using hip_device_class = nvidia_device; -#else -using hip_device_class = amd_device; -#endif - - std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, hipStream_t stream) { return std::shared_ptr( - new HipExecutor(device_id, std::move(master), device_reset, alloc_mode, - stream), - [device_id](HipExecutor* exec) { - auto device_reset = exec->get_device_reset(); - std::lock_guard guard( - hip_device_class::get_mutex(device_id)); - delete exec; - auto& num_execs = hip_device_class::get_num_execs(device_id); - num_execs--; - if (!num_execs && device_reset) { - detail::hip_scoped_device_id_guard g(device_id); - hipDeviceReset(); - } - }); + new HipExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} + + +std::shared_ptr HipExecutor::create( + int device_id, std::shared_ptr master, + std::shared_ptr alloc, hipStream_t stream) +{ + return std::shared_ptr(new HipExecutor( + device_id, std::move(master), std::move(alloc), stream)); } @@ -125,42 +111,14 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, void HipExecutor::raw_free(void* ptr) const noexcept { detail::hip_scoped_device_id_guard g(this->get_device_id()); - auto error_code = hipFree(ptr); - if (error_code != hipSuccess) { -#if GKO_VERBOSE_LEVEL >= 1 - // Unfortunately, if memory free fails, there's not much we can do - std::cerr << "Unrecoverable HIP error on device " - << this->get_device_id() << " in " << __func__ << ": " - << hipGetErrorName(error_code) << ": " - << hipGetErrorString(error_code) << std::endl - << "Exiting program" << std::endl; -#endif // GKO_VERBOSE_LEVEL >= 1 - std::exit(error_code); - } + alloc_->deallocate(ptr); } void* HipExecutor::raw_alloc(size_type num_bytes) const { - void* dev_ptr = nullptr; detail::hip_scoped_device_id_guard g(this->get_device_id()); - int error_code = 0; - if (this->alloc_mode_ == allocation_mode::device) { - error_code = hipMalloc(&dev_ptr, num_bytes); -#if !(GKO_HIP_PLATFORM_HCC == 1) - } else if (this->alloc_mode_ == allocation_mode::unified_global) { - error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachGlobal); - } else if (this->alloc_mode_ == allocation_mode::unified_host) { - error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachHost); -#endif - } else { - GKO_NOT_SUPPORTED(this->alloc_mode_); - } - if (error_code != hipErrorMemoryAllocation) { - GKO_ASSERT_NO_HIP_ERRORS(error_code); - } - GKO_ENSURE_ALLOCATED(dev_ptr, "hip", num_bytes); - return dev_ptr; + return alloc_->allocate(num_bytes); } @@ -309,73 +267,4 @@ void HipExecutor::init_handles() } -hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id) -{ - detail::hip_scoped_device_id_guard g(device_id_); - GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_)); -} - - -hip_stream::~hip_stream() -{ - if (stream_) { - detail::hip_scoped_device_id_guard g(device_id_); - hipStreamDestroy(stream_); - } -} - - -hip_stream::hip_stream(hip_stream&& other) - : stream_{std::exchange(other.stream_, nullptr)}, - device_id_{std::exchange(other.device_id_, -1)} -{} - - -GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; } - - -namespace log { - - -#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX - -void begin_roctx(const char* name, profile_event_category) -{ - roctxRangePush(name); -} - -void end_roctx(const char*, profile_event_category) { roctxRangePop(); } - -#else - -void begin_roctx(const char* name, profile_event_category) - GKO_NOT_COMPILED(roctx); - -void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx); - -#endif - - -} // namespace log - - -namespace kernels { -namespace hip { - - -void reset_device(int device_id) -{ - gko::detail::hip_scoped_device_id_guard guard{device_id}; - hipDeviceReset(); -} - - -void destroy_event(GKO_HIP_EVENT_STRUCT* event) -{ - GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event)); -} - - -} // namespace hip -} // namespace kernels } // namespace gko diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp new file mode 100644 index 00000000000..f2a8977525f --- /dev/null +++ b/hip/base/memory.hip.cpp @@ -0,0 +1,97 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +namespace gko { + + +#define GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(_operation, _size) \ + { \ + auto error_code = _operation; \ + if (error_code == hipErrorMemoryAllocation) { \ + throw AllocationError(__FILE__, __LINE__, "hip", _size); \ + } else { \ + GKO_ASSERT_NO_HIP_ERRORS(error_code); \ + } \ + } + + +#if GKO_VERBOSE_LEVEL >= 1 +#define GKO_EXIT_ON_HIP_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != hipSuccess) { \ + int device_id{-1}; \ + hipGetDevice(&device_id); \ + std::cerr << "Unrecoverable HIP error on device " << device_id \ + << " in " << __func__ << ": " \ + << hipGetErrorName(error_code) << ": " \ + << hipGetErrorString(error_code) << std::endl \ + << "Exiting program" << std::endl; \ + std::exit(error_code); \ + } \ + } +#else +#define GKO_EXIT_ON_HIP_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != hipSuccess) { \ + std::exit(error_code); \ + } \ + } +#endif + + +void* HipAllocator::allocate(size_type num_bytes) const +{ + void* dev_ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes), + num_bytes); + return dev_ptr; +} + + +void HipAllocator::deallocate(void* dev_ptr) const +{ + GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr)); +} + + +} // namespace gko diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp new file mode 100644 index 00000000000..9f309b93362 --- /dev/null +++ b/hip/base/roctx.hip.cpp @@ -0,0 +1,70 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX +#include +#endif + + +#include +#include + + +namespace gko { +namespace log { + + +#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX + +void begin_roctx(const char* name, profile_event_category) +{ + roctxRangePush(name); +} + +void end_roctx(const char*, profile_event_category) { roctxRangePop(); } + +#else + +void begin_roctx(const char* name, profile_event_category) + GKO_NOT_COMPILED(roctx); + +void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx); + +#endif + + +} // namespace log +} // namespace gko diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp new file mode 100644 index 00000000000..e5817eb9ebd --- /dev/null +++ b/hip/base/stream.hip.cpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include "hip/base/scoped_device_id.hip.hpp" + + +namespace gko { + + +hip_stream::hip_stream() : stream_{}, device_id_{-1} {} + + +hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id) +{ + detail::hip_scoped_device_id_guard g(device_id_); + GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_)); +} + + +hip_stream::~hip_stream() +{ + if (stream_) { + detail::hip_scoped_device_id_guard g(device_id_); + hipStreamDestroy(stream_); + } +} + + +hip_stream::hip_stream(hip_stream&& other) + : stream_{std::exchange(other.stream_, nullptr)}, + device_id_{std::exchange(other.device_id_, -1)} +{} + + +GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; } + + +} // namespace gko diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt index 7ed0d2ceb52..f597a3d6e3d 100644 --- a/hip/test/base/CMakeLists.txt +++ b/hip/test/base/CMakeLists.txt @@ -1,6 +1,5 @@ ginkgo_create_hip_test(hip_executor) ginkgo_create_test(index_set) -ginkgo_create_test(hip_executor_reset ADDITIONAL_LIBRARIES Threads::Threads) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_hip_test(hip_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index d27dd58d132..e531fa739e6 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -109,18 +109,18 @@ class HipExecutor : public ::testing::Test { ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM hip = gko::HipExecutor::create( - 0, omp, false, gko::default_hip_alloc_mode, stream.get()); - hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, - omp, false, gko::default_hip_alloc_mode, - other_stream.get()); + 0, omp, std::make_shared(), stream.get()); + hip2 = gko::HipExecutor::create( + gko::HipExecutor::get_num_devices() - 1, omp, + std::make_shared(), other_stream.get()); hip3 = gko::HipExecutor::create( - 0, omp, false, gko::allocation_mode::unified_global, stream.get()); + 0, omp, std::make_shared(), stream.get()); #else hip = gko::HipExecutor::create(0, omp); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, omp); - hip3 = gko::HipExecutor::create(0, omp, false, - gko::allocation_mode::unified_global); + hip3 = gko::HipExecutor::create(0, omp, + std::make_shared()); #endif } diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index 9337da14139..bf7073cf9a1 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "hip/base/device.hpp" @@ -60,8 +61,9 @@ class HipTestFixture : public ::testing::Test { HipTestFixture() : ref(gko::ReferenceExecutor::create()), #ifdef GKO_TEST_NONDEFAULT_STREAM + stream(0), exec(gko::HipExecutor::create( - 0, ref, false, gko::default_hip_alloc_mode, stream.get())) + 0, ref, std::make_shared(), stream.get())) #else exec(gko::HipExecutor::create(0, ref)) #endif diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 965cd562bff..4545b216f86 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -47,7 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include +#include #include #include #include @@ -121,33 +123,6 @@ constexpr allocation_mode default_hip_alloc_mode = } // namespace gko -// after intel/llvm September'22 release, which uses major version 6, they -// introduce another inline namespace _V1. -#if GINKGO_DPCPP_MAJOR_VERSION >= 6 -namespace sycl { -inline namespace _V1 { - - -class queue; -class event; - - -} // namespace _V1 -} // namespace sycl -#else // GINKGO_DPCPP_MAJOR_VERSION < 6 -inline namespace cl { -namespace sycl { - - -class queue; -class event; - - -} // namespace sycl -} // namespace cl -#endif - - /** * The enum class is for the dpcpp queue property. It's legal to use a binary * or(|) operation to combine several properties. @@ -172,29 +147,6 @@ GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a, } -struct cublasContext; - -struct cusparseContext; - -struct CUstream_st; - -struct CUevent_st; - -struct hipblasContext; - -struct hipsparseContext; - -#if GINKGO_HIP_PLATFORM_HCC -struct ihipStream_t; -struct ihipEvent_t; -#define GKO_HIP_STREAM_STRUCT ihipStream_t -#define GKO_HIP_EVENT_STRUCT ihipEvent_t -#else -#define GKO_HIP_STREAM_STRUCT CUstream_st -#define GKO_HIP_EVENT_STRUCT CUevent_st -#endif - - namespace gko { @@ -1355,26 +1307,14 @@ class EnableDeviceReset { * * @param device_reset whether to allow a device reset or not */ - void set_device_reset(bool device_reset) { device_reset_ = device_reset; } + void set_device_reset(bool device_reset) {} /** * Returns the current status of the device reset boolean for this executor. * * @return the current status of the device reset boolean for this executor. */ - bool get_device_reset() { return device_reset_; } - -protected: - /** - * Instantiate an EnableDeviceReset class - * - * @param device_reset the starting device_reset status. Defaults to false. - */ - EnableDeviceReset(bool device_reset = false) : device_reset_{device_reset} - {} - -private: - bool device_reset_{}; + bool get_device_reset() { return false; } }; @@ -1411,9 +1351,11 @@ class OmpExecutor : public detail::ExecutorBase, /** * Creates a new OmpExecutor. */ - static std::shared_ptr create() + static std::shared_ptr create( + std::shared_ptr alloc = + std::make_shared()) { - return std::shared_ptr(new OmpExecutor()); + return std::shared_ptr(new OmpExecutor(std::move(alloc))); } std::shared_ptr get_master() noexcept override; @@ -1435,7 +1377,8 @@ class OmpExecutor : public detail::ExecutorBase, scoped_device_id_guard get_scoped_device_id_guard() const override; protected: - OmpExecutor() + OmpExecutor(std::shared_ptr alloc) + : alloc_{std::move(alloc)} { this->OmpExecutor::populate_exec_info(machine_topology::get_instance()); } @@ -1457,6 +1400,8 @@ class OmpExecutor : public detail::ExecutorBase, GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false); bool verify_memory_to(const DpcppExecutor* dest_exec) const override; + + std::shared_ptr alloc_; }; @@ -1476,9 +1421,12 @@ using DefaultExecutor = OmpExecutor; */ class ReferenceExecutor : public OmpExecutor { public: - static std::shared_ptr create() + static std::shared_ptr create( + std::shared_ptr alloc = + std::make_shared()) { - return std::shared_ptr(new ReferenceExecutor()); + return std::shared_ptr( + new ReferenceExecutor(std::move(alloc))); } scoped_device_id_guard get_scoped_device_id_guard() const override @@ -1495,7 +1443,8 @@ class ReferenceExecutor : public OmpExecutor { } protected: - ReferenceExecutor() + ReferenceExecutor(std::shared_ptr alloc) + : OmpExecutor{std::move(alloc)} { this->ReferenceExecutor::populate_exec_info( machine_topology::get_instance()); @@ -1550,15 +1499,32 @@ class CudaExecutor : public detail::ExecutorBase, * @param device_id the CUDA device id of this device * @param master an executor on the host that is used to invoke the device * kernels - * @param device_reset whether to reset the device after the object exits - * the scope. + * @param device_reset this option no longer has any effect. * @param alloc_mode the allocation mode that the executor should operate * on. See @allocation_mode for more details + * @param stream the stream to execute operations on. + */ + [[deprecated( + "device_reset is deprecated entirely, call cudaDeviceReset directly. " + "alloc_mode was replaced by the Allocator type " + "hierarchy.")]] static std::shared_ptr + create(int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode = default_cuda_alloc_mode, + CUstream_st* stream = nullptr); + + /** + * Creates a new CudaExecutor with a custom allocator and device stream. + * + * @param device_id the CUDA device id of this device + * @param master an executor on the host that is used to invoke the device + * kernels. + * @param alloc the allocator to use for device memory allocations. + * @param stream the stream to execute operations on. */ static std::shared_ptr create( int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_cuda_alloc_mode, + std::shared_ptr alloc = + std::make_shared(), CUstream_st* stream = nullptr); std::shared_ptr get_master() noexcept override; @@ -1679,26 +1645,15 @@ class CudaExecutor : public detail::ExecutorBase, void init_handles(); CudaExecutor(int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_cuda_alloc_mode, - CUstream_st* stream = nullptr) - : EnableDeviceReset{device_reset}, - master_(master), - alloc_mode_{alloc_mode}, - stream_{stream} + std::shared_ptr alloc, CUstream_st* stream) + : alloc_{std::move(alloc)}, master_(master), stream_{stream} { this->get_exec_info().device_id = device_id; this->get_exec_info().num_computing_units = 0; this->get_exec_info().num_pu_per_cu = 0; this->CudaExecutor::populate_exec_info( machine_topology::get_instance()); - - // it only gets attribute from device, so it should not be affected by - // DeviceReset. this->set_gpu_property(); - // increase the number of executor before any operations may be affected - // by DeviceReset. - increase_num_execs(this->get_exec_info().device_id); this->init_handles(); } @@ -1718,12 +1673,6 @@ class CudaExecutor : public detail::ExecutorBase, bool verify_memory_to(const CudaExecutor* dest_exec) const override; - static void increase_num_execs(unsigned device_id); - - static void decrease_num_execs(unsigned device_id); - - static unsigned get_num_execs(unsigned device_id); - void populate_exec_info(const machine_topology* mach_topo) override; private: @@ -1733,45 +1682,8 @@ class CudaExecutor : public detail::ExecutorBase, using handle_manager = std::unique_ptr>; handle_manager cublas_handle_; handle_manager cusparse_handle_; + std::shared_ptr alloc_; CUstream_st* stream_; - - allocation_mode alloc_mode_; -}; - - -/** - * An RAII wrapper for a custom CUDA stream. - * The stream will be created on construction and destroyed when the lifetime of - * the wrapper ends. - */ -class cuda_stream { -public: - /** Creates a new custom CUDA stream. */ - cuda_stream(int device_id = 0); - - /** Destroys the custom CUDA stream, if it wasn't moved-from already. */ - ~cuda_stream(); - - cuda_stream(const cuda_stream&) = delete; - - /** Move-constructs from an existing stream, which will be emptied. */ - cuda_stream(cuda_stream&&); - - cuda_stream& operator=(const cuda_stream&) = delete; - - /** Move-assigns from an existing stream, which will be emptied. */ - cuda_stream& operator=(cuda_stream&&) = delete; - - /** - * Returns the native CUDA stream handle. - * In a moved-from cuda_stream, this will return nullptr. - */ - CUstream_st* get() const; - -private: - CUstream_st* stream_; - - int device_id_; }; @@ -1805,10 +1717,15 @@ class HipExecutor : public detail::ExecutorBase, * @param alloc_mode the allocation mode that the executor should operate * on. See @allocation_mode for more details */ + [[deprecated("")]] static std::shared_ptr create( + int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode = default_hip_alloc_mode, + GKO_HIP_STREAM_STRUCT* stream = nullptr); + static std::shared_ptr create( int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_hip_alloc_mode, + std::shared_ptr alloc = + std::make_shared(), GKO_HIP_STREAM_STRUCT* stream = nullptr); std::shared_ptr get_master() noexcept override; @@ -1923,25 +1840,15 @@ class HipExecutor : public detail::ExecutorBase, void init_handles(); HipExecutor(int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_hip_alloc_mode, - GKO_HIP_STREAM_STRUCT* stream = nullptr) - : EnableDeviceReset{device_reset}, - master_(master), - alloc_mode_(alloc_mode), - stream_{stream} + std::shared_ptr alloc, + GKO_HIP_STREAM_STRUCT* stream) + : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream} { this->get_exec_info().device_id = device_id; this->get_exec_info().num_computing_units = 0; this->get_exec_info().num_pu_per_cu = 0; this->HipExecutor::populate_exec_info(machine_topology::get_instance()); - - // it only gets attribute from device, so it should not be affected by - // DeviceReset. this->set_gpu_property(); - // increase the number of executor before any operations may be affected - // by DeviceReset. - increase_num_execs(this->get_exec_info().device_id); this->init_handles(); } @@ -1961,12 +1868,6 @@ class HipExecutor : public detail::ExecutorBase, bool verify_memory_to(const HipExecutor* dest_exec) const override; - static void increase_num_execs(int device_id); - - static void decrease_num_execs(int device_id); - - static int get_num_execs(int device_id); - void populate_exec_info(const machine_topology* mach_topo) override; private: @@ -1976,48 +1877,11 @@ class HipExecutor : public detail::ExecutorBase, using handle_manager = std::unique_ptr>; handle_manager hipblas_handle_; handle_manager hipsparse_handle_; - - allocation_mode alloc_mode_; + std::shared_ptr alloc_; GKO_HIP_STREAM_STRUCT* stream_; }; -/** - * An RAII wrapper for a custom HIP stream. - * The stream will be created on construction and destroyed when the lifetime of - * the wrapper ends. - */ -class hip_stream { -public: - /** Creates a new custom HIP stream. */ - hip_stream(int device_id = 0); - - /** Destroys the custom HIP stream, if it wasn't moved-from already. */ - ~hip_stream(); - - hip_stream(const hip_stream&) = delete; - - /** Move-constructs from an existing stream, which will be emptied. */ - hip_stream(hip_stream&&); - - hip_stream& operator=(const hip_stream&) = delete; - - /** Move-assigns from an existing stream, which will be emptied. */ - hip_stream& operator=(hip_stream&&) = delete; - - /** - * Returns the native HIP stream handle. - * In a moved-from hip_stream, this will return nullptr. - */ - GKO_HIP_STREAM_STRUCT* get() const; - -private: - GKO_HIP_STREAM_STRUCT* stream_; - - int device_id_; -}; - - namespace kernels { namespace hip { using DefaultExecutor = HipExecutor; @@ -2050,6 +1914,28 @@ class DpcppExecutor : public detail::ExecutorBase, std::string device_type = "all", dpcpp_queue_property property = dpcpp_queue_property::in_order); + /** + * Creates a new DpcppExecutor from an existing SYCL queue. + * + * @param queue the DPCPP device id of this device + * @param master an executor on the host that is used to invoke the device + * kernels + */ + static std::shared_ptr create( + sycl::queue* queue, std::shared_ptr master); + + /** + * Creates a new DpcppExecutor from an existing SYCL queue. + * + * @param queue the DPCPP device id of this device + * @param master an executor on the host that is used to invoke the device + * kernels + * @param alloc the allocator used for memory allocation + */ + static std::shared_ptr create( + sycl::queue* queue, std::shared_ptr master, + std::shared_ptr alloc); + std::shared_ptr get_master() noexcept override; std::shared_ptr get_master() const noexcept override; diff --git a/include/ginkgo/core/base/fwd_defs.hpp b/include/ginkgo/core/base/fwd_defs.hpp new file mode 100644 index 00000000000..5f0cbd9d960 --- /dev/null +++ b/include/ginkgo/core/base/fwd_defs.hpp @@ -0,0 +1,90 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ +#define GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ + + +#include + + +struct cublasContext; + +struct cusparseContext; + +struct CUstream_st; + +struct CUevent_st; + +struct hipblasContext; + +struct hipsparseContext; + +#if GINKGO_HIP_PLATFORM_HCC +struct ihipStream_t; +struct ihipEvent_t; +#define GKO_HIP_STREAM_STRUCT ihipStream_t +#define GKO_HIP_EVENT_STRUCT ihipEvent_t +#else +#define GKO_HIP_STREAM_STRUCT CUstream_st +#define GKO_HIP_EVENT_STRUCT CUevent_st +#endif + + +// after intel/llvm September'22 release, which uses major version 6, they +// introduce another inline namespace _V1. +#if GINKGO_DPCPP_MAJOR_VERSION >= 6 +namespace sycl { +inline namespace _V1 { + + +class queue; +class event; + + +} // namespace _V1 +} // namespace sycl +#else // GINKGO_DPCPP_MAJOR_VERSION < 6 +inline namespace cl { +namespace sycl { + + +class queue; +class event; + + +} // namespace sycl +} // namespace cl +#endif + + +#endif // GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp new file mode 100644 index 00000000000..ec25920dcea --- /dev/null +++ b/include/ginkgo/core/base/memory.hpp @@ -0,0 +1,211 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_MEMORY_HPP_ +#define GKO_PUBLIC_CORE_BASE_MEMORY_HPP_ + + +#include +#include + + +namespace gko { + + +/** + * Provides generic allocation and deallocation functionality to be used by an + * Executor. + */ +class Allocator { +public: + virtual ~Allocator() = default; + + virtual void* allocate(size_type num_bytes) const = 0; + + virtual void deallocate(void* ptr) const = 0; +}; + + +/** + * Implement this interface to provide an allocator for OmpExecutor or + * ReferenceExecutor. + */ +class CpuAllocatorBase : public Allocator {}; + + +/** + * Implement this interface to provide an allocator for CudaExecutor. + */ +class CudaAllocatorBase : public Allocator {}; + + +/** + * Implement this interface to provide an allocator for HipExecutor. + */ +class HipAllocatorBase : public Allocator {}; + + +/** + * Implement this interface to provide an allocator for DpcppExecutor. + */ +class DpcppAllocatorBase : public Allocator { +public: + DpcppAllocatorBase(sycl::queue* queue); + +protected: + virtual void* allocate_impl(sycl::queue* queue, + size_type num_bytes) const = 0; + + virtual void deallocate_impl(sycl::queue* queue, void* ptr) const = 0; + +private: + sycl::queue* queue_; +}; + + +/** + * Allocator using new/delete. + */ +class CpuAllocator : public CpuAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; +}; + + +/** + * Allocator using cudaMalloc. + */ +class CudaAllocator : public CudaAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; +}; + + +/* + * Allocator using cudaMallocAsync. + */ +class CudaAsyncAllocator : public CudaAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; + + CudaAsyncAllocator(CUstream_st* stream); + +private: + CUstream_st* stream_; +}; + + +/* + * Allocator using cudaMallocManaged + */ +class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; + + CudaUnifiedAllocator(int device_id); + + CudaUnifiedAllocator(int device_id, unsigned int flags); + +private: + int device_id_; + unsigned int flags_; +}; + + +/* + * Allocator using cudaMallocHost. + */ +class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; + + CudaHostAllocator(int device_id); + +private: + int device_id_; +}; + + +/* + * Allocator using hipMalloc. + */ +class HipAllocator : public HipAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; +}; + + +/* + * Allocator using sycl::malloc_device. + */ +class DpcppAllocator : public DpcppAllocatorBase { +public: + using DpcppAllocatorBase::DpcppAllocatorBase; + +protected: + void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + + void deallocate_impl(sycl::queue* queue, void* ptr) const override; +}; + + +/* + * Allocator using sycl::malloc_shared. + */ +class DpcppUnifiedAllocator : public DpcppAllocatorBase, + public CpuAllocatorBase { +public: + using DpcppAllocatorBase::DpcppAllocatorBase; + +protected: + void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + + void deallocate_impl(sycl::queue* queue, void* ptr) const override; +}; + + +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_MEMORY_HPP_ diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp new file mode 100644 index 00000000000..4bb4aeecf9e --- /dev/null +++ b/include/ginkgo/core/base/stream.hpp @@ -0,0 +1,124 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_STREAM_HPP_ +#define GKO_PUBLIC_CORE_BASE_STREAM_HPP_ + + +#include + + +namespace gko { + + +/** + * An RAII wrapper for a custom CUDA stream. + * The stream will be created on construction and destroyed when the lifetime of + * the wrapper ends. + */ +class cuda_stream { +public: + /** Creates an empty stream wrapper, representing the default stream. */ + cuda_stream(); + + /** Creates a new custom CUDA stream. */ + cuda_stream(int device_id); + + /** Destroys the custom CUDA stream, if it isn't empty. */ + ~cuda_stream(); + + cuda_stream(const cuda_stream&) = delete; + + /** Move-constructs from an existing stream, which will be emptied. */ + cuda_stream(cuda_stream&&); + + cuda_stream& operator=(const cuda_stream&) = delete; + + /** Move-assigns from an existing stream, which will be emptied. */ + cuda_stream& operator=(cuda_stream&&) = delete; + + /** + * Returns the native CUDA stream handle. + * In an empty cuda_stream, this will return nullptr. + */ + CUstream_st* get() const; + +private: + CUstream_st* stream_; + + int device_id_; +}; + + +/** + * An RAII wrapper for a custom HIP stream. + * The stream will be created on construction and destroyed when the lifetime of + * the wrapper ends. + */ +class hip_stream { +public: + /** Creates an empty stream wrapper, representing the default stream. */ + hip_stream(); + + /** Creates a new custom HIP stream. */ + hip_stream(int device_id); + + /** Destroys the custom HIP stream, if it isn't empty. */ + ~hip_stream(); + + hip_stream(const hip_stream&) = delete; + + /** Move-constructs from an existing stream, which will be emptied. */ + hip_stream(hip_stream&&); + + hip_stream& operator=(const hip_stream&) = delete; + + /** Move-assigns from an existing stream, which will be emptied. */ + hip_stream& operator=(hip_stream&&) = delete; + + /** + * Returns the native HIP stream handle. + * In an empty hip_stream, this will return nullptr. + */ + GKO_HIP_STREAM_STRUCT* get() const; + +private: + GKO_HIP_STREAM_STRUCT* stream_; + + int device_id_; +}; + + +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 93663b02290..d73bf669700 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -65,6 +67,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 25482cf18c8..33e6258fbbd 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #ifdef GKO_COMPILING_CUDA #include "cuda/base/device.hpp" @@ -106,8 +109,8 @@ inline void init_executor(std::shared_ptr ref, if (gko::CudaExecutor::get_num_devices() == 0) { throw std::runtime_error{"No suitable CUDA devices"}; } - exec = gko::CudaExecutor::create(0, ref, false, - gko::default_cuda_alloc_mode, stream); + exec = gko::CudaExecutor::create( + 0, ref, std::make_shared(stream), stream); } } @@ -119,8 +122,8 @@ inline void init_executor(std::shared_ptr ref, if (gko::HipExecutor::get_num_devices() == 0) { throw std::runtime_error{"No suitable HIP devices"}; } - exec = gko::HipExecutor::create(0, ref, false, gko::default_hip_alloc_mode, - stream); + exec = gko::HipExecutor::create( + 0, ref, std::make_shared(), stream); } @@ -146,7 +149,13 @@ class CommonTestFixture : public ::testing::Test { #endif using index_type = int; - CommonTestFixture() : ref{gko::ReferenceExecutor::create()} + CommonTestFixture() + : +#if defined(GKO_TEST_NONDEFAULT_STREAM) && \ + (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) + stream{0}, +#endif + ref{gko::ReferenceExecutor::create()} { #if defined(GKO_TEST_NONDEFAULT_STREAM) && \ (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp index 59c3f1e3f3b..d8c94e01804 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/executor.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include inline void init_executor(std::shared_ptr, @@ -71,7 +72,7 @@ inline void init_executor(std::shared_ptr ref, exec = gko::CudaExecutor::create( gko::experimental::mpi::map_rank_to_device_id( MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()), - ref, false, gko::default_cuda_alloc_mode, stream); + ref, std::make_shared(), stream); } } @@ -86,7 +87,7 @@ inline void init_executor(std::shared_ptr ref, exec = gko::HipExecutor::create( gko::experimental::mpi::map_rank_to_device_id( MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()), - ref, false, gko::default_hip_alloc_mode, stream); + ref, std::make_shared(), stream); } From 5dfebba3cbe3495ae150bc5b902cb25851a6796d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 30 Mar 2023 12:23:58 +0200 Subject: [PATCH 051/583] reset to default CUDA allocator --- test/utils/executor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 33e6258fbbd..c588ac74260 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -110,7 +110,7 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable CUDA devices"}; } exec = gko::CudaExecutor::create( - 0, ref, std::make_shared(stream), stream); + 0, ref, std::make_shared(stream), stream); } } From cd7cfc224321bd1013161b92226ce905e4b4d16d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 30 Mar 2023 16:18:42 +0200 Subject: [PATCH 052/583] fix some compilation issues --- core/device_hooks/cuda_hooks.cpp | 12 ++++++++++- core/device_hooks/dpcpp_hooks.cpp | 15 +++++++++++-- core/device_hooks/hip_hooks.cpp | 25 +++++++++------------- cuda/base/memory.cpp | 28 ++++++++++++++++++++++++ dpcpp/CMakeLists.txt | 1 + dpcpp/base/executor.dp.cpp | 33 ----------------------------- dpcpp/base/memory.dp.cpp | 25 ++++++++++++++++------ include/ginkgo/core/base/memory.hpp | 4 ++++ 8 files changed, 86 insertions(+), 57 deletions(-) diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index cdecf735a9d..f8489908cc9 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -96,9 +96,19 @@ void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, CUstream_st* stream) +{ + return std::shared_ptr( + new CudaExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} + + +std::shared_ptr CudaExecutor::create( + int device_id, std::shared_ptr master, + std::shared_ptr alloc, CUstream_st* stream) { return std::shared_ptr(new CudaExecutor( - device_id, std::move(master), device_reset, alloc_mode, stream)); + device_id, std::move(master), std::move(alloc), stream)); } diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 0ee3e6f289f..1981c712872 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -53,7 +53,18 @@ version version_info::get_dpcpp_version() noexcept } -void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type size) const +DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue*) GKO_NOT_COMPILED(dpcpp); + + +void* DpcppAllocatorBase::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(dpcpp); + + +void DpcppAllocatorBase::deallocate(void* ptr) const GKO_NOT_COMPILED(dpcpp); + + +void* DpcppAllocator::allocate_impl(sycl::queue* queue, + size_type num_bytes) const GKO_NOT_COMPILED(dpcpp); @@ -62,7 +73,7 @@ void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, - size_type size) const + size_type num_bytes) const GKO_NOT_COMPILED(dpcpp); diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 739dac39f08..54486cc4e74 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -36,10 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include #include +#include "ginkgo/core/base/memory.hpp" namespace gko { @@ -53,29 +55,22 @@ version version_info::get_hip_version() noexcept } -void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); +void* HipAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(hip); -void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); +void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip); -HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) +std::shared_ptr HipExecutor::create( + int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) GKO_NOT_COMPILED(hip); -void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); - - -void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); - - std::shared_ptr HipExecutor::create( - int device_id, std::shared_ptr master, bool device_reset, - allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) -{ - return std::shared_ptr(new HipExecutor( - device_id, std::move(master), device_reset, alloc_mode, stream)); -} + int device_id, std::shared_ptr master, + std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) + GKO_NOT_COMPILED(hip); void HipExecutor::populate_exec_info(const machine_topology* mach_topo) diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index 11dee81ad42..c1b0a5d517f 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -97,6 +97,9 @@ void CudaAllocator::deallocate(void* ptr) const } +#if CUDA_VERSION >= 11020 + + CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} @@ -108,12 +111,37 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const return ptr; } + void CudaAsyncAllocator::deallocate(void* ptr) const { GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_)); } +#else // Fall back to regular allocation + + +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{} {} + + +void* CudaAsyncAllocator::allocate(size_type num_bytes) const +{ + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void CudaAsyncAllocator::deallocate(void* ptr) const +{ + GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); +} + + +#endif + + CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id) : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal} {} diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 31b5e0543ba..55763ca5525 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -10,6 +10,7 @@ target_sources(ginkgo_dpcpp base/executor.dp.cpp base/helper.dp.cpp base/index_set_kernels.dp.cpp + base/memory.dp.cpp base/scoped_device_id.dp.cpp base/timer.dp.cpp base/version.dp.cpp diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index d668331a43b..c2015c8664c 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -51,39 +51,6 @@ namespace gko { namespace detail { -DpcppAllocator::DpcppAllocator(sycl::queue* queue) : queue_{queue} {} - - -void* DpcppAllocator::allocate(size_type size) -{ - return sycl::malloc_device(size, *queue_); -} - - -void DpcppAllocator::deallocate(void* ptr) -{ - queue_->wait_and_throw(); - sycl::free(ptr, queue_->get_context()); -} - - -DpcppUnifiedAllocator::DpcppUnifiedAllocator(sycl::queue* queue) : queue_{queue} -{} - - -void* DpcppUnifiedAllocator::allocate(size_type size) -{ - return sycl::malloc_shared(size, *queue_); -} - - -void DpcppUnifiedAllocator::deallocate(void* ptr) -{ - queue_->wait_and_throw(); - sycl::free(ptr, queue_->get_context()); -} - - const std::vector get_devices(std::string device_type) { std::map device_type_map{ diff --git a/dpcpp/base/memory.dp.cpp b/dpcpp/base/memory.dp.cpp index b1ccd007dea..2582fa331a0 100644 --- a/dpcpp/base/memory.dp.cpp +++ b/dpcpp/base/memory.dp.cpp @@ -42,10 +42,22 @@ namespace gko { DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {} +void* DpcppAllocatorBase::allocate(size_type num_bytes) const +{ + return this->allocate_impl(queue_, num_bytes); +} + + +void DpcppAllocatorBase::deallocate(void* ptr) const +{ + this->deallocate_impl(queue_, ptr); +} + + void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type num_bytes) const { - return sycl::malloc_device(size, *queue); + return sycl::malloc_device(num_bytes, *queue); } @@ -56,16 +68,17 @@ void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const } -void* DpcppUnifiedAllocator::allocate(size_type num_bytes) +void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, + size_type num_bytes) { - return sycl::malloc_shared(size, *queue_); + return sycl::malloc_shared(num_bytes, *queue); } -void DpcppUnifiedAllocator::deallocate(void* ptr) +void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) { - queue_->wait_and_throw(); - sycl::free(ptr, queue_->get_context()); + queue->wait_and_throw(); + sycl::free(ptr, queue->get_context()); } diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index ec25920dcea..872a25a9a33 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -81,6 +81,10 @@ class DpcppAllocatorBase : public Allocator { public: DpcppAllocatorBase(sycl::queue* queue); + void* allocate(size_type num_bytes) const final; + + void deallocate(void* ptr) const final; + protected: virtual void* allocate_impl(sycl::queue* queue, size_type num_bytes) const = 0; From 223b06b477de4570a27b94d175d0f30219faf1c5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 31 May 2023 14:08:44 +0200 Subject: [PATCH 053/583] formatting --- core/base/memory.cpp | 2 +- core/device_hooks/hip_hooks.cpp | 2 +- core/test/base/executor.cpp | 2 +- cuda/base/device.cpp | 4 +--- cuda/base/memory.cpp | 2 +- cuda/base/nvtx.cpp | 4 +++- cuda/base/stream.cpp | 4 +++- cuda/test/base/cuda_executor.cu | 1 - hip/base/device.hip.cpp | 4 ++-- hip/base/roctx.hip.cpp | 4 +++- include/ginkgo/core/base/stream.hpp | 2 +- 11 files changed, 17 insertions(+), 14 deletions(-) diff --git a/core/base/memory.cpp b/core/base/memory.cpp index 88d97bcc765..4e9f0b7e24a 100644 --- a/core/base/memory.cpp +++ b/core/base/memory.cpp @@ -56,4 +56,4 @@ void CpuAllocator::deallocate(void* ptr) const } -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 54486cc4e74..4dbe6409c01 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -36,12 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include #include #include -#include "ginkgo/core/base/memory.hpp" namespace gko { diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 71064cf01d2..94e7bc02d79 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include "ginkgo/core/base/memory.hpp" #if defined(__unix__) || defined(__APPLE__) @@ -47,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include namespace { diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp index 31ab5bcde63..2db0876ca95 100644 --- a/cuda/base/device.cpp +++ b/cuda/base/device.cpp @@ -30,15 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "cuda/base/device.hpp" - - #include #include +#include "cuda/base/device.hpp" #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index c1b0a5d517f..afc1f9f62fa 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -193,4 +193,4 @@ void CudaHostAllocator::deallocate(void* ptr) const } -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp index e313c110ea2..3cbc59299b0 100644 --- a/cuda/base/nvtx.cpp +++ b/cuda/base/nvtx.cpp @@ -30,10 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include #ifdef GKO_LEGACY_NVTX #include #else diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp index 8c6aa92c28b..0bbc9b1cc83 100644 --- a/cuda/base/stream.cpp +++ b/cuda/base/stream.cpp @@ -30,11 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include #include -#include #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index afb23c06186..c81799e0dae 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include - #include "common/cuda_hip/base/executor.hpp.inc" #include "cuda/base/scoped_device_id.hpp" #include "cuda/test/utils.hpp" diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index b5ec1bec6d6..9a01d6aacee 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -30,15 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include #include -#include #include +#include #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index 9f309b93362..a01bc11dc47 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -30,10 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX #include #endif diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp index 4bb4aeecf9e..8ee8333e41a 100644 --- a/include/ginkgo/core/base/stream.hpp +++ b/include/ginkgo/core/base/stream.hpp @@ -121,4 +121,4 @@ class hip_stream { } // namespace gko -#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ +#endif // GKO_PUBLIC_CORE_BASE_STREAM_HPP_ From 19c23eca2d0472d5a003bdcbe026315d14d0a761 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 1 Jun 2023 12:00:18 +0200 Subject: [PATCH 054/583] fix compilation --- core/device_hooks/hip_hooks.cpp | 12 ++++++++++-- test/utils/executor.hpp | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 4dbe6409c01..ba7563f1ef0 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -64,13 +64,21 @@ void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip); std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) - GKO_NOT_COMPILED(hip); +{ + return std::shared_ptr( + new HipExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) - GKO_NOT_COMPILED(hip); +{ + return std::shared_ptr( + new HipExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} void HipExecutor::populate_exec_info(const machine_topology* mach_topo) diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index c588ac74260..200f4652644 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -110,7 +110,7 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable CUDA devices"}; } exec = gko::CudaExecutor::create( - 0, ref, std::make_shared(stream), stream); + 0, ref, std::make_shared(), stream); } } From eb31e6215cbaf00ddc1aed87c1af925b5b50a68b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 6 Jul 2023 15:45:46 +0200 Subject: [PATCH 055/583] update interface and remove DPC++ allocator --- core/base/memory.cpp | 4 +- core/device_hooks/cuda_hooks.cpp | 32 ++++-- core/device_hooks/dpcpp_hooks.cpp | 28 ----- core/device_hooks/hip_hooks.cpp | 51 ++++++++- core/test/base/executor.cpp | 2 +- cuda/base/executor.cpp | 9 +- cuda/base/memory.cpp | 43 ++++++-- dpcpp/CMakeLists.txt | 1 - dpcpp/base/memory.dp.cpp | 85 --------------- dpcpp/test/base/CMakeLists.txt | 1 - dpcpp/test/base/memory.dp.cpp | 98 ----------------- hip/base/executor.hip.cpp | 9 +- hip/base/memory.hip.cpp | 123 ++++++++++++++++++++- include/ginkgo/core/base/executor.hpp | 33 ++---- include/ginkgo/core/base/memory.hpp | 151 ++++++++++++++++++-------- 15 files changed, 348 insertions(+), 322 deletions(-) delete mode 100644 dpcpp/base/memory.dp.cpp delete mode 100644 dpcpp/test/base/memory.dp.cpp diff --git a/core/base/memory.cpp b/core/base/memory.cpp index 4e9f0b7e24a..b6c6f8f265c 100644 --- a/core/base/memory.cpp +++ b/core/base/memory.cpp @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -void* CpuAllocator::allocate(size_type num_bytes) const +void* CpuAllocator::allocate(size_type num_bytes) { auto ptr = ::operator new (num_bytes, std::nothrow_t{}); GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes); @@ -50,7 +50,7 @@ void* CpuAllocator::allocate(size_type num_bytes) const } -void CpuAllocator::deallocate(void* ptr) const +void CpuAllocator::deallocate(void* ptr) { ::operator delete (ptr, std::nothrow_t{}); } diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index f8489908cc9..03ab12deb46 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -54,43 +54,55 @@ version version_info::get_cuda_version() noexcept } -void* CudaAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(cuda); +void* CudaAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); -void CudaAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); +void CudaAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream) GKO_NOT_COMPILED(cuda); -void* CudaAsyncAllocator::allocate(size_type num_bytes) const - GKO_NOT_COMPILED(cuda); +void* CudaAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); + +void CudaAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); -void CudaAsyncAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + +bool CudaAsyncAllocator::check_environment(int device_id, + CUstream_st* stream) const + GKO_NOT_COMPILED(cuda); CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) GKO_NOT_COMPILED(cuda); -void* CudaUnifiedAllocator::allocate(size_type num_bytes) const +void* CudaUnifiedAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); -void CudaUnifiedAllocator::deallocate(void* dev_ptr) const +void CudaUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); + + +bool CudaUnifiedAllocator::check_environment(int device_id, + CUstream_st* stream) const GKO_NOT_COMPILED(cuda); CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda); -void* CudaHostAllocator::allocate(size_type num_bytes) const - GKO_NOT_COMPILED(cuda); +void* CudaHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); + +void CudaHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); -void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + +bool CudaHostAllocator::check_environment(int device_id, + CUstream_st* stream) const + GKO_NOT_COMPILED(cuda); std::shared_ptr CudaExecutor::create( diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 1981c712872..532e9c55bbe 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -53,34 +53,6 @@ version version_info::get_dpcpp_version() noexcept } -DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue*) GKO_NOT_COMPILED(dpcpp); - - -void* DpcppAllocatorBase::allocate(size_type num_bytes) const - GKO_NOT_COMPILED(dpcpp); - - -void DpcppAllocatorBase::deallocate(void* ptr) const GKO_NOT_COMPILED(dpcpp); - - -void* DpcppAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) const - GKO_NOT_COMPILED(dpcpp); - - -void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const - GKO_NOT_COMPILED(dpcpp); - - -void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) const - GKO_NOT_COMPILED(dpcpp); - - -void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const - GKO_NOT_COMPILED(dpcpp); - - std::shared_ptr DpcppExecutor::create( int device_id, std::shared_ptr master, std::string device_type, dpcpp_queue_property property) diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index ba7563f1ef0..dec1de15933 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -55,10 +55,54 @@ version version_info::get_hip_version() noexcept } -void* HipAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(hip); +void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); -void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip); +void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) + GKO_NOT_COMPILED(hip); + + +void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +bool HipAsyncAllocator::check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + GKO_NOT_COMPILED(hip); + + +HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags) + GKO_NOT_COMPILED(hip); + + +void* HipUnifiedAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +bool HipUnifiedAllocator::check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + GKO_NOT_COMPILED(hip); + + +HipHostAllocator::HipHostAllocator(int device_id) GKO_NOT_COMPILED(hip); + + +void* HipHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +bool HipHostAllocator::check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + GKO_NOT_COMPILED(hip); std::shared_ptr HipExecutor::create( @@ -76,8 +120,7 @@ std::shared_ptr HipExecutor::create( std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) { return std::shared_ptr( - new HipExecutor(device_id, std::move(master), - std::make_shared(), stream)); + new HipExecutor(device_id, std::move(master), alloc, stream)); } diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 94e7bc02d79..13cba09e2b6 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -386,7 +386,7 @@ TEST(Executor, CanVerifyMemory) struct MockAllocator : gko::CpuAllocator { - void deallocate(void* ptr) const noexcept override + void deallocate(void* ptr) noexcept override { called_free = true; CpuAllocator::deallocate(ptr); diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index f6e838dd2dd..faf90037a0f 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -80,9 +80,8 @@ std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, cudaStream_t stream) { - return std::shared_ptr( - new CudaExecutor(device_id, std::move(master), - allocator_from_mode(device_id, alloc_mode), stream)); + return create(device_id, master, allocator_from_mode(device_id, alloc_mode), + stream); } @@ -90,6 +89,10 @@ std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, std::shared_ptr alloc, cudaStream_t stream) { + if (!alloc->check_environment(device_id, stream)) { + throw Error{__FILE__, __LINE__, + "Allocator uses incorrect stream or device ID."}; + } return std::shared_ptr(new CudaExecutor( device_id, std::move(master), std::move(alloc), stream)); } diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index afc1f9f62fa..08c64c0ba05 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -82,7 +82,7 @@ namespace gko { #endif -void* CudaAllocator::allocate(size_type num_bytes) const +void* CudaAllocator::allocate(size_type num_bytes) { void* ptr{}; GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), @@ -91,7 +91,7 @@ void* CudaAllocator::allocate(size_type num_bytes) const } -void CudaAllocator::deallocate(void* ptr) const +void CudaAllocator::deallocate(void* ptr) { GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); } @@ -103,7 +103,7 @@ void CudaAllocator::deallocate(void* ptr) const CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} -void* CudaAsyncAllocator::allocate(size_type num_bytes) const +void* CudaAsyncAllocator::allocate(size_type num_bytes) { void* ptr{}; GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS( @@ -112,7 +112,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const } -void CudaAsyncAllocator::deallocate(void* ptr) const +void CudaAsyncAllocator::deallocate(void* ptr) { GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_)); } @@ -121,10 +121,10 @@ void CudaAsyncAllocator::deallocate(void* ptr) const #else // Fall back to regular allocation -CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{} {} +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} -void* CudaAsyncAllocator::allocate(size_type num_bytes) const +void* CudaAsyncAllocator::allocate(size_type num_bytes) { void* ptr{}; GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), @@ -133,7 +133,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const } -void CudaAsyncAllocator::deallocate(void* ptr) const +void CudaAsyncAllocator::deallocate(void* ptr) { GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); } @@ -142,6 +142,13 @@ void CudaAsyncAllocator::deallocate(void* ptr) const #endif +bool CudaAsyncAllocator::check_environment(int device_id, + CUstream_st* stream) const +{ + return stream == stream_; +} + + CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id) : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal} {} @@ -152,7 +159,7 @@ CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) {} -void* CudaUnifiedAllocator::allocate(size_type num_bytes) const +void* CudaUnifiedAllocator::allocate(size_type num_bytes) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -163,7 +170,7 @@ void* CudaUnifiedAllocator::allocate(size_type num_bytes) const } -void CudaUnifiedAllocator::deallocate(void* ptr) const +void CudaUnifiedAllocator::deallocate(void* ptr) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -171,10 +178,17 @@ void CudaUnifiedAllocator::deallocate(void* ptr) const } +bool CudaUnifiedAllocator::check_environment(int device_id, + CUstream_st* stream) const +{ + return device_id == device_id_; +} + + CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {} -void* CudaHostAllocator::allocate(size_type num_bytes) const +void* CudaHostAllocator::allocate(size_type num_bytes) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -185,7 +199,7 @@ void* CudaHostAllocator::allocate(size_type num_bytes) const } -void CudaHostAllocator::deallocate(void* ptr) const +void CudaHostAllocator::deallocate(void* ptr) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -193,4 +207,11 @@ void CudaHostAllocator::deallocate(void* ptr) const } +bool CudaHostAllocator::check_environment(int device_id, + CUstream_st* stream) const +{ + return device_id == device_id_; +} + + } // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 55763ca5525..31b5e0543ba 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -10,7 +10,6 @@ target_sources(ginkgo_dpcpp base/executor.dp.cpp base/helper.dp.cpp base/index_set_kernels.dp.cpp - base/memory.dp.cpp base/scoped_device_id.dp.cpp base/timer.dp.cpp base/version.dp.cpp diff --git a/dpcpp/base/memory.dp.cpp b/dpcpp/base/memory.dp.cpp deleted file mode 100644 index 2582fa331a0..00000000000 --- a/dpcpp/base/memory.dp.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include - - -namespace gko { - - -DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {} - - -void* DpcppAllocatorBase::allocate(size_type num_bytes) const -{ - return this->allocate_impl(queue_, num_bytes); -} - - -void DpcppAllocatorBase::deallocate(void* ptr) const -{ - this->deallocate_impl(queue_, ptr); -} - - -void* DpcppAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) const -{ - return sycl::malloc_device(num_bytes, *queue); -} - - -void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const -{ - queue->wait_and_throw(); - sycl::free(ptr, queue->get_context()); -} - - -void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) -{ - return sycl::malloc_shared(num_bytes, *queue); -} - - -void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) -{ - queue->wait_and_throw(); - sycl::free(ptr, queue->get_context()); -} - - -} // namespace gko diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt index 5c0ca601f04..bb9c8a75050 100644 --- a/dpcpp/test/base/CMakeLists.txt +++ b/dpcpp/test/base/CMakeLists.txt @@ -3,4 +3,3 @@ ginkgo_create_dpcpp_test(dim3) ginkgo_create_dpcpp_test(kernel_launch) # set correct flags for kernel_launch.hpp target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP) -ginkgo_create_dpcpp_test(memory) \ No newline at end of file diff --git a/dpcpp/test/base/memory.dp.cpp b/dpcpp/test/base/memory.dp.cpp deleted file mode 100644 index e587660cde3..00000000000 --- a/dpcpp/test/base/memory.dp.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include -#include - - -#include - - -#include -#include -#include - - -#include "dpcpp/test/utils.hpp" - - -namespace { - - -class Memory : public ::testing::Test { -protected: - Memory() - : exec{gko::DpcppExecutor::create(0, gko::OmpExecutor::create())}, - host_exec_with_unified{gko::OmpExecutor::create( - std::make_shared(exec->get_queue()))}, - exec_with_unified{gko::DpcppExecutor::create( - exec->get_queue(), host_exec_with_unified, - std::make_shared(exec->get_queue()))} - {} - - std::shared_ptr exec; - std::shared_ptr host_exec_with_unified; - std::shared_ptr exec_with_unified; -}; - - -TEST_F(Memory, DeviceAllocationWorks) -{ - gko::array data{exec, {1, 2}}; - - GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); -} - - -TEST_F(Memory, UnifiedDeviceAllocationWorks) -{ - gko::array data{exec_with_unified, {1, 2}}; - exec->synchronize(); - - ASSERT_EQ(data.get_const_data()[0], 1); - ASSERT_EQ(data.get_const_data()[1], 2); -} - - -TEST_F(Memory, HostUnifiedAllocationWorks) -{ - gko::array data{host_exec_with_unified, {1, 2}}; - - ASSERT_EQ(data.get_const_data()[0], 1); - ASSERT_EQ(data.get_const_data()[1], 2); -} - - -} // namespace diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 6b4b0fd5ddc..2df5c9a4847 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -60,9 +60,8 @@ std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, hipStream_t stream) { - return std::shared_ptr( - new HipExecutor(device_id, std::move(master), - std::make_shared(), stream)); + return create(device_id, std::move(master), + std::make_shared(), stream); } @@ -70,6 +69,10 @@ std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, std::shared_ptr alloc, hipStream_t stream) { + if (!alloc->check_environment(device_id, stream)) { + throw Error{__FILE__, __LINE__, + "Allocator uses incorrect stream or device ID."}; + } return std::shared_ptr(new HipExecutor( device_id, std::move(master), std::move(alloc), stream)); } diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index f2a8977525f..7acb208173a 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "hip/base/scoped_device_id.hip.hpp" + + namespace gko { @@ -79,7 +82,7 @@ namespace gko { #endif -void* HipAllocator::allocate(size_type num_bytes) const +void* HipAllocator::allocate(size_type num_bytes) { void* dev_ptr{}; GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes), @@ -88,10 +91,126 @@ void* HipAllocator::allocate(size_type num_bytes) const } -void HipAllocator::deallocate(void* dev_ptr) const +void HipAllocator::deallocate(void* dev_ptr) { GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr)); } +#if HIP_VERSION_MAJOR >= 5 + + +HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} + + +void* HipAsyncAllocator::allocate(size_type num_bytes) +{ + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS( + hipMallocAsync(&ptr, num_bytes, stream_), num_bytes); + return ptr; +} + + +void HipAsyncAllocator::deallocate(void* ptr) +{ + GKO_EXIT_ON_HIP_ERROR(hipFreeAsync(ptr, stream_)); +} + + +#else // Fall back to regular allocation + + +HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} + + +void* HipAsyncAllocator::allocate(size_type num_bytes) +{ + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&ptr, num_bytes), num_bytes); + return ptr; +} + + +void HipAsyncAllocator::deallocate(void* ptr) +{ + GKO_EXIT_ON_HIP_ERROR(hipFree(ptr)); +} + + +#endif + + +bool HipAsyncAllocator::check_environment(int device_id, + hipStream_t stream) const +{ + return stream == stream_; +} + + +HipUnifiedAllocator::HipUnifiedAllocator(int device_id) + : HipUnifiedAllocator{device_id, hipMemAttachGlobal} +{} + + +HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags) + : device_id_{device_id}, flags_{flags} +{} + + +void* HipUnifiedAllocator::allocate(size_type num_bytes) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS( + hipMallocManaged(&ptr, num_bytes, flags_), num_bytes); + return ptr; +} + + +void HipUnifiedAllocator::deallocate(void* ptr) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_HIP_ERROR(hipFree(ptr)); +} + + +bool HipUnifiedAllocator::check_environment(int device_id, + hipStream_t stream) const +{ + return device_id == device_id_; +} + + +HipHostAllocator::HipHostAllocator(int device_id) : device_id_{device_id} {} + + +void* HipHostAllocator::allocate(size_type num_bytes) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipHostMalloc(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void HipHostAllocator::deallocate(void* ptr) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_HIP_ERROR(hipFreeHost(ptr)); +} + + +bool HipHostAllocator::check_environment(int device_id, + hipStream_t stream) const +{ + return device_id == device_id_; +} + + } // namespace gko diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 4545b216f86..f033873e392 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1717,10 +1717,13 @@ class HipExecutor : public detail::ExecutorBase, * @param alloc_mode the allocation mode that the executor should operate * on. See @allocation_mode for more details */ - [[deprecated("")]] static std::shared_ptr create( - int device_id, std::shared_ptr master, bool device_reset, - allocation_mode alloc_mode = default_hip_alloc_mode, - GKO_HIP_STREAM_STRUCT* stream = nullptr); + [[deprecated( + "device_reset is deprecated entirely, call hipDeviceReset directly. " + "alloc_mode was replaced by the Allocator type " + "hierarchy.")]] static std::shared_ptr + create(int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode = default_hip_alloc_mode, + GKO_HIP_STREAM_STRUCT* stream = nullptr); static std::shared_ptr create( int device_id, std::shared_ptr master, @@ -1914,28 +1917,6 @@ class DpcppExecutor : public detail::ExecutorBase, std::string device_type = "all", dpcpp_queue_property property = dpcpp_queue_property::in_order); - /** - * Creates a new DpcppExecutor from an existing SYCL queue. - * - * @param queue the DPCPP device id of this device - * @param master an executor on the host that is used to invoke the device - * kernels - */ - static std::shared_ptr create( - sycl::queue* queue, std::shared_ptr master); - - /** - * Creates a new DpcppExecutor from an existing SYCL queue. - * - * @param queue the DPCPP device id of this device - * @param master an executor on the host that is used to invoke the device - * kernels - * @param alloc the allocator used for memory allocation - */ - static std::shared_ptr create( - sycl::queue* queue, std::shared_ptr master, - std::shared_ptr alloc); - std::shared_ptr get_master() noexcept override; std::shared_ptr get_master() const noexcept override; diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index 872a25a9a33..1086c9aacb4 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -49,9 +49,9 @@ class Allocator { public: virtual ~Allocator() = default; - virtual void* allocate(size_type num_bytes) const = 0; + virtual void* allocate(size_type num_bytes) = 0; - virtual void deallocate(void* ptr) const = 0; + virtual void deallocate(void* ptr) = 0; }; @@ -65,34 +65,49 @@ class CpuAllocatorBase : public Allocator {}; /** * Implement this interface to provide an allocator for CudaExecutor. */ -class CudaAllocatorBase : public Allocator {}; +class CudaAllocatorBase : public Allocator { + friend class CudaExecutor; - -/** - * Implement this interface to provide an allocator for HipExecutor. - */ -class HipAllocatorBase : public Allocator {}; +protected: + /** + * Checks if the allocator can be used safely with the provided device ID + * and stream. The check is necessary to ensure safe usage of stream-ordered + * allocators and unified shared memory allocators. + * + * @param device_id the device ID the allocator will be used in. + * @param stream the stream the allocator will be used with. + * @return true if and only if the allocator can be used by CudaExecutor in + * the given environment. + */ + virtual bool check_environment(int device_id, CUstream_st* stream) const + { + return true; + } +}; /** - * Implement this interface to provide an allocator for DpcppExecutor. + * Implement this interface to provide an allocator for HipExecutor. */ -class DpcppAllocatorBase : public Allocator { -public: - DpcppAllocatorBase(sycl::queue* queue); - - void* allocate(size_type num_bytes) const final; - - void deallocate(void* ptr) const final; +class HipAllocatorBase : public Allocator { + friend class HipExecutor; protected: - virtual void* allocate_impl(sycl::queue* queue, - size_type num_bytes) const = 0; - - virtual void deallocate_impl(sycl::queue* queue, void* ptr) const = 0; - -private: - sycl::queue* queue_; + /** + * Checks if the allocator can be used safely with the provided device ID + * and stream. The check is necessary to ensure safe usage of stream-ordered + * allocators and unified shared memory allocators. + * + * @param device_id the device ID the allocator will be used in. + * @param stream the stream the allocator will be used with. + * @return true if and only if the allocator can be used by CudaExecutor in + * the given environment. + */ + virtual bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + { + return true; + } }; @@ -101,9 +116,9 @@ class DpcppAllocatorBase : public Allocator { */ class CpuAllocator : public CpuAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; }; @@ -112,9 +127,9 @@ class CpuAllocator : public CpuAllocatorBase { */ class CudaAllocator : public CudaAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; }; @@ -123,12 +138,14 @@ class CudaAllocator : public CudaAllocatorBase { */ class CudaAsyncAllocator : public CudaAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; CudaAsyncAllocator(CUstream_st* stream); + bool check_environment(int device_id, CUstream_st* stream) const override; + private: CUstream_st* stream_; }; @@ -139,14 +156,17 @@ class CudaAsyncAllocator : public CudaAllocatorBase { */ class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; CudaUnifiedAllocator(int device_id); CudaUnifiedAllocator(int device_id, unsigned int flags); +protected: + bool check_environment(int device_id, CUstream_st* stream) const override; + private: int device_id_; unsigned int flags_; @@ -154,16 +174,19 @@ class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase { /* - * Allocator using cudaMallocHost. + * Allocator using cudaHostMalloc. */ class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; CudaHostAllocator(int device_id); +protected: + bool check_environment(int device_id, CUstream_st* stream) const override; + private: int device_id_; }; @@ -174,38 +197,72 @@ class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase { */ class HipAllocator : public HipAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; + + void deallocate(void* ptr) override; +}; + + +/* + * Allocator using hipMallocAsync. + */ +class HipAsyncAllocator : public HipAllocatorBase { +public: + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; + + HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream); + +protected: + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; + +private: + GKO_HIP_STREAM_STRUCT* stream_; }; /* - * Allocator using sycl::malloc_device. + * Allocator using hipMallocManaged */ -class DpcppAllocator : public DpcppAllocatorBase { +class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase { public: - using DpcppAllocatorBase::DpcppAllocatorBase; + void* allocate(size_type num_bytes) override; + + void deallocate(void* ptr) override; + + HipUnifiedAllocator(int device_id); + + HipUnifiedAllocator(int device_id, unsigned int flags); protected: - void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; - void deallocate_impl(sycl::queue* queue, void* ptr) const override; +private: + int device_id_; + unsigned int flags_; }; /* - * Allocator using sycl::malloc_shared. + * Allocator using hipHostAlloc. */ -class DpcppUnifiedAllocator : public DpcppAllocatorBase, - public CpuAllocatorBase { +class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase { public: - using DpcppAllocatorBase::DpcppAllocatorBase; + void* allocate(size_type num_bytes) override; + + void deallocate(void* ptr) override; + + HipHostAllocator(int device_id); protected: - void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; - void deallocate_impl(sycl::queue* queue, void* ptr) const override; +private: + int device_id_; }; From 7a7960774b3e8bc5e6461357f4360cd5fdb2373b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 7 Jul 2023 11:49:57 +0200 Subject: [PATCH 056/583] add HIP allocator tests --- benchmark/utils/general.hpp | 9 ++- hip/base/memory.hip.cpp | 2 +- hip/test/base/CMakeLists.txt | 1 + hip/test/base/memory.hip.cpp | 126 +++++++++++++++++++++++++++++++++++ 4 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 hip/test/base/memory.hip.cpp diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 35077f66d4b..19c71b74a1a 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -368,16 +368,15 @@ const std::map****************************** +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Memory : public HipTestFixture { +protected: + Memory() + : host_exec_with_pinned{gko::OmpExecutor::create( + std::make_shared(0))}, + host_exec_with_unified{gko::OmpExecutor::create( + std::make_shared(0))}, + exec_with_normal{gko::HipExecutor::create( + 0, ref, std::make_shared(), + exec->get_stream())}, + exec_with_async{gko::HipExecutor::create( + 0, host_exec_with_pinned, + std::make_shared(exec->get_stream()), + exec->get_stream())}, + exec_with_unified{gko::HipExecutor::create( + 0, host_exec_with_unified, + std::make_shared(0), + exec->get_stream())} + {} + + std::shared_ptr host_exec_with_pinned; + std::shared_ptr host_exec_with_unified; + std::shared_ptr exec_with_normal; + std::shared_ptr exec_with_async; + std::shared_ptr exec_with_unified; +}; + + +TEST_F(Memory, DeviceAllocationWorks) +{ + gko::array data{exec_with_normal, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, AsyncDeviceAllocationWorks) +{ + gko::array data{exec_with_async, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, UnifiedDeviceAllocationWorks) +{ + gko::array data{exec_with_unified, {1, 2}}; + exec->synchronize(); + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostUnifiedAllocationWorks) +{ + gko::array data{host_exec_with_unified, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostPinnedAllocationWorks) +{ + gko::array data{host_exec_with_pinned, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +} // namespace From 5e4881a2ca0498350b8ed9dbdbc287c48d2e2e95 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 7 Jul 2023 13:22:33 +0200 Subject: [PATCH 057/583] review updates * honor allocation_mode for HIP * use correct allocation flags for cudaMallocManaged allocation_mode * use valid device_id in moved-from stream wrapper * add more deprecation warnings for device_reset functionality * documentation Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Marcel Koch --- cuda/base/executor.cpp | 6 ++++-- cuda/base/stream.cpp | 4 ++-- hip/base/executor.hip.cpp | 20 ++++++++++++++++++- hip/base/roctx.hip.cpp | 2 ++ hip/base/stream.hip.cpp | 4 ++-- include/ginkgo/core/base/executor.hpp | 28 +++++++++++++++++++++++++-- include/ginkgo/core/base/stream.hpp | 12 ++++++++++-- 7 files changed, 65 insertions(+), 11 deletions(-) diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index faf90037a0f..fd16815456a 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -67,9 +67,11 @@ std::unique_ptr allocator_from_mode(int device_id, case allocation_mode::device: return std::make_unique(); case allocation_mode::unified_global: - return std::make_unique(device_id); + return std::make_unique(device_id, + cudaMemAttachGlobal); case allocation_mode::unified_host: - return std::make_unique(device_id); + return std::make_unique(device_id, + cudaMemAttachHost); default: GKO_NOT_SUPPORTED(mode); } diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp index 0bbc9b1cc83..76027bd51e2 100644 --- a/cuda/base/stream.cpp +++ b/cuda/base/stream.cpp @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -cuda_stream::cuda_stream() : stream_{}, device_id_{-1} {} +cuda_stream::cuda_stream() : stream_{nullptr}, device_id_{} {} cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id) @@ -66,7 +66,7 @@ cuda_stream::~cuda_stream() cuda_stream::cuda_stream(cuda_stream&& other) : stream_{std::exchange(other.stream_, nullptr)}, - device_id_(std::exchange(other.device_id_, -1)) + device_id_(std::exchange(other.device_id_, 0)) {} diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 2df5c9a4847..a89e765becb 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -56,12 +56,30 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" +std::unique_ptr allocator_from_mode(int device_id, + allocation_mode mode) +{ + switch (mode) { + case allocation_mode::device: + return std::make_unique(); + case allocation_mode::unified_global: + return std::make_unique(device_id, + hipMemAttachGlobal); + case allocation_mode::unified_host: + return std::make_unique(device_id, + hipMemAttachHost); + default: + GKO_NOT_SUPPORTED(mode); + } +} + + std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, hipStream_t stream) { return create(device_id, std::move(master), - std::make_shared(), stream); + allocator_from_mode(device_id, alloc_mode), stream); } diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index a01bc11dc47..23b07e60254 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -56,6 +56,7 @@ void begin_roctx(const char* name, profile_event_category) roctxRangePush(name); } + void end_roctx(const char*, profile_event_category) { roctxRangePop(); } #else @@ -63,6 +64,7 @@ void end_roctx(const char*, profile_event_category) { roctxRangePop(); } void begin_roctx(const char* name, profile_event_category) GKO_NOT_COMPILED(roctx); + void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx); #endif diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index e5817eb9ebd..dc2d99b8b17 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -hip_stream::hip_stream() : stream_{}, device_id_{-1} {} +hip_stream::hip_stream() : stream_{}, device_id_{} {} hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id) @@ -68,7 +68,7 @@ hip_stream::~hip_stream() hip_stream::hip_stream(hip_stream&& other) : stream_{std::exchange(other.stream_, nullptr)}, - device_id_{std::exchange(other.device_id_, -1)} + device_id_{std::exchange(other.device_id_, 0)} {} diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index f033873e392..4f476b9286d 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1307,14 +1307,38 @@ class EnableDeviceReset { * * @param device_reset whether to allow a device reset or not */ - void set_device_reset(bool device_reset) {} + [[deprecated( + "device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] void + set_device_reset(bool device_reset) + {} /** * Returns the current status of the device reset boolean for this executor. * * @return the current status of the device reset boolean for this executor. */ - bool get_device_reset() { return false; } + [[deprecated( + "device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] bool + get_device_reset() + { + return false; + } + +protected: + /** + * Instantiate an EnableDeviceReset class + * + * @param device_reset the starting device_reset status. Defaults to false. + */ + EnableDeviceReset() {} + + [[deprecated( + "device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] EnableDeviceReset(bool + device_reset) + {} }; diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp index 8ee8333e41a..f7d45f59c5a 100644 --- a/include/ginkgo/core/base/stream.hpp +++ b/include/ginkgo/core/base/stream.hpp @@ -50,7 +50,11 @@ class cuda_stream { /** Creates an empty stream wrapper, representing the default stream. */ cuda_stream(); - /** Creates a new custom CUDA stream. */ + /** + * Creates a new custom CUDA stream on the given device. + * + * @param device_id the device ID to create the stream on. + */ cuda_stream(int device_id); /** Destroys the custom CUDA stream, if it isn't empty. */ @@ -89,7 +93,11 @@ class hip_stream { /** Creates an empty stream wrapper, representing the default stream. */ hip_stream(); - /** Creates a new custom HIP stream. */ + /** + * Creates a new custom HIP stream on the given device. + * + * @param device_id the device ID to create the stream on. + */ hip_stream(int device_id); /** Destroys the custom HIP stream, if it isn't empty. */ From 0954951f813e27d2e83023fcbae7940e68f65a74 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 10 Jul 2023 14:23:11 +0000 Subject: [PATCH 058/583] avoid creating OmpExecutor in tests --- core/test/base/executor.cpp | 54 ++++++++++----------- cuda/test/base/cuda_executor.cu | 28 +++++------ cuda/test/base/cuda_executor_topology.cu | 18 +++---- hip/test/base/hip_executor.hip.cpp | 28 +++++------ hip/test/base/hip_executor_topology.hip.cpp | 19 ++++---- 5 files changed, 74 insertions(+), 73 deletions(-) diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 13cba09e2b6..a331d8f3485 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -248,17 +248,17 @@ TEST(ReferenceExecutor, IsItsOwnMaster) TEST(CudaExecutor, KnowsItsMaster) { - auto omp = gko::OmpExecutor::create(); - exec_ptr cuda = gko::CudaExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + exec_ptr cuda = gko::CudaExecutor::create(0, ref); - ASSERT_EQ(omp, cuda->get_master()); + ASSERT_EQ(ref, cuda->get_master()); } TEST(CudaExecutor, KnowsItsDeviceId) { - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + auto cuda = gko::CudaExecutor::create(0, ref); ASSERT_EQ(0, cuda->get_device_id()); } @@ -266,17 +266,17 @@ TEST(CudaExecutor, KnowsItsDeviceId) TEST(HipExecutor, KnowsItsMaster) { - auto omp = gko::OmpExecutor::create(); - exec_ptr hip = gko::HipExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + exec_ptr hip = gko::HipExecutor::create(0, ref); - ASSERT_EQ(omp, hip->get_master()); + ASSERT_EQ(ref, hip->get_master()); } TEST(HipExecutor, KnowsItsDeviceId) { - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + auto hip = gko::HipExecutor::create(0, ref); ASSERT_EQ(0, hip->get_device_id()); } @@ -284,17 +284,17 @@ TEST(HipExecutor, KnowsItsDeviceId) TEST(DpcppExecutor, KnowsItsMaster) { - auto omp = gko::OmpExecutor::create(); - exec_ptr dpcpp = gko::DpcppExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + exec_ptr dpcpp = gko::DpcppExecutor::create(0, ref); - ASSERT_EQ(omp, dpcpp->get_master()); + ASSERT_EQ(ref, dpcpp->get_master()); } TEST(DpcppExecutor, KnowsItsDeviceId) { - auto omp = gko::OmpExecutor::create(); - auto dpcpp = gko::DpcppExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + auto dpcpp = gko::DpcppExecutor::create(0, ref); ASSERT_EQ(0, dpcpp->get_device_id()); } @@ -304,13 +304,13 @@ TEST(Executor, CanVerifyMemory) { auto ref = gko::ReferenceExecutor::create(); auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); - auto cuda = gko::CudaExecutor::create(0, omp); + auto hip = gko::HipExecutor::create(0, ref); + auto cuda = gko::CudaExecutor::create(0, ref); auto omp2 = gko::OmpExecutor::create(); - auto hip2 = gko::HipExecutor::create(0, omp); - auto cuda2 = gko::CudaExecutor::create(0, omp); - auto hip_1 = gko::HipExecutor::create(1, omp); - auto cuda_1 = gko::CudaExecutor::create(1, omp); + auto hip2 = gko::HipExecutor::create(0, ref); + auto cuda2 = gko::CudaExecutor::create(0, ref); + auto hip_1 = gko::HipExecutor::create(1, ref); + auto cuda_1 = gko::CudaExecutor::create(1, ref); std::shared_ptr host_dpcpp; std::shared_ptr cpu_dpcpp; std::shared_ptr gpu_dpcpp; @@ -318,16 +318,16 @@ TEST(Executor, CanVerifyMemory) std::shared_ptr cpu_dpcpp_dup; std::shared_ptr gpu_dpcpp_dup; if (gko::DpcppExecutor::get_num_devices("host")) { - host_dpcpp = gko::DpcppExecutor::create(0, omp, "host"); - host_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "host"); + host_dpcpp = gko::DpcppExecutor::create(0, ref, "host"); + host_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "host"); } if (gko::DpcppExecutor::get_num_devices("cpu")) { - cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu"); - cpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "cpu"); + cpu_dpcpp = gko::DpcppExecutor::create(0, ref, "cpu"); + cpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "cpu"); } if (gko::DpcppExecutor::get_num_devices("gpu")) { - gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu"); - gpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "gpu"); + gpu_dpcpp = gko::DpcppExecutor::create(0, ref, "gpu"); + gpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "gpu"); } ASSERT_EQ(false, ref->memory_accessible(omp)); diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index c81799e0dae..83cfd1827ad 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -93,7 +93,7 @@ protected: stream(0), other_stream(gko::CudaExecutor::get_num_devices() - 1), #endif - omp(gko::OmpExecutor::create()), + ref(gko::ReferenceExecutor::create()), cuda(nullptr), cuda2(nullptr), cuda3(nullptr) @@ -104,19 +104,19 @@ protected: ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM cuda = gko::CudaExecutor::create( - 0, omp, std::make_shared(), stream.get()); + 0, ref, std::make_shared(), stream.get()); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp, + gko::CudaExecutor::get_num_devices() - 1, ref, std::make_shared(), other_stream.get()); cuda3 = gko::CudaExecutor::create( - 0, omp, std::make_shared(0), + 0, ref, std::make_shared(0), stream.get()); #else - cuda = gko::CudaExecutor::create(0, omp); + cuda = gko::CudaExecutor::create(0, ref); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp); + gko::CudaExecutor::get_num_devices() - 1, ref); cuda3 = gko::CudaExecutor::create( - 0, omp, std::make_shared(0)); + 0, ref, std::make_shared(0)); #endif } @@ -132,7 +132,7 @@ protected: gko::cuda_stream stream; gko::cuda_stream other_stream; #endif - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr cuda; std::shared_ptr cuda2; std::shared_ptr cuda3; @@ -141,8 +141,8 @@ protected: TEST_F(CudaExecutor, CanInstantiateTwoExecutorsOnOneDevice) { - auto cuda = gko::CudaExecutor::create(0, omp); - auto cuda2 = gko::CudaExecutor::create(0, omp); + auto cuda = gko::CudaExecutor::create(0, ref); + auto cuda2 = gko::CudaExecutor::create(0, ref); // We want automatic deinitialization to not create any error } @@ -197,7 +197,7 @@ TEST_F(CudaExecutor, CopiesDataToCuda) int orig[] = {3, 8}; auto* copy = cuda->alloc(2); - cuda->copy_from(omp, 2, orig, copy); + cuda->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, cuda->get_stream()>>>(copy); ASSERT_NO_THROW(cuda->synchronize()); @@ -218,7 +218,7 @@ TEST_F(CudaExecutor, CanAllocateOnUnifiedMemory) int orig[] = {3, 8}; auto* copy = cuda3->alloc(2); - cuda3->copy_from(omp, 2, orig, copy); + cuda3->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, cuda3->get_stream()>>>(copy); ASSERT_NO_THROW(cuda3->synchronize()); @@ -240,7 +240,7 @@ TEST_F(CudaExecutor, CopiesDataFromCuda) auto orig = cuda->alloc(2); init_data<<<1, 1, 0, cuda->get_stream()>>>(orig); - omp->copy_from(cuda, 2, orig, copy); + ref->copy_from(cuda, 2, orig, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); @@ -293,7 +293,7 @@ TEST_F(CudaExecutor, CopiesDataFromCudaToCuda) cuda2->run(ExampleOperation(value)); ASSERT_EQ(value, cuda2->get_device_id()); // Put the results on OpenMP and run CPU side assertions - omp->copy_from(cuda2, 2, copy_cuda2, copy); + ref->copy_from(cuda2, 2, copy_cuda2, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); cuda2->free(copy_cuda2); diff --git a/cuda/test/base/cuda_executor_topology.cu b/cuda/test/base/cuda_executor_topology.cu index a0ee6826ded..3b91cc7941a 100644 --- a/cuda/test/base/cuda_executor_topology.cu +++ b/cuda/test/base/cuda_executor_topology.cu @@ -60,15 +60,15 @@ namespace { class CudaExecutor : public ::testing::Test { protected: CudaExecutor() - : omp(gko::OmpExecutor::create()), cuda(nullptr), cuda2(nullptr) + : ref(gko::ReferenceExecutor::create()), cuda(nullptr), cuda2(nullptr) {} void SetUp() { ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); - cuda = gko::CudaExecutor::create(0, omp); + cuda = gko::CudaExecutor::create(0, ref); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp); + gko::CudaExecutor::get_num_devices() - 1, ref); } void TearDown() @@ -79,7 +79,7 @@ protected: } } - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr cuda; std::shared_ptr cuda2; }; @@ -102,7 +102,7 @@ inline int get_core_os_id(int log_id) TEST_F(CudaExecutor, CanBindToSinglePu) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); const int bind_pu = 1; gko::machine_topology::get_instance()->bind_to_pu(bind_pu); @@ -114,7 +114,7 @@ TEST_F(CudaExecutor, CanBindToSinglePu) TEST_F(CudaExecutor, CanBindToPus) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_pus = {1, 3}; gko::machine_topology::get_instance()->bind_to_pus(bind_pus); @@ -126,7 +126,7 @@ TEST_F(CudaExecutor, CanBindToPus) TEST_F(CudaExecutor, CanBindToCores) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_cores = {1, 3}; gko::machine_topology::get_instance()->bind_to_cores(bind_cores); @@ -138,7 +138,7 @@ TEST_F(CudaExecutor, CanBindToCores) TEST_F(CudaExecutor, ClosestCpusIsPopulated) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); auto close_cpus = cuda->get_closest_pus(); if (close_cpus.size() == 0) { GTEST_SKIP(); @@ -150,7 +150,7 @@ TEST_F(CudaExecutor, ClosestCpusIsPopulated) TEST_F(CudaExecutor, KnowsItsNuma) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); auto numa0 = cuda->get_closest_numa(); auto close_cpus = cuda->get_closest_pus(); if (close_cpus.size() == 0) { diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index e531fa739e6..e63543ef77c 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -98,7 +98,7 @@ class HipExecutor : public ::testing::Test { stream(0), other_stream(gko::HipExecutor::get_num_devices() - 1), #endif - omp(gko::OmpExecutor::create()), + ref(gko::ReferenceExecutor::create()), hip(nullptr), hip2(nullptr), hip3(nullptr) @@ -109,17 +109,17 @@ class HipExecutor : public ::testing::Test { ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM hip = gko::HipExecutor::create( - 0, omp, std::make_shared(), stream.get()); + 0, ref, std::make_shared(), stream.get()); hip2 = gko::HipExecutor::create( - gko::HipExecutor::get_num_devices() - 1, omp, + gko::HipExecutor::get_num_devices() - 1, ref, std::make_shared(), other_stream.get()); hip3 = gko::HipExecutor::create( - 0, omp, std::make_shared(), stream.get()); + 0, ref, std::make_shared(), stream.get()); #else - hip = gko::HipExecutor::create(0, omp); + hip = gko::HipExecutor::create(0, ref); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, - omp); - hip3 = gko::HipExecutor::create(0, omp, + ref); + hip3 = gko::HipExecutor::create(0, ref, std::make_shared()); #endif } @@ -136,7 +136,7 @@ class HipExecutor : public ::testing::Test { gko::hip_stream stream; gko::hip_stream other_stream; #endif - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr hip; std::shared_ptr hip2; std::shared_ptr hip3; @@ -145,8 +145,8 @@ class HipExecutor : public ::testing::Test { TEST_F(HipExecutor, CanInstantiateTwoExecutorsOnOneDevice) { - auto hip = gko::HipExecutor::create(0, omp); - auto hip2 = gko::HipExecutor::create(0, omp); + auto hip = gko::HipExecutor::create(0, ref); + auto hip2 = gko::HipExecutor::create(0, ref); // We want automatic deinitialization to not create any error } @@ -204,7 +204,7 @@ TEST_F(HipExecutor, CopiesDataToHip) int orig[] = {3, 8}; auto* copy = hip->alloc(2); - hip->copy_from(omp, 2, orig, copy); + hip->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, hip->get_stream()>>>(copy); ASSERT_NO_THROW(hip->synchronize()); @@ -232,7 +232,7 @@ TEST_F(HipExecutor, CanAllocateOnUnifiedMemory) int orig[] = {3, 8}; auto* copy = hip3->alloc(2); - hip3->copy_from(omp, 2, orig, copy); + hip3->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, hip3->get_stream()>>>(copy); ASSERT_NO_THROW(hip3->synchronize()); @@ -257,7 +257,7 @@ TEST_F(HipExecutor, CopiesDataFromHip) auto orig = hip->alloc(2); init_data<<<1, 1, 0, hip->get_stream()>>>(orig); - omp->copy_from(hip, 2, orig, copy); + ref->copy_from(hip, 2, orig, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); @@ -310,7 +310,7 @@ TEST_F(HipExecutor, CopiesDataFromHipToHip) hip2->run(ExampleOperation(value)); ASSERT_EQ(value, hip2->get_device_id()); // Put the results on OpenMP and run CPU side assertions - omp->copy_from(hip2, 2, copy_hip2, copy); + ref->copy_from(hip2, 2, copy_hip2, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); hip2->free(copy_hip2); diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp index 394b2776319..3d6e3f2bddc 100644 --- a/hip/test/base/hip_executor_topology.hip.cpp +++ b/hip/test/base/hip_executor_topology.hip.cpp @@ -65,15 +65,16 @@ namespace { class HipExecutor : public ::testing::Test { protected: - HipExecutor() : omp(gko::OmpExecutor::create()), hip(nullptr), hip2(nullptr) + HipExecutor() + : ref(gko::ReferenceExecutor::create()), hip(nullptr), hip2(nullptr) {} void SetUp() { ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); - hip = gko::HipExecutor::create(0, omp); + hip = gko::HipExecutor::create(0, ref); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, - omp); + ref); } void TearDown() @@ -84,7 +85,7 @@ class HipExecutor : public ::testing::Test { } } - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr hip; std::shared_ptr hip2; }; @@ -107,7 +108,7 @@ inline int get_core_os_id(int log_id) TEST_F(HipExecutor, CanBindToSinglePu) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); const int bind_pu = 1; gko::machine_topology::get_instance()->bind_to_pu(bind_pu); @@ -119,7 +120,7 @@ TEST_F(HipExecutor, CanBindToSinglePu) TEST_F(HipExecutor, CanBindToPus) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_pus = {1, 3}; gko::machine_topology::get_instance()->bind_to_pus(bind_pus); @@ -131,7 +132,7 @@ TEST_F(HipExecutor, CanBindToPus) TEST_F(HipExecutor, CanBindToCores) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_cores = {1, 3}; gko::machine_topology::get_instance()->bind_to_cores(bind_cores); @@ -143,7 +144,7 @@ TEST_F(HipExecutor, CanBindToCores) TEST_F(HipExecutor, ClosestCpusIsPopulated) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); auto close_cpus = hip->get_closest_pus(); if (close_cpus.size() == 0) { GTEST_SKIP(); @@ -155,7 +156,7 @@ TEST_F(HipExecutor, ClosestCpusIsPopulated) TEST_F(HipExecutor, KnowsItsNuma) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); auto numa0 = hip->get_closest_numa(); auto close_cpus = hip->get_closest_pus(); if (close_cpus.size() == 0) { From e365d292f73b3e8423e25c39f9fdc107316ea9ed Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 10 Jul 2023 14:24:15 +0000 Subject: [PATCH 059/583] warn if using unsupported allocator --- cuda/base/memory.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index 08c64c0ba05..f605d9135ea 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -121,7 +121,13 @@ void CudaAsyncAllocator::deallocate(void* ptr) #else // Fall back to regular allocation -CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} +{ +#if GKO_VERBOSE_LEVEL >= 1 + std::cerr << "This version of CUDA does not support cudaMallocAsync, " + "please use CudaAllocator instead of CudaAsyncAllocator.\n"; +#endif +} void* CudaAsyncAllocator::allocate(size_type num_bytes) From d01e1a957a4d370cc02d9294cb685461373a4d35 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 10 Jul 2023 14:27:48 +0000 Subject: [PATCH 060/583] improve documentation Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Pratik Nayak --- include/ginkgo/core/base/memory.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index 1086c9aacb4..f421abf7da4 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -76,6 +76,7 @@ class CudaAllocatorBase : public Allocator { * * @param device_id the device ID the allocator will be used in. * @param stream the stream the allocator will be used with. + * * @return true if and only if the allocator can be used by CudaExecutor in * the given environment. */ @@ -100,7 +101,8 @@ class HipAllocatorBase : public Allocator { * * @param device_id the device ID the allocator will be used in. * @param stream the stream the allocator will be used with. - * @return true if and only if the allocator can be used by CudaExecutor in + * + * @return true if and only if the allocator can be used by HipExecutor in * the given environment. */ virtual bool check_environment(int device_id, From 1420f3e2d5fe1b76e8d92e82be39c54bd19c311f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 11:46:47 +0200 Subject: [PATCH 061/583] fix HIP requirements for stream-ordered allocation --- hip/base/memory.hip.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index 86ac31c3154..be795bb3397 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -97,7 +97,7 @@ void HipAllocator::deallocate(void* dev_ptr) } -#if HIP_VERSION_MAJOR >= 5 +#if HIP_VERSION >= 50200000 HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} @@ -121,7 +121,13 @@ void HipAsyncAllocator::deallocate(void* ptr) #else // Fall back to regular allocation -HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} +HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} +{ +#if GKO_VERBOSE_LEVEL >= 1 + std::cerr << "This version of HIP does not support hipMallocAsync, " + "please use HipAllocator instead of HipAsyncAllocator.\n"; +#endif +} void* HipAsyncAllocator::allocate(size_type num_bytes) From a3ab2a253024d257fa55f9f10f4b65926bfcf5f5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 12:13:05 +0200 Subject: [PATCH 062/583] use unified allocator in some HIP tests --- hip/test/base/hip_executor.hip.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index e63543ef77c..42499384704 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -114,13 +114,14 @@ class HipExecutor : public ::testing::Test { gko::HipExecutor::get_num_devices() - 1, ref, std::make_shared(), other_stream.get()); hip3 = gko::HipExecutor::create( - 0, ref, std::make_shared(), stream.get()); + 0, ref, std::make_shared(0), + stream.get()); #else hip = gko::HipExecutor::create(0, ref); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, ref); - hip3 = gko::HipExecutor::create(0, ref, - std::make_shared()); + hip3 = gko::HipExecutor::create( + 0, ref, std::make_shared(0)); #endif } From 6cda4fc348710efa4e2d825e15367dacce1966b6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 13:52:49 +0200 Subject: [PATCH 063/583] resolve ambiguous symbol --- cuda/base/executor.cpp | 8 ++++---- hip/base/executor.hip.cpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index fd16815456a..f296fb9da86 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -60,8 +60,8 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" -std::unique_ptr allocator_from_mode(int device_id, - allocation_mode mode) +std::unique_ptr cuda_allocator_from_mode( + int device_id, allocation_mode mode) { switch (mode) { case allocation_mode::device: @@ -82,8 +82,8 @@ std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, cudaStream_t stream) { - return create(device_id, master, allocator_from_mode(device_id, alloc_mode), - stream); + return create(device_id, master, + cuda_allocator_from_mode(device_id, alloc_mode), stream); } diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index a89e765becb..8d175c0e424 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -56,8 +56,8 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" -std::unique_ptr allocator_from_mode(int device_id, - allocation_mode mode) +std::unique_ptr hip_allocator_from_mode(int device_id, + allocation_mode mode) { switch (mode) { case allocation_mode::device: @@ -79,7 +79,7 @@ std::shared_ptr HipExecutor::create( allocation_mode alloc_mode, hipStream_t stream) { return create(device_id, std::move(master), - allocator_from_mode(device_id, alloc_mode), stream); + hip_allocator_from_mode(device_id, alloc_mode), stream); } From c64da5b092a3daebddad7430cf1779dcf4371087 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 27 Apr 2022 15:08:52 +0200 Subject: [PATCH 064/583] add partition dpcpp kernels --- dpcpp/distributed/partition_kernels.dp.cpp | 115 ++++++++++++++++++++- 1 file changed, 112 insertions(+), 3 deletions(-) diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp index 7d9210894e2..42cc0a72711 100644 --- a/dpcpp/distributed/partition_kernels.dp.cpp +++ b/dpcpp/distributed/partition_kernels.dp.cpp @@ -30,17 +30,86 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +// force-top: on +#include +#include +#include +// force-top: off + + #include "core/distributed/partition_kernels.hpp" +#include "common/unified/base/kernel_launch.hpp" +#include "core/components/fill_array_kernels.hpp" + + namespace gko { namespace kernels { namespace dpcpp { namespace partition { +namespace kernel { + + +template +void setup_sizes_ids_permutation( + std::shared_ptr exec, size_type num_ranges, + comm_index_type num_parts, const GlobalIndexType* range_offsets, + const comm_index_type* range_parts, Array& range_sizes, + Array& part_ids, Array& permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto num_ranges, auto num_parts, + auto range_offsets, auto range_parts, auto range_sizes, + auto part_ids, auto permutation) { + if (i == 0) { + // set sentinel value at the end + part_ids[num_ranges] = num_parts; + } + range_sizes[i] = range_offsets[i + 1] - range_offsets[i]; + part_ids[i] = range_parts[i]; + permutation[i] = static_cast(i); + }, + num_ranges, num_ranges, num_parts, range_offsets, range_parts, + range_sizes.get_data(), part_ids.get_data(), permutation.get_data()); +} + + +template +void compute_part_sizes_and_starting_indices( + std::shared_ptr exec, size_type num_ranges, + const Array& range_sizes, + const Array& part_ids, + const Array& permutation, LocalIndexType* starting_indices, + LocalIndexType* part_sizes) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto grouped_starting_indices, + auto grouped_part_ids, auto orig_idxs, + auto starting_indices, auto part_sizes) { + auto prev_part = i > 0 ? grouped_part_ids[i - 1] + : invalid_index(); + auto cur_part = grouped_part_ids[i]; + auto next_part = + grouped_part_ids[i + 1]; // last element has to be num_parts + if (cur_part != next_part) { + part_sizes[cur_part] = grouped_starting_indices[i]; + } + // write result shifted by one entry to get exclusive prefix sum + starting_indices[orig_idxs[i]] = + prev_part == cur_part ? grouped_starting_indices[i - 1] + : LocalIndexType{}; + }, + num_ranges, range_sizes.get_const_data(), part_ids.get_const_data(), + permutation.get_const_data(), starting_indices, part_sizes); +} + + +} // namespace kernel -// TODO: wait until https://github.com/oneapi-src/oneDPL/pull/388 is release to -// implement it similar to cuda/hip template void build_starting_indices(std::shared_ptr exec, const GlobalIndexType* range_offsets, @@ -48,7 +117,47 @@ void build_starting_indices(std::shared_ptr exec, size_type num_ranges, comm_index_type num_parts, comm_index_type& num_empty_parts, LocalIndexType* starting_indices, - LocalIndexType* part_sizes) GKO_NOT_IMPLEMENTED; + LocalIndexType* part_sizes) +{ + if (num_ranges > 0) { + auto policy = + oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + + Array range_sizes{exec, num_ranges}; + // num_parts sentinel at the end + Array tmp_part_ids{exec, num_ranges + 1}; + Array permutation{exec, num_ranges}; + // set part_sizes to 0 in case of empty parts + components::fill_array(exec, part_sizes, num_parts, + zero()); + + kernel::setup_sizes_ids_permutation( + exec, num_ranges, num_parts, range_offsets, range_parts, + range_sizes, tmp_part_ids, permutation); + + auto tmp_part_id_ptr = tmp_part_ids.get_data(); + auto range_sizes_ptr = range_sizes.get_data(); + auto sort_it = oneapi::dpl::make_zip_iterator( + tmp_part_id_ptr, range_sizes_ptr, permutation.get_data()); + // group range_sizes by part ID + oneapi::dpl::stable_sort(policy, sort_it, sort_it + num_ranges, + [](const auto t_a, const auto t_b) { + return std::get<0>(t_a) < std::get<0>(t_b); + }); + // compute inclusive prefix sum for each part + oneapi::dpl::inclusive_scan_by_segment( + policy, tmp_part_id_ptr, tmp_part_id_ptr + num_ranges, + range_sizes_ptr, range_sizes_ptr); + // write back the results + kernel::compute_part_sizes_and_starting_indices( + exec, num_ranges, range_sizes, tmp_part_ids, permutation, + starting_indices, part_sizes); + num_empty_parts = + oneapi::dpl::count(policy, part_sizes, part_sizes + num_parts, 0); + } else { + num_empty_parts = num_parts; + } +} GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES); From fe3ff935db618637df0e633560e9c0899d278c7b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 7 Jul 2023 11:19:38 +0200 Subject: [PATCH 065/583] review updates: - adds helper to create oneDPL policy - re-enable distributed matrix test with dpcpp Co-authored-by: Tobias Ribizel --- dpcpp/base/device_matrix_data_kernels.dp.cpp | 13 ++--- dpcpp/base/onedpl.hpp | 61 ++++++++++++++++++++ dpcpp/distributed/partition_kernels.dp.cpp | 11 ++-- dpcpp/multigrid/pgm_kernels.dp.cpp | 10 ++-- test/mpi/CMakeLists.txt | 2 +- 5 files changed, 78 insertions(+), 19 deletions(-) create mode 100644 dpcpp/base/onedpl.hpp diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp index 9d387ce7ecf..f8185d884c1 100644 --- a/dpcpp/base/device_matrix_data_kernels.dp.cpp +++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp @@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // force-top: on // oneDPL needs to be first to avoid issues with libstdc++ TBB impl #include -#include // force-top: off @@ -43,6 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "dpcpp/base/onedpl.hpp" + + namespace gko { namespace kernels { namespace dpcpp { @@ -56,8 +58,7 @@ void remove_zeros(std::shared_ptr exec, { using nonzero_type = matrix_data_entry; auto size = values.get_num_elems(); - auto policy = - oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + auto policy = onedpl_policy(exec); auto nnz = std::count_if( policy, values.get_const_data(), values.get_const_data() + size, [](ValueType val) { return is_nonzero(val); }); @@ -96,8 +97,7 @@ void sum_duplicates(std::shared_ptr exec, size_type, if (size == 0) { return; } - auto policy = - oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + auto policy = onedpl_policy(exec); auto in_loc_it = oneapi::dpl::make_zip_iterator(row_idxs.get_const_data(), col_idxs.get_const_data()); auto adj_in_loc_it = @@ -136,8 +136,7 @@ template void sort_row_major(std::shared_ptr exec, device_matrix_data& data) { - auto policy = - oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + auto policy = onedpl_policy(exec); auto input_it = oneapi::dpl::make_zip_iterator( data.get_row_idxs(), data.get_col_idxs(), data.get_values()); std::sort(policy, input_it, input_it + data.get_num_elems(), diff --git a/dpcpp/base/onedpl.hpp b/dpcpp/base/onedpl.hpp new file mode 100644 index 00000000000..4af31d3e115 --- /dev/null +++ b/dpcpp/base/onedpl.hpp @@ -0,0 +1,61 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_BASE_ONEDPL_HPP_ +#define GKO_DPCPP_BASE_ONEDPL_HPP_ + + +// force-top: on +#include +// force-top: off + + +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +inline auto onedpl_policy(std::shared_ptr exec) +{ + return oneapi::dpl::execution::make_device_policy(*exec->get_queue()); +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_BASE_ONEDPL_HPP_ diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp index 42cc0a72711..04b7ff215ed 100644 --- a/dpcpp/distributed/partition_kernels.dp.cpp +++ b/dpcpp/distributed/partition_kernels.dp.cpp @@ -32,7 +32,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // force-top: on #include -#include #include // force-top: off @@ -42,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" +#include "dpcpp/base/onedpl.hpp" namespace gko { @@ -72,7 +72,7 @@ void setup_sizes_ids_permutation( permutation[i] = static_cast(i); }, num_ranges, num_ranges, num_parts, range_offsets, range_parts, - range_sizes.get_data(), part_ids.get_data(), permutation.get_data()); + range_sizes, part_ids, permutation); } @@ -102,8 +102,8 @@ void compute_part_sizes_and_starting_indices( prev_part == cur_part ? grouped_starting_indices[i - 1] : LocalIndexType{}; }, - num_ranges, range_sizes.get_const_data(), part_ids.get_const_data(), - permutation.get_const_data(), starting_indices, part_sizes); + num_ranges, range_sizes, part_ids, permutation, starting_indices, + part_sizes); } @@ -120,8 +120,7 @@ void build_starting_indices(std::shared_ptr exec, LocalIndexType* part_sizes) { if (num_ranges > 0) { - auto policy = - oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + auto policy = onedpl_policy(exec); Array range_sizes{exec, num_ranges}; // num_parts sentinel at the end diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp index 15bd22180c0..2234d8ffe38 100644 --- a/dpcpp/multigrid/pgm_kernels.dp.cpp +++ b/dpcpp/multigrid/pgm_kernels.dp.cpp @@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // force-top: on // oneDPL needs to be first to avoid issues with libstdc++ TBB impl #include -#include // force-top: off @@ -48,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "dpcpp/base/onedpl.hpp" + + namespace gko { namespace kernels { namespace dpcpp { @@ -63,8 +65,7 @@ template void sort_agg(std::shared_ptr exec, IndexType num, IndexType* row_idxs, IndexType* col_idxs) { - auto policy = - oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + auto policy = onedpl_policy(exec); auto it = oneapi::dpl::make_zip_iterator(row_idxs, col_idxs); std::sort(policy, it, it + num, [](auto a, auto b) { return std::tie(std::get<0>(a), std::get<1>(a)) < @@ -79,8 +80,7 @@ template void sort_row_major(std::shared_ptr exec, size_type nnz, IndexType* row_idxs, IndexType* col_idxs, ValueType* vals) { - auto policy = - oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + auto policy = onedpl_policy(exec); auto it = oneapi::dpl::make_zip_iterator(row_idxs, col_idxs, vals); // Because reduce_by_segment is not determinstic, so we do not need // stable_sort diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt index 3d5e3cadd58..08050bde58f 100644 --- a/test/mpi/CMakeLists.txt +++ b/test/mpi/CMakeLists.txt @@ -1,4 +1,4 @@ -ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3) ginkgo_create_common_and_reference_test(vector MPI_SIZE 3) add_subdirectory(preconditioner) From 20b7f8cc9ea21cd2a8b01b3d780dddace5b03144 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 12 Jun 2023 10:36:15 +0200 Subject: [PATCH 066/583] Fix some typos --- core/base/dispatch_helper.hpp | 4 ++-- core/base/iterator_factory.hpp | 2 +- core/base/mtx_io.cpp | 8 ++++---- core/base/types.hpp | 2 +- core/solver/gcr.cpp | 2 +- core/solver/multigrid.cpp | 4 ++-- core/test/base/mtx_io.cpp | 6 +++--- core/test/mpi/base/polymorphic_object.cpp | 2 +- core/test/solver/idr.cpp | 2 +- core/test/utils/assertions_test.cpp | 2 +- core/test/utils/fb_matrix_generator.hpp | 2 +- core/test/utils/matrix_generator.hpp | 2 +- examples/CMakeLists.txt | 2 +- .../custom-stopping-criterion.cpp | 2 +- .../distributed-solver/distributed-solver.cpp | 4 ++-- .../external-lib-interfacing.cpp | 2 +- .../doc/results.dox | 2 +- .../mixed-multigrid-preconditioned-solver.cpp | 2 +- .../mixed-multigrid-solver/doc/results.dox | 2 +- .../mixed-multigrid-solver.cpp | 2 +- .../doc/results.dox | 2 +- ...igrid-preconditioned-solver-customized.cpp | 2 +- .../doc/results.dox | 2 +- .../multigrid-preconditioned-solver.cpp | 2 +- include/ginkgo/core/base/composition.hpp | 4 ++-- include/ginkgo/core/base/exception.hpp | 6 +++--- include/ginkgo/core/base/executor.hpp | 12 +++++------ include/ginkgo/core/base/index_set.hpp | 2 +- include/ginkgo/core/base/lin_op.hpp | 2 +- include/ginkgo/core/base/machine_topology.hpp | 2 +- include/ginkgo/core/base/perturbation.hpp | 2 +- .../ginkgo/core/base/polymorphic_object.hpp | 4 ++-- include/ginkgo/core/base/range.hpp | 20 +++++++++---------- .../core/base/scoped_device_id_guard.hpp | 2 +- .../distributed/preconditioner/schwarz.hpp | 4 ++-- include/ginkgo/core/distributed/vector.hpp | 8 ++++---- include/ginkgo/core/factorization/par_ic.hpp | 2 +- include/ginkgo/core/factorization/par_ict.hpp | 2 +- include/ginkgo/core/factorization/par_ilu.hpp | 2 +- .../ginkgo/core/factorization/par_ilut.hpp | 2 +- include/ginkgo/core/log/logger.hpp | 4 ++-- include/ginkgo/core/log/profiler_hook.hpp | 4 ++-- include/ginkgo/core/matrix/coo.hpp | 2 +- include/ginkgo/core/matrix/dense.hpp | 8 ++++---- include/ginkgo/core/matrix/hybrid.hpp | 8 ++++---- include/ginkgo/core/preconditioner/isai.hpp | 2 +- include/ginkgo/core/preconditioner/jacobi.hpp | 6 +++--- include/ginkgo/core/reorder/rcm.hpp | 2 +- .../ginkgo/core/reorder/reordering_base.hpp | 2 +- include/ginkgo/core/solver/idr.hpp | 2 +- include/ginkgo/core/solver/ir.hpp | 4 ++-- include/ginkgo/core/solver/solver_base.hpp | 2 +- include/ginkgo/core/stop/criterion.hpp | 2 +- include/ginkgo/core/stop/stopping_status.hpp | 4 ++-- include/ginkgo/core/stop/time.hpp | 2 +- 55 files changed, 97 insertions(+), 97 deletions(-) diff --git a/core/base/dispatch_helper.hpp b/core/base/dispatch_helper.hpp index 155d5ef6c23..2226ffc6b6d 100644 --- a/core/base/dispatch_helper.hpp +++ b/core/base/dispatch_helper.hpp @@ -63,7 +63,7 @@ void run(T, Func, Args...) * run uses template to go through the list and select the valid * template and run it. * - * @tparam K the current type tried in the convertion + * @tparam K the current type tried in the conversion * @tparam ...Types the other types will be tried in the conversion if K fails * @tparam T the type of input object * @tparam Func the function will run if the object can be converted to K @@ -108,7 +108,7 @@ void run(T, Func, Args...) * * @tparam Base the Base class with one template * @tparam K the current template type of B. pointer of const Base is tried - * in the convertion. + * in the conversion. * @tparam ...Types the other types will be tried in the conversion if K fails * @tparam T the type of input object waiting converted * @tparam Func the function will run if the object can be converted to pointer diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 6384d5bfbce..7ebbc510f74 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -84,7 +84,7 @@ class zip_iterator_reference template value_type cast_impl(std::index_sequence) const { - // gcc 5 throws error as using unintialized array + // gcc 5 throws error as using uninitialized array // std::tuple t = { 1, '2' }; is not allowed. // converting to 'std::tuple<...>' from initializer list would use // explicit constructor diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index d8604e95b5f..de4f6ec1e86 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -267,7 +267,7 @@ class mtx_io { /** * storage modifier hierarchy provides algorithms for handling storage - * modifiers (general, symetric, skew symetric, hermitian) and filling the + * modifiers (general, symmetric, skew symmetric, hermitian) and filling the * entire matrix from the stored parts */ struct storage_modifier { @@ -491,7 +491,7 @@ class mtx_io { * @param os The output stream to write to * @param data The matrix data to write * @param entry_writer The entry format to write in. - * @param modifier The strorage modifer + * @param modifier The storage modifier */ virtual void write_data(std::ostream& os, const matrix_data& data, @@ -554,7 +554,7 @@ class mtx_io { * @param os The output stream to write to * @param data The matrix data to write * @param entry_writer The entry format to write in. - * @param modifier The strorage modifer + * @param modifier The storage modifier */ void write_data(std::ostream& os, const matrix_data& data, @@ -623,7 +623,7 @@ class mtx_io { * @param os The output stream to write to * @param data The matrix data to write * @param entry_writer The entry format to write in. - * @param modifier The strorage modifer + * @param modifier The storage modifier */ void write_data(std::ostream& os, const matrix_data& data, diff --git a/core/base/types.hpp b/core/base/types.hpp index 5f90ed2cafe..39ca169d486 100644 --- a/core/base/types.hpp +++ b/core/base/types.hpp @@ -109,7 +109,7 @@ constexpr std::enable_if_t<(num_groups > current_shift + 1), int> shift( * * The usage will be the following * Set the method with bits Cfg = ConfigSet - * Encode the given infomation encoded = Cfg::encode(x_0, x_1, ..., x_k) + * Encode the given information encoded = Cfg::encode(x_0, x_1, ..., x_k) * Decode the specific position information x_t = Cfg::decode(encoded) * The encoded result will use 32 bits to record * rrrrr0..01....1...k..k, which 1/2/.../k means the bits store the information diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp index e1df71491e5..4b767ad40ad 100644 --- a/core/solver/gcr.cpp +++ b/core/solver/gcr.cpp @@ -186,7 +186,7 @@ void Gcr::apply_dense_impl(const VectorType* dense_b, size_type restart_iter = 0; /* Memory movement summary for average iteration with krylov_dim d: - * (4d+22+4/d)n+(d+1+1/d) * values + matrix/preconditioner stroage + * (4d+22+4/d)n+(d+1+1/d) * values + matrix/preconditioner storage * 1x SpMV: 2n * values + storage * 1x Preconditioner: 2n * values + storage * 1x step 1 (scal, axpys) 6n diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 7a521f5f53e..074fa95d848 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -182,8 +182,8 @@ namespace multigrid { /** - * The enum class is to combine the cycle infomation It's legal to use a binary - * or(|) operation to combine several properties. + * The enum class is to combine the cycle information It's legal to use a + * binary or(|) operation to combine several properties. */ enum class cycle_mode { /** diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp index a1029bd9d12..a25f462556a 100644 --- a/core/test/base/mtx_io.cpp +++ b/core/test/base/mtx_io.cpp @@ -286,7 +286,7 @@ TEST(MtxReader, ReadsSparseRealMtx) } -TEST(MtxReader, ReadsSparseRealSymetricMtx) +TEST(MtxReader, ReadsSparseRealSymmetricMtx) { using tpl = gko::matrix_data::nonzero_type; std::istringstream iss( @@ -310,7 +310,7 @@ TEST(MtxReader, ReadsSparseRealSymetricMtx) } -TEST(MtxReader, ReadsSparseRealSkewSymetricMtx) +TEST(MtxReader, ReadsSparseRealSkewSymmetricMtx) { using tpl = gko::matrix_data::nonzero_type; std::istringstream iss( @@ -330,7 +330,7 @@ TEST(MtxReader, ReadsSparseRealSkewSymetricMtx) } -TEST(MtxReader, ReadsSparseRealSkewSymetricMtxWithExplicitDiagonal) +TEST(MtxReader, ReadsSparseRealSkewSymmetricMtxWithExplicitDiagonal) { using tpl = gko::matrix_data::nonzero_type; std::istringstream iss( diff --git a/core/test/mpi/base/polymorphic_object.cpp b/core/test/mpi/base/polymorphic_object.cpp index 88bcb756f4b..1cacc5d52f4 100644 --- a/core/test/mpi/base/polymorphic_object.cpp +++ b/core/test/mpi/base/polymorphic_object.cpp @@ -152,7 +152,7 @@ class EnableDistributedPolymorphicObject : public testing::Test { protected: std::shared_ptr ref{ gko::ReferenceExecutor::create()}; - // TDOD: We can't rely on Omp module being available in this test! + // TODO: We can't rely on Omp module being available in this test! std::shared_ptr omp{gko::OmpExecutor::create()}; gko::experimental::mpi::communicator comm{MPI_COMM_WORLD}; gko::experimental::mpi::communicator split_comm{comm.get(), comm.rank() < 2, diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp index f9109acb69e..45511be8e1b 100644 --- a/core/test/solver/idr.cpp +++ b/core/test/solver/idr.cpp @@ -420,7 +420,7 @@ TYPED_TEST(Idr, CanSetComplexSubspaceAgain) auto solver = idr_factory->generate(this->mtx); - solver->set_complex_subpsace(false); + solver->set_complex_subspace(false); ASSERT_EQ(solver->get_complex_subspace(), false); } diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp index 2e3cbefaaf6..029af45e076 100644 --- a/core/test/utils/assertions_test.cpp +++ b/core/test/utils/assertions_test.cpp @@ -98,7 +98,7 @@ class MatricesNear : public ::testing::Test { }; -TEST_F(MatricesNear, SuceedsIfSame) +TEST_F(MatricesNear, SucceedsIfSame) { ASSERT_PRED_FORMAT3(gko::test::assertions::matrices_near, mtx1.get(), mtx1.get(), 0.0); diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp index 1c5c818757b..7c43b0905c1 100644 --- a/core/test/utils/fb_matrix_generator.hpp +++ b/core/test/utils/fb_matrix_generator.hpp @@ -129,7 +129,7 @@ std::unique_ptr generate_random_matrix_with_diag( * generated FBCSR matrix. * @param block_size Block size of output Fbcsr matrix. * @param row_diag_dominant If true, a row-diagonal-dominant Fbcsr matrix is - * generated. Note that in this case, the intput Csr + * generated. Note that in this case, the input Csr * matrix must have diagonal entries in all rows. * @param rand_engine Random number engine to use, such as * std::default_random_engine. diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 23ab84cc491..6928c5424a5 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -553,7 +553,7 @@ std::unique_ptr generate_tridiag_matrix( /** * This computes an inverse of an tridiagonal Toeplitz matrix. * - * The compuation is based on the formula is from + * The computation is based on the formula is from * https://en.wikipedia.org/wiki/Tridiagonal_matrix#Inversion * * @param size the (square) size of the resulting matrix diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 41ed77d9002..33e3bab735a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -59,7 +59,7 @@ find_package(Kokkos QUIET) if(Kokkos_FOUND) if(GINKGO_WITH_CCACHE) message(WARNING "The CMAKE_CXX_COMPILER_LAUNCHER is set due to " - "GINKGO_WITH_CCACHE=ON which is known to casue issues with CUDA enabled " + "GINKGO_WITH_CCACHE=ON which is known to cause issues with CUDA enabled " "Kokkos (https://github.com/kokkos/kokkos/issues/4821) including compilation " "failures. This can be prevented by setting GINKGO_WITH_CCACHE=OFF.") endif() diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp index e07f1bf92fb..800846cfbd9 100644 --- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp +++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp @@ -175,7 +175,7 @@ int main(int argc, char* argv[]) // executor where Ginkgo will perform the computation const auto exec = exec_map.at(executor_string)(); // throws if not valid - // Declare a user controled boolean for the iteration process + // Declare a user controlled boolean for the iteration process volatile bool stop_iteration_process{}; // Create a new a thread to launch the solver diff --git a/examples/distributed-solver/distributed-solver.cpp b/examples/distributed-solver/distributed-solver.cpp index 865a44b0643..123f93775f5 100644 --- a/examples/distributed-solver/distributed-solver.cpp +++ b/examples/distributed-solver/distributed-solver.cpp @@ -51,9 +51,9 @@ int main(int argc, char* argv[]) // done with the following helper construct that uses RAII to automate the // initialization and finalization. const gko::experimental::mpi::environment env(argc, argv); - // @sect3{Type Definitiions} + // @sect3{Type Definitions} // Define the needed types. In a parallel program we need to differentiate - // beweeen global and local indices, thus we have two index types. + // between global and local indices, thus we have two index types. using GlobalIndexType = gko::int64; using LocalIndexType = gko::int32; // The underlying value type. diff --git a/examples/external-lib-interfacing/external-lib-interfacing.cpp b/examples/external-lib-interfacing/external-lib-interfacing.cpp index 08b35923b30..1766af3001f 100644 --- a/examples/external-lib-interfacing/external-lib-interfacing.cpp +++ b/examples/external-lib-interfacing/external-lib-interfacing.cpp @@ -1324,7 +1324,7 @@ void GradientEstimation::estimate_cell( // set_thread_limit, the default value from the Intel Threading // Building Blocks (TBB) library is used. If the call to // set_thread_limit is omitted, the number of threads will be -// chosen by TBB indepently of DEAL_II_NUM_THREADS. +// chosen by TBB independently of DEAL_II_NUM_THREADS. int main() { try { diff --git a/examples/mixed-multigrid-preconditioned-solver/doc/results.dox b/examples/mixed-multigrid-preconditioned-solver/doc/results.dox index af922a27ebc..dccd3ccad93 100644 --- a/examples/mixed-multigrid-preconditioned-solver/doc/results.dox +++ b/examples/mixed-multigrid-preconditioned-solver/doc/results.dox @@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r): CG iteration count: 39 CG generation time [ms]: 2.04293 CG execution time [ms]: 22.3874 -CG execution time per iteraion[ms]: 0.574036 +CG execution time per iteration[ms]: 0.574036 @endcode diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp index 9edd7ff29a1..cef918983e9 100644 --- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp +++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp @@ -250,7 +250,7 @@ int main(int argc, char* argv[]) << static_cast(gen_time.count()) / 1000000.0 << std::endl; std::cout << "CG execution time [ms]: " << static_cast(time.count()) / 1000000.0 << std::endl; - std::cout << "CG execution time per iteraion[ms]: " + std::cout << "CG execution time per iteration[ms]: " << static_cast(time.count()) / 1000000.0 / logger->get_num_iterations() << std::endl; diff --git a/examples/mixed-multigrid-solver/doc/results.dox b/examples/mixed-multigrid-solver/doc/results.dox index 7cbaa772d18..045fe343743 100644 --- a/examples/mixed-multigrid-solver/doc/results.dox +++ b/examples/mixed-multigrid-solver/doc/results.dox @@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r): Multigrid iteration count: 9 Multigrid generation time [ms]: 3.35361 Multigrid execution time [ms]: 10.048 -Multigrid execution time per iteraion[ms]: 1.11644 +Multigrid execution time per iteration[ms]: 1.11644 @endcode diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp index cbecbbbdc02..4241a74cdf2 100644 --- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp +++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp @@ -232,7 +232,7 @@ int main(int argc, char* argv[]) << static_cast(gen_time.count()) / 1000000.0 << std::endl; std::cout << "Multigrid execution time [ms]: " << static_cast(time.count()) / 1000000.0 << std::endl; - std::cout << "Multigrid execution time per iteraion[ms]: " + std::cout << "Multigrid execution time per iteration[ms]: " << static_cast(time.count()) / 1000000.0 / logger->get_num_iterations() << std::endl; diff --git a/examples/multigrid-preconditioned-solver-customized/doc/results.dox b/examples/multigrid-preconditioned-solver-customized/doc/results.dox index c7ba90d2fbb..2135f715934 100644 --- a/examples/multigrid-preconditioned-solver-customized/doc/results.dox +++ b/examples/multigrid-preconditioned-solver-customized/doc/results.dox @@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r): CG iteration count: 12 CG generation time [ms]: 1.41642 CG execution time [ms]: 6.59244 -CG execution time per iteraion[ms]: 0.54937 +CG execution time per iteration[ms]: 0.54937 @endcode diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp index a455ca2e8ed..f82a603d662 100644 --- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp +++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp @@ -202,7 +202,7 @@ int main(int argc, char* argv[]) << static_cast(gen_time.count()) / 1000000.0 << std::endl; std::cout << "CG execution time [ms]: " << static_cast(time.count()) / 1000000.0 << std::endl; - std::cout << "CG execution time per iteraion[ms]: " + std::cout << "CG execution time per iteration[ms]: " << static_cast(time.count()) / 1000000.0 / logger->get_num_iterations() << std::endl; diff --git a/examples/multigrid-preconditioned-solver/doc/results.dox b/examples/multigrid-preconditioned-solver/doc/results.dox index af922a27ebc..dccd3ccad93 100644 --- a/examples/multigrid-preconditioned-solver/doc/results.dox +++ b/examples/multigrid-preconditioned-solver/doc/results.dox @@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r): CG iteration count: 39 CG generation time [ms]: 2.04293 CG execution time [ms]: 22.3874 -CG execution time per iteraion[ms]: 0.574036 +CG execution time per iteration[ms]: 0.574036 @endcode diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp index 75c03259c67..b31b7906902 100644 --- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp +++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp @@ -161,7 +161,7 @@ int main(int argc, char* argv[]) << static_cast(gen_time.count()) / 1000000.0 << std::endl; std::cout << "CG execution time [ms]: " << static_cast(time.count()) / 1000000.0 << std::endl; - std::cout << "CG execution time per iteraion[ms]: " + std::cout << "CG execution time per iteration[ms]: " << static_cast(time.count()) / 1000000.0 / logger->get_num_iterations() << std::endl; diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp index 44c24b901b3..5091b4a439e 100644 --- a/include/ginkgo/core/base/composition.hpp +++ b/include/ginkgo/core/base/composition.hpp @@ -176,7 +176,7 @@ class Composition : public EnableLinOp>, * @tparam Rest types of trailing parameters * * @param oper the first operator - * @param rest remainging operators + * @param rest remaining operators */ template explicit Composition(std::shared_ptr oper, Rest&&... rest) @@ -217,7 +217,7 @@ class UseComposition { } /** - * Returns the operator at index-th poistion of composition + * Returns the operator at index-th position of composition * * @return index-th operator * diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index ad39adf7a36..8b270ed7a98 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -73,7 +73,7 @@ namespace gko { * try { * auto y = apply(A, x); * } catch(Error e) { - * // an error occured, write the message to screen and exit + * // an error occurred, write the message to screen and exit * std::cout << e.what() << std::endl; * return -1; * } @@ -160,7 +160,7 @@ class NotSupported : public Error { * * @param file The name of the offending source file * @param line The source code line number where the error occurred - * @param func The name of the function where the error occured + * @param func The name of the function where the error occurred * @param obj_type The object type on which the requested operation cannot be performed. */ @@ -513,7 +513,7 @@ class BadDimension : public Error { * Error that denotes issues between block sizes and matrix dimensions * * \tparam IndexType Type of index used by the linear algebra object that is - * incompatible with the requried block size. + * incompatible with the required block size. */ template class BlockSizeError : public Error { diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 4f476b9286d..456b69d3d7e 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -85,8 +85,8 @@ enum class log_propagation_mode { * host through the Unified memory model. * * `unified_host` allocates memory on the - * host and it is not available on devices which do not have concurrent acesses - * switched on, but this access can be explictly switched on, when necessary. + * host and it is not available on devices which do not have concurrent accesses + * switched on, but this access can be explicitly switched on, when necessary. */ enum class allocation_mode { device, unified_global, unified_host }; @@ -1606,7 +1606,7 @@ class CudaExecutor : public detail::ExecutorBase, } /** - * Get the major verion of compute capability. + * Get the major version of compute capability. */ int get_major_version() const noexcept { @@ -1614,7 +1614,7 @@ class CudaExecutor : public detail::ExecutorBase, } /** - * Get the minor verion of compute capability. + * Get the minor version of compute capability. */ int get_minor_version() const noexcept { @@ -1793,7 +1793,7 @@ class HipExecutor : public detail::ExecutorBase, } /** - * Get the major verion of compute capability. + * Get the major version of compute capability. */ int get_major_version() const noexcept { @@ -1801,7 +1801,7 @@ class HipExecutor : public detail::ExecutorBase, } /** - * Get the minor verion of compute capability. + * Get the minor version of compute capability. */ int get_minor_version() const noexcept { diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp index 3594d837f88..281690b7807 100644 --- a/include/ginkgo/core/base/index_set.hpp +++ b/include/ginkgo/core/base/index_set.hpp @@ -360,7 +360,7 @@ class index_set { const bool is_sorted = false) const; /** - * This function allows the user obtain a decompresed global_indices array + * This function allows the user obtain a decompressed global_indices array * from the indices stored in the index set * * @return the decompressed set of indices. diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index c7043f4ae25..c06c43bbb6e 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -931,7 +931,7 @@ class EnableLinOp * template parameters to enable a subclass of LinOpFactory. * * @tparam ConcreteFactory the concrete factory which is being implemented - * [CRTP parmeter] + * [CRTP parameter] * @tparam ConcreteLinOp the concrete LinOp type which this factory produces, * needs to have a constructor which takes a * const ConcreteFactory *, and an diff --git a/include/ginkgo/core/base/machine_topology.hpp b/include/ginkgo/core/base/machine_topology.hpp index 4fa7c2f8e17..317a768fb8a 100644 --- a/include/ginkgo/core/base/machine_topology.hpp +++ b/include/ginkgo/core/base/machine_topology.hpp @@ -71,7 +71,7 @@ namespace gko { /** * The machine topology class represents the hierarchical topology of a machine, - * including NUMA nodes, cores and PCI Devices. Various infomation of the + * including NUMA nodes, cores and PCI Devices. Various information of the * machine are gathered with the help of the Hardware Locality library (hwloc). * * This class also provides functionalities to bind objects in the topology to diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp index 4e9adc4e94e..e0378b8cec2 100644 --- a/include/ginkgo/core/base/perturbation.hpp +++ b/include/ginkgo/core/base/perturbation.hpp @@ -186,7 +186,7 @@ class Perturbation : public EnableLinOp>, cache_struct(const cache_struct& other) {} cache_struct& operator=(const cache_struct& other) { return *this; } - // allocate linops of cache. The dimenstion of `intermediate` is + // allocate linops of cache. The dimension of `intermediate` is // (the number of rows of projector, the number of columns of b). Others // are 1x1 scalar. void allocate(std::shared_ptr exec, dim<2> size) diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp index 8d4c327ac33..da19a63d51d 100644 --- a/include/ginkgo/core/base/polymorphic_object.hpp +++ b/include/ginkgo/core/base/polymorphic_object.hpp @@ -59,7 +59,7 @@ namespace gko { * @note Most of the public methods of this class should not be overridden * directly, and are thus not virtual. Instead, there are equivalent * protected methods (ending in _impl) that should be - * overriden instead. This allows polymorphic objects to implement default + * overridden instead. This allows polymorphic objects to implement default * behavior around virtual methods (parameter checking, type casting). * * @see EnablePolymorphicObject if you wish to implement a concrete polymorphic @@ -657,7 +657,7 @@ std::shared_ptr copy_and_convert_to( * The mixin changes parameter and return types of appropriate public methods of * PolymorphicObject in the same way EnableAbstractPolymorphicObject does. * In addition, it also provides default implementations of PolymorphicObject's - * vritual methods by using the _executor default constructor_ and the + * virtual methods by using the _executor default constructor_ and the * assignment operator of ConcreteObject. Consequently, the following is a * minimal example of PolymorphicObject: * diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index ed8901075bd..29c7baba8d8 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -276,7 +276,7 @@ using head_t = typename head::type; * `x` an `y` are ranges, and `alpha` is a scalar. * Range operations are optimized for memory access, and the above code does not * allocate additional storage for intermediate ranges `alpha * x` - * or `aplha * x + y`. In fact, the entire computation is done during the + * or `alpha * x + y`. In fact, the entire computation is done during the * assignment, and the results of operations `+` and `*` only register the data, * and the types of operations that will be computed once the results are * needed. @@ -295,7 +295,7 @@ using head_t = typename head::type; * * __`mmul` is not a highly-optimized BLAS-3 version of the matrix * multiplication.__ The current design of ranges and accessors prevents that, - * so if you need a high-perfromance matrix multiplication, you should use one + * so if you need a high-performance matrix multiplication, you should use one * of the libraries that provide that, or implement your own * (you can use pointwise range operations to help simplify that). However, * range design might get improved in the future to allow efficient @@ -710,17 +710,17 @@ GKO_ENABLE_UNARY_RANGE_OPERATION(bitwise_not, operator~, // common unary functions GKO_ENABLE_UNARY_RANGE_OPERATION(zero_operation, zero, accessor::detail::zero_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(one_operaton, one, +GKO_ENABLE_UNARY_RANGE_OPERATION(one_operation, one, accessor::detail::one_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(abs_operaton, abs, +GKO_ENABLE_UNARY_RANGE_OPERATION(abs_operation, abs, accessor::detail::abs_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(real_operaton, real, +GKO_ENABLE_UNARY_RANGE_OPERATION(real_operation, real, accessor::detail::real_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(imag_operaton, imag, +GKO_ENABLE_UNARY_RANGE_OPERATION(imag_operation, imag, accessor::detail::imag_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(conj_operaton, conj, +GKO_ENABLE_UNARY_RANGE_OPERATION(conj_operation, conj, accessor::detail::conj_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(squared_norm_operaton, squared_norm, +GKO_ENABLE_UNARY_RANGE_OPERATION(squared_norm_operation, squared_norm, accessor::detail::squared_norm_operation); namespace accessor { @@ -961,9 +961,9 @@ GKO_ENABLE_BINARY_RANGE_OPERATION(right_shift, operator>>, accessor::detail::right_shift); // common binary functions -GKO_ENABLE_BINARY_RANGE_OPERATION(max_operaton, max, +GKO_ENABLE_BINARY_RANGE_OPERATION(max_operation, max, accessor::detail::max_operation); -GKO_ENABLE_BINARY_RANGE_OPERATION(min_operaton, min, +GKO_ENABLE_BINARY_RANGE_OPERATION(min_operation, min, accessor::detail::min_operation); diff --git a/include/ginkgo/core/base/scoped_device_id_guard.hpp b/include/ginkgo/core/base/scoped_device_id_guard.hpp index 52fccdd241c..6b236a6a37e 100644 --- a/include/ginkgo/core/base/scoped_device_id_guard.hpp +++ b/include/ginkgo/core/base/scoped_device_id_guard.hpp @@ -58,7 +58,7 @@ class generic_scoped_device_id_guard { public: generic_scoped_device_id_guard() = default; - // TODO: this should be a purely virtual funtion, but somehow that leads to + // TODO: this should be a purely virtual function, but somehow that leads to // linker errors virtual ~generic_scoped_device_id_guard() = default; diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index 9016442df67..441bc63d22c 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -112,7 +112,7 @@ class Schwarz /** * Creates a Schwarz preconditioner from a matrix using a Schwarz::Factory. * - * @param factory the factory to use to create the preconditoner + * @param factory the factory to use to create the preconditioner * @param system_matrix the matrix this preconditioner should be created * from */ @@ -126,7 +126,7 @@ class Schwarz } /** - * Generates the preconditoner. + * Generates the preconditioner. */ void generate(std::shared_ptr system_matrix); diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 1ad0b171788..61ceab8e380 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -337,7 +337,7 @@ class Vector array& tmp) const; /** - * Computes the square of the column-wise Euclidian ($L^2$) norm of this + * Computes the square of the column-wise Euclidean ($L^2$) norm of this * (multi-)vector using a global reduction. * * @param result a Dense row vector, used to store the norm @@ -347,7 +347,7 @@ class Vector void compute_squared_norm2(ptr_param result) const; /** - * Computes the square of the column-wise Euclidian ($L^2$) norm of this + * Computes the square of the column-wise Euclidean ($L^2$) norm of this * (multi-)vector using a global reduction. * * @param result a Dense row vector, used to store the norm @@ -360,7 +360,7 @@ class Vector void compute_squared_norm2(ptr_param result, array& tmp) const; /** - * Computes the Euclidian (L^2) norm of this (multi-)vector using a global + * Computes the Euclidean (L^2) norm of this (multi-)vector using a global * reduction. * * @param result a Dense row matrix, used to store the norm @@ -370,7 +370,7 @@ class Vector void compute_norm2(ptr_param result) const; /** - * Computes the Euclidian (L^2) norm of this (multi-)vector using a global + * Computes the Euclidean (L^2) norm of this (multi-)vector using a global * reduction. * * @param result a Dense row matrix, used to store the norm diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp index 365a431208a..2df350f31a2 100644 --- a/include/ginkgo/core/factorization/par_ic.hpp +++ b/include/ginkgo/core/factorization/par_ic.hpp @@ -130,7 +130,7 @@ class ParIc : public Composition { * The number of iterations the `compute` kernel will use when doing * the factorization. The default value `0` means `Auto`, so the * implementation decides on the actual value depending on the - * ressources that are available. + * resources that are available. */ size_type GKO_FACTORY_PARAMETER_SCALAR(iterations, 0); diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp index a9b41f33d90..173136fa682 100644 --- a/include/ginkgo/core/factorization/par_ict.hpp +++ b/include/ginkgo/core/factorization/par_ict.hpp @@ -236,7 +236,7 @@ class ParIct : public Composition { * matrix_type * * @param system_matrix the source matrix used to generate the factors. - * @note: system_matrix must be convertable to a Csr + * @note: system_matrix must be convertible to a Csr * Matrix, otherwise, an exception is thrown. * @return A Composition, containing the incomplete LU factors for the * given system_matrix (first element is L, then L^T) diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp index 539946befec..878721afbd5 100644 --- a/include/ginkgo/core/factorization/par_ilu.hpp +++ b/include/ginkgo/core/factorization/par_ilu.hpp @@ -128,7 +128,7 @@ class ParIlu : public Composition { * The number of iterations the `compute` kernel will use when doing * the factorization. The default value `0` means `Auto`, so the * implementation decides on the actual value depending on the - * ressources that are available. + * resources that are available. */ size_type GKO_FACTORY_PARAMETER_SCALAR(iterations, 0); diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp index ba4ce7d1629..76f3789a44e 100644 --- a/include/ginkgo/core/factorization/par_ilut.hpp +++ b/include/ginkgo/core/factorization/par_ilut.hpp @@ -242,7 +242,7 @@ class ParIlut : public Composition { * while the dynamic type of U is u_matrix_type. * * @param system_matrix the source matrix used to generate the factors. - * @note: system_matrix must be convertable to a Csr + * @note: system_matrix must be convertible to a Csr * Matrix, otherwise, an exception is thrown. * @return A Composition, containing the incomplete LU factors for the * given system_matrix (first element is L, then U) diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index 0f22663347c..b700e1e703a 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -111,10 +111,10 @@ class Logger { * call only if the user activates this event through the mask. If the * event is activated, we rely on polymorphism and the virtual method * `on_##_event_name()` to either call the Logger class's function, - * which does nothing, or the overriden version in the derived class if + * which does nothing, or the overridden version in the derived class if * any. Therefore, to support a new event in any Logger (i.e. class * which derive from this class), the function `on_##_event_name()` - * should be overriden and implemented. + * should be overridden and implemented. * * @param _id the unique id of the event * diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp index 9a26acd6ab0..6a9b00dfac7 100644 --- a/include/ginkgo/core/log/profiler_hook.hpp +++ b/include/ginkgo/core/log/profiler_hook.hpp @@ -298,7 +298,7 @@ class ProfilerHook : public Logger { std::vector children{}; }; - /** Recieves the results from ProfilerHook::create_summary(). */ + /** Receives the results from ProfilerHook::create_summary(). */ class SummaryWriter { public: virtual ~SummaryWriter() = default; @@ -313,7 +313,7 @@ class ProfilerHook : public Logger { std::chrono::nanoseconds overhead) = 0; }; - /** Recieves the results from ProfilerHook::create_nested_summary(). */ + /** Receives the results from ProfilerHook::create_nested_summary(). */ class NestedSummaryWriter { public: virtual ~NestedSummaryWriter() = default; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 9ccd02d48db..15662294607 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -63,7 +63,7 @@ class Hybrid; /** * COO stores a matrix in the coordinate matrix format. * - * The nonzero elements are stored in an array row-wise (but not neccessarily + * The nonzero elements are stored in an array row-wise (but not necessarily * sorted by column index within a row). Two extra arrays contain the row and * column indexes of each nonzero element of the matrix. * diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index a1e08d38c65..16bff356223 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -853,7 +853,7 @@ class Dense array& tmp) const; /** - * Computes the column-wise Euclidian (L^2) norm of this matrix. + * Computes the column-wise Euclidean (L^2) norm of this matrix. * * @param result a Dense row vector, used to store the norm * (the number of columns in the vector must match the number @@ -862,7 +862,7 @@ class Dense void compute_norm2(ptr_param result) const; /** - * Computes the column-wise Euclidian (L^2) norm of this matrix. + * Computes the column-wise Euclidean (L^2) norm of this matrix. * * @param result a Dense row vector, used to store the norm * (the number of columns in the vector must match the @@ -895,7 +895,7 @@ class Dense void compute_norm1(ptr_param result, array& tmp) const; /** - * Computes the square of the column-wise Euclidian (L^2) norm of this + * Computes the square of the column-wise Euclidean (L^2) norm of this * matrix. * * @param result a Dense row vector, used to store the norm @@ -905,7 +905,7 @@ class Dense void compute_squared_norm2(ptr_param result) const; /** - * Computes the square of the column-wise Euclidian (L^2) norm of this + * Computes the square of the column-wise Euclidean (L^2) norm of this * matrix. * * @param result a Dense row vector, used to store the norm diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index a923e7b9079..db65b57b6fb 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -279,7 +279,7 @@ class Hybrid /** * Get the percent setting * - * @retrun percent + * @return percent */ auto get_percentage() const { return percent_; } @@ -314,14 +314,14 @@ class Hybrid /** * Get the percent setting * - * @retrun percent + * @return percent */ auto get_percentage() const { return strategy_.get_percentage(); } /** * Get the ratio setting * - * @retrun ratio + * @return ratio */ auto get_ratio() const { return ratio_; } @@ -356,7 +356,7 @@ class Hybrid /** * Get the percent setting * - * @retrun percent + * @return percent */ auto get_percentage() const { return strategy_.get_percentage(); } diff --git a/include/ginkgo/core/preconditioner/isai.hpp b/include/ginkgo/core/preconditioner/isai.hpp index 7f03deae0a1..c5d5ddc6471 100644 --- a/include/ginkgo/core/preconditioner/isai.hpp +++ b/include/ginkgo/core/preconditioner/isai.hpp @@ -233,7 +233,7 @@ class Isai : public EnableLinOp>, /** * Creates an Isai preconditioner from a matrix using an Isai::Factory. * - * @param factory the factory to use to create the preconditoner + * @param factory the factory to use to create the preconditioner * @param system_matrix the matrix for which an ISAI is to be computed */ explicit Isai(const Factory* factory, diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp index bf215082a85..f48d8e34c8c 100644 --- a/include/ginkgo/core/preconditioner/jacobi.hpp +++ b/include/ginkgo/core/preconditioner/jacobi.hpp @@ -546,7 +546,7 @@ class Jacobi : public EnableLinOp>, /** * Creates a Jacobi preconditioner from a matrix using a Jacobi::Factory. * - * @param factory the factory to use to create the preconditoner + * @param factory the factory to use to create the preconditioner * @param system_matrix the matrix this preconditioner should be created * from */ @@ -593,7 +593,7 @@ class Jacobi : public EnableLinOp>, max_block_stride = param_max_block_stride; if (this->get_executor() != this->get_executor()->get_master() && max_block_stride != default_block_stride) { - // only support the default value on the gpu devive + // only support the default value on the gpu device GKO_NOT_SUPPORTED(this); } } @@ -612,7 +612,7 @@ class Jacobi : public EnableLinOp>, } /** - * Generates the preconditoner. + * Generates the preconditioner. * * @param system_matrix the source matrix used to generate the * preconditioner diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp index bb6e7986206..72ba6827f2b 100644 --- a/include/ginkgo/core/reorder/rcm.hpp +++ b/include/ginkgo/core/reorder/rcm.hpp @@ -77,7 +77,7 @@ enum class starting_strategy { minimum_degree, pseudo_peripheral }; * * There are two "starting strategies" currently available: minimum degree and * pseudo-peripheral. These strategies control how a starting vertex for a - * connected component is choosen, which is then renumbered as first vertex in + * connected component is chosen, which is then renumbered as first vertex in * the component, starting the algorithm from there. * In general, the bandwidths obtained by choosing a pseudo-peripheral vertex * are slightly smaller than those obtained from choosing a vertex of minimum diff --git a/include/ginkgo/core/reorder/reordering_base.hpp b/include/ginkgo/core/reorder/reordering_base.hpp index 8cfb4c10c48..e0b80adb4cd 100644 --- a/include/ginkgo/core/reorder/reordering_base.hpp +++ b/include/ginkgo/core/reorder/reordering_base.hpp @@ -113,7 +113,7 @@ using ReorderingBaseFactory = * template parameters to enable a subclass of ReorderingBaseFactory. * * @tparam ConcreteFactory the concrete factory which is being implemented - * [CRTP parmeter] + * [CRTP parameter] * @tparam ConcreteReorderingBase the concrete ReorderingBase type which this * factory produces, needs to have a constructor which takes a const * ConcreteFactory *, and a const ReorderingBaseArgs * as parameters. diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index 7ad152f6808..fde0bc67157 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -163,7 +163,7 @@ class Idr * * @param other the new complex_subspace parameter */ - void set_complex_subpsace(const bool other) + void set_complex_subspace(const bool other) { parameters_.complex_subspace = other; } diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index 251924b70ff..c5c69c1fb67 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -306,7 +306,7 @@ struct workspace_traits> { * limited stop criterion(iterations and relacation_factor). * * @param factory the shared pointer of factory - * @param iteration the maximum number of iteraion, which default is 1 + * @param iteration the maximum number of iteration, which default is 1 * @param relaxation_factor the relaxation factor for Richardson * * @return the pointer of Ir(Richardson) @@ -329,7 +329,7 @@ auto build_smoother(std::shared_ptr factory, * limited stop criterion(iterations and relacation_factor). * * @param solver the shared pointer of solver - * @param iteration the maximum number of iteraion, which default is 1 + * @param iteration the maximum number of iteration, which default is 1 * @param relaxation_factor the relaxation factor for Richardson * * @return the pointer of Ir(Richardson) diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index 6687a6df82e..53909337554 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -244,7 +244,7 @@ class EnableApplyWithInitialGuess : public ApplyWithInitialGuess { self(), alpha, b, beta, x); } - // TODO: should we provide the defaule implementation? + // TODO: should we provide the default implementation? /** * The class should override this method and must modify the input vectors * according to the initial_guess_mode diff --git a/include/ginkgo/core/stop/criterion.hpp b/include/ginkgo/core/stop/criterion.hpp index e094cc90206..1a52da3efae 100644 --- a/include/ginkgo/core/stop/criterion.hpp +++ b/include/ginkgo/core/stop/criterion.hpp @@ -259,7 +259,7 @@ using CriterionFactory = AbstractFactory; * template parameters to enable a subclass of CriterionFactory. * * @tparam ConcreteFactory the concrete factory which is being implemented - * [CRTP parmeter] + * [CRTP parameter] * @tparam ConcreteCriterion the concrete Criterion type which this factory * produces, needs to have a constructor which takes * a const ConcreteFactory *, and a diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp index c644e1977df..ee7d7890cf4 100644 --- a/include/ginkgo/core/stop/stopping_status.hpp +++ b/include/ginkgo/core/stop/stopping_status.hpp @@ -96,7 +96,7 @@ class stopping_status { GKO_ATTRIBUTES GKO_INLINE void reset() noexcept { data_ = uint8{0}; } /** - * Call if a stop occured due to a hard limit (and convergence was not + * Call if a stop occurred due to a hard limit (and convergence was not * reached). * @param id id of the stopping criteria. * @param set_finalized Controls if the current version should count as @@ -114,7 +114,7 @@ class stopping_status { } /** - * Call if convergence occured. + * Call if convergence occurred. * @param id id of the stopping criteria. * @param set_finalized Controls if the current version should count as * finalized (set to true) or not (set to false). diff --git a/include/ginkgo/core/stop/time.hpp b/include/ginkgo/core/stop/time.hpp index d1a752c5042..3d39b1de082 100644 --- a/include/ginkgo/core/stop/time.hpp +++ b/include/ginkgo/core/stop/time.hpp @@ -45,7 +45,7 @@ namespace stop { /** * The Time class is a stopping criterion which stops the iteration process - * after a certain amout of time has passed. + * after a certain amount of time has passed. * * @ingroup stop */ From 8612d9ca09051e52a333f60ad401668469e60bef Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 12 Jun 2023 12:39:50 +0200 Subject: [PATCH 067/583] Manual typo fix --- CHANGELOG.md | 48 +++++++++---------- CMakeLists.txt | 2 +- CONTRIBUTING.md | 2 +- _typos.toml | 10 ++++ accessor/accessor_helper.hpp | 2 +- accessor/row_major.hpp | 2 +- accessor/utils.hpp | 2 +- benchmark/CMakeLists.txt | 4 +- benchmark/tools/mtx_to_binary.cpp | 4 +- benchmark/utils/formats.hpp | 4 +- benchmark/utils/general.hpp | 2 +- cmake/CTestScript.cmake | 2 +- cmake/Modules/CudaArchitectureSelector.cmake | 2 +- cmake/hip.cmake | 2 +- cmake/information_helpers.cmake | 2 +- common/cuda_hip/base/executor.hpp.inc | 2 +- .../cuda_hip/components/segment_scan.hpp.inc | 2 +- common/cuda_hip/matrix/csr_kernels.hpp.inc | 2 +- common/cuda_hip/multigrid/pgm_kernels.hpp.inc | 4 +- common/unified/multigrid/pgm_kernels.cpp | 6 +-- core/solver/multigrid.cpp | 2 +- cuda/components/cooperative_groups.cuh | 2 +- cuda/solver/common_trs_kernels.cuh | 2 +- dev_tools/oneapi/convert_source.sh | 8 ++-- dev_tools/scripts/format_header.sh | 6 +-- devices/reference/dummy.cpp | 2 +- doc/examples/examples.hpp.in | 2 +- dpcpp/base/executor.dp.cpp | 2 +- dpcpp/base/helper.hpp | 2 +- dpcpp/components/segment_scan.dp.hpp | 2 +- dpcpp/components/thread_ids.dp.hpp | 4 +- dpcpp/multigrid/pgm_kernels.dp.cpp | 5 +- .../ginkgo/core/base/polymorphic_object.hpp | 4 +- include/ginkgo/core/matrix/dense.hpp | 2 +- include/ginkgo/core/solver/solver_base.hpp | 3 +- omp/reorder/rcm_kernels.cpp | 6 +-- reference/reorder/rcm_kernels.cpp | 2 +- reference/test/matrix/csr_kernels.cpp | 2 +- .../test/matrix/sparsity_csr_kernels.cpp | 2 +- reference/test/preconditioner/ilu.cpp | 2 +- reference/test/stop/residual_norm_kernels.cpp | 6 +-- 41 files changed, 92 insertions(+), 82 deletions(-) create mode 100644 _typos.toml diff --git a/CHANGELOG.md b/CHANGELOG.md index 34d53363898..e5728ef2cc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -215,7 +215,7 @@ Supported systems and requirements: + Add reduce_add for arrays ([#831](https://github.com/ginkgo-project/ginkgo/pull/831)) + Add utility to simplify Dense View creation from an existing Dense vector ([#1136](https://github.com/ginkgo-project/ginkgo/pull/1136)). + Add a custom transpose implementation for Fbcsr and Csr transpose for unsupported vendor types ([#1123](https://github.com/ginkgo-project/ginkgo/pull/1123)) -+ Make IDR random initilization deterministic ([#1116](https://github.com/ginkgo-project/ginkgo/pull/1116)) ++ Make IDR random initialization deterministic ([#1116](https://github.com/ginkgo-project/ginkgo/pull/1116)) + Move the algorithm choice for triangular solvers from Csr::strategy_type to a factory parameter ([#1088](https://github.com/ginkgo-project/ginkgo/pull/1088)) + Update CUDA archCoresPerSM ([#1175](https://github.com/ginkgo-project/ginkgo/pull/1116)) + Add kernels for Csr sparsity pattern lookup ([#994](https://github.com/ginkgo-project/ginkgo/pull/994)) @@ -620,7 +620,7 @@ page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues). ### Additions -+ Upper and lower triangular solvers ([#327](https://github.com/ginkgo-project/ginkgo/issues/327), [#336](https://github.com/ginkgo-project/ginkgo/issues/336), [#341](https://github.com/ginkgo-project/ginkgo/issues/341), [#342](https://github.com/ginkgo-project/ginkgo/issues/342)) ++ Upper and lower triangular solvers ([#327](https://github.com/ginkgo-project/ginkgo/issues/327), [#336](https://github.com/ginkgo-project/ginkgo/issues/336), [#341](https://github.com/ginkgo-project/ginkgo/issues/341), [#342](https://github.com/ginkgo-project/ginkgo/issues/342)) + New factorization support in Ginkgo, and addition of the ParILU algorithm ([#305](https://github.com/ginkgo-project/ginkgo/issues/305), [#315](https://github.com/ginkgo-project/ginkgo/issues/315), [#319](https://github.com/ginkgo-project/ginkgo/issues/319), [#324](https://github.com/ginkgo-project/ginkgo/issues/324)) + New ILU preconditioner ([#348](https://github.com/ginkgo-project/ginkgo/issues/348), [#353](https://github.com/ginkgo-project/ginkgo/issues/353)) @@ -632,7 +632,7 @@ page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues). + Allow benchmarking CuSPARSE spmv formats through Ginkgo's benchmarks ([#303](https://github.com/ginkgo-project/ginkgo/issues/303)) + New benchmark for sparse matrix format conversions ([#312](https://github.com/ginkgo-project/ginkgo/issues/312)[#317](https://github.com/ginkgo-project/ginkgo/issues/317)) + Add conversions between CSR and Hybrid formats ([#302](https://github.com/ginkgo-project/ginkgo/issues/302), [#310](https://github.com/ginkgo-project/ginkgo/issues/310)) -+ Support for sorting rows in the CSR format by column idices ([#322](https://github.com/ginkgo-project/ginkgo/issues/322)) ++ Support for sorting rows in the CSR format by column indices ([#322](https://github.com/ginkgo-project/ginkgo/issues/322)) + Addition of a CUDA COO SpMM kernel for improved performance ([#345](https://github.com/ginkgo-project/ginkgo/issues/345)) + Addition of a LinOp to handle perturbations of the form (identity + scalar * basis * projector) ([#334](https://github.com/ginkgo-project/ginkgo/issues/334)) @@ -845,35 +845,35 @@ About Ginkgo 1.0.0 is brought to you by: -**Karlsruhe Institute of Technology**, Germany -**Universitat Jaume I**, Spain -**University of Tennessee, Knoxville**, US +**Karlsruhe Institute of Technology**, Germany +**Universitat Jaume I**, Spain +**University of Tennessee, Knoxville**, US These universities, along with various project grants, supported the development team and provided resources needed for the development of Ginkgo. Ginkgo 1.0.0 contains contributions from: -**Hartwig Anzt**, Karlsruhe Institute of Technology -**Yenchen Chen**, National Taiwan University -**Terry Cojean**, Karlsruhe Institute of Technology -**Goran Flegar**, Universitat Jaume I -**Fritz Göbel**, Karlsruhe Institute of Technology -**Thomas Grützmacher**, Karlsruhe Institute of Technology -**Pratik Nayak**, Karlsruhe Institue of Technologgy -**Tobias Ribizel**, Karlsruhe Institute of Technology -**Yuhsiang Tsai**, National Taiwan University +**Hartwig Anzt**, Karlsruhe Institute of Technology +**Yenchen Chen**, National Taiwan University +**Terry Cojean**, Karlsruhe Institute of Technology +**Goran Flegar**, Universitat Jaume I +**Fritz Göbel**, Karlsruhe Institute of Technology +**Thomas Grützmacher**, Karlsruhe Institute of Technology +**Pratik Nayak**, Karlsruhe Institute of Technology +**Tobias Ribizel**, Karlsruhe Institute of Technology +**Yuhsiang Tsai**, National Taiwan University Supporting materials are provided by the following individuals: -**David Rogers** - the Ginkgo logo -**Frithjof Fleischhammer** - the Ginkgo website +**David Rogers** - the Ginkgo logo +**Frithjof Fleischhammer** - the Ginkgo website The development team is grateful to the following individuals for discussions and comments: - -**Erik Boman** -**Jelena Držaić** -**Mike Heroux** -**Mark Hoemmen** -**Timo Heister** -**Jens Saak** + +**Erik Boman** +**Jelena Držaić** +**Mike Heroux** +**Mark Hoemmen** +**Timo Heister** +**Jens Saak** diff --git a/CMakeLists.txt b/CMakeLists.txt index df6f0ffb89a..6351ce98bfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ endif() set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING "Set the required NVCC compiler flags, mainly used for warnings. Current default is an empty string") set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING - "A list of target NVIDIA GPU achitectures. See README.md for more detail.") + "A list of target NVIDIA GPU architectures. See README.md for more detail.") option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) # the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1dd6f412876..8e2f3990aca 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -312,7 +312,7 @@ Thus, contributors should be aware of the following rules for blank lines: However, simply calling function `f` from function `g` does not imply that `f` and `g` are "related". 2. Statements within structures / classes are separated with 1 blank line. - There are no blank lines betweeen the first / last statement in the + There are no blank lines between the first / last statement in the structure / class. 1. _exception_: there is no blank line between an access modifier (`private`, `protected`, `public`) and the following statement. _example_: diff --git a/_typos.toml b/_typos.toml new file mode 100644 index 00000000000..5ba4cd4f662 --- /dev/null +++ b/_typos.toml @@ -0,0 +1,10 @@ +[files] +extend-exclude = ["third_party/*", "*.svg"] + +[default.extend-words] +dout = "dout" +nd = "nd" +tht = "tht" +automatical = "automatical" +strat = "strat" +entrie = "entrie" diff --git a/accessor/accessor_helper.hpp b/accessor/accessor_helper.hpp index 5ee536d28db..5b80f4e13d8 100644 --- a/accessor/accessor_helper.hpp +++ b/accessor/accessor_helper.hpp @@ -78,7 +78,7 @@ struct row_major_helper_s { const std::array 1 ? total_dim - 1 : 0)>& stride, IndexType first, Indices&&... idxs) { - // The ASSERT size check must NOT be indexed with `dim_idx` directy, + // The ASSERT size check must NOT be indexed with `dim_idx` directly, // otherwise, it leads to a linker error. The reason is likely that // `std::array::operator[](const size_type &)` uses a // reference. Since `dim_idx` is constexpr (and not defined in a diff --git a/accessor/row_major.hpp b/accessor/row_major.hpp index 757110f4912..9026cef2116 100644 --- a/accessor/row_major.hpp +++ b/accessor/row_major.hpp @@ -55,7 +55,7 @@ namespace acc { * constructor parameters for this class to the range (it will forward it to * this class). * - * @warning For backward compatability reasons, a specialization is provided + * @warning For backward compatibility reasons, a specialization is provided * for dimensionality == 2. * * @tparam ValueType type of values this accessor returns diff --git a/accessor/utils.hpp b/accessor/utils.hpp index e692138ee4d..dfe30188f83 100644 --- a/accessor/utils.hpp +++ b/accessor/utils.hpp @@ -243,7 +243,7 @@ to_arithmetic_type(const Ref& ref) * @internal * Struct used for testing if an implicit cast is present. The constructor only * takes an OutType, so any argument of a type that is not implicitly - * convertable to OutType is incompatible. + * convertible to OutType is incompatible. */ template struct test_for_implicit_cast { diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 434474fd336..f12dbad7f19 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -33,7 +33,7 @@ function(ginkgo_benchmark_hipsparse_linops type def) # use Thrust C++ device just for compilation, we don't use thrust::complex in the benchmarks target_compile_definitions(hipsparse_linops_${type} PUBLIC -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE - ${HSA_HEADER} ${HIP_INCLUDE_DIRS} + ${HAS_HEADER} ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) endfunction() @@ -133,7 +133,7 @@ if (GINKGO_BUILD_HIP) add_library(hip_timer utils/hip_timer.hip.cpp) EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) set_target_properties(hip_timer PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) - target_include_directories(hip_timer SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS}) + target_include_directories(hip_timer SYSTEM PRIVATE ${HAS_HEADER} ${HIP_INCLUDE_DIRS}) target_link_libraries(hip_timer ginkgo) endif() diff --git a/benchmark/tools/mtx_to_binary.cpp b/benchmark/tools/mtx_to_binary.cpp index 487687ff605..1d2f4f94e02 100644 --- a/benchmark/tools/mtx_to_binary.cpp +++ b/benchmark/tools/mtx_to_binary.cpp @@ -61,8 +61,8 @@ void process(const char* input, const char* output, bool validate) } } if (validate) { - std::ifstream ois(output, std::ios_base::in | std::ios_base::binary); - auto data2 = gko::read_binary_raw(ois); + std::ifstream is(output, std::ios_base::in | std::ios_base::binary); + auto data2 = gko::read_binary_raw(is); std::cerr << "Comparing against previously read data\n"; if (data.size != data2.size) { throw GKO_STREAM_ERROR("Mismatching sizes!"); diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index deecc4b530c..6b024b16d1c 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -78,8 +78,8 @@ std::string format_description = " Irregular Sparse Matrices.\n" "csr: Compressed Sparse Row storage. Ginkgo implementation with\n" " automatic strategy.\n" - "csrc: Ginkgo's CSR implementation with automatic stategy.\n" - "csri: Ginkgo's CSR implementation with inbalance strategy.\n" + "csrc: Ginkgo's CSR implementation with automatic strategy.\n" + "csri: Ginkgo's CSR implementation with imbalance strategy.\n" "csrm: Ginkgo's CSR implementation with merge_path strategy.\n" "csrs: Ginkgo's CSR implementation with sparselib strategy.\n" "ell: Ellpack format according to Bell and Garland: Efficient Sparse\n" diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 19c71b74a1a..5c6d849fe36 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -179,7 +179,7 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header, } /** - * Print general benchmark informations using the common available parameters + * Print general benchmark information using the common available parameters * * @param extra describes benchmark specific extra parameters to output */ diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake index 61d53b0442a..81ff86625d1 100644 --- a/cmake/CTestScript.cmake +++ b/cmake/CTestScript.cmake @@ -4,7 +4,7 @@ # # Runs our tests through CTest, with support for Coverage or memory checking. # -# This script provides a full CTest run whith result submission to Ginkgo's +# This script provides a full CTest run with result submission to Ginkgo's # CDash dashboard. The supported runs are: # + With or without coverage, requires the gcov tool. # + With or without address sanitizers. diff --git a/cmake/Modules/CudaArchitectureSelector.cmake b/cmake/Modules/CudaArchitectureSelector.cmake index 63e8c767446..1838ed4b932 100644 --- a/cmake/Modules/CudaArchitectureSelector.cmake +++ b/cmake/Modules/CudaArchitectureSelector.cmake @@ -119,7 +119,7 @@ # identifiers in this list will be removed from the list specified by the # ``ARCHITECTURES`` list. A warning will be printed for each removed entry. # The list also supports aggregates ``All``, ``Auto`` and GPU generation names -# wich have the same meaning as in the ``ARCHITECTURES'' specification list. +# which have the same meaning as in the ``ARCHITECTURES'' specification list. if(NOT DEFINED CMAKE_CUDA_COMPILER) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 1b9aa0e8723..5b7a268c7b6 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -197,7 +197,7 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") # Remove false positive CUDA warnings when calling one() and zero() list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda) - if (GINKGO_HIP_PLATFROM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" + if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" ) ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION) diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake index 8bed7320caa..9a6a4481bf5 100644 --- a/cmake/information_helpers.cmake +++ b/cmake/information_helpers.cmake @@ -103,7 +103,7 @@ macro(ginkgo_interface_information) get_target_property(GINKGO_INTERFACE_LINK_LIBRARIES ginkgo INTERFACE_LINK_LIBRARIES) ginkgo_interface_libraries_recursively("${GINKGO_INTERFACE_LINK_LIBRARIES}") # Format and store the interface libraries found - # remove duplicates on the reversed list to keep the dependecy in the end of list. + # remove duplicates on the reversed list to keep the dependency in the end of list. list(REVERSE GINKGO_INTERFACE_LIBS_FOUND) list(REMOVE_DUPLICATES GINKGO_INTERFACE_LIBS_FOUND) list(REVERSE GINKGO_INTERFACE_LIBS_FOUND) diff --git a/common/cuda_hip/base/executor.hpp.inc b/common/cuda_hip/base/executor.hpp.inc index 7e71a3e24c0..ad641ecea5b 100644 --- a/common/cuda_hip/base/executor.hpp.inc +++ b/common/cuda_hip/base/executor.hpp.inc @@ -40,7 +40,7 @@ inline int convert_sm_ver_to_cores(int major, int minor) // Defines for GPU Architecture types (using the SM version to determine // the # of cores per SM typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, + int SM; // 0xMm (hexadecimal notation), M = SM Major version, // and m = SM minor version int Cores; } sSMtoCores; diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc index 947c2c3afd7..584f44b6415 100644 --- a/common/cuda_hip/components/segment_scan.hpp.inc +++ b/common/cuda_hip/components/segment_scan.hpp.inc @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * @internal * - * Compute a segement scan using add operation (+) of a subwarp. Each segment + * Compute a segment scan using add operation (+) of a subwarp. Each segment * performs suffix sum. Works on the source array and returns whether the thread * is the first element of its segment with same `ind`. */ diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index e73dfde00fb..1fca1ee7215 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -606,7 +606,7 @@ __global__ __launch_bounds__(default_block_size) void spgeam( } // advance by the number of merged elements // in theory, we would need to mask by `valid`, but this - // would only be false somwhere in the last iteration, where + // would only be false somewhere in the last iteration, where // we don't need the value of c_begin afterwards, anyways. c_begin += popcnt(~prev_equal_mask & lanemask_full); return true; diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc index 30cce92b8de..d8b6c4786b0 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc +++ b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc @@ -51,9 +51,9 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, using device_value_type = device_member_type; auto vals_it = reinterpret_cast(vals); auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs)); - // Because reduce_by_key is not determinstic, so we do not need + // Because reduce_by_key is not deterministic, so we do not need // stable_sort_by_key - // TODO: If we have determinstic reduce_by_key, it should be + // TODO: If we have deterministic reduce_by_key, it should be // stable_sort_by_key thrust::sort_by_key(thrust_policy(exec), it, it + nnz, vals_it); } diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 5836486f2a6..a61b32dacbd 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -135,7 +135,7 @@ void map_row(std::shared_ptr exec, exec, [] GKO_KERNEL(auto tidx, auto fine_row_ptrs, auto agg, auto row_idxs) { const auto coarse_row = agg[tidx]; - // TODO: when it is neccessary, it can use warp per row to improve. + // TODO: when it is necessary, it can use warp per row to improve. for (auto i = fine_row_ptrs[tidx]; i < fine_row_ptrs[tidx + 1]; i++) { row_idxs[i] = coarse_row; @@ -232,7 +232,7 @@ void find_strongest_neighbor( // all neighbor is agg, connect to the strongest agg // Also, no others will use this item as their // strongest_neighbor because they are already aggregated. Thus, - // it is determinstic behavior + // it is deterministic behavior agg[row] = agg[strongest_agg]; } else if (strongest_unagg != -1) { // set the strongest neighbor in the unagg group @@ -260,7 +260,7 @@ void assign_to_exist_agg(std::shared_ptr exec, { const auto num = agg.get_num_elems(); if (intermediate_agg.get_num_elems() > 0) { - // determinstic kernel + // deterministic kernel run_kernel( exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 074fa95d848..303106fa4f6 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -197,7 +197,7 @@ enum class cycle_mode { first_of_cycle = 2, /** - * current procees is the end one of the cycle + * current process is the end one of the cycle */ end_of_cycle = 4 }; diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index 93db80f2c31..db59a47658d 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -399,7 +399,7 @@ using cooperative_groups::thread_block_tile; // public API: // void sync() const // unsigned thread_rank() const -// usigned size() const +// unsigned size() const // T shfl(T, int) // T shfl_up(T, unsigned) // T shfl_down(T, unsigned) diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index a8b134cebf2..bfdb4a5f854 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -445,7 +445,7 @@ __global__ void sptrsv_naive_caching_kernel( store(x_s, self_shid, r); x[row * x_stride + rhs] = r; - // This check to ensure no infinte loops happen. + // This check to ensure no infinite loops happen. if (is_nan(r)) { store(x_s, self_shid, zero()); x[row * x_stride + rhs] = zero(); diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh index f6983dd500a..7aabca6f17d 100755 --- a/dev_tools/oneapi/convert_source.sh +++ b/dev_tools/oneapi/convert_source.sh @@ -3,7 +3,7 @@ # convert_source.sh converts cuda (and c++ code) to dpcpp code with ginkgo design. # Usage: -# EnvironementSet ./dev_tools/oneapi/convert_source.sh +# EnvironmentSet ./dev_tools/oneapi/convert_source.sh # can be .hpp/.cpp/.cu/.cuh # the following are parameters set by environment variables @@ -18,7 +18,7 @@ # ROOT_BUILD_DIR: the complete path for build folder. The default is "${ROOT_DIR}/${BUILD_DIR}" # GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include" # CLANG_FORMAT: the clang-format exec. The default is "clang-format" -# VERBOSE: if it is set as 1, script will ouput the path information +# VERBOSE: if it is set as 1, script will output the path information CURRENT_DIR="$( pwd )" cd "$( dirname "${BASH_SOURCE[0]}" )" SCRIPT_DIR="$( pwd )" @@ -33,7 +33,7 @@ GTEST_HEADER_DIR="${GTEST_HEADER_DIR:="${ROOT_BUILD_DIR}/_deps/googletest-src/go CLANG_FORMAT=${CLANG_FORMAT:="clang-format"} if [[ "${VERBOSE}" == 1 ]]; then echo "#####################" - echo "# Enviroment Setting:" + echo "# Environment Setting:" echo "CURRENT_DIR ${CURRENT_DIR}" echo "SCRIPT_DIR ${SCRIPT_DIR}" echo "ROOT_DIR ${ROOT_DIR}" @@ -262,7 +262,7 @@ rm "${OUTPUT_FOLDER}/${OUTPUT_FILE}" rm "${OUTPUT_FOLDER}/${OUTPUT_FILE}.dp.cpp" # Call DPCT -echo "# Call DPCT on the previosly generated file." +echo "# Call DPCT on the previously generated file." echo "############################################" dpct --extra-arg="-std=c++14" --extra-arg="-I ${ROOT_DIR}" --extra-arg="-I ${ROOT_DIR}/include" --extra-arg="-I ${ROOT_BUILD_DIR}/include" --extra-arg="-I ${ROOT_DIR}/dev_tools/oneapi" --extra-arg="-I ${GTEST_HEADER_DIR}" --cuda-include-path="${CUDA_HEADER_DIR}" --format-range=none ${OUTPUT_FILE} --suppress-warnings=1049 --out-root=${OUTPUT_FOLDER} echo "############################################" diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh index a501b6f97d2..2437a03d623 100755 --- a/dev_tools/scripts/format_header.sh +++ b/dev_tools/scripts/format_header.sh @@ -266,7 +266,7 @@ while IFS='' read -r line || [ -n "$line" ]; do echo "${line}" >> "${CONTENT}" SKIP="false" if [[ "${line}" =~ $START_BLOCK_REX ]]; then - # keep everythin in #if block and /* block + # keep everything in #if block and /* block IN_BLOCK=$((IN_BLOCK+1)) if [ -z "${ALARM}" ]; then ALARM="set" @@ -291,13 +291,13 @@ if [ "${ALARM}" = "true" ]; then echo "Warning $1: sorting is probably incorrect" fi -# Wrtie license +# Write license echo "/*${GINKGO_LICENSE_BEACON}" > "$1" cat LICENSE >> "$1" echo "${GINKGO_LICENSE_BEACON}*/" >> "$1" echo "" >> "$1" -# Wrtie the definition of header according to path +# Write the definition of header according to path if [ -n "${IFNDEF}" ] && [ -n "${DEFINE}" ]; then IFNDEF="#ifndef ${HEADER_DEF}" DEFINE="#define ${HEADER_DEF}" diff --git a/devices/reference/dummy.cpp b/devices/reference/dummy.cpp index 210666655f7..6ab5dde07f3 100644 --- a/devices/reference/dummy.cpp +++ b/devices/reference/dummy.cpp @@ -31,4 +31,4 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ // Remove this file once there is at least one source file in -// ginkgo_referece_device +// ginkgo_reference_device diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in index 5e685e2aa7b..a75ac59f186 100644 --- a/doc/examples/examples.hpp.in +++ b/doc/examples/examples.hpp.in @@ -212,7 +212,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @ref heat_equation * Solving a 2D heat equation and showing matrix assembly, vector - * initalization and solver setup in a more complex setting with + * initialization and solver setup in a more complex setting with * output visualization. * * diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index c2015c8664c..3d01e271f15 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -103,7 +103,7 @@ void DpcppExecutor::populate_exec_info(const machine_topology* mach_topo) void DpcppExecutor::raw_free(void* ptr) const noexcept { - // the free function may syncronize excution or not, which depends on + // the free function may synchronize execution or not, which depends on // implementation or backend, so it is not guaranteed. // TODO: maybe a light wait implementation? try { diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 714f5a0d37a..b38b6c1ef8b 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -203,7 +203,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size, * get_first_cfg will return the first valid config by validate function from * given config array. * - * @tparam IterArr the iteratable array type + * @tparam IterArr the iterable array type * @tparam Validate the validate function type * * @param arr the config array diff --git a/dpcpp/components/segment_scan.dp.hpp b/dpcpp/components/segment_scan.dp.hpp index ba0d9577fe3..b73ae12e9b3 100644 --- a/dpcpp/components/segment_scan.dp.hpp +++ b/dpcpp/components/segment_scan.dp.hpp @@ -50,7 +50,7 @@ namespace dpcpp { /** * @internal * - * Compute a segement scan using add operation (+) of a subgroup_size. Each + * Compute a segment scan using add operation (+) of a subgroup_size. Each * segment performs suffix sum. Works on the source array and returns whether * the thread is the first element of its segment with same `ind`. */ diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 2792e2307e4..e689e9f14ba 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -238,7 +238,7 @@ __dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1) * * Returns the global ID of the thread in the given index type. * This function assumes one-dimensional thread and block indexing in cuda - * sense. It uses the third position infomation to get the information. + * sense. It uses the third position information to get the information. * * @return the global ID of the thread in the given index type. * @@ -258,7 +258,7 @@ __dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1) * * Returns the total number of threads in the given index type. * This function assumes one-dimensional thread and block indexing in cuda - * sense. It uses the third position infomation to get the information. + * sense. It uses the third position information to get the information. * * @return the total number of threads in the given index type. * diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp index 2234d8ffe38..b404b1c10ab 100644 --- a/dpcpp/multigrid/pgm_kernels.dp.cpp +++ b/dpcpp/multigrid/pgm_kernels.dp.cpp @@ -82,9 +82,10 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, { auto policy = onedpl_policy(exec); auto it = oneapi::dpl::make_zip_iterator(row_idxs, col_idxs, vals); - // Because reduce_by_segment is not determinstic, so we do not need + // Because reduce_by_segment is not deterministic, so we do not need + // stable_sort + // TODO: If we have deterministic reduce_by_segment, it should be // stable_sort - // TODO: If we have determinstic reduce_by_segment, it should be stable_sort std::sort(policy, it, it + nnz, [](auto a, auto b) { return std::tie(std::get<0>(a), std::get<1>(a)) < std::tie(std::get<0>(b), std::get<1>(b)); diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp index da19a63d51d..fe5fdeae258 100644 --- a/include/ginkgo/core/base/polymorphic_object.hpp +++ b/include/ginkgo/core/base/polymorphic_object.hpp @@ -59,8 +59,8 @@ namespace gko { * @note Most of the public methods of this class should not be overridden * directly, and are thus not virtual. Instead, there are equivalent * protected methods (ending in _impl) that should be - * overridden instead. This allows polymorphic objects to implement default - * behavior around virtual methods (parameter checking, type casting). + * overridden instead. This allows polymorphic objects to implement + * default behavior around virtual methods (parameter checking, type casting). * * @see EnablePolymorphicObject if you wish to implement a concrete polymorphic * object and have sensible defaults generated automatically. diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 16bff356223..ae738d49b93 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -792,7 +792,7 @@ class Dense void add_scaled(ptr_param alpha, ptr_param b); /** - * Subtracts `b` scaled by `alpha` fron the matrix (aka: BLAS axpy). + * Subtracts `b` scaled by `alpha` from the matrix (aka: BLAS axpy). * * @param alpha If alpha is 1x1 Dense matrix, b is scaled * by alpha. If it is a Dense row vector of values, diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index 53909337554..ca2b5cee1b1 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -537,8 +537,7 @@ class // clang-format off [[deprecated("This class will be replaced by the template-less detail::SolverBaseLinOp in a future release")]] SolverBase // clang-format on - : public detail::SolverBaseLinOp -{ + : public detail::SolverBaseLinOp { public: using detail::SolverBaseLinOp::SolverBaseLinOp; diff --git a/omp/reorder/rcm_kernels.cpp b/omp/reorder/rcm_kernels.cpp index 579770b9b2f..c0042224b3c 100644 --- a/omp/reorder/rcm_kernels.cpp +++ b/omp/reorder/rcm_kernels.cpp @@ -99,7 +99,7 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_GET_DEGREE_OF_NODES_KERNEL); // This constant controls how many nodes can be dequeued from the // UbfsLinearQueue at once at most. Increasing it reduces lock contention and -// "unneccesary work", but disturbs queue ordering, generating extra work. +// "unnecessary work", but disturbs queue ordering, generating extra work. constexpr int32 chunk_bound = 512; @@ -633,7 +633,7 @@ vector compute_level_offsets(std::shared_ptr exec, } -// Signal value to which the entire permutation is intialized. +// Signal value to which the entire permutation is initialized. // Threads spin on this value, until it is replaced by another value, // written by another thread. constexpr int32 perm_untouched = -1; @@ -697,7 +697,7 @@ void write_permutation(std::shared_ptr exec, // Will not be written by multiple threads, but can be read // while written. This is only necessary to guarantee the - // abscence of reads-while-writes. + // absence of reads-while-writes. IndexType neighbour_level; #pragma omp atomic read neighbour_level = levels[neighbour]; diff --git a/reference/reorder/rcm_kernels.cpp b/reference/reorder/rcm_kernels.cpp index be14aeb557d..9ad8de1d170 100644 --- a/reference/reorder/rcm_kernels.cpp +++ b/reference/reorder/rcm_kernels.cpp @@ -255,7 +255,7 @@ void get_permutation(std::shared_ptr exec, ++tail_offset; } - // Get the neigbours of the next vertex, + // Get the neighbours of the next vertex, // check if they have already been visited, // if no, insert them to sort. auto prev_head_offset = head_offset; diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 0c5ac3bde53..d56201ade02 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -123,7 +123,7 @@ class Csr : public ::testing::Test { value_type* v = m->get_values(); index_type* c = m->get_col_idxs(); index_type* r = m->get_row_ptrs(); - // It keeps an explict zero + // It keeps an explicit zero /* * 1 3 2 * {0} 5 0 diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp index 4d356ffd828..dde558d27fd 100644 --- a/reference/test/matrix/sparsity_csr_kernels.cpp +++ b/reference/test/matrix/sparsity_csr_kernels.cpp @@ -96,7 +96,7 @@ class SparsityCsr : public ::testing::Test { { index_type* c = m->get_col_idxs(); index_type* r = m->get_row_ptrs(); - // It keeps an explict zero + // It keeps an explicit zero /* * 1 1 1 * {0} 1 0 diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index 3511de4f011..ce3ea72725f 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -301,7 +301,7 @@ TYPED_TEST(Ilu, SolvesCustomTypeDefaultFactorySingleRhs) ilu_prec_type::build().on(this->exec)->generate(this->mtx); preconditioner->apply(b, x); - // Since it uses Bicgstab with default parmeters, the result will not be + // Since it uses Bicgstab with default parameters, the result will not be // accurate GKO_ASSERT_MTX_NEAR(x, l({-0.125, 0.25, 1.0}), 1e-1); } diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp index cc8d145231e..1c18fbb895d 100644 --- a/reference/test/stop/residual_norm_kernels.cpp +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -240,7 +240,7 @@ TYPED_TEST(ResidualNorm, WaitsTillResidualGoal) } -TYPED_TEST(ResidualNorm, SelfCalulatesThrowWithoutMatrix) +TYPED_TEST(ResidualNorm, SelfCalculatesThrowWithoutMatrix) { using Mtx = typename TestFixture::Mtx; using NormVector = typename TestFixture::NormVector; @@ -297,7 +297,7 @@ TYPED_TEST(ResidualNorm, SelfCalulatesThrowWithoutMatrix) } -TYPED_TEST(ResidualNorm, RelativeSelfCalulatesThrowWithoutRhs) +TYPED_TEST(ResidualNorm, RelativeSelfCalculatesThrowWithoutRhs) { // only relative residual norm allows generation without rhs. using Mtx = typename TestFixture::Mtx; @@ -322,7 +322,7 @@ TYPED_TEST(ResidualNorm, RelativeSelfCalulatesThrowWithoutRhs) } -TYPED_TEST(ResidualNorm, SelfCalulatesAndWaitsTillResidualGoal) +TYPED_TEST(ResidualNorm, SelfCalculatesAndWaitsTillResidualGoal) { using Mtx = typename TestFixture::Mtx; using NormVector = typename TestFixture::NormVector; From 42cc35a1cf2a83dbe03d72919aff5ba53d5a99c1 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 12 Jun 2023 10:42:59 +0000 Subject: [PATCH 068/583] Format files Co-authored-by: Gregor Olenik --- include/ginkgo/core/solver/solver_base.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index ca2b5cee1b1..53909337554 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -537,7 +537,8 @@ class // clang-format off [[deprecated("This class will be replaced by the template-less detail::SolverBaseLinOp in a future release")]] SolverBase // clang-format on - : public detail::SolverBaseLinOp { + : public detail::SolverBaseLinOp +{ public: using detail::SolverBaseLinOp::SolverBaseLinOp; From 06aa470f563200e1aca95e879dae629b62834134 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 12 Jun 2023 12:51:38 +0200 Subject: [PATCH 069/583] Deprecate set_complex_subpsace --- _typos.toml | 1 + include/ginkgo/core/solver/idr.hpp | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/_typos.toml b/_typos.toml index 5ba4cd4f662..e3229ce22f5 100644 --- a/_typos.toml +++ b/_typos.toml @@ -8,3 +8,4 @@ tht = "tht" automatical = "automatical" strat = "strat" entrie = "entrie" +set_complex_subpsace = "set_complex_subpsace" diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index fde0bc67157..da41e6229a5 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -158,6 +158,18 @@ class Idr */ bool get_complex_subspace() const { return parameters_.complex_subspace; } + /** + * Sets the complex_subspace parameter of the solver. + * + * @param other the new complex_subspace parameter + * @deprecated Please use set_complex_subspace instead + */ + [[deprecated("Use set_complex_subspace instead")]] + void set_complex_subpsace(const bool other) + { + set_complex_subspace(other); + } + /** * Sets the complex_subspace parameter of the solver. * From 4a3f722500bdb2bb6ffb339d5e0e4798dc9afbec Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 12 Jun 2023 10:59:45 +0000 Subject: [PATCH 070/583] Format files Co-authored-by: Gregor Olenik --- include/ginkgo/core/solver/idr.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index da41e6229a5..61020eb4f05 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -164,8 +164,8 @@ class Idr * @param other the new complex_subspace parameter * @deprecated Please use set_complex_subspace instead */ - [[deprecated("Use set_complex_subspace instead")]] - void set_complex_subpsace(const bool other) + [[deprecated("Use set_complex_subspace instead")]] void + set_complex_subpsace(const bool other) { set_complex_subspace(other); } From d95f7e812cab65694064e91f50970df2c48f94ab Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 12 Jun 2023 13:16:26 +0200 Subject: [PATCH 071/583] add spell check workflow --- .github/workflows/spell_check.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .github/workflows/spell_check.yml diff --git a/.github/workflows/spell_check.yml b/.github/workflows/spell_check.yml new file mode 100644 index 00000000000..9aaeca57385 --- /dev/null +++ b/.github/workflows/spell_check.yml @@ -0,0 +1,12 @@ +name: Test GitHub Action +on: [pull_request] + +jobs: + run: + name: Spell Check with Typos + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Check for typos + uses: crate-ci/typos@master + From 102ac329cd5bb572ff55699a2bc09531a554404a Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 12 Jun 2023 13:35:18 +0200 Subject: [PATCH 072/583] add exception for idr --- _typos.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/_typos.toml b/_typos.toml index e3229ce22f5..b3456752b8f 100644 --- a/_typos.toml +++ b/_typos.toml @@ -8,4 +8,7 @@ tht = "tht" automatical = "automatical" strat = "strat" entrie = "entrie" -set_complex_subpsace = "set_complex_subpsace" +agregate = "agregate" # since that script name is already in ginkgo-data repo + +[default.extend-identifiers] +set_complex_subpsace = "set_complex_subpsace" # remove when deprecated function is gone From e3651aaea1b6a3c00070b194ee093c88e7992d0c Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 12 Jun 2023 14:18:36 +0200 Subject: [PATCH 073/583] revert HAS_HEADER to HSA_HEADER --- _typos.toml | 1 + benchmark/CMakeLists.txt | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/_typos.toml b/_typos.toml index b3456752b8f..e261242ffa3 100644 --- a/_typos.toml +++ b/_typos.toml @@ -12,3 +12,4 @@ agregate = "agregate" # since that script name is already in ginkgo-data repo [default.extend-identifiers] set_complex_subpsace = "set_complex_subpsace" # remove when deprecated function is gone +HSA_HEADER = "HSA_HEADER" diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index f12dbad7f19..434474fd336 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -33,7 +33,7 @@ function(ginkgo_benchmark_hipsparse_linops type def) # use Thrust C++ device just for compilation, we don't use thrust::complex in the benchmarks target_compile_definitions(hipsparse_linops_${type} PUBLIC -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE - ${HAS_HEADER} ${HIP_INCLUDE_DIRS} + ${HSA_HEADER} ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) endfunction() @@ -133,7 +133,7 @@ if (GINKGO_BUILD_HIP) add_library(hip_timer utils/hip_timer.hip.cpp) EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) set_target_properties(hip_timer PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) - target_include_directories(hip_timer SYSTEM PRIVATE ${HAS_HEADER} ${HIP_INCLUDE_DIRS}) + target_include_directories(hip_timer SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS}) target_link_libraries(hip_timer ginkgo) endif() From a465b7d82b54b8c4dbb99d6e9b22699e52765f39 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 14 Jun 2023 13:52:08 +0200 Subject: [PATCH 074/583] Address review comments --- .github/workflows/spell_check.yml | 2 ++ CHANGELOG.md | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/spell_check.yml b/.github/workflows/spell_check.yml index 9aaeca57385..0fee0cce1aa 100644 --- a/.github/workflows/spell_check.yml +++ b/.github/workflows/spell_check.yml @@ -9,4 +9,6 @@ jobs: - uses: actions/checkout@v3 - name: Check for typos uses: crate-ci/typos@master + with: + config: ./_typos.toml diff --git a/CHANGELOG.md b/CHANGELOG.md index e5728ef2cc2..834bb6aa061 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -853,15 +853,15 @@ These universities, along with various project grants, supported the development Ginkgo 1.0.0 contains contributions from: -**Hartwig Anzt**, Karlsruhe Institute of Technology -**Yenchen Chen**, National Taiwan University -**Terry Cojean**, Karlsruhe Institute of Technology -**Goran Flegar**, Universitat Jaume I -**Fritz Göbel**, Karlsruhe Institute of Technology -**Thomas Grützmacher**, Karlsruhe Institute of Technology -**Pratik Nayak**, Karlsruhe Institute of Technology -**Tobias Ribizel**, Karlsruhe Institute of Technology -**Yuhsiang Tsai**, National Taiwan University +**Hartwig Anzt**, Karlsruhe Institute of Technology +**Yenchen Chen**, National Taiwan University +**Terry Cojean**, Karlsruhe Institute of Technology +**Goran Flegar**, Universitat Jaume I +**Fritz Göbel**, Karlsruhe Institute of Technology +**Thomas Grützmacher**, Karlsruhe Institute of Technology +**Pratik Nayak**, Karlsruhe Institute of Technology +**Tobias Ribizel**, Karlsruhe Institute of Technology +**Yuhsiang Tsai**, National Taiwan University Supporting materials are provided by the following individuals: From 1751160aae41811efbd90cab80963a683a0c6b37 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 14 Jun 2023 13:57:28 +0200 Subject: [PATCH 075/583] Address review comments Co-authored-by: Yuhsiang Tsai --- include/ginkgo/core/base/polymorphic_object.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp index fe5fdeae258..fc758f97699 100644 --- a/include/ginkgo/core/base/polymorphic_object.hpp +++ b/include/ginkgo/core/base/polymorphic_object.hpp @@ -60,7 +60,8 @@ namespace gko { * directly, and are thus not virtual. Instead, there are equivalent * protected methods (ending in _impl) that should be * overridden instead. This allows polymorphic objects to implement - * default behavior around virtual methods (parameter checking, type casting). + * default behavior around virtual methods (parameter checking, type + * casting). * * @see EnablePolymorphicObject if you wish to implement a concrete polymorphic * object and have sensible defaults generated automatically. From 376184d29a7f008891deaf36a413b1a7c35347a8 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 14 Jun 2023 14:00:37 +0200 Subject: [PATCH 076/583] Address review comments Co-authored-by: Yuhsiang Tsai --- include/ginkgo/core/solver/idr.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index 61020eb4f05..fc677f33171 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -167,7 +167,7 @@ class Idr [[deprecated("Use set_complex_subspace instead")]] void set_complex_subpsace(const bool other) { - set_complex_subspace(other); + this->set_complex_subspace(other); } /** From a3a6b9e5cee39442d586163d5091c34543183a25 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 14 Jun 2023 17:05:06 +0200 Subject: [PATCH 077/583] Update CHANGELOG.md Add linebreaks --- CHANGELOG.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 834bb6aa061..af4d3c06bb3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -845,9 +845,9 @@ About Ginkgo 1.0.0 is brought to you by: -**Karlsruhe Institute of Technology**, Germany -**Universitat Jaume I**, Spain -**University of Tennessee, Knoxville**, US +**Karlsruhe Institute of Technology**, Germany +**Universitat Jaume I**, Spain +**University of Tennessee, Knoxville**, US These universities, along with various project grants, supported the development team and provided resources needed for the development of Ginkgo. @@ -865,15 +865,15 @@ Ginkgo 1.0.0 contains contributions from: Supporting materials are provided by the following individuals: -**David Rogers** - the Ginkgo logo -**Frithjof Fleischhammer** - the Ginkgo website +**David Rogers** - the Ginkgo logo +**Frithjof Fleischhammer** - the Ginkgo website The development team is grateful to the following individuals for discussions and comments: -**Erik Boman** -**Jelena Držaić** -**Mike Heroux** -**Mark Hoemmen** -**Timo Heister** -**Jens Saak** +**Erik Boman** +**Jelena Držaić** +**Mike Heroux** +**Mark Hoemmen** +**Timo Heister** +**Jens Saak** From daa4cc6edb89c9dc268f60a6e0a9536484bb424d Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 20 Jun 2023 20:25:06 +0200 Subject: [PATCH 078/583] move spellchecker config to .github folder --- _typos.toml => .github/_typos.toml | 0 .github/workflows/spell_check.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename _typos.toml => .github/_typos.toml (100%) diff --git a/_typos.toml b/.github/_typos.toml similarity index 100% rename from _typos.toml rename to .github/_typos.toml diff --git a/.github/workflows/spell_check.yml b/.github/workflows/spell_check.yml index 0fee0cce1aa..fa0550858f9 100644 --- a/.github/workflows/spell_check.yml +++ b/.github/workflows/spell_check.yml @@ -10,5 +10,5 @@ jobs: - name: Check for typos uses: crate-ci/typos@master with: - config: ./_typos.toml + config: .github/_typos.toml From 0734b0b1316b9a338b55659387cd9a22f6d61c22 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 20 Jun 2023 20:26:41 +0200 Subject: [PATCH 079/583] consistent us spelling --- reference/reorder/rcm_kernels.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/reference/reorder/rcm_kernels.cpp b/reference/reorder/rcm_kernels.cpp index 9ad8de1d170..5e357cf775c 100644 --- a/reference/reorder/rcm_kernels.cpp +++ b/reference/reorder/rcm_kernels.cpp @@ -111,7 +111,7 @@ std::pair rls_contender_and_height( // The last levels size is required to compute the contender. IndexType last_level_size = 0; - // While there are still nodes whose neighbours haven't been inspected. + // While there are still nodes whose neighbors haven't been inspected. while (rls_index < rls_offset) { auto parent = rls_p[rls_index]; --current_level_countdown; @@ -255,12 +255,12 @@ void get_permutation(std::shared_ptr exec, ++tail_offset; } - // Get the neighbours of the next vertex, + // Get the neighbors of the next vertex, // check if they have already been visited, // if no, insert them to sort. auto prev_head_offset = head_offset; - // Get the next vertex neighbours. + // Get the next vertex neighbors. auto row_start = row_ptrs[next_vertex]; auto row_end = row_ptrs[next_vertex + 1]; for (auto neighbor_idx = row_start; neighbor_idx < row_end; @@ -276,7 +276,7 @@ void get_permutation(std::shared_ptr exec, } } - // Sort all just-added neighbours by degree. + // Sort all just-added neighbors by degree. std::sort( linear_queue_p + prev_head_offset, linear_queue_p + head_offset, [&](IndexType i, IndexType j) { return degrees[i] < degrees[j]; }); From 32f3d54e22077235c22a3c515194865c2d58b5a8 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 20 Jun 2023 20:41:46 +0200 Subject: [PATCH 080/583] Fix interface break Co-authored-by: Yuhsiang Tsai --- .github/_typos.toml | 2 ++ include/ginkgo/core/base/range.hpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/_typos.toml b/.github/_typos.toml index e261242ffa3..c9643175c07 100644 --- a/.github/_typos.toml +++ b/.github/_typos.toml @@ -13,3 +13,5 @@ agregate = "agregate" # since that script name is already in ginkgo-data repo [default.extend-identifiers] set_complex_subpsace = "set_complex_subpsace" # remove when deprecated function is gone HSA_HEADER = "HSA_HEADER" +one_operaton = "one_operaton" # considered interface break in range.hpp + diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 29c7baba8d8..6e1fdb3a007 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -710,7 +710,7 @@ GKO_ENABLE_UNARY_RANGE_OPERATION(bitwise_not, operator~, // common unary functions GKO_ENABLE_UNARY_RANGE_OPERATION(zero_operation, zero, accessor::detail::zero_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(one_operation, one, +GKO_ENABLE_UNARY_RANGE_OPERATION(one_operaton, one, accessor::detail::one_operation); GKO_ENABLE_UNARY_RANGE_OPERATION(abs_operation, abs, accessor::detail::abs_operation); From fc4a9a79854c377b5cae242ffc738b30c1dacaa8 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 21 Jun 2023 12:53:39 +0200 Subject: [PATCH 081/583] Update .github/workflows/spell_check.yml Co-authored-by: Marcel Koch --- .github/workflows/spell_check.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/spell_check.yml b/.github/workflows/spell_check.yml index fa0550858f9..0049dce9180 100644 --- a/.github/workflows/spell_check.yml +++ b/.github/workflows/spell_check.yml @@ -1,5 +1,7 @@ name: Test GitHub Action -on: [pull_request] +on: + pull_request: + types: [opened, synchronize] jobs: run: From 2827fbe811c7b9a925c5d3a4a2bec3d5314cce60 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 21 Jun 2023 14:20:05 +0200 Subject: [PATCH 082/583] Add deprecation notes Co-authored-by: Marcel Koch --- include/ginkgo/core/base/range.hpp | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 6e1fdb3a007..4bc9a77267f 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -614,6 +614,18 @@ struct implement_binary_operation \ + struct [[deprecated( \ + "Please use " #_operation_name)]] _operation_deprecated_name \ + : _operation_name {}; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + #define GKO_ENABLE_UNARY_RANGE_OPERATION(_operation_name, _operator_name, \ _operator) \ @@ -708,9 +720,10 @@ GKO_ENABLE_UNARY_RANGE_OPERATION(bitwise_not, operator~, accessor::detail::bitwise_not); // common unary functions + GKO_ENABLE_UNARY_RANGE_OPERATION(zero_operation, zero, accessor::detail::zero_operation); -GKO_ENABLE_UNARY_RANGE_OPERATION(one_operaton, one, +GKO_ENABLE_UNARY_RANGE_OPERATION(one_operation, one, accessor::detail::one_operation); GKO_ENABLE_UNARY_RANGE_OPERATION(abs_operation, abs, accessor::detail::abs_operation); @@ -723,6 +736,14 @@ GKO_ENABLE_UNARY_RANGE_OPERATION(conj_operation, conj, GKO_ENABLE_UNARY_RANGE_OPERATION(squared_norm_operation, squared_norm, accessor::detail::squared_norm_operation); +GKO_DEPRECATED_UNARY_RANGE_OPERATION(one_operaton, one_operation); +GKO_DEPRECATED_UNARY_RANGE_OPERATION(abs_operaton, abs_operation); +GKO_DEPRECATED_UNARY_RANGE_OPERATION(real_operaton, real_operation); +GKO_DEPRECATED_UNARY_RANGE_OPERATION(imag_operaton, imag_operation); +GKO_DEPRECATED_UNARY_RANGE_OPERATION(conj_operaton, conj_operation); +GKO_DEPRECATED_UNARY_RANGE_OPERATION(squared_norm_operaton, + squared_norm_operation); + namespace accessor { @@ -766,6 +787,7 @@ struct transpose_operation { GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose); +#undef GKO_DEPRECATED_UNARY_RANGE_OPERATION #undef GKO_DEFINE_SIMPLE_UNARY_OPERATION #undef GKO_ENABLE_UNARY_RANGE_OPERATION @@ -841,6 +863,9 @@ GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose); "semi-colon warnings") +#define GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(_deprecated_name, _name) \ + struct [[deprecated("Please use " #_name)]] _deprecated_name : _name {}; + #define GKO_DEFINE_SIMPLE_BINARY_OPERATION(_name, ...) \ struct _name { \ private: \ @@ -919,6 +944,8 @@ GKO_DEFINE_SIMPLE_BINARY_OPERATION(right_shift, first >> second); GKO_DEFINE_SIMPLE_BINARY_OPERATION(max_operation, max(first, second)); GKO_DEFINE_SIMPLE_BINARY_OPERATION(min_operation, min(first, second)); +GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(max_operaton, max_operation); +GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(min_operaton, min_operation); } // namespace detail } // namespace accessor From 8c96828a4e4618213befb359e2a5ee332252ed16 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 24 Jul 2023 04:33:48 +0000 Subject: [PATCH 083/583] Format files Co-authored-by: Gregor Olenik --- include/ginkgo/core/base/range.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 4bc9a77267f..5ba07aa834f 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -620,7 +620,7 @@ struct implement_binary_operation \ struct [[deprecated( \ "Please use " #_operation_name)]] _operation_deprecated_name \ - : _operation_name {}; \ + : _operation_name{}; \ } \ static_assert(true, \ "This assert is used to counter the false positive extra " \ @@ -864,7 +864,7 @@ GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose); #define GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(_deprecated_name, _name) \ - struct [[deprecated("Please use " #_name)]] _deprecated_name : _name {}; + struct [[deprecated("Please use " #_name)]] _deprecated_name : _name{}; #define GKO_DEFINE_SIMPLE_BINARY_OPERATION(_name, ...) \ struct _name { \ From 7708236f1bb5b8571521921ffff5b5ac8da21537 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 21 Jun 2023 15:15:15 +0200 Subject: [PATCH 084/583] add operaton exception, reformat --- .github/_typos.toml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/_typos.toml b/.github/_typos.toml index c9643175c07..4b9d9be6403 100644 --- a/.github/_typos.toml +++ b/.github/_typos.toml @@ -13,5 +13,11 @@ agregate = "agregate" # since that script name is already in ginkgo-data repo [default.extend-identifiers] set_complex_subpsace = "set_complex_subpsace" # remove when deprecated function is gone HSA_HEADER = "HSA_HEADER" +conj_operaton = "conj_operaton" # considered interface break in range.hpp +imag_operaton = "imag_operaton" # considered interface break in range.hpp +real_operaton = "real_operaton" # considered interface break in range.hpp one_operaton = "one_operaton" # considered interface break in range.hpp - +abs_operaton = "abs_operaton" # considered interface break in range.hpp +max_operaton = "max_operaton" # considered interface break in range.hpp +min_operaton = "min_operaton" # considered interface break in range.hpp +squared_norm_operaton = "squared_norm_operaton" # considered interface break in range.hpp From 905658eb953473a1822834c040d589e71a1a77aa Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 5 Apr 2023 11:09:18 +0200 Subject: [PATCH 085/583] clean up cholesky reference tests --- .../test/factorization/cholesky_kernels.cpp | 660 ++++++++---------- 1 file changed, 279 insertions(+), 381 deletions(-) diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index d5fae12a2e9..36bbd7e176e 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -67,12 +68,15 @@ class Cholesky : public ::testing::Test { using index_type = typename std::tuple_element<1, decltype(ValueIndexType())>::type; using matrix_type = gko::matrix::Csr; + using sparsity_matrix_type = + gko::matrix::SparsityCsr; using elimination_forest = gko::factorization::elimination_forest; Cholesky() : ref(gko::ReferenceExecutor::create()), tmp{ref}, + ref_row_nnz{ref}, storage_offsets{ref}, storage{ref}, row_descs{ref} @@ -98,13 +102,27 @@ class Cholesky : public ::testing::Test { return result; } + void setup( + std::initializer_list> mtx_list, + std::initializer_list> factor_list) + { + mtx = gko::initialize(mtx_list, ref); + l_factor_ref = gko::initialize(factor_list, ref); + setup_impl(); + } + void setup(const char* name_mtx, const char* name_factor) { std::ifstream stream{name_mtx}; std::ifstream ref_stream{name_factor}; mtx = gko::read(stream, this->ref); - num_rows = mtx->get_size()[0]; l_factor_ref = gko::read(ref_stream, this->ref); + setup_impl(); + } + + void setup_impl() + { + num_rows = mtx->get_size()[0]; combined_ref = combined_factor(l_factor_ref.get()); l_factor = matrix_type::create(ref, l_factor_ref->get_size(), l_factor_ref->get_num_stored_elements()); @@ -123,6 +141,13 @@ class Cholesky : public ::testing::Test { storage_offsets.resize_and_reset(num_rows + 1); row_descs.resize_and_reset(num_rows); + ref_row_nnz.resize_and_reset(num_rows); + const auto ref_row_ptrs = l_factor_ref->get_const_row_ptrs(); + for (gko::size_type row = 0; row < num_rows; row++) { + ref_row_nnz.get_data()[row] = + ref_row_ptrs[row + 1] - ref_row_ptrs[row]; + } + const auto allowed = gko::matrix::csr::sparsity_type::bitmap | gko::matrix::csr::sparsity_type::full | gko::matrix::csr::sparsity_type::hash; @@ -149,7 +174,7 @@ class Cholesky : public ::testing::Test { } } - void forall_matrices(std::function fn) + void forall_matrices(std::function fn, bool non_spd) { { SCOPED_TRACE("ani1"); @@ -163,11 +188,87 @@ class Cholesky : public ::testing::Test { gko::matrices::location_ani1_amd_chol_mtx); fn(); } + { + SCOPED_TRACE("example"); + this->setup( + {{4, 0, 1, 0, 0, 0, 0, 1, 0, 0}, + {0, 4, 0, 0, 1, 0, 0, 0, 0, 1}, + {1, 0, 4.25, 0, 0, 0, 1, 0, 0, 0}, + {0, 0, 0, 4, 0, 0, 0, 0, 1, 1}, + {0, 1, 0, 0, 4.25, 0, 0, 0, 1, 1}, + {0, 0, 0, 0, 0, 4, 2, 4, 0, 0}, + {0, 0, 1, 0, 0, 2, 5.25, 0, 0, 0}, + {1, 0, 0, 0, 0, 4, 0, 8, 1, 1}, + {0, 0, 0, 1, 1, 0, 0, 1, 4, 0}, + {0, 1, 0, 1, 1, 0, 0, 1, 0, 4}}, + {{2, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 2, 0, 0, 0, 0, 0, 0, 0, 0}, + {0.5, 0, 2, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 2, 0, 0, 0, 0, 0, 0}, + {0, 0.5, 0, 0, 2, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 2, 0, 0, 0, 0}, + {0, 0, 0.5, 0, 0, 1, 2, 0, 0, 0}, + {0.5, 0, -0.125, 0, 0, 2, -0.96875, 1.67209402770897, 0, 0}, + {0, 0, 0, 0.5, 0.5, 0, 0, 0.598052491922453, 1.7726627476498, + 0}, + {0, 0.5, 0, 0.5, 0.375, 0, 0, 0.598052491922453, + -0.448571948696326, 1.67346688755653}}); + fn(); + } + { + SCOPED_TRACE("separable"); + this->setup({{4, 0, 1, 0, 0, 0, 0, 0, 0, 0}, + {0, 4, 2, 0, 0, 0, 0, 0, 0, 0}, + {1, 2, 5.25, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 4, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, 4.25, 1, 0, 0, 0, 4}, + {0, 0, 0, 0, 1, 4.25, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 4, 1, 0, 4}, + {0, 0, 0, 0, 0, 0, 1, 4.25, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 4, 1}, + {0, 0, 0, 0, 4, 0, 4, 0, 1, 17.75}}, + {{2, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 2, 0, 0, 0, 0, 0, 0, 0, 0}, + {0.5, 1, 2, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 2, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0.5, 2, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0.5, 2, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 2, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0.5, 2, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 0}, + {0, 0, 0, 0, 2, -0.5, 2, -0.5, 0.5, 3}}); + fn(); + } + if (non_spd) { + SCOPED_TRACE("missing diagonal"); + this->setup({{1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 1, 0, 0, 0, 0, 0, 0, 0}, + {1, 1, 0, 1, 0, 0, 0, 0, 0, 0}, + {0, 0, 1, 1, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, 0, 1, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 1, 1, 0, 0, 0}, + {0, 0, 0, 0, 0, 1, 1, 1, 0, 1}, + {0, 0, 0, 0, 0, 0, 1, 1, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, + {0, 0, 0, 0, 0, 0, 1, 0, 1, 0}}, + {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, + {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.}, + {1., 1., 1., 0., 0., 0., 0., 0., 0., 0.}, + {0., 0., 1., 1., 0., 0., 0., 0., 0., 0.}, + {0., 0., 0., 1., 1., 0., 0., 0., 0., 0.}, + {0., 0., 0., 0., 1., 1., 0., 0., 0., 0.}, + {0., 0., 0., 0., 0., 1., 1., 0., 0., 0.}, + {0., 0., 0., 0., 0., 0., 1., 1., 0., 0.}, + {0., 0., 0., 0., 0., 0., 0., 0., 1., 0.}, + {0., 0., 0., 0., 0., 0., 1., 1., 1., 1.}}); + fn(); + } } std::shared_ptr ref; gko::size_type num_rows; gko::array tmp; + gko::array ref_row_nnz; gko::array storage_offsets; gko::array storage; gko::array row_descs; @@ -183,255 +284,51 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes, PairTypenameNameGenerator); -TYPED_TEST(Cholesky, KernelSymbolicCountExample) -{ - using matrix_type = typename TestFixture::matrix_type; - using elimination_forest = typename TestFixture::elimination_forest; - using index_type = typename TestFixture::index_type; - auto mtx = gko::initialize( - {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0}, - {0, 1, 0, 1, 0, 0, 0, 0, 0, 1}, - {1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 0, 0, 0, 0, 1, 1}, - {0, 1, 0, 0, 1, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 0, 1, 0, 1, 0, 0}, - {0, 0, 1, 0, 0, 1, 1, 0, 0, 0}, - {1, 0, 0, 0, 0, 1, 0, 1, 1, 1}, - {0, 0, 0, 1, 1, 0, 0, 1, 1, 0}, - {0, 1, 0, 1, 1, 0, 0, 1, 0, 1}}, - this->ref); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - gko::array row_nnz{this->ref, 10}; - - gko::kernels::reference::cholesky::symbolic_count( - this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp); - - GKO_ASSERT_ARRAY_EQ(row_nnz, I({1, 1, 2, 1, 2, 1, 3, 5, 4, 6})); -} - - -TYPED_TEST(Cholesky, KernelSymbolicFactorizeExample) -{ - using matrix_type = typename TestFixture::matrix_type; - using elimination_forest = typename TestFixture::elimination_forest; - using index_type = typename TestFixture::index_type; - auto mtx = gko::initialize( - {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0}, - {0, 1, 0, 1, 0, 0, 0, 0, 0, 1}, - {1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 0, 0, 0, 0, 1, 1}, - {0, 1, 0, 0, 1, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 0, 1, 0, 1, 0, 0}, - {0, 0, 1, 0, 0, 1, 1, 0, 0, 0}, - {1, 0, 0, 0, 0, 1, 0, 1, 1, 1}, - {0, 0, 0, 1, 1, 0, 0, 1, 1, 0}, - {0, 1, 0, 1, 1, 0, 0, 1, 0, 1}}, - this->ref); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - auto l_factor = matrix_type::create(this->ref, gko::dim<2>{10, 10}, 26); - gko::kernels::reference::cholesky::symbolic_count( - this->ref, mtx.get(), *forest, l_factor->get_row_ptrs(), this->tmp); - gko::kernels::reference::components::prefix_sum_nonnegative( - this->ref, l_factor->get_row_ptrs(), 11); - - gko::kernels::reference::cholesky::symbolic_factorize( - this->ref, mtx.get(), *forest, l_factor.get(), this->tmp); - - GKO_ASSERT_MTX_EQ_SPARSITY(l_factor, - l({{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, - {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.}, - {1., 0., 1., 0., 0., 0., 0., 0., 0., 0.}, - {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.}, - {0., 1., 0., 0., 1., 0., 0., 0., 0., 0.}, - {0., 0., 0., 0., 0., 1., 0., 0., 0., 0.}, - {0., 0., 1., 0., 0., 1., 1., 0., 0., 0.}, - {1., 0., 1., 0., 0., 1., 1., 1., 0., 0.}, - {0., 0., 0., 1., 1., 0., 0., 1., 1., 0.}, - {0., 1., 0., 1., 1., 0., 0., 1., 1., 1.}})); -} - - -TYPED_TEST(Cholesky, KernelSymbolicCountSeparable) -{ - using matrix_type = typename TestFixture::matrix_type; - using elimination_forest = typename TestFixture::elimination_forest; - using index_type = typename TestFixture::index_type; - auto mtx = gko::initialize( - {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 1, 1, 0, 0, 0, 0, 0, 0, 0}, - {1, 1, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 1, 1, 0, 0, 0, 1}, - {0, 0, 0, 0, 1, 1, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 1, 1, 0, 1}, - {0, 0, 0, 0, 0, 0, 1, 1, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}}, - this->ref); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - gko::array row_nnz{this->ref, 10}; - - gko::kernels::reference::cholesky::symbolic_count( - this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp); - - GKO_ASSERT_ARRAY_EQ(row_nnz, I({1, 1, 3, 1, 2, 2, 1, 2, 1, 6})); -} - - -TYPED_TEST(Cholesky, KernelSymbolicFactorizeSeparable) +TYPED_TEST(Cholesky, KernelSymbolicCount) { using matrix_type = typename TestFixture::matrix_type; - using index_type = typename TestFixture::index_type; + using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; using elimination_forest = typename TestFixture::elimination_forest; - auto mtx = gko::initialize( - {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 1, 1, 0, 0, 0, 0, 0, 0, 0}, - {1, 1, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 1, 1, 0, 0, 0, 1}, - {0, 0, 0, 0, 1, 1, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 1, 1, 0, 1}, - {0, 0, 0, 0, 0, 0, 1, 1, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}}, - this->ref); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - auto l_factor = matrix_type::create(this->ref, gko::dim<2>{10, 10}, 26); - gko::kernels::reference::cholesky::symbolic_count( - this->ref, mtx.get(), *forest, l_factor->get_row_ptrs(), this->tmp); - gko::kernels::reference::components::prefix_sum_nonnegative( - this->ref, l_factor->get_row_ptrs(), 11); - - gko::kernels::reference::cholesky::symbolic_factorize( - this->ref, mtx.get(), *forest, l_factor.get(), this->tmp); - - GKO_ASSERT_MTX_EQ_SPARSITY(l_factor, - l({{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, - {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.}, - {1., 1., 1., 0., 0., 0., 0., 0., 0., 0.}, - {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.}, - {0., 0., 0., 1., 1., 0., 0., 0., 0., 0.}, - {0., 0., 0., 0., 1., 1., 0., 0., 0., 0.}, - {0., 0., 0., 0., 0., 0., 1., 0., 0., 0.}, - {0., 0., 0., 0., 0., 0., 1., 1., 0., 0.}, - {0., 0., 0., 0., 0., 0., 0., 0., 1., 0.}, - {0., 0., 0., 0., 1., 1., 1., 1., 1., 1.}})); -} - - -TYPED_TEST(Cholesky, KernelSymbolicCountMissingDiagonal) -{ - using matrix_type = typename TestFixture::matrix_type; using index_type = typename TestFixture::index_type; - using elimination_forest = typename TestFixture::elimination_forest; - auto mtx = gko::initialize( - {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 1, 1, 0, 0, 0, 0, 0, 0, 0}, - {1, 1, 0, 1, 0, 0, 0, 0, 0, 0}, - {0, 0, 1, 1, 1, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 0, 1, 0, 0, 0, 0}, - {0, 0, 0, 0, 1, 1, 1, 0, 0, 0}, - {0, 0, 0, 0, 0, 1, 1, 1, 0, 1}, - {0, 0, 0, 0, 0, 0, 1, 1, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 0, 0, 1, 0, 1, 0}}, - this->ref); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - gko::array row_nnz{this->ref, 10}; - - gko::kernels::reference::cholesky::symbolic_count( - this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp); - - GKO_ASSERT_ARRAY_EQ(row_nnz, I({1, 1, 3, 2, 2, 2, 2, 2, 1, 4})); + this->forall_matrices( + [this] { + gko::factorization::compute_elim_forest(this->mtx.get(), + this->forest); + gko::array row_nnz{this->ref, this->num_rows}; + + gko::kernels::reference::cholesky::symbolic_count( + this->ref, this->mtx.get(), *this->forest, row_nnz.get_data(), + this->tmp); + + GKO_ASSERT_ARRAY_EQ(row_nnz, this->ref_row_nnz); + }, + true); } -TYPED_TEST(Cholesky, KernelSymbolicFactorizeMissingDiagonal) +TYPED_TEST(Cholesky, KernelSymbolicFactorize) { using matrix_type = typename TestFixture::matrix_type; - using index_type = typename TestFixture::index_type; + using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; using elimination_forest = typename TestFixture::elimination_forest; - auto mtx = gko::initialize( - {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - {0, 1, 1, 0, 0, 0, 0, 0, 0, 0}, - {1, 1, 0, 1, 0, 0, 0, 0, 0, 0}, - {0, 0, 1, 1, 1, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 0, 1, 0, 0, 0, 0}, - {0, 0, 0, 0, 1, 1, 1, 0, 0, 0}, - {0, 0, 0, 0, 0, 1, 1, 1, 0, 1}, - {0, 0, 0, 0, 0, 0, 1, 1, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 0, 0, 1, 0, 1, 0}}, - this->ref); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - auto l_factor = matrix_type::create(this->ref, gko::dim<2>{10, 10}, 20); - gko::kernels::reference::cholesky::symbolic_count( - this->ref, mtx.get(), *forest, l_factor->get_row_ptrs(), this->tmp); - gko::kernels::reference::components::prefix_sum_nonnegative( - this->ref, l_factor->get_row_ptrs(), 11); - - gko::kernels::reference::cholesky::symbolic_factorize( - this->ref, mtx.get(), *forest, l_factor.get(), this->tmp); - - GKO_ASSERT_MTX_EQ_SPARSITY(l_factor, - l({{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, - {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.}, - {1., 1., 1., 0., 0., 0., 0., 0., 0., 0.}, - {0., 0., 1., 1., 0., 0., 0., 0., 0., 0.}, - {0., 0., 0., 1., 1., 0., 0., 0., 0., 0.}, - {0., 0., 0., 0., 1., 1., 0., 0., 0., 0.}, - {0., 0., 0., 0., 0., 1., 1., 0., 0., 0.}, - {0., 0., 0., 0., 0., 0., 1., 1., 0., 0.}, - {0., 0., 0., 0., 0., 0., 0., 0., 1., 0.}, - {0., 0., 0., 0., 0., 0., 1., 1., 1., 1.}})); -} - - -TYPED_TEST(Cholesky, KernelSymbolicCountAni1) -{ using index_type = typename TestFixture::index_type; - using elimination_forest = typename TestFixture::elimination_forest; - this->setup(gko::matrices::location_ani1_mtx, - gko::matrices::location_ani1_chol_mtx); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(this->mtx.get(), forest); - gko::array row_nnz{this->ref, this->mtx->get_size()[0]}; - - gko::kernels::reference::cholesky::symbolic_count( - this->ref, this->mtx.get(), *forest, row_nnz.get_data(), this->tmp); - - GKO_ASSERT_ARRAY_EQ( - row_nnz, I({1, 2, 3, 3, 2, 2, 7, 7, 7, 8, 8, 7, - 8, 8, 8, 8, 2, 10, 10, 10, 10, 9, 8, 8, - 8, 7, 8, 2, 8, 8, 7, 5, 8, 6, 4, 4})); -} - - -TYPED_TEST(Cholesky, KernelSymbolicFactorize) -{ - using elimination_forest = typename TestFixture::elimination_forest; - this->forall_matrices([this] { - std::unique_ptr forest; - gko::factorization::compute_elim_forest(this->mtx.get(), forest); - gko::kernels::reference::cholesky::symbolic_count( - this->ref, this->mtx.get(), *forest, this->l_factor->get_row_ptrs(), - this->tmp); - gko::kernels::reference::components::prefix_sum_nonnegative( - this->ref, this->l_factor->get_row_ptrs(), - this->mtx->get_size()[0] + 1); - - gko::kernels::reference::cholesky::symbolic_factorize( - this->ref, this->mtx.get(), *forest, this->l_factor.get(), - this->tmp); - - GKO_ASSERT_MTX_EQ_SPARSITY(this->l_factor, this->l_factor_ref); - }); + this->forall_matrices( + [this] { + gko::factorization::compute_elim_forest(this->mtx.get(), + this->forest); + gko::kernels::reference::cholesky::symbolic_count( + this->ref, this->mtx.get(), *this->forest, + this->l_factor->get_row_ptrs(), this->tmp); + gko::kernels::reference::components::prefix_sum_nonnegative( + this->ref, this->l_factor->get_row_ptrs(), this->num_rows + 1); + + gko::kernels::reference::cholesky::symbolic_factorize( + this->ref, this->mtx.get(), *this->forest, this->l_factor.get(), + this->tmp); + + GKO_ASSERT_MTX_EQ_SPARSITY(this->l_factor, this->l_factor_ref); + }, + true); } @@ -439,14 +336,16 @@ TYPED_TEST(Cholesky, SymbolicFactorize) { using matrix_type = typename TestFixture::matrix_type; using elimination_forest = typename TestFixture::elimination_forest; - this->forall_matrices([this] { - std::unique_ptr combined_factor; - std::unique_ptr forest; - gko::factorization::symbolic_cholesky(this->mtx.get(), true, - combined_factor, forest); - - GKO_ASSERT_MTX_EQ_SPARSITY(combined_factor, this->combined_ref); - }); + this->forall_matrices( + [this] { + std::unique_ptr combined_factor; + std::unique_ptr forest; + gko::factorization::symbolic_cholesky(this->mtx.get(), true, + combined_factor, forest); + + GKO_ASSERT_MTX_EQ_SPARSITY(combined_factor, this->combined_ref); + }, + true); } @@ -454,55 +353,39 @@ TYPED_TEST(Cholesky, SymbolicFactorizeOnlyLower) { using matrix_type = typename TestFixture::matrix_type; using elimination_forest = typename TestFixture::elimination_forest; - this->forall_matrices([this] { - std::unique_ptr l_factor; - std::unique_ptr forest; - gko::factorization::symbolic_cholesky(this->mtx.get(), false, l_factor, - forest); - - GKO_ASSERT_MTX_EQ_SPARSITY(l_factor, this->l_factor_ref); - }); + this->forall_matrices( + [this] { + std::unique_ptr l_factor; + std::unique_ptr forest; + gko::factorization::symbolic_cholesky(this->mtx.get(), false, + l_factor, forest); + + GKO_ASSERT_MTX_EQ_SPARSITY(l_factor, this->l_factor_ref); + }, + true); } -TYPED_TEST(Cholesky, KernelSymbolicCountAni1Amd) -{ - using index_type = typename TestFixture::index_type; - using elimination_forest = typename TestFixture::elimination_forest; - this->setup(gko::matrices::location_ani1_amd_mtx, - gko::matrices::location_ani1_amd_chol_mtx); - std::unique_ptr forest; - gko::factorization::compute_elim_forest(this->mtx.get(), forest); - gko::array row_nnz{this->ref, this->mtx->get_size()[0]}; - - gko::kernels::reference::cholesky::symbolic_count( - this->ref, this->mtx.get(), *forest, row_nnz.get_data(), this->tmp); - - GKO_ASSERT_ARRAY_EQ( - row_nnz, I({1, 1, 2, 3, 5, 4, 1, 2, 3, 4, 1, 2, - 2, 2, 5, 1, 4, 4, 4, 1, 2, 3, 4, 3, - 8, 10, 4, 8, 10, 7, 7, 13, 21, 6, 11, 14})); -} - - -TYPED_TEST(Cholesky, KernelForestFromFactor) +TYPED_TEST(Cholesky, KernelForestFromFactorPlusPostprocessing) { using matrix_type = typename TestFixture::matrix_type; using index_type = typename TestFixture::index_type; using elimination_forest = typename TestFixture::elimination_forest; - this->forall_matrices([this] { - std::unique_ptr combined_factor; - std::unique_ptr forest_ref; - gko::factorization::symbolic_cholesky(this->mtx.get(), true, - combined_factor, forest_ref); - elimination_forest forest{this->ref, - static_cast(this->num_rows)}; - - gko::kernels::reference::cholesky::forest_from_factor( - this->ref, combined_factor.get(), forest); - - this->assert_equal_forests(forest, *forest_ref); - }); + this->forall_matrices( + [this] { + std::unique_ptr combined_factor; + std::unique_ptr forest_ref; + gko::factorization::symbolic_cholesky(this->mtx.get(), true, + combined_factor, forest_ref); + elimination_forest forest{this->ref, + static_cast(this->num_rows)}; + + gko::kernels::reference::cholesky::forest_from_factor( + this->ref, combined_factor.get(), forest); + + this->assert_equal_forests(forest, *forest_ref); + }, + true); } @@ -510,39 +393,46 @@ TYPED_TEST(Cholesky, KernelInitializeWorks) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - this->forall_matrices([this] { - std::fill_n(this->combined->get_values(), - this->combined->get_num_stored_elements(), - gko::zero()); - gko::array diag_idxs{this->ref, this->num_rows}; - gko::array transpose_idxs{ - this->ref, this->combined->get_num_stored_elements()}; - - gko::kernels::reference::cholesky::initialize( - this->ref, this->mtx.get(), this->storage_offsets.get_const_data(), - this->row_descs.get_const_data(), this->storage.get_const_data(), - diag_idxs.get_data(), transpose_idxs.get_data(), - this->combined.get()); - - GKO_ASSERT_MTX_NEAR(this->mtx, this->combined, 0.0); - for (gko::size_type row = 0; row < this->num_rows; row++) { - const auto diag_pos = diag_idxs.get_const_data()[row]; - const auto begin_pos = this->combined->get_const_row_ptrs()[row]; - const auto end_pos = this->combined->get_const_row_ptrs()[row + 1]; - ASSERT_GE(diag_pos, begin_pos); - ASSERT_LT(diag_pos, end_pos); - ASSERT_EQ(this->combined->get_const_col_idxs()[diag_pos], row); - for (auto nz = begin_pos; nz < end_pos; nz++) { - const auto trans_pos = transpose_idxs.get_const_data()[nz]; - const auto col = this->combined->get_const_col_idxs()[nz]; - ASSERT_GE(trans_pos, this->combined->get_const_row_ptrs()[col]); - ASSERT_LT(trans_pos, - this->combined->get_const_row_ptrs()[col + 1]); - ASSERT_EQ(this->combined->get_const_col_idxs()[trans_pos], row); - ASSERT_EQ(transpose_idxs.get_const_data()[trans_pos], nz); + this->forall_matrices( + [this] { + std::fill_n(this->combined->get_values(), + this->combined->get_num_stored_elements(), + gko::zero()); + gko::array diag_idxs{this->ref, this->num_rows}; + gko::array transpose_idxs{ + this->ref, this->combined->get_num_stored_elements()}; + + gko::kernels::reference::cholesky::initialize( + this->ref, this->mtx.get(), + this->storage_offsets.get_const_data(), + this->row_descs.get_const_data(), + this->storage.get_const_data(), diag_idxs.get_data(), + transpose_idxs.get_data(), this->combined.get()); + + GKO_ASSERT_MTX_NEAR(this->mtx, this->combined, 0.0); + for (gko::size_type row = 0; row < this->num_rows; row++) { + const auto diag_pos = diag_idxs.get_const_data()[row]; + const auto begin_pos = + this->combined->get_const_row_ptrs()[row]; + const auto end_pos = + this->combined->get_const_row_ptrs()[row + 1]; + ASSERT_GE(diag_pos, begin_pos); + ASSERT_LT(diag_pos, end_pos); + ASSERT_EQ(this->combined->get_const_col_idxs()[diag_pos], row); + for (auto nz = begin_pos; nz < end_pos; nz++) { + const auto trans_pos = transpose_idxs.get_const_data()[nz]; + const auto col = this->combined->get_const_col_idxs()[nz]; + ASSERT_GE(trans_pos, + this->combined->get_const_row_ptrs()[col]); + ASSERT_LT(trans_pos, + this->combined->get_const_row_ptrs()[col + 1]); + ASSERT_EQ(this->combined->get_const_col_idxs()[trans_pos], + row); + ASSERT_EQ(transpose_idxs.get_const_data()[trans_pos], nz); + } } - } - }); + }, + true); } @@ -550,26 +440,30 @@ TYPED_TEST(Cholesky, KernelFactorizeWorks) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - this->forall_matrices([this] { - gko::array diag_idxs{this->ref, this->num_rows}; - gko::array transpose_idxs{ - this->ref, this->combined->get_num_stored_elements()}; - gko::array tmp{this->ref}; - gko::kernels::reference::cholesky::initialize( - this->ref, this->mtx.get(), this->storage_offsets.get_const_data(), - this->row_descs.get_const_data(), this->storage.get_const_data(), - diag_idxs.get_data(), transpose_idxs.get_data(), - this->combined.get()); - - gko::kernels::reference::cholesky::factorize( - this->ref, this->storage_offsets.get_const_data(), - this->row_descs.get_const_data(), this->storage.get_const_data(), - diag_idxs.get_data(), transpose_idxs.get_data(), *this->forest, - this->combined.get(), tmp); - - GKO_ASSERT_MTX_NEAR(this->combined, this->combined_ref, - r::value); - }); + this->forall_matrices( + [this] { + gko::array diag_idxs{this->ref, this->num_rows}; + gko::array transpose_idxs{ + this->ref, this->combined->get_num_stored_elements()}; + gko::array tmp{this->ref}; + gko::kernels::reference::cholesky::initialize( + this->ref, this->mtx.get(), + this->storage_offsets.get_const_data(), + this->row_descs.get_const_data(), + this->storage.get_const_data(), diag_idxs.get_data(), + transpose_idxs.get_data(), this->combined.get()); + + gko::kernels::reference::cholesky::factorize( + this->ref, this->storage_offsets.get_const_data(), + this->row_descs.get_const_data(), + this->storage.get_const_data(), diag_idxs.get_data(), + transpose_idxs.get_data(), *this->forest, this->combined.get(), + tmp); + + GKO_ASSERT_MTX_NEAR(this->combined, this->combined_ref, + r::value); + }, + false); } @@ -577,23 +471,25 @@ TYPED_TEST(Cholesky, FactorizeWorks) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - this->forall_matrices([this] { - auto factory = - gko::experimental::factorization::Cholesky::build() - .on(this->ref); - - auto cholesky = factory->generate(this->mtx); - - GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref, - r::value); - ASSERT_EQ(cholesky->get_storage_type(), - gko::experimental::factorization::storage_type:: - symm_combined_cholesky); - ASSERT_EQ(cholesky->get_lower_factor(), nullptr); - ASSERT_EQ(cholesky->get_upper_factor(), nullptr); - ASSERT_EQ(cholesky->get_diagonal(), nullptr); - }); + this->forall_matrices( + [this] { + auto factory = + gko::experimental::factorization::Cholesky::build() + .on(this->ref); + + auto cholesky = factory->generate(this->mtx); + + GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref, + r::value); + ASSERT_EQ(cholesky->get_storage_type(), + gko::experimental::factorization::storage_type:: + symm_combined_cholesky); + ASSERT_EQ(cholesky->get_lower_factor(), nullptr); + ASSERT_EQ(cholesky->get_upper_factor(), nullptr); + ASSERT_EQ(cholesky->get_diagonal(), nullptr); + }, + false); } @@ -601,28 +497,30 @@ TYPED_TEST(Cholesky, FactorizeWithKnownSparsityWorks) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - this->forall_matrices([this] { - auto pattern = - gko::share(gko::matrix::SparsityCsr::create( - this->ref)); - pattern->copy_from(this->combined_ref.get()); - auto factory = - gko::experimental::factorization::Cholesky::build() - .with_symbolic_factorization(pattern) - .on(this->ref); - - auto cholesky = factory->generate(this->mtx); - - GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref, - r::value); - ASSERT_EQ(cholesky->get_storage_type(), - gko::experimental::factorization::storage_type:: - symm_combined_cholesky); - ASSERT_EQ(cholesky->get_lower_factor(), nullptr); - ASSERT_EQ(cholesky->get_upper_factor(), nullptr); - ASSERT_EQ(cholesky->get_diagonal(), nullptr); - }); + this->forall_matrices( + [this] { + auto pattern = gko::share( + gko::matrix::SparsityCsr::create( + this->ref)); + pattern->copy_from(this->combined_ref.get()); + auto factory = + gko::experimental::factorization::Cholesky::build() + .with_symbolic_factorization(pattern) + .on(this->ref); + + auto cholesky = factory->generate(this->mtx); + + GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref, + r::value); + ASSERT_EQ(cholesky->get_storage_type(), + gko::experimental::factorization::storage_type:: + symm_combined_cholesky); + ASSERT_EQ(cholesky->get_lower_factor(), nullptr); + ASSERT_EQ(cholesky->get_upper_factor(), nullptr); + ASSERT_EQ(cholesky->get_diagonal(), nullptr); + }, + false); } From 93a3ef84a20bd212436050d641a51ef0fb722196 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 6 Jun 2023 15:02:49 +0200 Subject: [PATCH 086/583] use column Cholesky for GPU --- .../factorization/cholesky_kernels.hpp.inc | 47 +++++++------------ 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc index f87969a7ad0..eb90127a8ca 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc +++ b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc @@ -149,8 +149,6 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize( template __global__ __launch_bounds__(default_block_size) void factorize( const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols, - const IndexType* __restrict__ elim_tree_child_ptrs, - const IndexType* __restrict__ elim_tree_children, const IndexType* __restrict__ storage_offsets, const int32* __restrict__ storage, const int64* __restrict__ row_descs, const IndexType* __restrict__ diag_idxs, @@ -171,32 +169,21 @@ __global__ __launch_bounds__(default_block_size) void factorize( const auto row_begin = row_ptrs[row]; const auto row_diag = diag_idxs[row]; const auto row_end = row_ptrs[row + 1]; - const auto child_begin = elim_tree_child_ptrs[row]; - const auto child_end = elim_tree_child_ptrs[row + 1]; gko::matrix::csr::device_sparsity_lookup lookup{ row_ptrs, cols, storage_offsets, storage, row_descs, static_cast(row)}; - for (auto child = child_begin; child < child_end; child++) { - const auto dep = elim_tree_children[child]; - scheduler.wait(dep); - // TODO evaluate parallel waiting with __all_sync - } - // for each lower triangular entry: eliminate with corresponding row + // for each lower triangular entry: eliminate with corresponding column for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) { const auto dep = cols[lower_nz]; - auto val = vals[lower_nz]; + scheduler.wait(dep); + const auto scale = vals[lower_nz]; const auto diag_idx = diag_idxs[dep]; const auto dep_end = row_ptrs[dep + 1]; - const auto diag = vals[diag_idx]; - const auto scale = val / diag; - if (lane == 0) { - vals[lower_nz] = scale; - } - // subtract all entries past the diagonal - for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end; + // subtract column dep from current column + for (auto upper_nz = diag_idx + lane; upper_nz < dep_end; upper_nz += config::warp_size) { const auto upper_col = cols[upper_nz]; - if (upper_col < row) { + if (upper_col >= row) { const auto upper_val = vals[upper_nz]; const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin; @@ -204,17 +191,16 @@ __global__ __launch_bounds__(default_block_size) void factorize( } } } - ValueType sum{}; - for (auto lower_nz = row_begin + lane; lower_nz < row_diag; - lower_nz += config::warp_size) { - sum += squared_norm(vals[lower_nz]); - // copy the lower triangular entries to the transpose - vals[transpose_idxs[lower_nz]] = conj(vals[lower_nz]); + auto diag_val = sqrt(vals[row_diag]); + for (auto upper_nz = row_diag + 1 + lane; upper_nz < row_end; + upper_nz += config::warp_size) { + vals[upper_nz] /= diag_val; + // copy the upper triangular entries to the transpose + vals[transpose_idxs[upper_nz]] = conj(vals[upper_nz]); } - sum = reduce(warp, sum, thrust::plus{}); if (lane == 0) { // store computed diagonal - vals[row_diag] = sqrt(vals[row_diag] - sum); + vals[row_diag] = diag_val; } scheduler.mark_ready(); } @@ -365,10 +351,9 @@ void factorize(std::shared_ptr exec, kernel::factorize<<get_stream()>>>( factors->get_const_row_ptrs(), factors->get_const_col_idxs(), - forest.child_ptrs.get_const_data(), - forest.children.get_const_data(), lookup_offsets, lookup_storage, - lookup_descs, diag_idxs, transpose_idxs, - as_device_type(factors->get_values()), storage, num_rows); + lookup_offsets, lookup_storage, lookup_descs, diag_idxs, + transpose_idxs, as_device_type(factors->get_values()), storage, + num_rows); } } From 4d28ade0809b89d7ae7fa8a99d2de1d91485f1f4 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 21 Jul 2023 17:59:18 +0200 Subject: [PATCH 087/583] adds check that downstream compiler match the ginkgo compiler --- cmake/GinkgoConfig.cmake.in | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 44aaf34fc3f..5194d76b5af 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -63,7 +63,6 @@ set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@) set(GINKGO_CUDA_ARCHITECTURES "@GINKGO_CUDA_ARCHITECTURES@") set(GINKGO_CUDA_DEFAULT_HOST_COMPILER @GINKGO_CUDA_DEFAULT_HOST_COMPILER@) -set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") set(GINKGO_CUDA_ARCH_FLAGS "@GINKGO_CUDA_ARCH_FLAGS@") set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@") @@ -91,6 +90,15 @@ set(GINKGO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@) set(GINKGO_HAVE_ROCTX @GINKGO_HAVE_ROCTX@) +# Ginkgo compiler information +set(GINKGO_CXX_COMPILER "@CMAKE_CXX_COMPILER@") +set(GINKGO_CXX_COMPILER_SHORT "@CMAKE_CXX_COMPILER_ID@:@CMAKE_CXX_COMPILER_VERSION@") +set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") +set(GINKGO_CUDA_COMPILER_SHORT "@CMAKE_CUDA_COMPILER_ID@:@CMAKE_CUDA_COMPILER_VERSION@") +set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") +set(GINKGO_CUDA_HOST_COMPILER_SHORT "") # dummy value to stay consistent +set(GINKGO_HIP_COMPILER "@HIP_HIPCC@") + # Ginkgo installation configuration set(GINKGO_INSTALL_PREFIX "@PACKAGE_CMAKE_INSTALL_PREFIX@") set(GINKGO_INSTALL_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_FULL_INCLUDEDIR@") @@ -107,7 +115,6 @@ if(GINKGO_BUILD_HIP) endif() list(APPEND CMAKE_PREFIX_PATH "${GINKGO_INSTALL_PREFIX}") - set(GINKGO_INTERFACE_LINK_LIBRARIES "@GINKGO_INTERFACE_LINK_LIBRARIES@") set(GINKGO_INTERFACE_LINK_FLAGS "@GINKGO_INTERFACE_LINK_FLAGS@") set(GINKGO_INTERFACE_CXX_FLAGS "@GINKGO_INTERFACE_CXX_FLAGS@") @@ -207,4 +214,26 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_HAVE_TAU) find_package(PerfStubs REQUIRED) endif() +# Check that the same compilers as for Ginkgo are used +function(_ginkgo_check_compiler lang) + if(NOT ${CMAKE_${lang}_COMPILER} STREQUAL ${GINKGO_${lang}_COMPILER}) + set(_compiler_short "${CMAKE_${lang}_COMPILER_ID}:${CMAKE_${lang}_COMPILER_VERSION}") + if(NOT _compiler_short STREQUAL ${GINKGO_${lang}_COMPILER_SHORT}) + message(WARNING "The currently used ${lang} compiler: ${CMAKE_${lang}_COMPILER} does not match the compiler used to " + "build Ginkgo: ${GINKGO_${lang}_COMPILER}. It is encouraged to use the same compiler as Ginkgo to prevent ABI mismatch.") + endif() + endif() +endfunction() +_ginkgo_check_compiler(CXX) +if(GINKGO_BUILD_CUDA) + _ginkgo_check_compiler(CUDA) +endif() +if(GINKGO_BUILD_HIP) + _ginkgo_check_compiler(HIP) + if(NOT HIP_HIPCC STREQUAL ${GINKGO_HIP_COMPILER}) + message(WARNING "The currently used HIP compiler: ${HIP_HIPCC} does not match the compiler used to " + "build Ginkgo: ${GINKGO_HIP_COMPILER}. It is encouraged to use the same compiler as Ginkgo to prevent ABI mismatch.") + endif() +endif() + include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) From d40aca8108b0610a760acb61ab7161e34de58838 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 10:29:11 +0200 Subject: [PATCH 088/583] don't check hip compiler until cmake update --- cmake/GinkgoConfig.cmake.in | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 5194d76b5af..ea251a64b86 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -97,7 +97,6 @@ set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") set(GINKGO_CUDA_COMPILER_SHORT "@CMAKE_CUDA_COMPILER_ID@:@CMAKE_CUDA_COMPILER_VERSION@") set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") set(GINKGO_CUDA_HOST_COMPILER_SHORT "") # dummy value to stay consistent -set(GINKGO_HIP_COMPILER "@HIP_HIPCC@") # Ginkgo installation configuration set(GINKGO_INSTALL_PREFIX "@PACKAGE_CMAKE_INSTALL_PREFIX@") @@ -228,12 +227,5 @@ _ginkgo_check_compiler(CXX) if(GINKGO_BUILD_CUDA) _ginkgo_check_compiler(CUDA) endif() -if(GINKGO_BUILD_HIP) - _ginkgo_check_compiler(HIP) - if(NOT HIP_HIPCC STREQUAL ${GINKGO_HIP_COMPILER}) - message(WARNING "The currently used HIP compiler: ${HIP_HIPCC} does not match the compiler used to " - "build Ginkgo: ${GINKGO_HIP_COMPILER}. It is encouraged to use the same compiler as Ginkgo to prevent ABI mismatch.") - endif() -endif() include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) From 00d5cf6519730688c2d036af481aa5d7e9bc3306 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 10:29:23 +0200 Subject: [PATCH 089/583] also check cuda host compiler --- cmake/GinkgoConfig.cmake.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index ea251a64b86..fe2ac05d7e5 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -217,7 +217,7 @@ endif() function(_ginkgo_check_compiler lang) if(NOT ${CMAKE_${lang}_COMPILER} STREQUAL ${GINKGO_${lang}_COMPILER}) set(_compiler_short "${CMAKE_${lang}_COMPILER_ID}:${CMAKE_${lang}_COMPILER_VERSION}") - if(NOT _compiler_short STREQUAL ${GINKGO_${lang}_COMPILER_SHORT}) + if(NOT _compiler_short STREQUAL "${GINKGO_${lang}_COMPILER_SHORT}") message(WARNING "The currently used ${lang} compiler: ${CMAKE_${lang}_COMPILER} does not match the compiler used to " "build Ginkgo: ${GINKGO_${lang}_COMPILER}. It is encouraged to use the same compiler as Ginkgo to prevent ABI mismatch.") endif() @@ -226,6 +226,7 @@ endfunction() _ginkgo_check_compiler(CXX) if(GINKGO_BUILD_CUDA) _ginkgo_check_compiler(CUDA) + _ginkgo_check_compiler(CUDA_HOST) endif() include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) From e9b9f68bb8d89148a2f83c542b69a738b03cdaa4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 31 Jul 2023 16:51:33 +0200 Subject: [PATCH 090/583] messy approach to splitting up files --- CMakeLists.txt | 2 + cmake/template_instantiation.cmake | 60 ++++++++++ common/CMakeLists.txt | 34 +----- common/unified/CMakeLists.txt | 34 ++++++ .../matrix/dense_kernels.instantiate.cpp | 108 ++++++++++++++++++ .../{dense_kernels.cpp => dense_kernels.tpp} | 83 -------------- omp/CMakeLists.txt | 1 + 7 files changed, 208 insertions(+), 114 deletions(-) create mode 100644 cmake/template_instantiation.cmake create mode 100644 common/unified/CMakeLists.txt create mode 100644 common/unified/matrix/dense_kernels.instantiate.cpp rename common/unified/matrix/{dense_kernels.cpp => dense_kernels.tpp} (87%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6351ce98bfa..809c39991bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,6 +304,8 @@ configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in # propagated to the other parts of Ginkgo in case of building as static libraries add_subdirectory(devices) # Basic device functionalities. Always compiled. add_subdirectory(common) # Import list of unified kernel source files +set_source_files_properties(${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES} PROPERTIES GENERATED 1) +message("${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES}") if(GINKGO_BUILD_CUDA) add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs endif() diff --git a/cmake/template_instantiation.cmake b/cmake/template_instantiation.cmake new file mode 100644 index 00000000000..af5c395279c --- /dev/null +++ b/cmake/template_instantiation.cmake @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.13) +function(add_instantiation_files source_file output_files_var) + file(READ "${source_file}" file_contents) + string(REPLACE ";" "" file_contents "${file_contents}") + string(REGEX REPLACE "[\r\n]" ";" file_contents "${file_contents}") + set(begin_location) + set(end_location) + set(split_locations) + list(LENGTH file_contents total_length) + set(counter 0) + foreach(line IN LISTS file_contents) + if(line MATCHES "// begin") + set(begin_location ${counter}) + elseif(line MATCHES "// split") + list(APPEND split_locations ${counter}) + elseif(line MATCHES "// end") + set(end_location ${counter}) + endif() + math(EXPR counter "${counter} + 1") + endforeach() + if (NOT (begin_location AND end_location AND split_locations)) + message(FATAL_ERROR "Nothing to split") + endif() + if (begin_location GREATER_EQUAL end_location) + message(FATAL_ERROR "Incorrect begin/end order") + endif() + set(range_begins ${begin_location} ${split_locations}) + set(range_ends ${split_locations} ${end_location}) + list(LENGTH begin_locations range_count) + list(LENGTH split_locations range_count_minus_one) + math(EXPR length_header "${begin_location}") + math(EXPR end_location_past "${end_location} + 1") + math(EXPR length_footer "${total_length} - ${end_location_past}") + list(SUBLIST file_contents 0 ${length_header} header) + list(SUBLIST file_contents ${end_location_past} ${length_footer} footer) + set(output_files) + foreach(range RANGE 0 ${range_count_minus_one}) + set(filename "${source_file}.${range}.cpp") + list(APPEND output_files "${filename}") + list(GET range_begins ${range} begin) + list(GET range_ends ${range} end) + math(EXPR begin "${begin} + 1") + math(EXPR length "${end} - ${begin}") + list(SUBLIST file_contents ${begin} ${length} content) + string(REPLACE ";" "\n" content "${header};${content};${footer}") + string(REPLACE "" ";" content "${content}") + # create a .tmp file, but only copy it over if source file changed + # this way, we don't rebuild unnecessarily + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${filename}.tmp" "${content}") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${filename}" + COMMAND ${CMAKE_COMMAND} + -E copy "${CMAKE_CURRENT_BINARY_DIR}/${filename}.tmp" + "${CMAKE_CURRENT_BINARY_DIR}/${filename}" + MAIN_DEPENDENCY "${source_file}") + endforeach() + # lazy workaround to make cmake generation depend on the source file + configure_file("${source_file}", "${source_file}.tmp" COPYONLY) + set(${output_files_var} ${output_files} PARENT_SCOPE) +endfunction() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3a7cb1ceb15..8512e05d07a 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,31 +1,3 @@ -set(UNIFIED_SOURCES - base/device_matrix_data_kernels.cpp - base/index_set_kernels.cpp - components/absolute_array_kernels.cpp - components/fill_array_kernels.cpp - components/format_conversion_kernels.cpp - components/precision_conversion_kernels.cpp - components/reduce_array_kernels.cpp - distributed/partition_kernels.cpp - matrix/coo_kernels.cpp - matrix/csr_kernels.cpp - matrix/dense_kernels.cpp - matrix/ell_kernels.cpp - matrix/hybrid_kernels.cpp - matrix/sellp_kernels.cpp - matrix/sparsity_csr_kernels.cpp - matrix/diagonal_kernels.cpp - multigrid/pgm_kernels.cpp - preconditioner/jacobi_kernels.cpp - solver/bicg_kernels.cpp - solver/bicgstab_kernels.cpp - solver/cg_kernels.cpp - solver/cgs_kernels.cpp - solver/common_gmres_kernels.cpp - solver/fcg_kernels.cpp - solver/gcr_kernels.cpp - solver/gmres_kernels.cpp - solver/ir_kernels.cpp - ) -list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/unified/) -set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE) +add_subdirectory(unified) +set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE) +set(GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES ${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES} PARENT_SCOPE) diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt new file mode 100644 index 00000000000..a9f45c63f13 --- /dev/null +++ b/common/unified/CMakeLists.txt @@ -0,0 +1,34 @@ +include(../../cmake/template_instantiation.cmake) +add_instantiation_files(matrix/dense_kernels.instantiate.cpp UNIFIED_INSTANTIATE_SOURCES) +set(UNIFIED_SOURCES + base/device_matrix_data_kernels.cpp + base/index_set_kernels.cpp + components/absolute_array_kernels.cpp + components/fill_array_kernels.cpp + components/format_conversion_kernels.cpp + components/precision_conversion_kernels.cpp + components/reduce_array_kernels.cpp + distributed/partition_kernels.cpp + matrix/coo_kernels.cpp + matrix/csr_kernels.cpp + matrix/ell_kernels.cpp + matrix/hybrid_kernels.cpp + matrix/sellp_kernels.cpp + matrix/sparsity_csr_kernels.cpp + matrix/diagonal_kernels.cpp + multigrid/pgm_kernels.cpp + preconditioner/jacobi_kernels.cpp + solver/bicg_kernels.cpp + solver/bicgstab_kernels.cpp + solver/cg_kernels.cpp + solver/cgs_kernels.cpp + solver/common_gmres_kernels.cpp + solver/fcg_kernels.cpp + solver/gcr_kernels.cpp + solver/gmres_kernels.cpp + solver/ir_kernels.cpp + ) +list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) +list(TRANSFORM UNIFIED_INSTANTIATE_SOURCES PREPEND ${CMAKE_CURRENT_BINARY_DIR}/) +set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE) +set(GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES ${UNIFIED_INSTANTIATE_SOURCES} PARENT_SCOPE) \ No newline at end of file diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp new file mode 100644 index 00000000000..92d9fa26a00 --- /dev/null +++ b/common/unified/matrix/dense_kernels.instantiate.cpp @@ -0,0 +1,108 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "common/unified/matrix/dense_kernels.tpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace dense { + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY( + GKO_DECLARE_DENSE_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_INV_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); +// end + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.tpp similarity index 87% rename from common/unified/matrix/dense_kernels.cpp rename to common/unified/matrix/dense_kernels.tpp index 18d2fbabe6c..b6ed5fb37e0 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.tpp @@ -67,9 +67,6 @@ void copy(std::shared_ptr exec, input->get_size(), input, output); } -GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY( - GKO_DECLARE_DENSE_COPY_KERNEL); - template void fill(std::shared_ptr exec, @@ -83,8 +80,6 @@ void fill(std::shared_ptr exec, mat->get_size(), mat, value); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL); - template void fill_in_matrix_data(std::shared_ptr exec, @@ -100,9 +95,6 @@ void fill_in_matrix_data(std::shared_ptr exec, data.get_const_col_idxs(), data.get_const_values(), output); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); - template void scale(std::shared_ptr exec, @@ -125,8 +117,6 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); - template void inv_scale(std::shared_ptr exec, @@ -150,9 +140,6 @@ void inv_scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_INV_SCALE_KERNEL); - template void add_scaled(std::shared_ptr exec, @@ -176,9 +163,6 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); - template void sub_scaled(std::shared_ptr exec, @@ -202,9 +186,6 @@ void sub_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); - template void add_scaled_diag(std::shared_ptr exec, @@ -221,8 +202,6 @@ void add_scaled_diag(std::shared_ptr exec, x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); - template void sub_scaled_diag(std::shared_ptr exec, @@ -239,8 +218,6 @@ void sub_scaled_diag(std::shared_ptr exec, x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); - template void compute_dot(std::shared_ptr exec, @@ -257,8 +234,6 @@ void compute_dot(std::shared_ptr exec, tmp, x, y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - template void compute_conj_dot(std::shared_ptr exec, @@ -275,8 +250,6 @@ void compute_conj_dot(std::shared_ptr exec, tmp, x, y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - template void compute_norm2(std::shared_ptr exec, @@ -292,8 +265,6 @@ void compute_norm2(std::shared_ptr exec, result->get_values(), x->get_size(), tmp, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - template void compute_norm1(std::shared_ptr exec, const matrix::Dense* x, @@ -306,8 +277,6 @@ void compute_norm1(std::shared_ptr exec, x->get_size(), tmp, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); - template void compute_max_nnz_per_row(std::shared_ptr exec, @@ -325,9 +294,6 @@ void compute_max_nnz_per_row(std::shared_ptr exec, source->get_size()[0]); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); - template void compute_slice_sets(std::shared_ptr exec, @@ -357,9 +323,6 @@ void compute_slice_sets(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); - template void count_nonzeros_per_row(std::shared_ptr exec, @@ -374,11 +337,6 @@ void count_nonzeros_per_row(std::shared_ptr exec, GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); - template void compute_squared_norm2(std::shared_ptr exec, @@ -393,9 +351,6 @@ void compute_squared_norm2(std::shared_ptr exec, x->get_size(), tmp, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); - template void compute_sqrt(std::shared_ptr exec, @@ -409,8 +364,6 @@ void compute_sqrt(std::shared_ptr exec, x->get_size(), x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); - template void symm_permute(std::shared_ptr exec, @@ -426,9 +379,6 @@ void symm_permute(std::shared_ptr exec, orig->get_size(), orig, *permutation_indices, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); - template void inv_symm_permute(std::shared_ptr exec, @@ -444,9 +394,6 @@ void inv_symm_permute(std::shared_ptr exec, orig->get_size(), orig, *permutation_indices, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); - template void row_gather(std::shared_ptr exec, @@ -463,9 +410,6 @@ void row_gather(std::shared_ptr exec, row_collection); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); - template void advanced_row_gather(std::shared_ptr exec, @@ -490,9 +434,6 @@ void advanced_row_gather(std::shared_ptr exec, row_collection); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); - template void column_permute(std::shared_ptr exec, @@ -508,9 +449,6 @@ void column_permute(std::shared_ptr exec, orig->get_size(), orig, *permutation_indices, column_permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); - template void inverse_row_permute(std::shared_ptr exec, @@ -526,9 +464,6 @@ void inverse_row_permute(std::shared_ptr exec, orig->get_size(), orig, *permutation_indices, row_permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); - template void inverse_column_permute(std::shared_ptr exec, @@ -544,9 +479,6 @@ void inverse_column_permute(std::shared_ptr exec, orig->get_size(), orig, *permutation_indices, column_permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); - template void extract_diagonal(std::shared_ptr exec, @@ -559,8 +491,6 @@ void extract_diagonal(std::shared_ptr exec, diag->get_size()[0], orig, diag->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); - template void inplace_absolute_dense(std::shared_ptr exec, @@ -574,8 +504,6 @@ void inplace_absolute_dense(std::shared_ptr exec, source->get_size(), source); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); - template void outplace_absolute_dense(std::shared_ptr exec, @@ -590,8 +518,6 @@ void outplace_absolute_dense(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); - template void make_complex(std::shared_ptr exec, @@ -606,8 +532,6 @@ void make_complex(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); - template void get_real(std::shared_ptr exec, @@ -622,8 +546,6 @@ void get_real(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); - template void get_imag(std::shared_ptr exec, @@ -638,8 +560,6 @@ void get_imag(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); - template void add_scaled_identity(std::shared_ptr exec, @@ -659,9 +579,6 @@ void add_scaled_identity(std::shared_ptr exec, mtx); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); - } // namespace dense } // namespace GKO_DEVICE_NAMESPACE diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 6499e3b49d4..74e5e5b8806 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -39,6 +39,7 @@ target_sources(ginkgo_omp stop/criterion_kernels.cpp stop/residual_norm_kernels.cpp ${GKO_UNIFIED_COMMON_SOURCES} + ${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES} ) ginkgo_compile_features(ginkgo_omp) From 72367a856a612d29e988bada2592646020ed7597 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 31 Jul 2023 19:27:28 +0200 Subject: [PATCH 091/583] clean up dependency structure --- CMakeLists.txt | 1 - cmake/template_instantiation.cmake | 34 ++++++++++++++++++------------ common/CMakeLists.txt | 1 - common/unified/CMakeLists.txt | 6 +----- dpcpp/CMakeLists.txt | 3 +++ omp/CMakeLists.txt | 2 +- 6 files changed, 26 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 809c39991bb..9e625892c3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,7 +304,6 @@ configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in # propagated to the other parts of Ginkgo in case of building as static libraries add_subdirectory(devices) # Basic device functionalities. Always compiled. add_subdirectory(common) # Import list of unified kernel source files -set_source_files_properties(${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES} PROPERTIES GENERATED 1) message("${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES}") if(GINKGO_BUILD_CUDA) add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs diff --git a/cmake/template_instantiation.cmake b/cmake/template_instantiation.cmake index af5c395279c..bc37d895537 100644 --- a/cmake/template_instantiation.cmake +++ b/cmake/template_instantiation.cmake @@ -1,8 +1,11 @@ -cmake_minimum_required(VERSION 3.13) -function(add_instantiation_files source_file output_files_var) - file(READ "${source_file}" file_contents) +function(add_instantiation_files source_dir source_file output_files_var) + # read full file into variable + set(source_path "${source_dir}/${source_file}") + file(READ "${source_path}" file_contents) + # escape semicolons and use them for line separation string(REPLACE ";" "" file_contents "${file_contents}") string(REGEX REPLACE "[\r\n]" ";" file_contents "${file_contents}") + # find location of // begin|split|end comments set(begin_location) set(end_location) set(split_locations) @@ -24,6 +27,7 @@ function(add_instantiation_files source_file output_files_var) if (begin_location GREATER_EQUAL end_location) message(FATAL_ERROR "Incorrect begin/end order") endif() + # determine which lines belong to the header and footer set(range_begins ${begin_location} ${split_locations}) set(range_ends ${split_locations} ${end_location}) list(LENGTH begin_locations range_count) @@ -34,27 +38,31 @@ function(add_instantiation_files source_file output_files_var) list(SUBLIST file_contents 0 ${length_header} header) list(SUBLIST file_contents ${end_location_past} ${length_footer} footer) set(output_files) + # for each range between // begin|split|end pairs foreach(range RANGE 0 ${range_count_minus_one}) - set(filename "${source_file}.${range}.cpp") - list(APPEND output_files "${filename}") + # create an output filename + string(REGEX REPLACE "(\.hip\.cpp|\.dp\.cpp|\.cpp|\.cu)$" ".${range}\\1" target_file "${source_file}") + set(target_path "${CMAKE_CURRENT_BINARY_DIR}/${target_file}") + list(APPEND output_files "${target_path}") + # extract the range between the comments list(GET range_begins ${range} begin) list(GET range_ends ${range} end) math(EXPR begin "${begin} + 1") math(EXPR length "${end} - ${begin}") list(SUBLIST file_contents ${begin} ${length} content) + # concatenate header, content and footer and turn semicolons into newlines string(REPLACE ";" "\n" content "${header};${content};${footer}") + # and escaped semicolons into regular semicolons again string(REPLACE "" ";" content "${content}") # create a .tmp file, but only copy it over if source file changed # this way, we don't rebuild unnecessarily - file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${filename}.tmp" "${content}") + file(WRITE "${target_path}.tmp" "${content}") add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${filename}" - COMMAND ${CMAKE_COMMAND} - -E copy "${CMAKE_CURRENT_BINARY_DIR}/${filename}.tmp" - "${CMAKE_CURRENT_BINARY_DIR}/${filename}" - MAIN_DEPENDENCY "${source_file}") + OUTPUT "${target_path}" + COMMAND ${CMAKE_COMMAND} -E copy "${target_path}.tmp" "${target_path}" + MAIN_DEPENDENCY "${source_path}") endforeach() - # lazy workaround to make cmake generation depend on the source file - configure_file("${source_file}", "${source_file}.tmp" COPYONLY) + # make sure cmake gets called when the source file was updated + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${source_path}") set(${output_files_var} ${output_files} PARENT_SCOPE) endfunction() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 8512e05d07a..77bdd7230b9 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,3 +1,2 @@ add_subdirectory(unified) set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE) -set(GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES ${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES} PARENT_SCOPE) diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt index a9f45c63f13..5a37eb022f9 100644 --- a/common/unified/CMakeLists.txt +++ b/common/unified/CMakeLists.txt @@ -1,5 +1,3 @@ -include(../../cmake/template_instantiation.cmake) -add_instantiation_files(matrix/dense_kernels.instantiate.cpp UNIFIED_INSTANTIATE_SOURCES) set(UNIFIED_SOURCES base/device_matrix_data_kernels.cpp base/index_set_kernels.cpp @@ -29,6 +27,4 @@ set(UNIFIED_SOURCES solver/ir_kernels.cpp ) list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) -list(TRANSFORM UNIFIED_INSTANTIATE_SOURCES PREPEND ${CMAKE_CURRENT_BINARY_DIR}/) -set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE) -set(GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES ${UNIFIED_INSTANTIATE_SOURCES} PARENT_SCOPE) \ No newline at end of file +set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE) \ No newline at end of file diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 31b5e0543ba..b33b63d4af9 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -3,6 +3,8 @@ set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE) find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}") set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE) +include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernels.instantiate.cpp DENSE_INSTANTIATE) add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp PRIVATE @@ -55,6 +57,7 @@ target_sources(ginkgo_dpcpp stop/criterion_kernels.dp.cpp stop/residual_norm_kernels.dp.cpp ${GKO_UNIFIED_COMMON_SOURCES} + ${DENSE_INSTANTIATE} ) # TODO: adjust it when dpcpp jacobi supports more block size diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 74e5e5b8806..50f46cd23cd 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -39,7 +39,7 @@ target_sources(ginkgo_omp stop/criterion_kernels.cpp stop/residual_norm_kernels.cpp ${GKO_UNIFIED_COMMON_SOURCES} - ${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES} + ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp ) ginkgo_compile_features(ginkgo_omp) From 08202a03b48b67467aa33e6a5f8d6c531b4ac5e3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 31 Jul 2023 20:07:37 +0000 Subject: [PATCH 092/583] split CUDA and HIP (fb)csr_kernels compilation --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 5 - common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 18 ---- cuda/CMakeLists.txt | 9 +- cuda/matrix/csr_kernels.instantiate.cu | 99 +++++++++++++++++++ ...csr_kernels.cu => csr_kernels.template.cu} | 56 ----------- cuda/matrix/fbcsr_kernels.instantiate.cu | 75 ++++++++++++++ ...r_kernels.cu => fbcsr_kernels.template.cu} | 11 --- hip/CMakeLists.txt | 9 +- hip/matrix/csr_kernels.instantiate.hip.cpp | 99 +++++++++++++++++++ ...s.hip.cpp => csr_kernels.template.hip.cpp} | 56 ----------- hip/matrix/fbcsr_kernels.instantiate.hip.cpp | 75 ++++++++++++++ ...hip.cpp => fbcsr_kernels.template.hip.cpp} | 11 --- omp/CMakeLists.txt | 3 +- 13 files changed, 364 insertions(+), 162 deletions(-) create mode 100644 cuda/matrix/csr_kernels.instantiate.cu rename cuda/matrix/{csr_kernels.cu => csr_kernels.template.cu} (96%) create mode 100644 cuda/matrix/fbcsr_kernels.instantiate.cu rename cuda/matrix/{fbcsr_kernels.cu => fbcsr_kernels.template.cu} (97%) create mode 100644 hip/matrix/csr_kernels.instantiate.hip.cpp rename hip/matrix/{csr_kernels.hip.cpp => csr_kernels.template.hip.cpp} (96%) create mode 100644 hip/matrix/fbcsr_kernels.instantiate.hip.cpp rename hip/matrix/{fbcsr_kernels.hip.cpp => fbcsr_kernels.template.hip.cpp} (96%) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 1fca1ee7215..c370075c8a8 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -937,9 +937,6 @@ void convert_to_fbcsr(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); - namespace kernel { @@ -1122,8 +1119,6 @@ void build_lookup(std::shared_ptr exec, storage); } -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); - template void fallback_transpose(std::shared_ptr exec, diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 27314c06a59..d71d593b0a2 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -238,9 +238,6 @@ void fill_in_matrix_data(std::shared_ptr exec, }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); - namespace kernel { @@ -323,9 +320,6 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); - template void convert_to_csr(const std::shared_ptr exec, @@ -345,9 +339,6 @@ void convert_to_csr(const std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); - template void is_sorted_by_column_index( @@ -372,23 +363,14 @@ void is_sorted_by_column_index( *is_sorted = exec->copy_val_to_host(gpu_array.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); - template void sort_by_column_index(const std::shared_ptr exec, matrix::Fbcsr* const to_sort) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); - template void extract_diagonal(std::shared_ptr exec, const matrix::Fbcsr* orig, matrix::Diagonal* diag) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index aecf4e1c2f2..6cfb83a59e8 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,4 +1,9 @@ add_library(ginkgo_cuda $ "") +include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) +add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) +add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE) +# we don't split up the dense kernels into distinct compliations +list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE base/device.cpp @@ -31,11 +36,11 @@ target_sources(ginkgo_cuda factorization/par_ilut_spgeam_kernel.cu factorization/par_ilut_sweep_kernel.cu matrix/coo_kernels.cu - matrix/csr_kernels.cu + ${CSR_INSTANTIATE} matrix/dense_kernels.cu matrix/diagonal_kernels.cu matrix/ell_kernels.cu - matrix/fbcsr_kernels.cu + ${FBCSR_INSTANTIATE} matrix/fft_kernels.cu matrix/sellp_kernels.cu matrix/sparsity_csr_kernels.cu diff --git a/cuda/matrix/csr_kernels.instantiate.cu b/cuda/matrix/csr_kernels.instantiate.cu new file mode 100644 index 00000000000..75747bf074b --- /dev/null +++ b/cuda/matrix/csr_kernels.instantiate.cu @@ -0,0 +1,99 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/matrix/csr_kernels.template.cu" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Compressed sparse row matrix format namespace. + * + * @ingroup csr + */ +namespace csr { + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_SPMV_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +// end + + +} // namespace csr +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.template.cu similarity index 96% rename from cuda/matrix/csr_kernels.cu rename to cuda/matrix/csr_kernels.template.cu index 619ead5bbbb..1b4b20a1e75 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -533,9 +533,6 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_SPMV_KERNEL); - template @@ -598,9 +595,6 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); - template void spgemm(std::shared_ptr exec, @@ -724,8 +718,6 @@ void spgemm(std::shared_ptr exec, #endif // CUDA_VERSION >= 11000 } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); - namespace { @@ -920,9 +912,6 @@ void advanced_spgemm(std::shared_ptr exec, #endif // CUDA_VERSION >= 11000 } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); - template void spgeam(std::shared_ptr exec, @@ -948,8 +937,6 @@ void spgeam(std::shared_ptr exec, b->get_const_col_idxs(), b->get_const_values(), c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); - template void fill_in_dense(std::shared_ptr exec, @@ -972,9 +959,6 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); - template void transpose(std::shared_ptr exec, @@ -1024,8 +1008,6 @@ void transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); - template void conj_transpose(std::shared_ptr exec, @@ -1083,9 +1065,6 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); - template void inv_symm_permute(std::shared_ptr exec, @@ -1116,9 +1095,6 @@ void inv_symm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); - template void row_permute(std::shared_ptr exec, @@ -1149,9 +1125,6 @@ void row_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); - template void inverse_row_permute(std::shared_ptr exec, @@ -1182,9 +1155,6 @@ void inverse_row_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); - template void calculate_nonzeros_per_row_in_span( @@ -1204,9 +1174,6 @@ void calculate_nonzeros_per_row_in_span( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); - template void compute_submatrix(std::shared_ptr exec, @@ -1233,9 +1200,6 @@ void compute_submatrix(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); - template void calculate_nonzeros_per_row_in_index_set( @@ -1245,9 +1209,6 @@ void calculate_nonzeros_per_row_in_index_set( const gko::index_set& col_index_set, IndexType* row_nnz) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); - template void compute_submatrix_from_index_set( @@ -1257,9 +1218,6 @@ void compute_submatrix_from_index_set( const gko::index_set& col_index_set, matrix::Csr* result) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); - template void sort_by_column_index(std::shared_ptr exec, @@ -1312,9 +1270,6 @@ void sort_by_column_index(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); - template void is_sorted_by_column_index( @@ -1336,9 +1291,6 @@ void is_sorted_by_column_index( cpu_array = gpu_array; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); - template void extract_diagonal(std::shared_ptr exec, @@ -1364,8 +1316,6 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); - template void check_diagonal_entries_exist( @@ -1389,9 +1339,6 @@ void check_diagonal_entries_exist( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); - template void add_scaled_identity(std::shared_ptr exec, @@ -1413,9 +1360,6 @@ void add_scaled_identity(std::shared_ptr exec, as_device_type(mtx->get_values())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); - } // namespace csr } // namespace cuda diff --git a/cuda/matrix/fbcsr_kernels.instantiate.cu b/cuda/matrix/fbcsr_kernels.instantiate.cu new file mode 100644 index 00000000000..73c3fc136ba --- /dev/null +++ b/cuda/matrix/fbcsr_kernels.instantiate.cu @@ -0,0 +1,75 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/matrix/fbcsr_kernels.template.cu" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The fixed-size block compressed sparse row matrix format namespace. + * + * @ingroup fbcsr + */ +namespace fbcsr { + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); +// end + + +} // namespace fbcsr +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.template.cu similarity index 97% rename from cuda/matrix/fbcsr_kernels.cu rename to cuda/matrix/fbcsr_kernels.template.cu index 8160a0ac5a5..c629b292bfb 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.template.cu @@ -180,8 +180,6 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); - template void advanced_spmv(std::shared_ptr exec, @@ -240,9 +238,6 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); - namespace { @@ -305,9 +300,6 @@ void transpose(const std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); - template void conj_transpose(std::shared_ptr exec, @@ -325,9 +317,6 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); - } // namespace fbcsr } // namespace cuda diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 61b06ad4058..7e0558844cf 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,3 +1,8 @@ +include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) +add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) +add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE) +# we don't split up the dense kernels into distinct compliations +list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES base/device.hip.cpp base/device_matrix_data_kernels.hip.cpp @@ -29,11 +34,11 @@ set(GINKGO_HIP_SOURCES factorization/par_ilut_spgeam_kernel.hip.cpp factorization/par_ilut_sweep_kernel.hip.cpp matrix/coo_kernels.hip.cpp - matrix/csr_kernels.hip.cpp + ${CSR_INSTANTIATE} matrix/dense_kernels.hip.cpp matrix/diagonal_kernels.hip.cpp matrix/ell_kernels.hip.cpp - matrix/fbcsr_kernels.hip.cpp + ${FBCSR_INSTANTIATE} matrix/sellp_kernels.hip.cpp matrix/sparsity_csr_kernels.hip.cpp multigrid/pgm_kernels.hip.cpp diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp new file mode 100644 index 00000000000..498f3ec1795 --- /dev/null +++ b/hip/matrix/csr_kernels.instantiate.hip.cpp @@ -0,0 +1,99 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "hip/matrix/csr_kernels.template.hip.cpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Compressed sparse row matrix format namespace. + * + * @ingroup csr + */ +namespace csr { + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_SPMV_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +// end + + +} // namespace csr +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp similarity index 96% rename from hip/matrix/csr_kernels.hip.cpp rename to hip/matrix/csr_kernels.template.hip.cpp index b18cfa0f12b..e6a4fb64041 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -493,9 +493,6 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_SPMV_KERNEL); - template @@ -558,9 +555,6 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); - template void spgemm(std::shared_ptr exec, @@ -634,8 +628,6 @@ void spgemm(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); - namespace { @@ -775,9 +767,6 @@ void advanced_spgemm(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); - template void spgeam(std::shared_ptr exec, @@ -803,8 +792,6 @@ void spgeam(std::shared_ptr exec, b->get_const_col_idxs(), b->get_const_values(), c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); - template void fill_in_dense(std::shared_ptr exec, @@ -827,9 +814,6 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); - template void transpose(std::shared_ptr exec, @@ -854,8 +838,6 @@ void transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); - template void conj_transpose(std::shared_ptr exec, @@ -888,9 +870,6 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); - template void inv_symm_permute(std::shared_ptr exec, @@ -921,9 +900,6 @@ void inv_symm_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); - template void row_permute(std::shared_ptr exec, const IndexType* perm, @@ -953,9 +929,6 @@ void row_permute(std::shared_ptr exec, const IndexType* perm, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); - template void inverse_row_permute(std::shared_ptr exec, @@ -986,9 +959,6 @@ void inverse_row_permute(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); - template void calculate_nonzeros_per_row_in_span( @@ -1009,9 +979,6 @@ void calculate_nonzeros_per_row_in_span( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); - template void compute_submatrix(std::shared_ptr exec, @@ -1038,9 +1005,6 @@ void compute_submatrix(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); - template void calculate_nonzeros_per_row_in_index_set( @@ -1050,9 +1014,6 @@ void calculate_nonzeros_per_row_in_index_set( const gko::index_set& col_index_set, IndexType* row_nnz) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); - template void compute_submatrix_from_index_set( @@ -1062,9 +1023,6 @@ void compute_submatrix_from_index_set( const gko::index_set& col_index_set, matrix::Csr* result) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); - template void sort_by_column_index(std::shared_ptr exec, @@ -1110,9 +1068,6 @@ void sort_by_column_index(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); - template void is_sorted_by_column_index( @@ -1134,9 +1089,6 @@ void is_sorted_by_column_index( cpu_array = gpu_array; } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); - template void extract_diagonal(std::shared_ptr exec, @@ -1161,8 +1113,6 @@ void extract_diagonal(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); - template void check_diagonal_entries_exist( @@ -1186,9 +1136,6 @@ void check_diagonal_entries_exist( } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); - template void add_scaled_identity(std::shared_ptr exec, @@ -1210,9 +1157,6 @@ void add_scaled_identity(std::shared_ptr exec, as_device_type(mtx->get_values())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); - } // namespace csr } // namespace hip diff --git a/hip/matrix/fbcsr_kernels.instantiate.hip.cpp b/hip/matrix/fbcsr_kernels.instantiate.hip.cpp new file mode 100644 index 00000000000..8cf4944e08a --- /dev/null +++ b/hip/matrix/fbcsr_kernels.instantiate.hip.cpp @@ -0,0 +1,75 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "hip/matrix/fbcsr_kernels.template.hip.cpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The fixed-size block compressed sparse row matrix format namespace. + * + * @ingroup fbcsr + */ +namespace fbcsr { + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); +// end + + +} // namespace fbcsr +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp similarity index 96% rename from hip/matrix/fbcsr_kernels.hip.cpp rename to hip/matrix/fbcsr_kernels.template.hip.cpp index 8a4d78e7e40..88cad66753c 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.template.hip.cpp @@ -182,8 +182,6 @@ void spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); - template void advanced_spmv(std::shared_ptr exec, @@ -242,9 +240,6 @@ void advanced_spmv(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); - template void transpose(const std::shared_ptr exec, @@ -254,9 +249,6 @@ void transpose(const std::shared_ptr exec, fallback_transpose(exec, input, output); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); - template void conj_transpose(std::shared_ptr exec, @@ -274,9 +266,6 @@ void conj_transpose(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); - } // namespace fbcsr } // namespace hip diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 50f46cd23cd..d552cc612bf 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -1,4 +1,6 @@ add_library(ginkgo_omp $ "") +# we don't split up the dense kernels into distinct compliations +list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_omp PRIVATE base/device_matrix_data_kernels.cpp @@ -39,7 +41,6 @@ target_sources(ginkgo_omp stop/criterion_kernels.cpp stop/residual_norm_kernels.cpp ${GKO_UNIFIED_COMMON_SOURCES} - ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp ) ginkgo_compile_features(ginkgo_omp) From d83d35cee09f7074526afb4c522d4526a05998f4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 1 Aug 2023 16:02:54 +0200 Subject: [PATCH 093/583] improve formatting --- CMakeLists.txt | 1 - common/unified/matrix/dense_kernels.instantiate.cpp | 4 ++-- .../{dense_kernels.tpp => dense_kernels.template.cpp} | 0 dev_tools/scripts/config | 6 ++++++ 4 files changed, 8 insertions(+), 3 deletions(-) rename common/unified/matrix/{dense_kernels.tpp => dense_kernels.template.cpp} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e625892c3d..6351ce98bfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,7 +304,6 @@ configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in # propagated to the other parts of Ginkgo in case of building as static libraries add_subdirectory(devices) # Basic device functionalities. Always compiled. add_subdirectory(common) # Import list of unified kernel source files -message("${GKO_UNIFIED_COMMON_INSTANTIATE_SOURCES}") if(GINKGO_BUILD_CUDA) add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs endif() diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp index 92d9fa26a00..bf20c8a19b6 100644 --- a/common/unified/matrix/dense_kernels.instantiate.cpp +++ b/common/unified/matrix/dense_kernels.instantiate.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "common/unified/matrix/dense_kernels.tpp" +#include "common/unified/matrix/dense_kernels.template.cpp" namespace gko { @@ -105,4 +105,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( } // namespace dense } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/unified/matrix/dense_kernels.tpp b/common/unified/matrix/dense_kernels.template.cpp similarity index 100% rename from common/unified/matrix/dense_kernels.tpp rename to common/unified/matrix/dense_kernels.template.cpp diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config index 03b160e3656..937af4a31d1 100644 --- a/dev_tools/scripts/config +++ b/dev_tools/scripts/config @@ -32,6 +32,12 @@ - FixInclude: "common/unified/base/kernel_launch_solver.hpp" - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - FixInclude: "common/unified/base/kernel_launch_solver.hpp" +- "dense_kernels.template.cpp" + - FixInclude: "core/matrix/dense_kernels.hpp" +- "/csr_kernels.template.*" + - FixInclude: "core/matrix/csr_kernels.hpp" +- "/fbcsr_kernels.template.*" + - FixInclude: "core/matrix/fbcsr_kernels.hpp" - "test/base/kernel_launch_generic.cpp" - FixInclude: "common/unified/base/kernel_launch.hpp" - "^test/solver/(lower|upper)_trs_kernels.cpp" From bb065af2e0c7b0073d090c8ddd4ff91589f4bdd3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 1 Aug 2023 16:03:45 +0200 Subject: [PATCH 094/583] fix typos --- cuda/CMakeLists.txt | 2 +- hip/CMakeLists.txt | 2 +- omp/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 6cfb83a59e8..37d56e5855f 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -2,7 +2,7 @@ add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE) -# we don't split up the dense kernels into distinct compliations +# we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 7e0558844cf..e433322e644 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,7 +1,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE) -# we don't split up the dense kernels into distinct compliations +# we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES base/device.hip.cpp diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index d552cc612bf..bda26ad63d3 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -1,5 +1,5 @@ add_library(ginkgo_omp $ "") -# we don't split up the dense kernels into distinct compliations +# we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_omp PRIVATE From 62d8f2abf6aaea7092d63d05f5cafd4baa5ab57c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 10:30:49 +0200 Subject: [PATCH 095/583] review updates - remove unused variables - warn on incorrect instantiation file format - allow disabling the template split - simpler format_header config entries Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Marcel Koch --- CMakeLists.txt | 1 + cmake/template_instantiation.cmake | 15 ++++++++++++++- dev_tools/scripts/config | 8 ++------ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6351ce98bfa..32552a77d6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,7 @@ set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING "Set the required HIP CLANG compiler flags. Current default is an empty string.") set(GINKGO_HIP_AMDGPU "" CACHE STRING "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).") +option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) if(MSVC OR WIN32 OR CYGWIN OR APPLE) diff --git a/cmake/template_instantiation.cmake b/cmake/template_instantiation.cmake index bc37d895537..f77527e0092 100644 --- a/cmake/template_instantiation.cmake +++ b/cmake/template_instantiation.cmake @@ -1,4 +1,9 @@ function(add_instantiation_files source_dir source_file output_files_var) + # if instantiation is disabled, compile the file directly + if(NOT GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) + set(${output_files_var} "${source_dir}/${source_file}" PARENT_SCOPE) + return() + endif() # read full file into variable set(source_path "${source_dir}/${source_file}") file(READ "${source_path}" file_contents) @@ -13,10 +18,19 @@ function(add_instantiation_files source_dir source_file output_files_var) set(counter 0) foreach(line IN LISTS file_contents) if(line MATCHES "// begin") + if(begin_location) + message(FATAL_ERROR "Duplicate begin in line ${counter}, first found in ${begin_location}") + endif() set(begin_location ${counter}) elseif(line MATCHES "// split") + if((NOT begin_location) OR end_location) + message(FATAL_ERROR "Found split outside begin/end in line ${counter}") + endif() list(APPEND split_locations ${counter}) elseif(line MATCHES "// end") + if(end_location) + message(FATAL_ERROR "Duplicate end in line ${counter}, first found in ${end_location}") + endif() set(end_location ${counter}) endif() math(EXPR counter "${counter} + 1") @@ -30,7 +44,6 @@ function(add_instantiation_files source_dir source_file output_files_var) # determine which lines belong to the header and footer set(range_begins ${begin_location} ${split_locations}) set(range_ends ${split_locations} ${end_location}) - list(LENGTH begin_locations range_count) list(LENGTH split_locations range_count_minus_one) math(EXPR length_header "${begin_location}") math(EXPR end_location_past "${end_location} + 1") diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config index 937af4a31d1..79e6a227530 100644 --- a/dev_tools/scripts/config +++ b/dev_tools/scripts/config @@ -32,12 +32,6 @@ - FixInclude: "common/unified/base/kernel_launch_solver.hpp" - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - FixInclude: "common/unified/base/kernel_launch_solver.hpp" -- "dense_kernels.template.cpp" - - FixInclude: "core/matrix/dense_kernels.hpp" -- "/csr_kernels.template.*" - - FixInclude: "core/matrix/csr_kernels.hpp" -- "/fbcsr_kernels.template.*" - - FixInclude: "core/matrix/fbcsr_kernels.hpp" - "test/base/kernel_launch_generic.cpp" - FixInclude: "common/unified/base/kernel_launch.hpp" - "^test/solver/(lower|upper)_trs_kernels.cpp" @@ -57,6 +51,7 @@ - "common/unified/.*.cpp" - PathIgnore: "2" - PathPrefix: "core" + - CoreSuffix: "\.template" - "core/test/base/(extended_float|iterator_factory)" - RemoveTest: "true" - "core/test/base/allocator" @@ -102,3 +97,4 @@ - ".*" - PathPrefix: "core" - PathIgnore: "1" + - CoreSuffix: "\.template" From 4bebc266ef076af3f27035d813d38f6622bc644d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 11:16:45 +0200 Subject: [PATCH 096/583] mark switch as advanced --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32552a77d6a..4d70ac404ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,6 +82,7 @@ set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING set(GINKGO_HIP_AMDGPU "" CACHE STRING "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).") option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) +mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) if(MSVC OR WIN32 OR CYGWIN OR APPLE) From 6a3ac15aa1850a856af2751afe47e39e147a7d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Tue, 11 Jul 2023 19:22:36 +0200 Subject: [PATCH 097/583] Fix OMP row reduction kernel The kernel requested more memory than necessary in most scenarios because of a faulty temporary storage estimation. --- omp/base/kernel_launch_reduction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index d8d081e323b..a46ce970421 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -327,7 +327,7 @@ void run_kernel_col_reduction_sized_impl( const auto reduction_size = ceildiv(reduction_kernel_oversubscription * num_threads, cols); const auto rows_per_thread = ceildiv(rows, reduction_size); - const auto required_storage = sizeof(ValueType) * rows * reduction_size; + const auto required_storage = sizeof(ValueType) * cols * reduction_size; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } From 2c9332182305cbf29fe44035df9e039c3b6406b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Thu, 13 Jul 2023 16:01:44 +0200 Subject: [PATCH 098/583] Add specific tests for OMP reductions --- test/base/kernel_launch_generic.cpp | 168 ++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 3dd1570c5f8..cf07f867c82 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include #include #include @@ -364,6 +365,39 @@ void run1d_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } +void run1d_reduction_cached(std::shared_ptr exec, + std::vector sizes) +{ + gko::array output{exec, 1}; + gko::array temp(exec); + for (const auto& size : sizes) { + temp.clear(); + gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + size, temp); + + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + static_cast(size)); + // The temporary storage (used for partial sums) must be smaller than + // the input array + ASSERT_LT(temp.get_num_elems() / sizeof(int64), size); + } +} + +TEST_F(KernelLaunch, Reduction1DCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run1d_reduction_cached(exec, {1000, 1000000, 1234567, 7654321}); +} + + void run2d_reduction(std::shared_ptr exec) { gko::array output{exec, {-1l}}; @@ -432,6 +466,47 @@ void run2d_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } +void run2d_reduction_cached(std::shared_ptr exec, + std::vector> dims) +{ + gko::array output{exec, 1}; + gko::array temp(exec); + for (const auto& dim : dims) { + temp.clear(); + gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + return i + j + 2; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + dim, temp); + + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + static_cast(dim[0] + dim[1])); + // The temporary storage (used for partial sums) must be smaller than + // the input array + ASSERT_LT(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + } +} + +TEST_F(KernelLaunch, Reduction2DCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run2d_reduction_cached(exec, {{20, 10}, + {10, 3000}, + {1000, 5}, + {30, 50}, + {1, 100000}, + {100000, 1}, + {500000, 20}, + {20, 500000}}); +} + + void run2d_row_reduction(std::shared_ptr exec) { for (auto num_rows : {0, 100, 1000, 10000}) { @@ -481,6 +556,53 @@ void run2d_row_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } +void run2d_row_reduction_cached(std::shared_ptr exec, + std::vector> dims) +{ + // This assumes at most 256 OpenMP Threads + constexpr int64_t max_tmp_elems = 4 * 256; + const size_type result_stride = 1; + gko::array temp(exec); + for (const auto& dim : dims) { + gko::array host_ref{exec->get_master(), dim[0]}; + gko::array output{exec, host_ref}; + temp.clear(); + for (int64 i = 0; i < host_ref.get_num_elems(); ++i) { + host_ref.get_data()[i] = dim[1] + i + 1; + } + + gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + return i + j + 2; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + result_stride, dim, temp); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), + max_tmp_elems * max_tmp_elems); + } +} + +TEST_F(KernelLaunch, ReductionRowCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run2d_row_reduction_cached(exec, {{20, 10}, + {10, 3000}, + {1000, 5}, + {30, 50}, + {1, 100000}, + {100000, 1}, + {500000, 20}, + {20, 500000}}); +} + + void run2d_col_reduction(std::shared_ptr exec) { // empty, most threads idle, most threads busy, multiple blocks @@ -530,3 +652,49 @@ void run2d_col_reduction(std::shared_ptr exec) } TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } + + +void run2d_col_reduction_cached(std::shared_ptr exec, + std::vector> dims) +{ + gko::array temp(exec); + for (const auto& dim : dims) { + gko::array host_ref{exec->get_master(), dim[1]}; + gko::array output{exec, host_ref}; + temp.clear(); + for (int64 i = 0; i < host_ref.get_num_elems(); ++i) { + host_ref.get_data()[i] = dim[0] + i + 1; + } + + gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + return i + j + 2; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + dim, temp); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + // This assumes at most 256 OpenMP Threads + const size_type temp_elem_limit = + std::max(size_type{4 * 256}, dim[0] * dim[1]); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit); + } +} + +TEST_F(KernelLaunch, ReductionColCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run2d_col_reduction_cached(exec, {{20, 10}, + {10, 3000}, + {1000, 5}, + {30, 50}, + {1, 100000}, + {100000, 1}, + {500000, 20}, + {20, 500000}}); +} From 585eea1273684024d0c852000bf5536326c3e43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Fri, 21 Jul 2023 15:26:07 +0200 Subject: [PATCH 099/583] Update reduction tests to all scale with size --- test/base/kernel_launch_generic.cpp | 68 +++++++++++++---------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index cf07f867c82..57bab96d9c0 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -373,11 +373,7 @@ void run1d_reduction_cached(std::shared_ptr exec, for (const auto& size : sizes) { temp.clear(); gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( - exec, - [] GKO_KERNEL(auto i) { - static_assert(is_same::value, "index"); - return i + 1; - }, + exec, [] GKO_KERNEL(auto i) { return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), size, temp); @@ -469,17 +465,13 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } void run2d_reduction_cached(std::shared_ptr exec, std::vector> dims) { + constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array output{exec, 1}; gko::array temp(exec); for (const auto& dim : dims) { temp.clear(); gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( - exec, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - return i + j + 2; - }, + exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), dim, temp); @@ -487,19 +479,23 @@ void run2d_reduction_cached(std::shared_ptr exec, ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), static_cast(dim[0] + dim[1])); // The temporary storage (used for partial sums) must be smaller than - // the input array - ASSERT_LT(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + // the input array (or smaller than a set minimum) + const size_type max_tmp_elems = + std::max(dim[0] * dim[1], min_allowed_tmp_elems); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); } } TEST_F(KernelLaunch, Reduction2DCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores run2d_reduction_cached(exec, {{20, 10}, {10, 3000}, {1000, 5}, {30, 50}, + {600, 500}, + {500, 600}, + {1000, 900}, + {900, 1000}, {1, 100000}, {100000, 1}, {500000, 20}, @@ -559,8 +555,9 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } void run2d_row_reduction_cached(std::shared_ptr exec, std::vector> dims) { - // This assumes at most 256 OpenMP Threads - constexpr int64_t max_tmp_elems = 4 * 256; + // The 2D row reduction potentially needs a lot of memory for small input + // sizes + constexpr size_type min_allowed_tmp_elems = 4 * 256 * 4 * 256; const size_type result_stride = 1; gko::array temp(exec); for (const auto& dim : dims) { @@ -572,30 +569,30 @@ void run2d_row_reduction_cached(std::shared_ptr exec, } gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached( - exec, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - return i + j + 2; - }, + exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), result_stride, dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), - max_tmp_elems * max_tmp_elems); + // The temporary storage (used for partial sums) must be smaller than + // the input array (or smaller than a set minimum) + const size_type max_tmp_elems = + std::max(dim[0] * dim[1], min_allowed_tmp_elems); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); } } TEST_F(KernelLaunch, ReductionRowCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores run2d_row_reduction_cached(exec, {{20, 10}, {10, 3000}, {1000, 5}, {30, 50}, + {600, 500}, + {500, 600}, + {1000, 900}, + {900, 1000}, {1, 100000}, {100000, 1}, {500000, 20}, @@ -657,6 +654,7 @@ TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } void run2d_col_reduction_cached(std::shared_ptr exec, std::vector> dims) { + constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array temp(exec); for (const auto& dim : dims) { gko::array host_ref{exec->get_master(), dim[1]}; @@ -667,32 +665,28 @@ void run2d_col_reduction_cached(std::shared_ptr exec, } gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached( - exec, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - return i + j + 2; - }, + exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - // This assumes at most 256 OpenMP Threads const size_type temp_elem_limit = - std::max(size_type{4 * 256}, dim[0] * dim[1]); + std::max(min_allowed_tmp_elems, dim[0] * dim[1]); ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit); } } TEST_F(KernelLaunch, ReductionColCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores run2d_col_reduction_cached(exec, {{20, 10}, {10, 3000}, {1000, 5}, {30, 50}, + {600, 500}, + {500, 600}, + {1000, 900}, + {900, 1000}, {1, 100000}, {100000, 1}, {500000, 20}, From b7c8c15f0ee49814d4b1a4a07b03e5d1a9118f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Tue, 25 Jul 2023 18:02:46 +0200 Subject: [PATCH 100/583] Change OMP reduction implementation At most allocate as much as the input vector for OMP reductions. --- omp/base/kernel_launch_reduction.hpp | 45 +++++++++++++++++----------- test/base/kernel_launch_generic.cpp | 27 +++++------------ 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index a46ce970421..5dfbd5ba6c0 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -62,8 +62,9 @@ void run_kernel_reduction_impl(std::shared_ptr exec, ValueType* result, size_type size, array& tmp, MappedKernelArgs... args) { - const auto num_threads = static_cast(omp_get_max_threads()); const auto ssize = static_cast(size); + // Limit the number of threads to the number of columns + const auto num_threads = std::min(omp_get_max_threads(), ssize); const auto work_per_thread = ceildiv(ssize, num_threads); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { @@ -82,8 +83,8 @@ void run_kernel_reduction_impl(std::shared_ptr exec, } partial[thread_id] = local_partial; } - *result = - finalize(std::accumulate(partial, partial + num_threads, identity, op)); + *result = finalize(std::accumulate( + partial, partial + required_storage / sizeof(ValueType), identity, op)); } @@ -99,7 +100,8 @@ void run_kernel_reduction_sized_impl(syn::value_list, { const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto num_threads = static_cast(omp_get_max_threads()); + // Limit the number of threads to the number of columns + const auto num_threads = std::min(omp_get_max_threads(), rows); const auto work_per_thread = ceildiv(rows, num_threads); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { @@ -109,7 +111,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, static_assert(remainder_cols < block_size, "remainder too large"); const auto rounded_cols = cols / block_size * block_size; GKO_ASSERT(rounded_cols + remainder_cols == cols); -#pragma omp parallel +#pragma omp parallel num_threads(num_threads) { const auto thread_id = omp_get_thread_num(); const auto begin = thread_id * work_per_thread; @@ -147,8 +149,8 @@ void run_kernel_reduction_sized_impl(syn::value_list, } partial[thread_id] = local_partial; } - *result = - finalize(std::accumulate(partial, partial + num_threads, identity, op)); + *result = finalize(std::accumulate( + partial, partial + required_storage / sizeof(ValueType), identity, op)); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, @@ -210,12 +212,12 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, constexpr int block_size = 8; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto num_threads = static_cast(omp_get_max_threads()); + const auto available_threads = static_cast(omp_get_max_threads()); if (rows <= 0) { return; } // enough work to keep all threads busy or only very small reduction sizes - if (rows >= reduction_kernel_oversubscription * num_threads || + if (rows >= reduction_kernel_oversubscription * available_threads || cols < rows) { #pragma omp parallel for for (int64 row = 0; row < rows; row++) { @@ -229,8 +231,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, } } else { // small number of rows and large reduction sizes: do partial sum first + const auto num_threads = std::min(available_threads, cols); const auto work_per_thread = ceildiv(cols, num_threads); - const auto required_storage = sizeof(ValueType) * rows * num_threads; + const auto temp_elems_per_row = num_threads; + const auto required_storage = + sizeof(ValueType) * rows * temp_elems_per_row; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -247,7 +252,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, return fn(row, col, args...); }()); } - partial[row * num_threads + thread_id] = local_partial; + partial[row * temp_elems_per_row + thread_id] = local_partial; } } // then accumulate the partial sums and write to result @@ -255,10 +260,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, for (int64 row = 0; row < rows; row++) { [&] { auto local_partial = identity; - for (int64 thread_id = 0; thread_id < num_threads; + for (int64 thread_id = 0; thread_id < temp_elems_per_row; thread_id++) { - local_partial = op(local_partial, - partial[row * num_threads + thread_id]); + local_partial = + op(local_partial, + partial[row * temp_elems_per_row + thread_id]); } result[row * result_stride] = finalize(local_partial); }(); @@ -302,12 +308,12 @@ void run_kernel_col_reduction_sized_impl( { const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto num_threads = static_cast(omp_get_max_threads()); + const auto available_threads = static_cast(omp_get_max_threads()); static_assert(remainder_cols < block_size, "remainder too large"); GKO_ASSERT(cols % block_size == remainder_cols); const auto num_col_blocks = ceildiv(cols, block_size); // enough work to keep all threads busy or only very small reduction sizes - if (cols >= reduction_kernel_oversubscription * num_threads || + if (cols >= reduction_kernel_oversubscription * available_threads || rows < cols) { #pragma omp parallel for for (int64 col_block = 0; col_block < num_col_blocks; col_block++) { @@ -324,8 +330,11 @@ void run_kernel_col_reduction_sized_impl( } } else { // number of blocks that need to be reduced afterwards - const auto reduction_size = - ceildiv(reduction_kernel_oversubscription * num_threads, cols); + // This reduction_size definition ensures we don't use more temporary + // storage than the input vector + const auto reduction_size = std::min( + rows, ceildiv(reduction_kernel_oversubscription * available_threads, + cols)); const auto rows_per_thread = ceildiv(rows, reduction_size); const auto required_storage = sizeof(ValueType) * cols * reduction_size; if (tmp.get_num_elems() < required_storage) { diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 57bab96d9c0..bc4119d2806 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -382,15 +382,13 @@ void run1d_reduction_cached(std::shared_ptr exec, static_cast(size)); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LT(temp.get_num_elems() / sizeof(int64), size); + ASSERT_LE(temp.get_num_elems() / sizeof(int64), size); } } TEST_F(KernelLaunch, Reduction1DCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores - run1d_reduction_cached(exec, {1000, 1000000, 1234567, 7654321}); + run1d_reduction_cached(exec, {10, 1000, 1000000, 1234567, 7654321}); } @@ -465,7 +463,6 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } void run2d_reduction_cached(std::shared_ptr exec, std::vector> dims) { - constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array output{exec, 1}; gko::array temp(exec); for (const auto& dim : dims) { @@ -479,10 +476,8 @@ void run2d_reduction_cached(std::shared_ptr exec, ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), static_cast(dim[0] + dim[1])); // The temporary storage (used for partial sums) must be smaller than - // the input array (or smaller than a set minimum) - const size_type max_tmp_elems = - std::max(dim[0] * dim[1], min_allowed_tmp_elems); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); + // the input array + ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); } } @@ -555,9 +550,6 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } void run2d_row_reduction_cached(std::shared_ptr exec, std::vector> dims) { - // The 2D row reduction potentially needs a lot of memory for small input - // sizes - constexpr size_type min_allowed_tmp_elems = 4 * 256 * 4 * 256; const size_type result_stride = 1; gko::array temp(exec); for (const auto& dim : dims) { @@ -576,10 +568,8 @@ void run2d_row_reduction_cached(std::shared_ptr exec, GKO_ASSERT_ARRAY_EQ(host_ref, output); // The temporary storage (used for partial sums) must be smaller than - // the input array (or smaller than a set minimum) - const size_type max_tmp_elems = - std::max(dim[0] * dim[1], min_allowed_tmp_elems); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); + // the input array + ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); } } @@ -654,7 +644,6 @@ TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } void run2d_col_reduction_cached(std::shared_ptr exec, std::vector> dims) { - constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array temp(exec); for (const auto& dim : dims) { gko::array host_ref{exec->get_master(), dim[1]}; @@ -671,9 +660,7 @@ void run2d_col_reduction_cached(std::shared_ptr exec, dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - const size_type temp_elem_limit = - std::max(min_allowed_tmp_elems, dim[0] * dim[1]); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit); + ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); } } From bc0adb04a9e0a44226c95a6c27764fb8b71dbd7b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 14:14:33 +0200 Subject: [PATCH 101/583] fix warning --- include/ginkgo/core/base/range.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 5ba07aa834f..1e4c7a5d00e 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -864,7 +864,7 @@ GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose); #define GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(_deprecated_name, _name) \ - struct [[deprecated("Please use " #_name)]] _deprecated_name : _name{}; + struct [[deprecated("Please use " #_name)]] _deprecated_name : _name {} #define GKO_DEFINE_SIMPLE_BINARY_OPERATION(_name, ...) \ struct _name { \ From 46c8bbd9539bc58b4f65dc2271ab956b66ee62e8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 14:15:03 +0200 Subject: [PATCH 102/583] fix divisions by zero and num_threads == 0 --- omp/base/kernel_launch_reduction.hpp | 112 +++++++++++++++------------ 1 file changed, 62 insertions(+), 50 deletions(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 5dfbd5ba6c0..ef57803ad31 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -65,7 +65,8 @@ void run_kernel_reduction_impl(std::shared_ptr exec, const auto ssize = static_cast(size); // Limit the number of threads to the number of columns const auto num_threads = std::min(omp_get_max_threads(), ssize); - const auto work_per_thread = ceildiv(ssize, num_threads); + const auto work_per_thread = + ceildiv(ssize, std::max(num_threads, 1)); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); @@ -74,17 +75,20 @@ void run_kernel_reduction_impl(std::shared_ptr exec, #pragma omp parallel num_threads(num_threads) { const auto thread_id = omp_get_thread_num(); - const auto begin = thread_id * work_per_thread; - const auto end = std::min(ssize, begin + work_per_thread); + if (thread_id < num_threads) { + const auto begin = thread_id * work_per_thread; + const auto end = std::min(ssize, begin + work_per_thread); - auto local_partial = identity; - for (auto i = begin; i < end; i++) { - local_partial = op(local_partial, fn(i, map_to_device(args)...)); + auto local_partial = identity; + for (auto i = begin; i < end; i++) { + local_partial = + op(local_partial, fn(i, map_to_device(args)...)); + } + partial[thread_id] = local_partial; } - partial[thread_id] = local_partial; } - *result = finalize(std::accumulate( - partial, partial + required_storage / sizeof(ValueType), identity, op)); + *result = + finalize(std::accumulate(partial, partial + num_threads, identity, op)); } @@ -102,7 +106,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, const auto cols = static_cast(size[1]); // Limit the number of threads to the number of columns const auto num_threads = std::min(omp_get_max_threads(), rows); - const auto work_per_thread = ceildiv(rows, num_threads); + const auto work_per_thread = ceildiv(rows, std::max(num_threads, 1)); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); @@ -114,43 +118,46 @@ void run_kernel_reduction_sized_impl(syn::value_list, #pragma omp parallel num_threads(num_threads) { const auto thread_id = omp_get_thread_num(); - const auto begin = thread_id * work_per_thread; - const auto end = std::min(rows, begin + work_per_thread); - - auto local_partial = identity; - if (rounded_cols == 0 || cols == block_size) { - // we group all sizes <= block_size here and unroll explicitly - constexpr auto local_cols = - remainder_cols == 0 ? block_size : remainder_cols; - for (auto row = begin; row < end; row++) { -#pragma unroll - for (int64 col = 0; col < local_cols; col++) { - local_partial = op(local_partial, fn(row, col, args...)); - } - } - } else { - // we operate in block_size blocks plus an explicitly unrolled - // remainder - for (auto row = begin; row < end; row++) { - for (int64 base_col = 0; base_col < rounded_cols; - base_col += block_size) { + if (thread_id < num_threads) { + const auto begin = thread_id * work_per_thread; + const auto end = std::min(rows, begin + work_per_thread); + + auto local_partial = identity; + if (rounded_cols == 0 || cols == block_size) { + // we group all sizes <= block_size here and unroll explicitly + constexpr auto local_cols = + remainder_cols == 0 ? block_size : remainder_cols; + for (auto row = begin; row < end; row++) { #pragma unroll - for (int64 i = 0; i < block_size; i++) { + for (int64 col = 0; col < local_cols; col++) { local_partial = - op(local_partial, fn(row, base_col + i, args...)); + op(local_partial, fn(row, col, args...)); } } + } else { + // we operate in block_size blocks plus an explicitly unrolled + // remainder + for (auto row = begin; row < end; row++) { + for (int64 base_col = 0; base_col < rounded_cols; + base_col += block_size) { #pragma unroll - for (int64 i = 0; i < remainder_cols; i++) { - local_partial = - op(local_partial, fn(row, rounded_cols + i, args...)); + for (int64 i = 0; i < block_size; i++) { + local_partial = op(local_partial, + fn(row, base_col + i, args...)); + } + } +#pragma unroll + for (int64 i = 0; i < remainder_cols; i++) { + local_partial = op(local_partial, + fn(row, rounded_cols + i, args...)); + } } } + partial[thread_id] = local_partial; } - partial[thread_id] = local_partial; } - *result = finalize(std::accumulate( - partial, partial + required_storage / sizeof(ValueType), identity, op)); + *result = + finalize(std::accumulate(partial, partial + num_threads, identity, op)); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, @@ -232,7 +239,8 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, } else { // small number of rows and large reduction sizes: do partial sum first const auto num_threads = std::min(available_threads, cols); - const auto work_per_thread = ceildiv(cols, num_threads); + const auto work_per_thread = + ceildiv(cols, std::max(num_threads, 1)); const auto temp_elems_per_row = num_threads; const auto required_storage = sizeof(ValueType) * rows * temp_elems_per_row; @@ -243,16 +251,19 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, #pragma omp parallel num_threads(num_threads) { const auto thread_id = static_cast(omp_get_thread_num()); - const auto begin = thread_id * work_per_thread; - const auto end = std::min(begin + work_per_thread, cols); - for (int64 row = 0; row < rows; row++) { - auto local_partial = identity; - for (int64 col = begin; col < end; col++) { - local_partial = op(local_partial, [&]() { - return fn(row, col, args...); - }()); + if (thread_id < num_threads) { + const auto begin = thread_id * work_per_thread; + const auto end = std::min(begin + work_per_thread, cols); + for (int64 row = 0; row < rows; row++) { + auto local_partial = identity; + for (int64 col = begin; col < end; col++) { + local_partial = op(local_partial, [&]() { + return fn(row, col, args...); + }()); + } + partial[row * temp_elems_per_row + thread_id] = + local_partial; } - partial[row * temp_elems_per_row + thread_id] = local_partial; } } // then accumulate the partial sums and write to result @@ -334,8 +345,9 @@ void run_kernel_col_reduction_sized_impl( // storage than the input vector const auto reduction_size = std::min( rows, ceildiv(reduction_kernel_oversubscription * available_threads, - cols)); - const auto rows_per_thread = ceildiv(rows, reduction_size); + std::max(cols, 1))); + const auto rows_per_thread = + ceildiv(rows, std::max(reduction_size, 1)); const auto required_storage = sizeof(ValueType) * cols * reduction_size; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); From 8cb678ac36cff7bdc95bd5a0605c3d794ea71343 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 14:15:16 +0200 Subject: [PATCH 103/583] simplify size calculations --- test/base/kernel_launch_generic.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index bc4119d2806..d4a0f83c819 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -382,7 +382,7 @@ void run1d_reduction_cached(std::shared_ptr exec, static_cast(size)); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LE(temp.get_num_elems() / sizeof(int64), size); + ASSERT_LE(temp.get_num_elems(), size * sizeof(int64)); } } @@ -477,7 +477,7 @@ void run2d_reduction_cached(std::shared_ptr exec, static_cast(dim[0] + dim[1])); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64)); } } @@ -569,7 +569,7 @@ void run2d_row_reduction_cached(std::shared_ptr exec, GKO_ASSERT_ARRAY_EQ(host_ref, output); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64)); } } @@ -660,7 +660,7 @@ void run2d_col_reduction_cached(std::shared_ptr exec, dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64)); } } From ff410a8e88707162c735e8d05968b403ff55ab21 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 30 Jun 2023 15:31:12 +0200 Subject: [PATCH 104/583] Add BatchDense class, kernels and tests. Co-authored-by: Aditya Kashi Co-authored-by: Isha Aggarwal --- .../matrix/batch_vector_kernels.hpp.inc | 196 +++ core/CMakeLists.txt | 1 + core/matrix/batch_vector.cpp | 469 +++++++ core/matrix/batch_vector_kernels.hpp | 284 +++++ core/test/matrix/batch_dense.cpp | 520 ++++++++ cuda/CMakeLists.txt | 1 + cuda/matrix/batch_vector_kernels.cu | 434 +++++++ hip/CMakeLists.txt | 1 + hip/matrix/batch_vector_kernels.hip.cpp | 449 +++++++ include/ginkgo/core/base/dim.hpp | 186 +++ include/ginkgo/core/matrix/batch_vector.hpp | 1093 +++++++++++++++++ omp/CMakeLists.txt | 1 + omp/matrix/batch_vector_kernels.cpp | 614 +++++++++ reference/CMakeLists.txt | 1 + reference/matrix/batch_vector_kernels.cpp | 580 +++++++++ reference/matrix/batch_vector_kernels.hpp.inc | 392 ++++++ .../test/matrix/batch_vector_kernels.cpp | 1023 +++++++++++++++ test/matrix/batch_vector_kernels.cpp | 433 +++++++ 18 files changed, 6678 insertions(+) create mode 100644 common/cuda_hip/matrix/batch_vector_kernels.hpp.inc create mode 100644 core/matrix/batch_vector.cpp create mode 100644 core/matrix/batch_vector_kernels.hpp create mode 100644 core/test/matrix/batch_dense.cpp create mode 100644 cuda/matrix/batch_vector_kernels.cu create mode 100644 hip/matrix/batch_vector_kernels.hip.cpp create mode 100644 include/ginkgo/core/matrix/batch_vector.hpp create mode 100644 omp/matrix/batch_vector_kernels.cpp create mode 100644 reference/matrix/batch_vector_kernels.cpp create mode 100644 reference/matrix/batch_vector_kernels.hpp.inc create mode 100644 reference/test/matrix/batch_vector_kernels.cpp create mode 100644 test/matrix/batch_vector_kernels.cpp diff --git a/common/cuda_hip/matrix/batch_vector_kernels.hpp.inc b/common/cuda_hip/matrix/batch_vector_kernels.hpp.inc new file mode 100644 index 00000000000..0eb86996c81 --- /dev/null +++ b/common/cuda_hip/matrix/batch_vector_kernels.hpp.inc @@ -0,0 +1,196 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +/** + * Copies the values of vector into another. + * + * @param num_rows Length of vector. + * @param in Vector to copy from. + * @param out Vector to copy into. + */ +template +__device__ __forceinline__ void single_copy(const int num_rows, + const ValueType* const in, + ValueType* const out) +{ + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + out[iz] = in[iz]; + } +} + +template +__global__ __launch_bounds__(default_block_size) void single_copy( + const size_type num_batch, const int num_rows, + const ValueType* const __restrict__ in, ValueType* const __restrict__ out) +{ + for (size_type ibatch = blockIdx.x; ibatch < num_batch; + ibatch += gridDim.x) { + const auto in_b = gko::batch::batch_entry_ptr(in, 1, num_rows, ibatch); + const auto out_b = + gko::batch::batch_entry_ptr(out, 1, num_rows, ibatch); + single_copy(num_rows, in_b, out_b); + } +} + + +/** + * Adds a scaled vector to another. + * + * @param num_rows Common length of both vectors. + * @param alpha Scaling factor. + * @param[in] x Vector to scale and add. + * @param[in,out] y Vector to add to. + */ +template +__device__ __forceinline__ void single_add_scaled(const int num_rows, + const ValueType alpha, + const ValueType* const x, + ValueType* const y) +{ + for (int li = threadIdx.x; li < num_rows; li += blockDim.x) { + y[li] += alpha * x[li]; + } +} + +template +__global__ __launch_bounds__(default_block_size) void single_add_scaled( + const size_type num_batch, const int num_rows, + const ValueType* const __restrict__ alpha, + const ValueType* const __restrict__ x, ValueType* const __restrict__ y) +{ + for (size_type ibatch = blockIdx.x; ibatch < num_batch; + ibatch += gridDim.x) { + const auto x_b = gko::batch::batch_entry_ptr(x, 1, num_rows, ibatch); + const auto y_b = gko::batch::batch_entry_ptr(y, 1, num_rows, ibatch); + single_add_scaled(num_rows, alpha[0], x_b, y_b); + } +} + + +/** + * Computes the 2-norm of a vector in global or shared memory. + * + * @param x A row-major vector (only 1 column). + * @param result Norm value. + */ +template +__device__ __forceinline__ void single_compute_norm2( + group::thread_block_tile& warp_grp, const int num_rows, + const ValueType* const x, remove_complex& result) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = warp_grp.thread_rank(); r < num_rows; r += warp_grp.size()) { + val += squared_norm(x[r]); + } + + // warp level reduction +#pragma unroll + for (int j = config::warp_size / 2; j > 0; j /= 2) { + val += warp_grp.shfl_down(val, j); + } + + if (warp_grp.thread_rank() == 0) { + result = sqrt(val); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void single_compute_norm2( + const size_type num_batch, const int num_rows, + const ValueType* const __restrict__ x, + remove_complex* const __restrict__ result) +{ + auto warp_grp = + group::tiled_partition(group::this_thread_block()); + for (size_type ibatch = blockIdx.x; ibatch < num_batch; + ibatch += gridDim.x) { + const auto x_b = gko::batch::batch_entry_ptr(x, 1, num_rows, ibatch); + const auto r_b = gko::batch::batch_entry_ptr(result, 1, 1, ibatch); + if (threadIdx.x / config::warp_size == 0) { + single_compute_norm2(warp_grp, num_rows, x_b, r_b[0]); + } + } +} + + +/** + * Computes the dot product of some column vectors in global or shared memory. + * + * @param result Holds dot product value for vector in x and y. + */ +template +__device__ __forceinline__ void single_compute_dot_product( + group::thread_block_tile& warp_grp, const int num_rows, + const ValueType* const x, const ValueType* const y, ValueType& result) +{ + ValueType val = zero(); + + for (int r = warp_grp.thread_rank(); r < num_rows; r += warp_grp.size()) { + val += conj(x[r]) * y[r]; + } + + // warp level reduction +#pragma unroll + for (int j = config::warp_size / 2; j > 0; j /= 2) { + val += warp_grp.shfl_down(val, j); + } + + if (warp_grp.thread_rank() == 0) { + result = val; + } +} + + +// clang-format off +template +__global__ __launch_bounds__(default_block_size) +void single_compute_dot_product(const size_type num_batch, + const int num_rows, + const ValueType *const __restrict__ x, + const ValueType *const __restrict__ y, + ValueType *const __restrict__ result) +// clang-format on +{ + auto warp_grp = + group::tiled_partition(group::this_thread_block()); + for (size_type ibatch = blockIdx.x; ibatch < num_batch; + ibatch += gridDim.x) { + const auto x_b = gko::batch::batch_entry_ptr(x, 1, num_rows, ibatch); + const auto y_b = gko::batch::batch_entry_ptr(y, 1, num_rows, ibatch); + const auto r_b = gko::batch::batch_entry_ptr(result, 1, 1, ibatch); + single_compute_dot_product(warp_grp, num_rows, x_b, y_b, r_b[0]); + } +} diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 49cf89b66d6..03d558562dc 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -38,6 +38,7 @@ target_sources(ginkgo log/vtune.cpp log/record.cpp log/stream.cpp + matrix/batch_vector.cpp matrix/coo.cpp matrix/csr.cpp matrix/dense.cpp diff --git a/core/matrix/batch_vector.cpp b/core/matrix/batch_vector.cpp new file mode 100644 index 00000000000..4449516d5a1 --- /dev/null +++ b/core/matrix/batch_vector.cpp @@ -0,0 +1,469 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "core/matrix/batch_dense_kernels.hpp" + + +namespace gko { +namespace matrix { +namespace batch_dense { + + +GKO_REGISTER_OPERATION(simple_apply, batch_dense::simple_apply); +GKO_REGISTER_OPERATION(apply, batch_dense::apply); +GKO_REGISTER_OPERATION(scale, batch_dense::scale); +GKO_REGISTER_OPERATION(add_scaled, batch_dense::add_scaled); +GKO_REGISTER_OPERATION(add_scale, batch_dense::add_scale); +GKO_REGISTER_OPERATION(convergence_add_scaled, + batch_dense::convergence_add_scaled); +GKO_REGISTER_OPERATION(add_scaled_diag, batch_dense::add_scaled_diag); +GKO_REGISTER_OPERATION(compute_dot, batch_dense::compute_dot); +GKO_REGISTER_OPERATION(convergence_compute_dot, + batch_dense::convergence_compute_dot); +GKO_REGISTER_OPERATION(compute_norm2, batch_dense::compute_norm2); +GKO_REGISTER_OPERATION(convergence_compute_norm2, + batch_dense::convergence_compute_norm2); +GKO_REGISTER_OPERATION(copy, batch_dense::copy); +GKO_REGISTER_OPERATION(convergence_copy, batch_dense::convergence_copy); +GKO_REGISTER_OPERATION(convert_to_batch_csr, batch_dense::convert_to_batch_csr); +GKO_REGISTER_OPERATION(count_nonzeros, batch_dense::count_nonzeros); +GKO_REGISTER_OPERATION(calculate_max_nnz_per_row, + batch_dense::calculate_max_nnz_per_row); +GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, + batch_dense::calculate_nonzeros_per_row); +GKO_REGISTER_OPERATION(calculate_total_cols, batch_dense::calculate_total_cols); +GKO_REGISTER_OPERATION(transpose, batch_dense::transpose); +GKO_REGISTER_OPERATION(conj_transpose, batch_dense::conj_transpose); +GKO_REGISTER_OPERATION(add_scaled_identity, batch_dense::add_scaled_identity); + + +} // namespace batch_dense + + +template +void BatchDense::apply_impl(const BatchLinOp* b, BatchLinOp* x) const +{ + // TODO: Remove this when non-uniform batching kernels have been + // implemented + if (!this->get_size().stores_equal_sizes() || + !this->get_stride().stores_equal_strides()) { + GKO_NOT_IMPLEMENTED; + } + this->get_executor()->run(batch_dense::make_simple_apply( + this, as>(b), as>(x))); +} + + +template +void BatchDense::apply_impl(const BatchLinOp* alpha, + const BatchLinOp* b, + const BatchLinOp* beta, + BatchLinOp* x) const +{ + if (!this->get_size().stores_equal_sizes() || + !this->get_stride().stores_equal_strides()) { + GKO_NOT_IMPLEMENTED; + } + if (auto bid = dynamic_cast*>(b)) { + if (auto xdense = dynamic_cast*>(x)) { + xdense->add_scale(alpha, this, beta); + } else { + GKO_NOT_SUPPORTED(x); + } + } else { + this->get_executor()->run(batch_dense::make_apply( + as>(alpha), this, + as>(b), as>(beta), + as>(x))); + } +} + + +template +void BatchDense::scale_impl(const BatchLinOp* alpha) +{ + auto batch_alpha = as>(alpha); + GKO_ASSERT_BATCH_EQUAL_ROWS( + batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); + for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { + if (batch_alpha->get_size().at(b)[1] != 1) { + // different alpha for each column + GKO_ASSERT_BATCH_EQUAL_COLS(this, batch_alpha); + } + } + auto exec = this->get_executor(); + exec->run(batch_dense::make_scale(batch_alpha, this)); +} + + +template +void BatchDense::add_scaled_impl(const BatchLinOp* alpha, + const BatchLinOp* b) +{ + auto batch_alpha = as>(alpha); + auto batch_b = as>(b); + GKO_ASSERT_BATCH_EQUAL_ROWS( + batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); + for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { + if (batch_alpha->get_size().at(b)[1] != 1) { + // different alpha for each column + GKO_ASSERT_BATCH_EQUAL_COLS(this, batch_alpha); + } + } + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); + auto exec = this->get_executor(); + + exec->run(batch_dense::make_add_scaled(batch_alpha, batch_b, this)); +} + + +template +void BatchDense::add_scale(const BatchLinOp* const alpha, + const BatchLinOp* const a, + const BatchLinOp* const beta) +{ + auto batch_alpha = as>(alpha); + auto batch_beta = as>(beta); + auto batch_a = as>(a); + GKO_ASSERT_BATCH_EQUAL_ROWS( + batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); + if (batch_alpha->get_size().stores_equal_sizes()) { + if (batch_alpha->get_size().at(0)[1] != 1) { + // different alpha for each column + GKO_ASSERT_BATCH_EQUAL_COLS(this, batch_alpha); + } + } else { + for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { + if (batch_alpha->get_size().at(b)[1] != 1) { + GKO_ASSERT(this->get_size().at(b)[1] == + batch_alpha->get_size().at(b)[1]); + } + } + } + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_a); + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_alpha, batch_beta); + this->get_executor()->run( + batch_dense::make_add_scale(batch_alpha, batch_a, batch_beta, this)); +} + + +inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) +{ + auto col_sizes = std::vector>(sizes.get_num_batch_entries()); + for (size_type i = 0; i < col_sizes.size(); ++i) { + col_sizes[i] = dim<2>(1, sizes.at(i)[1]); + } + return batch_dim<2>(col_sizes); +} + + +template +void BatchDense::compute_dot_impl(const BatchLinOp* b, + BatchLinOp* result) const +{ + auto batch_result = as>(result); + auto batch_b = as>(b); + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, + get_col_sizes(this->get_size())); + auto exec = this->get_executor(); + exec->run(batch_dense::make_compute_dot(this, batch_b, batch_result)); +} + + +template +void BatchDense::compute_norm2_impl(BatchLinOp* result) const +{ + using NormVector = BatchDense>; + auto batch_result = as(result); + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, + get_col_sizes(this->get_size())); + auto exec = this->get_executor(); + exec->run(batch_dense::make_compute_norm2(as>(this), + batch_result)); +} + + +template +void BatchDense::convert_to( + BatchDense>* result) const +{ + result->values_ = this->values_; + result->stride_ = this->stride_; + result->num_elems_per_batch_cumul_ = this->num_elems_per_batch_cumul_; + result->set_size(this->get_size()); +} + + +template +void BatchDense::move_to( + BatchDense>* result) +{ + this->convert_to(result); +} + + +template +void BatchDense::convert_to(BatchCsr* result) const +{ + auto exec = this->get_executor(); + + auto batch_size = this->get_size(); + if (!batch_size.stores_equal_sizes()) { + GKO_NOT_IMPLEMENTED; + } + + auto num_stored_nonzeros = + array{exec->get_master(), this->get_num_batch_entries()}; + exec->run( + batch_dense::make_count_nonzeros(this, num_stored_nonzeros.get_data())); + gko::dim<2> main_size = this->get_size().at(0); + const size_type num_nnz = + num_stored_nonzeros.get_data() ? num_stored_nonzeros.get_data()[0] : 0; + auto tmp = BatchCsr::create( + exec, this->get_num_batch_entries(), main_size, num_nnz); + exec->run(batch_dense::make_convert_to_batch_csr(this, tmp.get())); + tmp->move_to(result); +} + + +template +void BatchDense::move_to(BatchCsr* result) +{ + this->convert_to(result); +} + + +template +void BatchDense::convert_to( + BatchDiagonal* const result) const +{ + auto exec = this->get_executor(); + + auto batch_size = this->get_size(); + if (!batch_size.stores_equal_sizes()) { + GKO_NOT_IMPLEMENTED; + } + GKO_ASSERT_BATCH_HAS_SINGLE_COLUMN(this); + if (this->get_stride().at(0) != 1) { + GKO_NOT_IMPLEMENTED; + } + auto temp = BatchDiagonal::create( + exec, batch_dim<2>{batch_size.get_num_batch_entries(), + dim<2>{batch_size.at(0)[0]}}); + exec->copy(this->get_num_stored_elements(), this->get_const_values(), + temp->get_values()); + result->copy_from(temp.get()); +} + + +template +void BatchDense::move_to(BatchDiagonal* const result) +{ + auto exec = this->get_executor(); + + auto batch_size = this->get_size(); + if (!batch_size.stores_equal_sizes()) { + GKO_NOT_IMPLEMENTED; + } + GKO_ASSERT_BATCH_HAS_SINGLE_COLUMN(this); + if (this->get_stride().at(0) != 1) { + GKO_NOT_IMPLEMENTED; + } + auto temp = BatchDiagonal::create( + exec, + batch_dim<2>{batch_size.get_num_batch_entries(), + dim<2>{batch_size.at(0)[0]}}, + std::move(this->values_)); + *result = std::move(*temp); + // set the size of this to 0 + this->set_size(batch_dim<2>()); +} + + +namespace { + + +template +inline void read_impl(MatrixType* mtx, const std::vector& data) +{ + auto batch_sizes = std::vector>(data.size()); + size_type ind = 0; + for (const auto& b : data) { + batch_sizes[ind] = b.size; + ++ind; + } + auto tmp = MatrixType::create(mtx->get_executor()->get_master(), + batch_dim<2>(batch_sizes)); + for (size_type b = 0; b < data.size(); ++b) { + size_type ind = 0; + for (size_type row = 0; row < data[b].size[0]; ++row) { + for (size_type col = 0; col < data[b].size[1]; ++col) { + if (ind < data[b].nonzeros.size() && + data[b].nonzeros[ind].row == row && + data[b].nonzeros[ind].column == col) { + tmp->at(b, row, col) = data[b].nonzeros[ind].value; + ++ind; + } else { + tmp->at(b, row, col) = + zero(); + } + } + } + } + tmp->move_to(mtx); +} + + +} // namespace + + +template +void BatchDense::read(const std::vector& data) +{ + read_impl(this, data); +} + + +template +void BatchDense::read(const std::vector& data) +{ + read_impl(this, data); +} + + +namespace { + + +template +inline void write_impl(const MatrixType* mtx, std::vector& data) +{ + std::unique_ptr op{}; + const MatrixType* tmp{}; + if (mtx->get_executor()->get_master() != mtx->get_executor()) { + op = mtx->clone(mtx->get_executor()->get_master()); + tmp = static_cast(op.get()); + } else { + tmp = mtx; + } + + data = std::vector(mtx->get_num_batch_entries()); + for (size_type b = 0; b < mtx->get_num_batch_entries(); ++b) { + data[b] = {mtx->get_size().at(b), {}}; + for (size_type row = 0; row < data[b].size[0]; ++row) { + for (size_type col = 0; col < data[b].size[1]; ++col) { + if (tmp->at(b, row, col) != + zero()) { + data[b].nonzeros.emplace_back(row, col, + tmp->at(b, row, col)); + } + } + } + } +} + + +} // namespace + + +template +void BatchDense::write(std::vector& data) const +{ + write_impl(this, data); +} + + +template +void BatchDense::write(std::vector& data) const +{ + write_impl(this, data); +} + + +template +std::unique_ptr BatchDense::transpose() const +{ + auto exec = this->get_executor(); + auto trans_cpy = BatchDense::create(exec, gko::transpose(this->get_size())); + + exec->run(batch_dense::make_transpose(this, trans_cpy.get())); + + return std::move(trans_cpy); +} + + +template +std::unique_ptr BatchDense::conj_transpose() const +{ + auto exec = this->get_executor(); + auto trans_cpy = BatchDense::create(exec, gko::transpose(this->get_size())); + + exec->run(batch_dense::make_conj_transpose(this, trans_cpy.get())); + return std::move(trans_cpy); +} + + +template +void BatchDense::add_scaled_identity_impl(const BatchLinOp* const a, + const BatchLinOp* const b) +{ + this->get_executor()->run(batch_dense::make_add_scaled_identity( + as>(a), as>(b), this)); +} + + +#define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class BatchDense<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX); + + +} // namespace matrix + + +} // namespace gko diff --git a/core/matrix/batch_vector_kernels.hpp b/core/matrix/batch_vector_kernels.hpp new file mode 100644 index 00000000000..91dd3e6f5b7 --- /dev/null +++ b/core/matrix/batch_vector_kernels.hpp @@ -0,0 +1,284 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ +#define GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ + + +#include + + +#include +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(_type) \ + void simple_apply(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* a, \ + const matrix::BatchDense<_type>* b, \ + matrix::BatchDense<_type>* c) + +#define GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL(_type) \ + void apply(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* alpha, \ + const matrix::BatchDense<_type>* a, \ + const matrix::BatchDense<_type>* b, \ + const matrix::BatchDense<_type>* beta, \ + matrix::BatchDense<_type>* c) + +#define GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL(_type) \ + void scale(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* alpha, \ + matrix::BatchDense<_type>* x) + +#define GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL(_type) \ + void add_scaled(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* alpha, \ + const matrix::BatchDense<_type>* x, \ + matrix::BatchDense<_type>* y) + +#define GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL(_type) \ + void add_scale(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* alpha, \ + const matrix::BatchDense<_type>* x, \ + const matrix::BatchDense<_type>* beta, \ + matrix::BatchDense<_type>* y) + +#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL(_type) \ + void convergence_add_scaled(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* alpha, \ + const matrix::BatchDense<_type>* x, \ + matrix::BatchDense<_type>* y, \ + const uint32& converged) + +#define GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL(_type) \ + void add_scaled_diag(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* alpha, \ + const matrix::Diagonal<_type>* x, \ + matrix::BatchDense<_type>* y) + +#define GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL(_type) \ + void compute_dot(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* x, \ + const matrix::BatchDense<_type>* y, \ + matrix::BatchDense<_type>* result) + + +#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL(_type) \ + void convergence_compute_dot(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* x, \ + const matrix::BatchDense<_type>* y, \ + matrix::BatchDense<_type>* result, \ + const uint32& converged) + +#define GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL(_type) \ + void compute_norm2(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* x, \ + matrix::BatchDense>* result) + +#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL(_type) \ + void convergence_compute_norm2( \ + std::shared_ptr exec, \ + const matrix::BatchDense<_type>* x, \ + matrix::BatchDense>* result, \ + const uint32& converged) + + +#define GKO_DECLARE_BATCH_DENSE_COPY_KERNEL(_type) \ + void copy(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* x, \ + matrix::BatchDense<_type>* result) + +#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL(_type) \ + void convergence_copy(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* x, \ + matrix::BatchDense<_type>* result, \ + const uint32& converged) + +#define GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL(_type, _prec) \ + void convert_to_batch_csr(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* source, \ + matrix::BatchCsr<_type, _prec>* other) + +#define GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL(_type) \ + void count_nonzeros(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* source, \ + size_type* result) + +#define GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(_type) \ + void calculate_max_nnz_per_row( \ + std::shared_ptr exec, \ + const matrix::BatchDense<_type>* source, size_type* result) + +#define GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL(_type) \ + void calculate_nonzeros_per_row( \ + std::shared_ptr exec, \ + const matrix::BatchDense<_type>* source, array* result) + +#define GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL(_type) \ + void calculate_total_cols( \ + std::shared_ptr exec, \ + const matrix::BatchDense<_type>* source, size_type* result, \ + const size_type* stride_factor, const size_type* slice_size) + +#define GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL(_type) \ + void transpose(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* orig, \ + matrix::BatchDense<_type>* trans) + +#define GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL(_type) \ + void conj_transpose(std::shared_ptr exec, \ + const matrix::BatchDense<_type>* orig, \ + matrix::BatchDense<_type>* trans) + +#define GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL(ValueType) \ + void batch_scale(std::shared_ptr exec, \ + const matrix::BatchDiagonal* left_scale, \ + const matrix::BatchDiagonal* right_scale, \ + matrix::BatchDense* vec_to_scale) + +#define GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL(ValueType) \ + void add_scaled_identity(std::shared_ptr exec, \ + const matrix::BatchDense* a, \ + const matrix::BatchDense* b, \ + matrix::BatchDense* mtx) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_COPY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL(ValueType) + + +namespace omp { +namespace batch_dense { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace batch_dense +} // namespace omp + + +namespace cuda { +namespace batch_dense { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace batch_dense +} // namespace cuda + + +namespace reference { +namespace batch_dense { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace batch_dense +} // namespace reference + + +namespace hip { +namespace batch_dense { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace batch_dense +} // namespace hip + + +namespace dpcpp { +namespace batch_dense { + +GKO_DECLARE_ALL_AS_TEMPLATES; + +} // namespace batch_dense +} // namespace dpcpp + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp new file mode 100644 index 00000000000..7db7469baf6 --- /dev/null +++ b/core/test/matrix/batch_dense.cpp @@ -0,0 +1,520 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class BatchDense : public ::testing::Test { +protected: + using value_type = T; + using DenseMtx = gko::matrix::Dense; + using size_type = gko::size_type; + BatchDense() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::batch_initialize>( + std::vector{4, 3}, + {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + exec)) + {} + + + static void assert_equal_to_original_mtx( + gko::matrix::BatchDense* m) + { + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_stride().at(0), 4); + ASSERT_EQ(m->get_stride().at(1), 3); + ASSERT_EQ(m->get_num_stored_elements(), (2 * 4) + (2 * 3)); + ASSERT_EQ(m->get_num_stored_elements(0), 2 * 4); + ASSERT_EQ(m->get_num_stored_elements(1), 2 * 3); + EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5}); + EXPECT_EQ(m->at(0, 1, 1), value_type{2.5}); + ASSERT_EQ(m->at(0, 1, 2), value_type{3.5}); + EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{2.5}); + EXPECT_EQ(m->at(1, 0, 2), value_type{3.0}); + EXPECT_EQ(m->at(1, 1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{2.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); + } + + static void assert_empty(gko::matrix::BatchDense* m) + { + ASSERT_EQ(m->get_num_batch_entries(), 0); + ASSERT_EQ(m->get_num_stored_elements(), 0); + } + + std::shared_ptr exec; + std::unique_ptr> mtx; +}; + +TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); + + +TYPED_TEST(BatchDense, CanBeEmpty) +{ + auto empty = gko::matrix::BatchDense::create(this->exec); + this->assert_empty(empty.get()); +} + + +TYPED_TEST(BatchDense, ReturnsNullValuesArrayWhenEmpty) +{ + auto empty = gko::matrix::BatchDense::create(this->exec); + ASSERT_EQ(empty->get_const_values(), nullptr); +} + + +TYPED_TEST(BatchDense, CanBeConstructedWithSize) +{ + using size_type = gko::size_type; + auto m = gko::matrix::BatchDense::create( + this->exec, + std::vector>{gko::dim<2>{2, 4}, gko::dim<2>{2, 3}}); + + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 4)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 3)); + EXPECT_EQ(m->get_stride().at(0), 4); + EXPECT_EQ(m->get_stride().at(1), 3); + ASSERT_EQ(m->get_num_stored_elements(), 14); + ASSERT_EQ(m->get_num_stored_elements(0), 8); + ASSERT_EQ(m->get_num_stored_elements(1), 6); +} + + +TYPED_TEST(BatchDense, CanBeConstructedWithSizeAndStride) +{ + using size_type = gko::size_type; + auto m = gko::matrix::BatchDense::create( + this->exec, std::vector>{gko::dim<2>{2, 3}}, + std::vector{4}); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + EXPECT_EQ(m->get_stride().at(0), 4); + ASSERT_EQ(m->get_num_stored_elements(), 8); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) +{ + using value_type = typename TestFixture::value_type; + using size_type = gko::size_type; + // clang-format off + value_type data[] = { + 1.0, 2.0, -1.0, + 3.0, 4.0, -1.0, + 3.0, 5.0, 1.0, + 5.0, 6.0, -3.0}; + // clang-format on + + auto m = gko::matrix::BatchDense::create( + this->exec, + std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, + gko::array::view(this->exec, 12, data), + std::vector{3, 3}); + + ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) +{ + using value_type = typename TestFixture::value_type; + using size_type = gko::size_type; + // clang-format off + const value_type data[] = { + 1.0, 2.0, -1.0, + 3.0, 4.0, -1.0, + 3.0, 5.0, 1.0, + 5.0, 6.0, -3.0}; + // clang-format on + + auto m = gko::matrix::BatchDense::create_const( + this->exec, + std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, + gko::array::const_view(this->exec, 12, data), + std::vector{3, 3}); + + ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromBatchDenseMatrices) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 3, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto m = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat2.get()}); + auto m_ref = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), + mat2.get(), mat1.get(), mat2.get()}); + auto m2 = + gko::matrix::BatchDense::create(this->exec, 3, m.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatricesByDuplication) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto bat_m = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); + auto m = + gko::matrix::BatchDense::create(this->exec, 3, mat1.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto m = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat2.get()}); + + this->assert_equal_to_original_mtx(m.get()); +} + + +TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto dense_mats = this->mtx->unbatch(); + + + GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); + GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.); +} + + +TYPED_TEST(BatchDense, KnowsItsSizeAndValues) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} + + +TYPED_TEST(BatchDense, CanBeListConstructed) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( + {{1.0, 2.0}, {1.0, 3.0}}, this->exec); + + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 4); + EXPECT_EQ(m->at(0, 0), value_type{1}); + EXPECT_EQ(m->at(0, 1), value_type{2}); + EXPECT_EQ(m->at(1, 0), value_type{1}); + EXPECT_EQ(m->at(1, 1), value_type{3}); +} + + +TYPED_TEST(BatchDense, CanBeListConstructedWithstride) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( + std::vector{2}, {{1.0, 2.0}}, this->exec); + ASSERT_EQ(m->get_num_batch_entries(), 1); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 4); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{2.0}); +} + + +TYPED_TEST(BatchDense, CanBeListConstructedByCopies) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( + 2, I({1.0, 2.0}), this->exec); + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 4); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{2.0}); +} + + +TYPED_TEST(BatchDense, CanBeDoubleListConstructed) +{ + using value_type = typename TestFixture::value_type; + using T = value_type; + auto m = gko::batch_initialize>( + {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, + {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, + this->exec); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); + ASSERT_EQ(m->get_stride().at(0), 3); + ASSERT_EQ(m->get_stride().at(1), 2); + EXPECT_EQ(m->get_num_stored_elements(), 15); + ASSERT_EQ(m->get_num_stored_elements(0), 9); + ASSERT_EQ(m->get_num_stored_elements(1), 6); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{1.0}); + EXPECT_EQ(m->at(0, 2), value_type{0.0}); + ASSERT_EQ(m->at(0, 3), value_type{2.0}); + EXPECT_EQ(m->at(0, 4), value_type{4.0}); + EXPECT_EQ(m->at(1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 2), value_type{3.0}); + ASSERT_EQ(m->at(1, 3), value_type{4.0}); + EXPECT_EQ(m->at(1, 4), value_type{5.0}); +} + + +TYPED_TEST(BatchDense, CanBeDoubleListConstructedWithstride) +{ + using value_type = typename TestFixture::value_type; + using T = value_type; + auto m = gko::batch_initialize>( + {4, 3}, + {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, + {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, + this->exec); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); + ASSERT_EQ(m->get_stride().at(0), 4); + ASSERT_EQ(m->get_stride().at(1), 3); + EXPECT_EQ(m->get_num_stored_elements(), 21); + ASSERT_EQ(m->get_num_stored_elements(0), 12); + ASSERT_EQ(m->get_num_stored_elements(1), 9); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{1.0}); + EXPECT_EQ(m->at(0, 2), value_type{0.0}); + ASSERT_EQ(m->at(0, 3), value_type{2.0}); + EXPECT_EQ(m->at(0, 4), value_type{4.0}); + EXPECT_EQ(m->at(1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 2), value_type{3.0}); + ASSERT_EQ(m->at(1, 3), value_type{4.0}); + EXPECT_EQ(m->at(1, 4), value_type{5.0}); +} + + +TYPED_TEST(BatchDense, CanBeCopied) +{ + auto mtx_copy = gko::matrix::BatchDense::create(this->exec); + mtx_copy->copy_from(this->mtx.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->at(0, 0, 0) = 7; + this->mtx->at(0, 1) = 7; + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchDense, CanBeMoved) +{ + auto mtx_copy = gko::matrix::BatchDense::create(this->exec); + mtx_copy->copy_from(std::move(this->mtx)); + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchDense, CanBeCloned) +{ + auto mtx_clone = this->mtx->clone(); + this->assert_equal_to_original_mtx( + dynamic_castmtx.get())>(mtx_clone.get())); +} + + +TYPED_TEST(BatchDense, CanBeCleared) +{ + this->mtx->clear(); + this->assert_empty(this->mtx.get()); +} + + +TYPED_TEST(BatchDense, CanBeReadFromMatrixData) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::matrix::BatchDense::create(this->exec); + // clang-format off + m->read({gko::matrix_data{{2, 3}, + {{0, 0, 1.0}, + {0, 1, 3.0}, + {0, 2, 2.0}, + {1, 0, 0.0}, + {1, 1, 5.0}, + {1, 2, 0.0}}}, + gko::matrix_data{{2, 2}, + {{0, 0, -1.0}, + {0, 1, 0.5}, + {1, 0, 0.0}, + {1, 1, 9.0}}}}); + // clang-format on + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 2)); + ASSERT_EQ(m->get_num_stored_elements(), 10); + ASSERT_EQ(m->get_num_stored_elements(0), 6); + ASSERT_EQ(m->get_num_stored_elements(1), 4); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); + EXPECT_EQ(m->at(0, 1, 2), value_type{0.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); + EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); +} + + +TYPED_TEST(BatchDense, GeneratesCorrectMatrixData) +{ + using value_type = typename TestFixture::value_type; + using tpl = typename gko::matrix_data::nonzero_type; + std::vector> data; + + this->mtx->write(data); + + ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); + ASSERT_EQ(data[0].nonzeros.size(), 6); + EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0})); + EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0})); + EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0})); + EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5})); + EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5})); + EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5})); + ASSERT_EQ(data[1].size, gko::dim<2>(2, 3)); + ASSERT_EQ(data[1].nonzeros.size(), 6); + EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5})); + EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0})); + EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0})); + EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0})); + EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0})); +} + + +TYPED_TEST(BatchDense, CanBeReadFromMatrixAssemblyData) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::matrix::BatchDense::create(this->exec); + gko::matrix_assembly_data data1(gko::dim<2>{2, 3}); + data1.set_value(0, 0, 1.0); + data1.set_value(0, 1, 3.0); + data1.set_value(0, 2, 2.0); + data1.set_value(1, 0, 0.0); + data1.set_value(1, 1, 5.0); + data1.set_value(1, 2, 0.0); + gko::matrix_assembly_data data2(gko::dim<2>{2, 1}); + data2.set_value(0, 0, 2.0); + data2.set_value(1, 0, 5.0); + auto data = std::vector>{data1, data2}; + + m->read(data); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 8); + ASSERT_EQ(m->get_num_stored_elements(0), 6); + ASSERT_EQ(m->get_num_stored_elements(1), 2); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{0.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); + EXPECT_EQ(m->at(1, 1, 0), value_type{5.0}); +} + + +} // namespace diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 37d56e5855f..d630fb9a92a 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -35,6 +35,7 @@ target_sources(ginkgo_cuda factorization/par_ilut_select_kernel.cu factorization/par_ilut_spgeam_kernel.cu factorization/par_ilut_sweep_kernel.cu + matrix/batch_vector_kernels.cu matrix/coo_kernels.cu ${CSR_INSTANTIATE} matrix/dense_kernels.cu diff --git a/cuda/matrix/batch_vector_kernels.cu b/cuda/matrix/batch_vector_kernels.cu new file mode 100644 index 00000000000..af67fa1597a --- /dev/null +++ b/cuda/matrix/batch_vector_kernels.cu @@ -0,0 +1,434 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include +#include +#include + + +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/cublas_bindings.hpp" +#include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The BatchDense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_multiplier = 4; + + +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_vector_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const matrix::BatchDense* a, + const matrix::BatchDense* b, + matrix::BatchDense* c) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto a_ub = get_batch_struct(a); + const auto b_ub = get_batch_struct(b); + const auto c_ub = get_batch_struct(c); + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + mv<<>>(a_ub, b_ub, c_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void apply(std::shared_ptr exec, + const matrix::BatchDense* alpha, + const matrix::BatchDense* a, + const matrix::BatchDense* b, + const matrix::BatchDense* beta, + matrix::BatchDense* c) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto a_ub = get_batch_struct(a); + const auto b_ub = get_batch_struct(b); + const auto c_ub = get_batch_struct(c); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + advanced_mv<<>>(alpha_ub, a_ub, b_ub, + beta_ub, c_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); + + +template +void scale(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + matrix::BatchDense* const x) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + scale<<>>(alpha_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + matrix::BatchDense* const y) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const size_type nrhs = x->get_size().at(0)[1]; + if (nrhs == 1) { + const auto num_batch = x->get_num_batch_entries(); + const auto num_rows = x->get_size().at(0)[0]; + single_add_scaled<<>>( + num_batch, num_rows, as_cuda_type(alpha->get_const_values()), + as_cuda_type(x->get_const_values()), as_cuda_type(y->get_values())); + } else { + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + add_scaled<<>>(alpha_ub, x_ub, y_ub); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); + + +template +void add_scale(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + const matrix::BatchDense* const beta, + matrix::BatchDense* const y) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const size_type nrhs = x->get_size().at(0)[1]; + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + add_scale<<>>(alpha_ub, x_ub, beta_ub, + y_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); + + +template +void convergence_add_scaled(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + matrix::BatchDense* const y, + const uint32& converged) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); + + +template +void add_scaled_diag(std::shared_ptr exec, + const matrix::BatchDense* alpha, + const matrix::Diagonal* x, + matrix::BatchDense* y) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const matrix::BatchDense* x, + const matrix::BatchDense* y, + matrix::BatchDense* result) +{ + const auto num_blocks = x->get_num_batch_entries(); + const auto num_rhs = x->get_size().at()[1]; + if (num_rhs == 1) { + const auto num_rows = x->get_size().at()[0]; + single_compute_dot_product<<>>( + num_blocks, num_rows, as_cuda_type(x->get_const_values()), + as_cuda_type(y->get_const_values()), + as_cuda_type(result->get_values())); + } else { + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + compute_dot_product<<>>(x_ub, y_ub, + res_ub); + } +} + + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); + + +template +void convergence_compute_dot(std::shared_ptr exec, + const matrix::BatchDense* x, + const matrix::BatchDense* y, + matrix::BatchDense* result, + const uint32& converged) GKO_NOT_IMPLEMENTED; + + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::BatchDense* const x, + matrix::BatchDense>* const result) +{ + const auto num_blocks = x->get_num_batch_entries(); + const auto num_rhs = x->get_size().at()[1]; + if (num_rhs == 1) { + const auto num_rows = x->get_size().at()[0]; + single_compute_norm2<<>>( + num_blocks, num_rows, as_cuda_type(x->get_const_values()), + as_cuda_type(result->get_values())); + } else { + const auto x_ub = get_batch_struct(x); + const auto res_ub = get_batch_struct(result); + compute_norm2<<>>(x_ub, res_ub); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); + + +template +void convergence_compute_norm2( + std::shared_ptr exec, + const matrix::BatchDense* const x, + matrix::BatchDense>* const result, + const uint32& converged) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); + + +template +void convert_to_batch_csr(std::shared_ptr exec, + const matrix::BatchDense* source, + matrix::BatchCsr* other) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); + + +template +void calculate_max_nnz_per_row(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); + + +template +void calculate_nonzeros_per_row(std::shared_ptr exec, + const matrix::BatchDense* source, + array* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); + + +template +void calculate_total_cols(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result, const size_type* stride_factor, + const size_type* slice_size) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::BatchDense* const orig, + matrix::BatchDense* const trans) +{ + using cu_val_type = cuda_type; + const size_type nbatch = orig->get_num_batch_entries(); + const size_type orig_stride = orig->get_stride().at(); + const size_type trans_stride = trans->get_stride().at(); + const int nrows = orig->get_size().at()[0]; + const int ncols = orig->get_size().at()[1]; + transpose<<>>( + nrows, ncols, orig_stride, as_cuda_type(orig->get_const_values()), + trans_stride, as_cuda_type(trans->get_values()), + [] __device__(cu_val_type x) { return x; }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::BatchDense* orig, + matrix::BatchDense* trans) +{ + using cu_val_type = cuda_type; + const size_type nbatch = orig->get_num_batch_entries(); + const size_type orig_stride = orig->get_stride().at(); + const size_type trans_stride = trans->get_stride().at(); + const int nrows = orig->get_size().at()[0]; + const int ncols = orig->get_size().at()[1]; + transpose<<>>( + nrows, ncols, orig_stride, as_cuda_type(orig->get_const_values()), + trans_stride, as_cuda_type(trans->get_values()), + [] __device__(cu_val_type x) { return conj(x); }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + + +template +void copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto result_ub = get_batch_struct(result); + const auto x_ub = get_batch_struct(x); + copy<<>>(x_ub, result_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); + + +template +void convergence_copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result, + const uint32& converged) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); + + +template +void batch_scale(std::shared_ptr exec, + const matrix::BatchDiagonal* const left_scale, + const matrix::BatchDiagonal* const rght_scale, + matrix::BatchDense* const vec_to_scale) +{ + if (!left_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + if (!rght_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + if (!vec_to_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + + const auto stride = vec_to_scale->get_stride().at(); + const auto nrows = static_cast(vec_to_scale->get_size().at()[0]); + const auto nrhs = static_cast(vec_to_scale->get_size().at()[1]); + const auto nbatch = vec_to_scale->get_num_batch_entries(); + + const int num_blocks = vec_to_scale->get_num_batch_entries(); + uniform_batch_scale<<>>( + nrows, stride, nrhs, nbatch, + as_cuda_type(left_scale->get_const_values()), + as_cuda_type(rght_scale->get_const_values()), + as_cuda_type(vec_to_scale->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); + + +template +void add_scaled_identity(std::shared_ptr exec, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + matrix::BatchDense* const mtx) +{ + if (!mtx->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + const auto num_blocks = mtx->get_num_batch_entries(); + const auto nrows = static_cast(mtx->get_size().at(0)[0]); + const auto ncols = static_cast(mtx->get_size().at(0)[1]); + const auto stride = mtx->get_stride().at(0); + const auto values = mtx->get_values(); + const auto alpha = a->get_const_values(); + const auto a_stride = a->get_stride().at(0); + const auto b_stride = b->get_stride().at(0); + const auto beta = b->get_const_values(); + add_scaled_identity<<>>( + num_blocks, nrows, ncols, stride, as_cuda_type(values), a_stride, + as_cuda_type(alpha), b_stride, as_cuda_type(beta)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_dense +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index e433322e644..fea0dec5c8c 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -33,6 +33,7 @@ set(GINKGO_HIP_SOURCES factorization/par_ilut_select_kernel.hip.cpp factorization/par_ilut_spgeam_kernel.hip.cpp factorization/par_ilut_sweep_kernel.hip.cpp + matrix/batch_vector_kernels.hip.cpp matrix/coo_kernels.hip.cpp ${CSR_INSTANTIATE} matrix/dense_kernels.hip.cpp diff --git a/hip/matrix/batch_vector_kernels.hip.cpp b/hip/matrix/batch_vector_kernels.hip.cpp new file mode 100644 index 00000000000..32665e31191 --- /dev/null +++ b/hip/matrix/batch_vector_kernels.hip.cpp @@ -0,0 +1,449 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "core/matrix/batch_struct.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The BatchDense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_multiplier = 4; + + +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_vector_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const matrix::BatchDense* a, + const matrix::BatchDense* b, + matrix::BatchDense* c) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto a_ub = get_batch_struct(a); + const auto b_ub = get_batch_struct(b); + const auto c_ub = get_batch_struct(c); + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + hipLaunchKernelGGL(mv, num_blocks, default_block_size, 0, 0, a_ub, b_ub, + c_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void apply(std::shared_ptr exec, + const matrix::BatchDense* alpha, + const matrix::BatchDense* a, + const matrix::BatchDense* b, + const matrix::BatchDense* beta, + matrix::BatchDense* c) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto a_ub = get_batch_struct(a); + const auto b_ub = get_batch_struct(b); + const auto c_ub = get_batch_struct(c); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + hipLaunchKernelGGL(advanced_mv, num_blocks, default_block_size, 0, 0, + alpha_ub, a_ub, b_ub, beta_ub, c_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); + + +template +void scale(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + matrix::BatchDense* const x) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + hipLaunchKernelGGL(scale, dim3(num_blocks), dim3(default_block_size), 0, 0, + alpha_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + matrix::BatchDense* const y) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const size_type nrhs = x->get_size().at(0)[1]; + if (nrhs == 1) { + const auto num_batch = x->get_num_batch_entries(); + const auto num_rows = x->get_size().at(0)[0]; + hipLaunchKernelGGL( + single_add_scaled, dim3(num_blocks), dim3(default_block_size), 0, 0, + num_batch, num_rows, as_hip_type(alpha->get_const_values()), + as_hip_type(x->get_const_values()), as_hip_type(y->get_values())); + } else { + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + hipLaunchKernelGGL(add_scaled, dim3(num_blocks), + dim3(default_block_size), 0, 0, alpha_ub, x_ub, + y_ub); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); + + +template +void add_scale(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + const matrix::BatchDense* const beta, + matrix::BatchDense* const y) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const size_type nrhs = x->get_size().at(0)[1]; + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + hipLaunchKernelGGL(add_scale, num_blocks, default_block_size, 0, 0, + alpha_ub, x_ub, beta_ub, y_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); + + +template +void convergence_add_scaled(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + matrix::BatchDense* const y, + const uint32& converged) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); + + +template +void add_scaled_diag(std::shared_ptr exec, + const matrix::BatchDense* alpha, + const matrix::Diagonal* x, + matrix::BatchDense* y) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const matrix::BatchDense* x, + const matrix::BatchDense* y, + matrix::BatchDense* result) +{ + const auto num_blocks = x->get_num_batch_entries(); + const auto num_rhs = x->get_size().at()[1]; + if (num_rhs == 1) { + const auto num_rows = x->get_size().at()[0]; + hipLaunchKernelGGL(single_compute_dot_product, dim3(num_blocks), + dim3(default_block_size), 0, 0, num_blocks, num_rows, + as_hip_type(x->get_const_values()), + as_hip_type(y->get_const_values()), + as_hip_type(result->get_values())); + } else { + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + hipLaunchKernelGGL(compute_dot_product, dim3(num_blocks), + dim3(default_block_size), 0, 0, x_ub, y_ub, res_ub); + } +} + + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); + + +template +void convergence_compute_dot(std::shared_ptr exec, + const matrix::BatchDense* x, + const matrix::BatchDense* y, + matrix::BatchDense* result, + const uint32& converged) GKO_NOT_IMPLEMENTED; + + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::BatchDense* const x, + matrix::BatchDense>* const result) +{ + const auto num_blocks = x->get_num_batch_entries(); + const auto num_rhs = x->get_size().at()[1]; + if (num_rhs == 1) { + const auto num_rows = x->get_size().at()[0]; + hipLaunchKernelGGL(single_compute_norm2, dim3(num_blocks), + dim3(default_block_size), 0, 0, num_blocks, num_rows, + as_hip_type(x->get_const_values()), + as_hip_type(result->get_values())); + } else { + const auto x_ub = get_batch_struct(x); + const auto res_ub = get_batch_struct(result); + hipLaunchKernelGGL(compute_norm2, dim3(num_blocks), + dim3(default_block_size), 0, 0, x_ub, res_ub); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); + + +template +void convergence_compute_norm2( + std::shared_ptr exec, + const matrix::BatchDense* const x, + matrix::BatchDense>* const result, + const uint32& converged) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); + + +template +void convert_to_batch_csr(std::shared_ptr exec, + const matrix::BatchDense* source, + matrix::BatchCsr* other) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); + + +template +void calculate_max_nnz_per_row(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); + + +template +void calculate_nonzeros_per_row(std::shared_ptr exec, + const matrix::BatchDense* source, + array* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); + + +template +void calculate_total_cols(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result, const size_type* stride_factor, + const size_type* slice_size) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::BatchDense* const orig, + matrix::BatchDense* const trans) +{ + using hip_val_type = hip_type; + const size_type nbatch = orig->get_num_batch_entries(); + const size_type orig_stride = orig->get_stride().at(); + const size_type trans_stride = trans->get_stride().at(); + const int nrows = orig->get_size().at()[0]; + const int ncols = orig->get_size().at()[1]; + hipLaunchKernelGGL(transpose, dim3(nbatch), dim3(default_block_size), 0, 0, + nrows, ncols, orig_stride, + as_hip_type(orig->get_const_values()), trans_stride, + as_hip_type(trans->get_values()), + [] __device__(hip_val_type x) { return x; }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::BatchDense* orig, + matrix::BatchDense* trans) +{ + using hip_val_type = hip_type; + const size_type nbatch = orig->get_num_batch_entries(); + const size_type orig_stride = orig->get_stride().at(); + const size_type trans_stride = trans->get_stride().at(); + const int nrows = orig->get_size().at()[0]; + const int ncols = orig->get_size().at()[1]; + hipLaunchKernelGGL(transpose, dim3(nbatch), dim3(default_block_size), 0, 0, + nrows, ncols, orig_stride, + as_hip_type(orig->get_const_values()), trans_stride, + as_hip_type(trans->get_values()), + [] __device__(hip_val_type x) { return conj(x); }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + + +template +void copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto result_ub = get_batch_struct(result); + const auto x_ub = get_batch_struct(x); + hipLaunchKernelGGL(copy, dim3(num_blocks), dim3(default_block_size), 0, 0, + x_ub, result_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); + + +template +void convergence_copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result, + const uint32& converged) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); + + +template +void batch_scale(std::shared_ptr exec, + const matrix::BatchDiagonal* const left_scale, + const matrix::BatchDiagonal* const rght_scale, + matrix::BatchDense* const vec_to_scale) +{ + if (!left_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + if (!rght_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + if (!vec_to_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + + const auto stride = vec_to_scale->get_stride().at(); + const auto nrows = static_cast(vec_to_scale->get_size().at()[0]); + const auto nrhs = static_cast(vec_to_scale->get_size().at()[1]); + const auto nbatch = vec_to_scale->get_num_batch_entries(); + + const int num_blocks = vec_to_scale->get_num_batch_entries(); + hipLaunchKernelGGL(uniform_batch_scale, dim3(num_blocks), + dim3(default_block_size), 0, 0, nrows, stride, nrhs, + nbatch, as_hip_type(left_scale->get_const_values()), + as_hip_type(rght_scale->get_const_values()), + as_hip_type(vec_to_scale->get_values())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); + + +template +void add_scaled_identity(std::shared_ptr exec, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + matrix::BatchDense* const mtx) +{ + if (!mtx->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; + const auto num_blocks = mtx->get_num_batch_entries(); + const auto nrows = static_cast(mtx->get_size().at(0)[0]); + const auto ncols = static_cast(mtx->get_size().at(0)[1]); + const auto stride = mtx->get_stride().at(0); + const auto values = mtx->get_values(); + const auto alpha = a->get_const_values(); + const auto a_stride = a->get_stride().at(0); + const auto b_stride = b->get_stride().at(0); + const auto beta = b->get_const_values(); + hipLaunchKernelGGL(add_scaled_identity, num_blocks, default_block_size, 0, + 0, num_blocks, nrows, ncols, stride, as_hip_type(values), + a_stride, as_hip_type(alpha), b_stride, + as_hip_type(beta)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_dense +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index c70c5f054ec..ae13290cdd2 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -243,6 +243,144 @@ struct dim<1u, DimensionType> { }; +/** + * A type representing the dimensions of a multidimensional batch object. + * + * @tparam Dimensionality number of dimensions of the object + * @tparam DimensionType datatype used to represent each dimension + * + * @ingroup batch_dim + */ +template +struct batch_dim { + static constexpr size_type dimensionality = Dimensionality; + using dimension_type = DimensionType; + + /** + * Checks if the batch_dim object stores equal sizes. + * + * @return bool representing whether equal sizes are being stored + */ + bool stores_equal_sizes() const { return equal_sizes_; } + + /** + * Get the number of batch entries stored + * + * @return num_batch_entries + */ + size_type get_num_batch_entries() const { return num_batch_entries_; } + + /** + * Get the sizes of all entries as a std::vector. + * + * @return the std::vector of batch sizes + */ + std::vector> get_batch_sizes() const + { + if (equal_sizes_) { + if (num_batch_entries_ > 0) { + return std::vector>( + num_batch_entries_, common_size_); + } else { + return std::vector>{ + common_size_}; + } + } else { + return sizes_; + } + } + + /** + * Get the batch size at a particular index. + * + * @param batch_entry the index of the entry whose size is needed + * + * @return the size of the batch entry at the requested batch-index + */ + const dim& at( + const size_type batch_entry = 0) const + { + if (equal_sizes_) { + return common_size_; + } else { + GKO_ASSERT(batch_entry < num_batch_entries_); + return sizes_[batch_entry]; + } + } + + /** + * Checks if two batch_dim objects are equal. + * + * @param x first object + * @param y second object + * + * @return true if and only if all dimensions of both objects are equal. + */ + friend bool operator==(const batch_dim& x, const batch_dim& y) + { + if (x.equal_sizes_ && y.equal_sizes_) { + return x.num_batch_entries_ == y.num_batch_entries_ && + x.common_size_ == y.common_size_; + } else { + return x.sizes_ == y.sizes_; + } + } + + /** + * Creates a batch_dim object which stores a uniform size for all batch + * entries. + * + * @param num_batch_entries number of batch entries to be stored + * @param common_size the common size of all the batch entries stored + * + * @note Use this constructor when uniform batches need to be stored. + */ + explicit batch_dim(const size_type num_batch_entries = 0, + const dim& common_size = + dim{}) + : equal_sizes_(true), + common_size_(common_size), + num_batch_entries_(num_batch_entries), + sizes_() + {} + + /** + * Creates a batch_dim object which stores possibly non-uniform sizes for + * the different batch entries. + * + * @param batch_sizes the std::vector object that stores the batch_sizes + * + * @note Use this constructor when non-uniform batches need to be stored. + */ + batch_dim( + const std::vector>& batch_sizes) + : equal_sizes_(false), + common_size_(dim{}), + num_batch_entries_(batch_sizes.size()), + sizes_(batch_sizes) + { + check_size_equality(); + } + +private: + void check_size_equality() + { + for (size_type i = 0; i < num_batch_entries_; ++i) { + if (!(sizes_[i] == sizes_[0])) { + return; + } + } + common_size_ = sizes_[0]; + equal_sizes_ = true; + } + + bool equal_sizes_{}; + size_type num_batch_entries_{}; + dim common_size_{}; + std::vector> sizes_{}; +}; + + /** * Checks if two dim objects are different. * @@ -280,6 +418,54 @@ constexpr GKO_ATTRIBUTES GKO_INLINE dim<2, DimensionType> transpose( } +/** + * Checks if two batch dim objects are different. + * + * @tparam Dimensionality number of dimensions of the dim objects + * @tparam DimensionType datatype used to represent each dimension + * + * @param x first object + * @param y second object + * + * @return `!(x == y)` + */ +template +inline bool operator!=(const batch_dim& x, + const batch_dim& y) +{ + return !(x == y); +} + + +/** + * Returns a batch_dim object with its dimensions swapped for batched operators + * + * @tparam DimensionType datatype used to represent each dimension + * + * @param dimensions original object + * + * @return a batch_dim object with the individual batches having their + * dimensions swapped + */ +template +inline batch_dim<2, DimensionType> transpose( + const batch_dim<2, DimensionType>& input) +{ + batch_dim<2, DimensionType> out{}; + if (input.stores_equal_sizes()) { + out = batch_dim<2, DimensionType>(input.get_num_batch_entries(), + gko::transpose(input.at(0))); + return out; + } + auto trans = + std::vector>(input.get_num_batch_entries()); + for (size_type i = 0; i < trans.size(); ++i) { + trans[i] = transpose(input.at(i)); + } + return batch_dim<2, DimensionType>(trans); +} + + } // namespace gko diff --git a/include/ginkgo/core/matrix/batch_vector.hpp b/include/ginkgo/core/matrix/batch_vector.hpp new file mode 100644 index 00000000000..f4061114052 --- /dev/null +++ b/include/ginkgo/core/matrix/batch_vector.hpp @@ -0,0 +1,1093 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ +#define GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace matrix { + + +template +class BatchDiagonal; + + +template +class BatchCsr; + + +/** + * BatchDense is a batch matrix format which explicitly stores all values of the + * matrix in each of the batches. + * + * The values in each of the batches are stored in row-major format (values + * belonging to the same row appear consecutive in the memory). Optionally, rows + * can be padded for better memory access. + * + * @tparam ValueType precision of matrix elements + * + * @note While this format is not very useful for storing sparse matrices, it + * is often suitable to store vectors, and sets of vectors. + * @ingroup batch_dense + * @ingroup mat_formats + * @ingroup BatchLinOp + */ +template +class BatchDense : public EnableBatchLinOp>, + public EnableCreateMethod>, + public ConvertibleTo>>, + public ConvertibleTo>, + public ConvertibleTo>, + public BatchReadableFromMatrixData, + public BatchReadableFromMatrixData, + public BatchWritableToMatrixData, + public BatchWritableToMatrixData, + public BatchTransposable, + public BatchScaledIdentityAddable { + friend class EnableCreateMethod; + friend class EnablePolymorphicObject; + friend class BatchDense>; + +public: + using EnableBatchLinOp::convert_to; + using EnableBatchLinOp::move_to; + using BatchReadableFromMatrixData::read; + using BatchReadableFromMatrixData::read; + + using value_type = ValueType; + using index_type = int32; + using transposed_type = BatchDense; + using unbatch_type = Dense; + using mat_data = gko::matrix_data; + using mat_data32 = gko::matrix_data; + using absolute_type = remove_complex; + using complex_type = to_complex; + + using row_major_range = gko::range>; + + /** + * Creates a BatchDense matrix with the configuration of another BatchDense + * matrix. + * + * @param other The other matrix whose configuration needs to copied. + */ + static std::unique_ptr create_with_config_of( + const BatchDense* other) + { + // De-referencing `other` before calling the functions (instead of + // using operator `->`) is currently required to be compatible with + // CUDA 10.1. + // Otherwise, it results in a compile error. + return (*other).create_with_same_config(); + } + + friend class BatchDense>; + + void convert_to( + BatchDense>* result) const override; + + void move_to(BatchDense>* result) override; + + void convert_to(BatchCsr* result) const override; + + void move_to(BatchCsr* result) override; + + void convert_to(BatchDiagonal* result) const override; + + void move_to(BatchDiagonal* result) override; + + void read(const std::vector& data) override; + + void read(const std::vector& data) override; + + void write(std::vector& data) const override; + + void write(std::vector& data) const override; + + std::unique_ptr transpose() const override; + + std::unique_ptr conj_transpose() const override; + + /** + * Unbatches the batched dense and creates a std::vector of Dense matrices + * + * @return a std::vector containing the Dense matrices. + */ + std::vector> unbatch() const + { + auto exec = this->get_executor(); + auto unbatch_mats = std::vector>{}; + for (size_type b = 0; b < this->get_num_batch_entries(); ++b) { + auto mat = unbatch_type::create(exec, this->get_size().at(b), + this->get_stride().at(b)); + exec->copy_from(exec.get(), mat->get_num_stored_elements(), + this->get_const_values() + + num_elems_per_batch_cumul_.get_const_data()[b], + mat->get_values()); + unbatch_mats.emplace_back(std::move(mat)); + } + return unbatch_mats; + } + + /** + * Returns a pointer to the array of values of the matrix. + * + * @return the pointer to the array of values + */ + value_type* get_values() noexcept { return values_.get_data(); } + + /** + * Returns a pointer to the array of values of the matrix. + * + * @return the pointer to the array of values + */ + value_type* get_values(size_type batch) noexcept + { + GKO_ASSERT(batch < this->get_num_batch_entries()); + return values_.get_data() + + num_elems_per_batch_cumul_.get_const_data()[batch]; + } + + /** + * @copydoc get_values() + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_values() const noexcept + { + return values_.get_const_data(); + } + + /** + * @copydoc get_values(size_type) + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_values(size_type batch) const noexcept + { + GKO_ASSERT(batch < this->get_num_batch_entries()); + return values_.get_const_data() + + num_elems_per_batch_cumul_.get_const_data()[batch]; + } + + /** + * Returns the batch_stride of the matrix. + * + * @return the batch_stride of the matrix. + */ + const batch_stride& get_stride() const noexcept { return stride_; } + + /** + * Returns the number of elements explicitly stored in the batch matrix, + * cumulative across all the batches. + * + * @return the number of elements explicitly stored in the matrix, + * cumulative across all the batches + */ + size_type get_num_stored_elements() const noexcept + { + return values_.get_num_elems(); + } + + /** + * Returns the number of elements explicitly stored at a specific batch + * index. + * + * @param batch the batch index to be queried + * + * @return the number of elements explicitly stored in the matrix + */ + size_type get_num_stored_elements(size_type batch) const noexcept + { + GKO_ASSERT(batch < this->get_num_batch_entries()); + return num_elems_per_batch_cumul_.get_const_data()[batch + 1] - + num_elems_per_batch_cumul_.get_const_data()[batch]; + } + + /** + * Returns a single element for a particular batch. + * + * @param batch the batch index to be queried + * @param row the row of the requested element + * @param col the column of the requested element + * + * @note the method has to be called on the same Executor the matrix is + * stored at (e.g. trying to call this method on a GPU matrix from + * the OMP results in a runtime error) + */ + value_type& at(size_type batch, size_type row, size_type col) noexcept + { + GKO_ASSERT(batch < this->get_num_batch_entries()); + return values_.get_data()[linearize_index(batch, row, col)]; + } + + /** + * @copydoc BatchDense::at(size_type, size_type, size_type) + */ + value_type at(size_type batch, size_type row, size_type col) const noexcept + { + GKO_ASSERT(batch < this->get_num_batch_entries()); + return values_.get_const_data()[linearize_index(batch, row, col)]; + } + + /** + * Returns a single element for a particular batch entry. + * + * Useful for iterating across all elements of the matrix. + * However, it is less efficient than the two-parameter variant of this + * method. + * + * @param batch the batch index to be queried + * @param idx a linear index of the requested element + * (ignoring the stride) + * + * @note the method has to be called on the same Executor the matrix is + * stored at (e.g. trying to call this method on a GPU matrix from + * the OMP results in a runtime error) + */ + ValueType& at(size_type batch, size_type idx) noexcept + { + return values_.get_data()[linearize_index(batch, idx)]; + } + + /** + * @copydoc BatchDense::at(size_type, size_type, size_type) + */ + ValueType at(size_type batch, size_type idx) const noexcept + { + return values_.get_const_data()[linearize_index(batch, idx)]; + } + + /** + * Scales the matrix with a scalar (aka: BLAS scal). + * + * @param alpha If alpha is 1x1 BatchDense matrix, the entire matrix (all + * batches) is scaled by alpha. If it is a BatchDense row vector of values, + * then i-th column of the matrix is scaled with the i-th element of alpha + * (the number of columns of alpha has to match the number of columns of the + * matrix). + */ + void scale(const BatchLinOp* alpha) + { + auto exec = this->get_executor(); + this->scale_impl(make_temporary_clone(exec, alpha).get()); + } + + /** + * Adds `b` scaled by `alpha` to the matrix (aka: BLAS axpy). + * + * @param alpha If alpha is 1x1 BatchDense matrix, the entire matrix is + * scaled by alpha. If it is a BatchDense row vector of values, then i-th + * column of the matrix is scaled with the i-th element of alpha (the number + * of columns of alpha has to match the number of columns of the matrix). + * @param b a matrix of the same dimension as this + */ + void add_scaled(const BatchLinOp* alpha, const BatchLinOp* b) + { + auto exec = this->get_executor(); + this->add_scaled_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get()); + } + + /** + * Adds `a` scaled by `alpha` to the matrix scaled by `beta`: + * this <- alpha * a + beta * this. + * + * @param alpha If alpha is 1x1 BatchDense matrix, the entire matrix a is + * scaled by alpha. If it is a BatchDense row vector of + * values, then i-th column of a is scaled with the i-th + * element of alpha (the number of columns of alpha has to + * match the number of columns of a). + * @param a a matrix of the same dimension as this. + * @param beta Scalar(s), of the same size as alpha, to multiply this + * matrix. + */ + void add_scale(const BatchLinOp* alpha, const BatchLinOp* a, + const BatchLinOp* beta); + + /** + * Computes the column-wise dot product of each matrix in this batch and its + * corresponding entry in `b`. If the matrix has complex value_type, then + * the conjugate of this is taken. + * + * @param b a BatchDense matrix of same dimension as this + * @param result a BatchDense row vector, used to store the dot product + * (the number of column in the vector must match the number + * of columns of this) + */ + void compute_dot(const BatchLinOp* b, BatchLinOp* result) const + { + auto exec = this->get_executor(); + this->compute_dot_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, result).get()); + } + + /** + * Computes the Euclidean (L^2) norm of each matrix in this batch. + * + * @param result a BatchDense row vector, used to store the norm + * (the number of columns in the vector must match the number + * of columns of this) + */ + void compute_norm2(BatchLinOp* result) const + { + auto exec = this->get_executor(); + this->compute_norm2_impl(make_temporary_clone(exec, result).get()); + } + + /** + * Creates a constant (immutable) batch dense matrix from a constant array. + * + * @param exec the executor to create the matrix on + * @param size the dimensions of the matrix + * @param values the value array of the matrix + * @param stride the row-stride of the matrix + * @returns A smart pointer to the constant matrix wrapping the input array + * (if it resides on the same executor as the matrix) or a copy of + * the array on the correct executor. + */ + static std::unique_ptr create_const( + std::shared_ptr exec, const batch_dim<2>& sizes, + gko::detail::const_array_view&& values, + const batch_stride& strides) + { + // cast const-ness away, but return a const object afterwards, + // so we can ensure that no modifications take place. + return std::unique_ptr(new BatchDense{ + exec, sizes, gko::detail::array_const_cast(std::move(values)), + strides}); + } + +private: + /** + * Compute the memory required for the values array from the sizes and the + * strides. + */ + inline size_type compute_batch_mem(const batch_dim<2>& sizes, + const batch_stride& strides) + { + GKO_ASSERT(sizes.get_num_batch_entries() == + strides.get_num_batch_entries()); + if (sizes.stores_equal_sizes() && strides.stores_equal_strides()) { + return (sizes.at(0))[0] * strides.at(0) * + sizes.get_num_batch_entries(); + } + size_type mem_req = 0; + for (auto i = 0; i < sizes.get_num_batch_entries(); ++i) { + mem_req += (sizes.at(i))[0] * strides.at(i); + } + return mem_req; + } + + /** + * Extract the nth dim of the batch sizes from the input batch_dim object. + */ + inline batch_stride extract_nth_dim(const int dim, const batch_dim<2>& size) + { + if (size.stores_equal_sizes()) { + return batch_stride(size.get_num_batch_entries(), size.at(0)[dim]); + } + std::vector stride(size.get_num_batch_entries()); + for (auto i = 0; i < size.get_num_batch_entries(); ++i) { + stride[i] = (size.at(i))[dim]; + } + return batch_stride(stride); + } + + /** + * Extract strides from the vector of the distinct Dense matrices. + */ + inline batch_stride get_strides_from_mtxs( + const std::vector*> mtxs) + { + auto strides = std::vector(mtxs.size()); + for (auto i = 0; i < mtxs.size(); ++i) { + strides[i] = mtxs[i]->get_stride(); + } + return batch_stride(strides); + } + + /** + * Extract sizes from the vector of the distinct Dense matrices. + */ + inline batch_dim<2> get_sizes_from_mtxs( + const std::vector*> mtxs) + { + auto sizes = std::vector>(mtxs.size()); + for (auto i = 0; i < mtxs.size(); ++i) { + sizes[i] = mtxs[i]->get_size(); + } + return batch_dim<2>(sizes); + } + + /** + * Compute the number of elements stored in each batch and store it in a + * prefixed sum fashion + */ + inline array compute_num_elems_per_batch_cumul( + std::shared_ptr exec, const batch_dim<2>& sizes, + const batch_stride& strides) + { + auto num_elems = array(exec->get_master(), + sizes.get_num_batch_entries() + 1); + num_elems.get_data()[0] = 0; + for (auto i = 0; i < sizes.get_num_batch_entries(); ++i) { + num_elems.get_data()[i + 1] = + num_elems.get_data()[i] + (sizes.at(i))[0] * strides.at(i); + } + num_elems.set_executor(exec); + return num_elems; + } + +protected: + /** + * Creates an uninitialized BatchDense matrix of the specified size. + * + * @param exec Executor associated to the matrix + * @param size size of the matrix + */ + BatchDense(std::shared_ptr exec, + const batch_dim<2>& size = batch_dim<2>{}) + : BatchDense(std::move(exec), size, + size.get_num_batch_entries() > 0 ? extract_nth_dim(1, size) + : batch_stride{}) + {} + + /** + * Creates an uninitialized BatchDense matrix of the specified size. + * + * @param exec Executor associated to the matrix + * @param size size of the batch matrices in a batch_dim object + * @param stride stride of the rows (i.e. offset between the first + * elements of two consecutive rows, expressed as the + * number of matrix elements) + */ + BatchDense(std::shared_ptr exec, const batch_dim<2>& size, + const batch_stride& stride) + : EnableBatchLinOp(exec, size), + values_(exec, compute_batch_mem(size, stride)), + stride_(stride) + { + num_elems_per_batch_cumul_ = + compute_num_elems_per_batch_cumul(exec, this->get_size(), stride); + } + + /** + * Creates a BatchDense matrix from an already allocated (and initialized) + * array. + * + * @tparam ValuesArray type of array of values + * + * @param exec Executor associated to the matrix + * @param size sizes of the batch matrices in a batch_dim object + * @param values array of matrix values + * @param strides stride of the rows (i.e. offset between the first + * elements of two consecutive rows, expressed as the + * number of matrix elements) + * + * @note If `values` is not an rvalue, not an array of ValueType, or is on + * the wrong executor, an internal copy will be created, and the + * original array data will not be used in the matrix. + */ + template + BatchDense(std::shared_ptr exec, const batch_dim<2>& size, + ValuesArray&& values, const batch_stride& stride) + : EnableBatchLinOp(exec, size), + values_{exec, std::forward(values)}, + stride_{stride}, + num_elems_per_batch_cumul_( + exec->get_master(), + compute_num_elems_per_batch_cumul(exec->get_master(), + this->get_size(), stride)) + { + auto num_elems = + num_elems_per_batch_cumul_ + .get_const_data()[num_elems_per_batch_cumul_.get_num_elems() - + 1] - + 1; + GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems()); + } + + /** + * Creates a BatchDense matrix from a vector of matrices + * + * @param exec Executor associated to the matrix + * @param matrices The matrices that need to be batched. + */ + BatchDense(std::shared_ptr exec, + const std::vector*>& matrices) + : EnableBatchLinOp(exec, get_sizes_from_mtxs(matrices)), + stride_{get_strides_from_mtxs(matrices)}, + values_(exec, compute_batch_mem(this->get_size(), stride_)) + { + num_elems_per_batch_cumul_ = compute_num_elems_per_batch_cumul( + exec->get_master(), this->get_size(), stride_); + for (size_type i = 0; i < this->get_num_batch_entries(); ++i) { + auto local_exec = matrices[i]->get_executor(); + exec->copy_from(local_exec.get(), + matrices[i]->get_num_stored_elements(), + matrices[i]->get_const_values(), + this->get_values() + + num_elems_per_batch_cumul_.get_const_data()[i]); + } + } + + /** + * Creates a BatchDense matrix by duplicating BatchDense matrix + * + * @param exec Executor associated to the matrix + * @param num_duplications The number of times to duplicate + * @param input The matrix to be duplicated. + */ + BatchDense(std::shared_ptr exec, size_type num_duplications, + const BatchDense* input) + : EnableBatchLinOp( + exec, gko::batch_dim<2>( + input->get_num_batch_entries() * num_duplications, + input->get_size().at(0))), + stride_{gko::batch_stride( + input->get_num_batch_entries() * num_duplications, + input->get_stride().at(0))}, + values_(exec, compute_batch_mem(this->get_size(), stride_)) + { + // Check if it works when stride neq num_cols + num_elems_per_batch_cumul_ = compute_num_elems_per_batch_cumul( + exec->get_master(), this->get_size(), stride_); + size_type offset = 0; + for (size_type i = 0; i < num_duplications; ++i) { + exec->copy_from( + input->get_executor().get(), input->get_num_stored_elements(), + input->get_const_values(), this->get_values() + offset); + offset += input->get_num_stored_elements(); + } + } + + /** + * Creates a BatchDense matrix by duplicating Dense matrix + * + * @param exec Executor associated to the matrix + * @param num_duplications The number of times to duplicate + * @param input The matrix to be duplicated. + */ + BatchDense(std::shared_ptr exec, size_type num_duplications, + const Dense* input) + : EnableBatchLinOp( + exec, gko::batch_dim<2>(num_duplications, input->get_size())), + stride_{gko::batch_stride(num_duplications, input->get_stride())}, + values_(exec, compute_batch_mem(this->get_size(), stride_)) + { + // Check if it works when stride neq num_cols + num_elems_per_batch_cumul_ = compute_num_elems_per_batch_cumul( + exec->get_master(), this->get_size(), stride_); + size_type offset = 0; + for (size_type i = 0; i < num_duplications; ++i) { + exec->copy_from( + input->get_executor().get(), input->get_num_stored_elements(), + input->get_const_values(), this->get_values() + offset); + offset += input->get_num_stored_elements(); + } + } + + /** + * Creates a BatchDense matrix with the same configuration as the callers + * matrix. + * + * @returns a BatchDense matrix with the same configuration as the caller. + */ + virtual std::unique_ptr create_with_same_config() const + { + return BatchDense::create(this->get_executor(), this->get_size(), + this->get_stride()); + } + + /** + * @copydoc scale(const BatchLinOp *) + * + * @note Other implementations of batch_dense should override this function + * instead of scale(const BatchLinOp *alpha). + */ + virtual void scale_impl(const BatchLinOp* alpha); + + /** + * @copydoc add_scaled(const BatchLinOp *, const BatchLinOp *) + * + * @note Other implementations of batch_dense should override this function + * instead of add_scale(const BatchLinOp *alpha, const BatchLinOp + * *b). + */ + virtual void add_scaled_impl(const BatchLinOp* alpha, const BatchLinOp* b); + + /** + * @copydoc compute_dot(const BatchLinOp *, BatchLinOp *) const + * + * @note Other implementations of batch_dense should override this function + * instead of compute_dot(const BatchLinOp *b, BatchLinOp *result). + */ + virtual void compute_dot_impl(const BatchLinOp* b, + BatchLinOp* result) const; + + /** + * @copydoc compute_norm2(BatchLinOp *) const + * + * @note Other implementations of batch_dense should override this function + * instead of compute_norm2(BatchLinOp *result). + */ + virtual void compute_norm2_impl(BatchLinOp* result) const; + + void apply_impl(const BatchLinOp* b, BatchLinOp* x) const override; + + void apply_impl(const BatchLinOp* alpha, const BatchLinOp* b, + const BatchLinOp* beta, BatchLinOp* x) const override; + + size_type linearize_index(size_type batch, size_type row, + size_type col) const noexcept + { + return num_elems_per_batch_cumul_.get_const_data()[batch] + + row * stride_.at(batch) + col; + } + + size_type linearize_index(size_type batch, size_type idx) const noexcept + { + return linearize_index(batch, idx / this->get_size().at(batch)[1], + idx % this->get_size().at(batch)[1]); + } + +private: + batch_stride stride_; + array num_elems_per_batch_cumul_; + array values_; + + void add_scaled_identity_impl(const BatchLinOp* a, + const BatchLinOp* b) override; +}; + + +} // namespace matrix + + +/** + * Creates and initializes a batch of column-vectors. + * + * This function first creates a temporary Dense matrix, fills it with passed in + * values, and then converts the matrix to the requested type. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param stride row stride for the temporary Dense matrix + * @param vals values used to initialize the batch vector + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup BatchLinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + std::vector stride, + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using batch_dense = matrix::BatchDense; + size_type num_batch_entries = vals.size(); + std::vector num_rows(num_batch_entries); + std::vector> sizes(num_batch_entries); + auto vals_begin = begin(vals); + for (size_type b = 0; b < num_batch_entries; ++b) { + num_rows[b] = vals_begin->size(); + sizes[b] = dim<2>(num_rows[b], 1); + vals_begin++; + } + auto b_size = batch_dim<2>(sizes); + auto b_stride = batch_stride(stride); + auto tmp = batch_dense::create(exec->get_master(), b_size, b_stride); + size_type batch = 0; + for (const auto& b : vals) { + size_type idx = 0; + for (const auto& elem : b) { + tmp->at(batch, idx) = elem; + ++idx; + } + ++batch; + } + auto mtx = Matrix::create(exec, std::forward(create_args)...); + tmp->move_to(mtx.get()); + return mtx; +} + +/** + * Creates and initializes a batch of column-vectors. + * + * This function first creates a temporary Dense matrix, fills it with passed in + * values, and then converts the matrix to the requested type. The stride of + * the intermediate Dense matrix is set to 1. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param vals values used to initialize the vector + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup BatchLinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + return batch_initialize(std::vector(vals.size(), 1), + vals, std::move(exec), + std::forward(create_args)...); +} + + +/** + * Creates and initializes a batch of matrices. + * + * This function first creates a temporary Dense matrix, fills it with passed in + * values, and then converts the matrix to the requested type. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param stride row stride for the temporary Dense matrix + * @param vals values used to initialize the matrix + * @param exec Executor associated to the matrix + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup BatchLinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + std::vector stride, + std::initializer_list>> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using batch_dense = matrix::BatchDense; + size_type num_batch_entries = vals.size(); + std::vector num_rows(num_batch_entries); + std::vector num_cols(num_batch_entries); + std::vector> sizes(num_batch_entries); + size_type ind = 0; + for (const auto& b : vals) { + num_rows[ind] = b.size(); + num_cols[ind] = num_rows[ind] > 0 ? begin(b)->size() : 1; + sizes[ind] = dim<2>(num_rows[ind], num_cols[ind]); + ++ind; + } + auto b_size = batch_dim<2>(sizes); + auto b_stride = batch_stride(stride); + auto tmp = batch_dense::create(exec->get_master(), b_size, b_stride); + size_type batch = 0; + for (const auto& b : vals) { + size_type ridx = 0; + for (const auto& row : b) { + size_type cidx = 0; + for (const auto& elem : row) { + tmp->at(batch, ridx, cidx) = elem; + ++cidx; + } + ++ridx; + } + ++batch; + } + auto mtx = Matrix::create(exec, std::forward(create_args)...); + tmp->move_to(mtx.get()); + return mtx; +} + + +/** + * Creates and initializes a batch of matrices. + * + * This function first creates a temporary Dense matrix, fills it with passed in + * values, and then converts the matrix to the requested type. The stride of + * the intermediate Dense matrix is set to the number of columns of the + * initializer list. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param vals values used to initialize the matrix + * @param exec Executor associated to the matrix + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup BatchLinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + std::initializer_list>> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + auto strides = std::vector(vals.size(), 0); + size_type ind = 0; + for (const auto& b : vals) { + strides[ind] = begin(b)->size(); + ++ind; + } + return batch_initialize(strides, vals, std::move(exec), + std::forward(create_args)...); +} + + +/** + * Creates and initializes a batch column-vector by making copies of the single + * input column vector. + * + * This function first creates a temporary batch dense matrix, fills it with + * passed in values, and then converts the matrix to the requested type. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo + * interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param stride row strides for the temporary batch dense matrix + * @param num_vectors The number of times the input vector is copied into + * the final output + * @param vals values used to initialize each vector in the temp. batch + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup BatchLinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + std::vector stride, const size_type num_vectors, + std::initializer_list vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using batch_dense = matrix::BatchDense; + std::vector num_rows(num_vectors); + std::vector> sizes(num_vectors); + for (size_type b = 0; b < num_vectors; ++b) { + num_rows[b] = vals.size(); + sizes[b] = dim<2>(vals.size(), 1); + } + auto b_size = batch_dim<2>(sizes); + auto b_stride = batch_stride(stride); + auto tmp = batch_dense::create(exec->get_master(), b_size, b_stride); + for (size_type batch = 0; batch < num_vectors; batch++) { + size_type idx = 0; + for (const auto& elem : vals) { + tmp->at(batch, idx) = elem; + ++idx; + } + } + auto mtx = Matrix::create(exec, std::forward(create_args)...); + tmp->move_to(mtx.get()); + return mtx; +} + + +/** + * Creates and initializes a column-vector from copies of a given vector. + * + * This function first creates a temporary Dense matrix, fills it with passed + * in values, and then converts the matrix to the requested type. The stride of + * the intermediate Dense matrix is set to 1. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo + * interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param num_vectors The number of times the input vector is copied into + * the final output + * @param vals values used to initialize the vector + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup BatchLinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + const size_type num_vectors, + std::initializer_list vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + return batch_initialize(std::vector(num_vectors, 1), + num_vectors, vals, std::move(exec), + std::forward(create_args)...); +} + +/** + * Creates and initializes a matrix from copies of a given matrix. + * + * This function first creates a temporary batch dense matrix, fills it with + * passed in values, and then converts the matrix to the requested type. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param stride row strides for the temporary batch dense matrix + * @param num_matrices The number of times the input matrix is copied into + * the final output + * @param vals values used to initialize each vector in the temp. batch + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup LinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + std::vector stride, const size_type num_matrices, + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using batch_dense = matrix::BatchDense; + std::vector> sizes(num_matrices); + const size_type num_rows = vals.size(); + for (size_type b = 0; b < num_matrices; ++b) { + const size_type num_cols = begin(vals)->size(); + sizes[b] = dim<2>(num_rows, num_cols); + for (auto blockit = begin(vals); blockit != end(vals); ++blockit) { + GKO_ASSERT(blockit->size() == num_cols); + } + } + auto tmp = batch_dense::create(exec->get_master(), sizes, stride); + for (size_type batch = 0; batch < num_matrices; batch++) { + size_type ridx = 0; + for (const auto& row : vals) { + size_type cidx = 0; + for (const auto& elem : row) { + tmp->at(batch, ridx, cidx) = elem; + ++cidx; + } + ++ridx; + } + } + auto mtx = Matrix::create(exec, std::forward(create_args)...); + tmp->move_to(mtx.get()); + return mtx; +} + +/** + * Creates and initializes a matrix from copies of a given matrix. + * + * This function first creates a temporary Dense matrix, fills it with passed in + * values, and then converts the matrix to the requested type. The stride of + * the intermediate Dense matrix is set to 1. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param num_vectors The number of times the input vector is copied into + * the final output + * @param vals values used to initialize the vector + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup LinOp + * @ingroup mat_formats + */ +template +std::unique_ptr batch_initialize( + const size_type num_matrices, + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + auto strides = std::vector(num_matrices, begin(vals)->size()); + return batch_initialize(strides, num_matrices, vals, + std::move(exec), + std::forward(create_args)...); +} + + +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index bda26ad63d3..abb50ffc09f 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -20,6 +20,7 @@ target_sources(ginkgo_omp factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp + matrix/batch_vector_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/omp/matrix/batch_vector_kernels.cpp b/omp/matrix/batch_vector_kernels.cpp new file mode 100644 index 00000000000..70c0794f4a8 --- /dev/null +++ b/omp/matrix/batch_vector_kernels.cpp @@ -0,0 +1,614 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/components/prefix_sum_kernels.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The BatchDense matrix format namespace. + * @ref BatchDense + * @ingroup batch_dense + */ +namespace batch_dense { + + +#include "reference/matrix/batch_dense_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + matrix::BatchDense* const c) +{ + const auto a_ub = host::get_batch_struct(a); + const auto b_ub = host::get_batch_struct(b); + const auto c_ub = host::get_batch_struct(c); +#pragma omp parallel for + for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { + const auto a_b = gko::batch::batch_entry(a_ub, batch); + const auto b_b = gko::batch::batch_entry(b_ub, batch); + const auto c_b = gko::batch::batch_entry(c_ub, batch); + matvec_kernel(a_b, b_b, c_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void apply(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + const matrix::BatchDense* const beta, + matrix::BatchDense* const c) +{ + const auto a_ub = host::get_batch_struct(a); + const auto b_ub = host::get_batch_struct(b); + const auto c_ub = host::get_batch_struct(c); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); +#pragma omp parallel for + for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { + const auto a_b = gko::batch::batch_entry(a_ub, batch); + const auto b_b = gko::batch::batch_entry(b_ub, batch); + const auto c_b = gko::batch::batch_entry(c_ub, batch); + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto beta_b = gko::batch::batch_entry(beta_ub, batch); + advanced_matvec_kernel(alpha_b.values[0], a_b, b_b, beta_b.values[0], + c_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); + + +template +void scale(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + matrix::BatchDense* const x) +{ + const auto x_ub = host::get_batch_struct(x); + const auto alpha_ub = host::get_batch_struct(alpha); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + scale(alpha_b, x_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + matrix::BatchDense* const y) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto alpha_ub = host::get_batch_struct(alpha); +#pragma omp parallel for + for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + add_scaled(alpha_b, x_b, y_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); + + +template +void add_scale(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + const matrix::BatchDense* const beta, + matrix::BatchDense* const y) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); +#pragma omp parallel for + for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto beta_b = gko::batch::batch_entry(beta_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + add_scale(alpha_b, x_b, beta_b, y_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); + + +template +void convergence_add_scaled(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + matrix::BatchDense* const y, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto alpha_ub = host::get_batch_struct(alpha); +#pragma omp parallel for + for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + add_scaled(alpha_b, x_b, y_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); + + +template +void add_scaled_diag(std::shared_ptr, + const matrix::BatchDense*, + const matrix::Diagonal*, + matrix::BatchDense*) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const matrix::BatchDense* const x, + const matrix::BatchDense* const y, + matrix::BatchDense* const result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto res_ub = host::get_batch_struct(result); +#pragma omp parallel for + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + compute_dot_product(x_b, y_b, res_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); + + +template +void convergence_compute_dot(std::shared_ptr exec, + const matrix::BatchDense* const x, + const matrix::BatchDense* const y, + matrix::BatchDense* const result, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto res_ub = host::get_batch_struct(result); +#pragma omp parallel for + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + compute_dot_product(x_b, y_b, res_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::BatchDense* const x, + matrix::BatchDense>* const result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto res_ub = host::get_batch_struct(result); +#pragma omp parallel for + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + compute_norm2(x_b, res_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); + + +template +void convergence_compute_norm2( + std::shared_ptr exec, + const matrix::BatchDense* const x, + matrix::BatchDense>* const result, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto res_ub = host::get_batch_struct(result); +#pragma omp parallel for + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + compute_norm2(x_b, res_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); + + +template +void convert_to_batch_csr(std::shared_ptr exec, + const matrix::BatchDense* const source, + matrix::BatchCsr* const result) +{ + GKO_ASSERT(source->get_size().stores_equal_sizes() == true); + auto num_rows = result->get_size().at(0)[0]; + auto num_cols = result->get_size().at(0)[1]; + auto num_batches = result->get_num_batch_entries(); + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + IndexType row_nnz{}; + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(0, row, col); + row_nnz += static_cast(val != zero()); + } + row_ptrs[row] = row_nnz; + } + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto cur_ptr = row_ptrs[row]; + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(0, row, col); + if (val != zero()) { + col_idxs[cur_ptr] = static_cast(col); + ++cur_ptr; + } + } + } + +#pragma omp parallel for + for (size_type batch = 0; batch < num_batches; ++batch) { + size_type cur_ptr = + batch * row_ptrs[num_rows]; // as row_ptrs[num_rows] is the num of + // non zero elements in the matrix + for (size_type row = 0; row < num_rows; ++row) { + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(batch, row, col); + if (val != zero()) { + values[cur_ptr] = val; + ++cur_ptr; + } + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_AND_INT32_INDEX( + GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::BatchDense* const source, + size_type* const result) +{ +#pragma omp parallel for + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + size_type num_nonzeros = 0; + + for (size_type row = 0; row < num_rows; ++row) { + for (size_type col = 0; col < num_cols; ++col) { + num_nonzeros += static_cast( + source->at(batch, row, col) != zero()); + } + } + result[batch] = num_nonzeros; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); + + +template +void calculate_max_nnz_per_row( + std::shared_ptr, + const matrix::BatchDense* const source, size_type* const result) +{ +#pragma omp parallel for + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + size_type num_stored_elements_per_row = 0; + size_type num_nonzeros = 0; + + for (size_type row = 0; row < num_rows; ++row) { + num_nonzeros = 0; + for (size_type col = 0; col < num_cols; ++col) { + num_nonzeros += static_cast( + source->at(batch, row, col) != zero()); + } + num_stored_elements_per_row = + std::max(num_nonzeros, num_stored_elements_per_row); + } + result[batch] = num_stored_elements_per_row; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); + + +template +void calculate_nonzeros_per_row( + std::shared_ptr, + const matrix::BatchDense* const source, + array* const result) +{ + size_type cumul_prev_rows = 0; + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + auto row_nnz_val = result->get_data() + cumul_prev_rows; + +#pragma omp parallel for reduction(+ : cumul_prev_rows) + for (size_type row = 0; row < num_rows; ++row) { + size_type num_nonzeros = 0; + + for (size_type col = 0; col < num_cols; ++col) { + num_nonzeros += static_cast( + source->at(batch, row, col) != zero()); + } + row_nnz_val[row] = num_nonzeros; + ++cumul_prev_rows; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); + + +template +void calculate_total_cols(std::shared_ptr, + const matrix::BatchDense* const source, + size_type* const result, + const size_type* const stride_factor, + const size_type* const slice_size) +{ +#pragma omp parallel for + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + auto slice_num = ceildiv(num_rows, slice_size[batch]); + size_type total_cols = 0; + size_type temp = 0; + size_type slice_temp = 0; + + for (size_type slice = 0; slice < slice_num; slice++) { + slice_temp = 0; + for (size_type row = 0; row < slice_size[batch] && + row + slice * slice_size[batch] < num_rows; + row++) { + temp = 0; + for (size_type col = 0; col < num_cols; col++) { + temp += static_cast( + source->at(batch, row + slice * slice_size[batch], + col) != zero()); + } + slice_temp = (slice_temp < temp) ? temp : slice_temp; + } + slice_temp = ceildiv(slice_temp, stride_factor[batch]) * + stride_factor[batch]; + total_cols += slice_temp; + } + result[batch] = total_cols; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); + + +template +void transpose(std::shared_ptr, + const matrix::BatchDense* const orig, + matrix::BatchDense* const trans) +{ +#pragma omp parallel for + for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { + for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { + for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { + trans->at(batch, j, i) = orig->at(batch, i, j); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr, + const matrix::BatchDense* const orig, + matrix::BatchDense* const trans) +{ +#pragma omp parallel for + for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { + for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { + for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { + trans->at(batch, j, i) = conj(orig->at(batch, i, j)); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + + +template +void copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto result_ub = host::get_batch_struct(result); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { + const auto result_b = gko::batch::batch_entry(result_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + copy(x_b, result_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); + + +template +void convergence_copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto result_ub = host::get_batch_struct(result); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { + const auto result_b = gko::batch::batch_entry(result_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + copy(x_b, result_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); + + +template +void batch_scale(std::shared_ptr exec, + const matrix::BatchDiagonal* const left, + const matrix::BatchDiagonal* const rght, + matrix::BatchDense* const vecs) +{ + const auto left_vals = left->get_const_values(); + const auto rght_vals = rght->get_const_values(); + const auto v_vals = vecs->get_values(); + const auto nrows = static_cast(vecs->get_size().at(0)[0]); + const auto ncols = static_cast(vecs->get_size().at(0)[1]); + const auto vstride = vecs->get_stride().at(0); +#pragma omp parallel for + for (size_type batch = 0; batch < vecs->get_num_batch_entries(); ++batch) { + const auto left_b = + gko::batch::batch_entry_ptr(left_vals, 1, nrows, batch); + const auto rght_b = + gko::batch::batch_entry_ptr(rght_vals, 1, ncols, batch); + const auto v_b = + gko::batch::batch_entry_ptr(v_vals, vstride, nrows, batch); + batch_scale(nrows, ncols, vstride, left_b, rght_b, v_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); + + +template +void add_scaled_identity(std::shared_ptr exec, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + matrix::BatchDense* const mtx) +{ + const auto a_ub = host::get_batch_struct(a); + const auto b_ub = host::get_batch_struct(b); + const auto mtx_ub = host::get_batch_struct(mtx); +#pragma omp parallel for + for (size_type batch = 0; batch < mtx->get_num_batch_entries(); ++batch) { + auto a_b = gko::batch::batch_entry(a_ub, batch); + auto b_b = gko::batch::batch_entry(b_ub, batch); + auto mtx_b = gko::batch::batch_entry(mtx_ub, batch); + add_scaled_identity(a_b.values[0], b_b.values[0], mtx_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_dense +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index ab04aec75a1..224fb70dc0e 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -23,6 +23,7 @@ target_sources(ginkgo_reference factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp + matrix/batch_vector_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/reference/matrix/batch_vector_kernels.cpp b/reference/matrix/batch_vector_kernels.cpp new file mode 100644 index 00000000000..8e9e857cc5b --- /dev/null +++ b/reference/matrix/batch_vector_kernels.cpp @@ -0,0 +1,580 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/matrix/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The BatchDense matrix format namespace. + * @ref BatchDense + * @ingroup batch_dense + */ +namespace batch_dense { + + +#include "reference/matrix/batch_dense_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + matrix::BatchDense* const c) +{ + const auto a_ub = host::get_batch_struct(a); + const auto b_ub = host::get_batch_struct(b); + const auto c_ub = host::get_batch_struct(c); + for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { + const auto a_b = gko::batch::batch_entry(a_ub, batch); + const auto b_b = gko::batch::batch_entry(b_ub, batch); + const auto c_b = gko::batch::batch_entry(c_ub, batch); + matvec_kernel(a_b, b_b, c_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void apply(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + const matrix::BatchDense* const beta, + matrix::BatchDense* const c) +{ + const auto a_ub = host::get_batch_struct(a); + const auto b_ub = host::get_batch_struct(b); + const auto c_ub = host::get_batch_struct(c); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); + for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { + const auto a_b = gko::batch::batch_entry(a_ub, batch); + const auto b_b = gko::batch::batch_entry(b_ub, batch); + const auto c_b = gko::batch::batch_entry(c_ub, batch); + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto beta_b = gko::batch::batch_entry(beta_ub, batch); + advanced_matvec_kernel(alpha_b.values[0], a_b, b_b, beta_b.values[0], + c_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); + + +template +void scale(std::shared_ptr exec, + const matrix::BatchDense* alpha, + matrix::BatchDense* x) +{ + const auto x_ub = host::get_batch_struct(x); + const auto alpha_ub = host::get_batch_struct(alpha); + for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + scale(alpha_b, x_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const matrix::BatchDense* alpha, + const matrix::BatchDense* x, + matrix::BatchDense* y) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto alpha_ub = host::get_batch_struct(alpha); + for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + add_scaled(alpha_b, x_b, y_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); + + +template +void add_scale(std::shared_ptr exec, + const matrix::BatchDense* const alpha, + const matrix::BatchDense* const x, + const matrix::BatchDense* const beta, + matrix::BatchDense* const y) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); + for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto beta_b = gko::batch::batch_entry(beta_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + add_scale(alpha_b, x_b, beta_b, y_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); + + +template +void convergence_add_scaled(std::shared_ptr exec, + const matrix::BatchDense* alpha, + const matrix::BatchDense* x, + matrix::BatchDense* y, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto alpha_ub = host::get_batch_struct(alpha); + for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { + const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + add_scaled(alpha_b, x_b, y_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); + + +template +void add_scaled_diag(std::shared_ptr exec, + const matrix::BatchDense* alpha, + const matrix::Diagonal* x, + matrix::BatchDense* y) GKO_NOT_IMPLEMENTED; +// { +// for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { +// const auto diag_values = x->get_const_values(); +// for (size_type i = 0; i < x->get_size().at(batch)[0]; i++) { +// y->at(batch,i, i) += alpha->at(batch,0, 0) * diag_values[i]; +// } +// } +// } + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const matrix::BatchDense* x, + const matrix::BatchDense* y, + matrix::BatchDense* result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto res_ub = host::get_batch_struct(result); + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + compute_dot_product(x_b, y_b, res_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); + + +template +void convergence_compute_dot(std::shared_ptr exec, + const matrix::BatchDense* x, + const matrix::BatchDense* y, + matrix::BatchDense* result, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto res_ub = host::get_batch_struct(result); + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + compute_dot_product(x_b, y_b, res_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense>* result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto res_ub = host::get_batch_struct(result); + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + compute_norm2(x_b, res_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); + + +template +void convergence_compute_norm2( + std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense>* result, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto res_ub = host::get_batch_struct(result); + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + compute_norm2(x_b, res_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); + + +template +void convert_to_batch_csr(std::shared_ptr exec, + const matrix::BatchDense* source, + matrix::BatchCsr* result) +{ + GKO_ASSERT(source->get_size().stores_equal_sizes() == true); + auto num_rows = result->get_size().at(0)[0]; + auto num_cols = result->get_size().at(0)[1]; + auto num_batch_entries = result->get_num_batch_entries(); + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + size_type cur_ptr = 0; + row_ptrs[0] = cur_ptr; + for (size_type row = 0; row < num_rows; ++row) { + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(0, row, col); + if (val != zero()) { + col_idxs[cur_ptr] = col; + ++cur_ptr; + } + } + row_ptrs[row + 1] = cur_ptr; + } + + cur_ptr = 0; + for (size_type batch = 0; batch < num_batch_entries; ++batch) { + for (size_type row = 0; row < num_rows; ++row) { + for (size_type col = 0; col < num_cols; ++col) { + auto val = source->at(batch, row, col); + if (val != zero()) { + values[cur_ptr] = val; + ++cur_ptr; + } + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_AND_INT32_INDEX( + GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); + + +template +void count_nonzeros(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result) +{ + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + auto num_nonzeros = 0; + + for (size_type row = 0; row < num_rows; ++row) { + for (size_type col = 0; col < num_cols; ++col) { + num_nonzeros += + (source->at(batch, row, col) != zero()); + } + } + result[batch] = num_nonzeros; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); + + +template +void calculate_max_nnz_per_row(std::shared_ptr exec, + const matrix::BatchDense* source, + size_type* result) +{ + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + size_type num_stored_elements_per_row = 0; + size_type num_nonzeros = 0; + for (size_type row = 0; row < num_rows; ++row) { + num_nonzeros = 0; + for (size_type col = 0; col < num_cols; ++col) { + num_nonzeros += + (source->at(batch, row, col) != zero()); + } + num_stored_elements_per_row = + std::max(num_nonzeros, num_stored_elements_per_row); + } + result[batch] = num_stored_elements_per_row; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); + + +template +void calculate_nonzeros_per_row(std::shared_ptr exec, + const matrix::BatchDense* source, + array* result) +{ + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + auto row_nnz_val = result->get_data(); + size_type offset = 0; + for (size_type row = 0; row < num_rows; ++row) { + size_type num_nonzeros = 0; + for (size_type col = 0; col < num_cols; ++col) { + num_nonzeros += + (source->at(batch, row, col) != zero()); + } + row_nnz_val[offset + row] = num_nonzeros; + ++offset; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); + + +template +void calculate_total_cols(std::shared_ptr exec, + const matrix::BatchDense* const source, + size_type* const result, + const size_type* const stride_factor, + const size_type* const slice_size) +{ + for (size_type batch = 0; batch < source->get_num_batch_entries(); + ++batch) { + auto num_rows = source->get_size().at(batch)[0]; + auto num_cols = source->get_size().at(batch)[1]; + auto slice_num = ceildiv(num_rows, slice_size[batch]); + auto total_cols = 0; + auto temp = 0, slice_temp = 0; + for (size_type slice = 0; slice < slice_num; slice++) { + slice_temp = 0; + for (size_type row = 0; row < slice_size[batch] && + row + slice * slice_size[batch] < num_rows; + row++) { + temp = 0; + for (size_type col = 0; col < num_cols; col++) { + temp += (source->at(batch, row + slice * slice_size[batch], + col) != zero()); + } + slice_temp = (slice_temp < temp) ? temp : slice_temp; + } + slice_temp = ceildiv(slice_temp, stride_factor[batch]) * + stride_factor[batch]; + total_cols += slice_temp; + } + result[batch] = total_cols; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::BatchDense* const orig, + matrix::BatchDense* const trans) +{ + for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { + for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { + for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { + trans->at(batch, j, i) = orig->at(batch, i, j); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::BatchDense* orig, + matrix::BatchDense* trans) +{ + for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { + for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { + for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { + trans->at(batch, j, i) = conj(orig->at(batch, i, j)); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + + +template +void copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto result_ub = host::get_batch_struct(result); + for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { + const auto result_b = gko::batch::batch_entry(result_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + copy(x_b, result_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); + + +template +void convergence_copy(std::shared_ptr exec, + const matrix::BatchDense* x, + matrix::BatchDense* result, + const uint32& converged) +{ + const auto x_ub = host::get_batch_struct(x); + const auto result_ub = host::get_batch_struct(result); + for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { + const auto result_b = gko::batch::batch_entry(result_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + copy(x_b, result_b, converged); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); + + +template +void batch_scale(std::shared_ptr exec, + const matrix::BatchDiagonal* const left, + const matrix::BatchDiagonal* const rght, + matrix::BatchDense* const vecs) +{ + const auto left_vals = left->get_const_values(); + const auto rght_vals = rght->get_const_values(); + const auto v_vals = vecs->get_values(); + const auto nrows = static_cast(vecs->get_size().at(0)[0]); + const auto ncols = static_cast(vecs->get_size().at(0)[1]); + const auto vstride = vecs->get_stride().at(0); + for (size_type batch = 0; batch < vecs->get_num_batch_entries(); ++batch) { + const auto left_b = + gko::batch::batch_entry_ptr(left_vals, 1, nrows, batch); + const auto rght_b = + gko::batch::batch_entry_ptr(rght_vals, 1, ncols, batch); + const auto v_b = + gko::batch::batch_entry_ptr(v_vals, vstride, nrows, batch); + batch_scale(nrows, ncols, vstride, left_b, rght_b, v_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); + + +template +void add_scaled_identity(std::shared_ptr exec, + const matrix::BatchDense* const a, + const matrix::BatchDense* const b, + matrix::BatchDense* const mtx) +{ + const auto a_ub = host::get_batch_struct(a); + const auto b_ub = host::get_batch_struct(b); + const auto mtx_ub = host::get_batch_struct(mtx); + for (size_type batch = 0; batch < mtx->get_num_batch_entries(); ++batch) { + auto a_b = gko::batch::batch_entry(a_ub, batch); + auto b_b = gko::batch::batch_entry(b_ub, batch); + auto mtx_b = gko::batch::batch_entry(mtx_ub, batch); + add_scaled_identity(a_b.values[0], b_b.values[0], mtx_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_dense +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/batch_vector_kernels.hpp.inc b/reference/matrix/batch_vector_kernels.hpp.inc new file mode 100644 index 00000000000..db828206239 --- /dev/null +++ b/reference/matrix/batch_vector_kernels.hpp.inc @@ -0,0 +1,392 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +inline void matvec_kernel( + const gko::batch_dense::BatchEntry& a, + const gko::batch_dense::BatchEntry& b, + const gko::batch_dense::BatchEntry& c) +{ + for (int row = 0; row < c.num_rows; ++row) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] = gko::zero(); + } + } + + for (int row = 0; row < c.num_rows; ++row) { + for (int inner = 0; inner < a.num_rhs; ++inner) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] += + a.values[row * a.stride + inner] * + b.values[inner * b.stride + col]; + } + } + } +} + + +template +inline void advanced_matvec_kernel( + const ValueType alpha, + const gko::batch_dense::BatchEntry& a, + const gko::batch_dense::BatchEntry& b, + const ValueType beta, const gko::batch_dense::BatchEntry& c) +{ + if (beta != gko::zero()) { + for (int row = 0; row < c.num_rows; ++row) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] *= beta; + } + } + } else { + for (int row = 0; row < c.num_rows; ++row) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] *= gko::zero(); + } + } + } + + for (int row = 0; row < c.num_rows; ++row) { + for (int inner = 0; inner < a.num_rhs; ++inner) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] += + alpha * a.values[row * a.stride + inner] * + b.values[inner * b.stride + col]; + } + } + } +} + + +template +inline void scale(const gko::batch_dense::BatchEntry& alpha, + const gko::batch_dense::BatchEntry& x) +{ + if (alpha.num_rhs == 1) { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + x.values[i * x.stride + j] *= alpha.values[0]; + } + } + } else { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + x.values[i * x.stride + j] *= alpha.values[j]; + } + } + } +} + + +template +inline void add_scaled( + const gko::batch_dense::BatchEntry& alpha, + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry& y) +{ + if (alpha.num_rhs == 1) { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + y.values[i * y.stride + j] += + alpha.values[0] * x.values[i * x.stride + j]; + } + } + } else { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + y.values[i * y.stride + j] += + alpha.values[j] * x.values[i * x.stride + j]; + } + } + } +} + + +template +inline void add_scale( + const gko::batch_dense::BatchEntry& alpha, + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry& beta, + const gko::batch_dense::BatchEntry& y) +{ + if (alpha.num_rhs == 1) { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + y.values[i * y.stride + j] = + alpha.values[0] * x.values[i * x.stride + j] + + beta.values[0] * y.values[i * y.stride + j]; + } + } + } else { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + y.values[i * y.stride + j] = + alpha.values[j] * x.values[i * x.stride + j] + + beta.values[j] * y.values[i * y.stride + j]; + } + } + } +} + + +template +inline void compute_norm2( + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry>& result) +{ + for (int j = 0; j < x.num_rhs; ++j) { + result.values[j] = gko::zero>(); + } + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + result.values[j] += squared_norm(x.values[i * x.stride + j]); + } + } + for (int j = 0; j < x.num_rhs; ++j) { + result.values[j] = sqrt(result.values[j]); + } +} + + +/** + * Multiplies with a diagonal matrix represented as a dense vector. + * + * @param[in] diag_vec The entries of the diagonal matrix. + * @param[in,out] a The dense matrix or vectors to scale. + */ +template +inline void batch_scale( + const gko::batch_dense::BatchEntry& diag_vec, + const gko::batch_dense::BatchEntry& a) +{ + for (int i_row = 0; i_row < a.num_rows; i_row++) { + const ValueType scale = diag_vec.values[i_row]; + for (int j = 0; j < a.num_rhs; j++) { + a.values[i_row * a.stride + j] *= scale; + } + } +} + +template +inline void batch_scale(const int nrows, const int ncols, + const size_type a_stride, const ValueType* const left, + const ValueType* const right, ValueType* const a) +{ + for (int i_row = 0; i_row < nrows; i_row++) { + const ValueType scale = left[i_row]; + for (int j = 0; j < ncols; j++) { + a[i_row * a_stride + j] *= scale * right[j]; + } + } +} + + +/** + * Copies the values of one multi-vector into another. + * + * Note that the output multi-vector should already have memory allocated + * and stride set. + */ +template +inline void copy(const gko::batch_dense::BatchEntry& in, + const gko::batch_dense::BatchEntry& out) +{ + for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) { + const int i = iz / in.num_rhs; + const int j = iz % in.num_rhs; + out.values[i * out.stride + j] = in.values[i * in.stride + j]; + } +} + + +template +inline void compute_dot_product( + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry& y, + const gko::batch_dense::BatchEntry& result) +{ + for (int c = 0; c < result.num_rhs; c++) { + result.values[c] = gko::zero(); + } + + for (int r = 0; r < x.num_rows; r++) { + for (int c = 0; c < x.num_rhs; c++) { + result.values[c] += + conj(x.values[r * x.stride + c]) * y.values[r * y.stride + c]; + } + } +} + + +template +inline void copy( + const gko::batch_dense::BatchEntry& source_entry, + const gko::batch_dense::BatchEntry& destination_entry, + const gko::uint32& converged) +{ + for (int r = 0; r < source_entry.num_rows; r++) { + for (int c = 0; c < source_entry.num_rhs; c++) { + const gko::uint32 conv = converged & (1 << c); + + if (conv) { + continue; + } + + destination_entry.values[r * destination_entry.stride + c] = + source_entry.values[r * source_entry.stride + c]; + } + } +} + + +template +inline void add_scaled( + const gko::batch_dense::BatchEntry& alpha, + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry& y, + const gko::uint32& converged) +{ + if (alpha.num_rhs == 1) { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + const gko::uint32 conv = converged & (1 << j); + + if (conv) { + continue; + } + + y.values[i * y.stride + j] += + alpha.values[0] * x.values[i * x.stride + j]; + } + } + } else { + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + const gko::uint32 conv = converged & (1 << j); + + if (conv) { + continue; + } + + + y.values[i * y.stride + j] += + alpha.values[j] * x.values[i * x.stride + j]; + } + } + } +} + + +template +inline void compute_norm2( + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry>& result, + const gko::uint32& converged) +{ + for (int j = 0; j < x.num_rhs; ++j) { + const gko::uint32 conv = converged & (1 << j); + + if (conv) { + continue; + } + + result.values[j] = gko::zero>(); + } + for (int i = 0; i < x.num_rows; ++i) { + for (int j = 0; j < x.num_rhs; ++j) { + const gko::uint32 conv = converged & (1 << j); + + if (conv) { + continue; + } + + result.values[j] += squared_norm(x.values[i * x.stride + j]); + } + } + for (int j = 0; j < x.num_rhs; ++j) { + const gko::uint32 conv = converged & (1 << j); + + if (conv) { + continue; + } + + result.values[j] = sqrt(result.values[j]); + } +} + + +template +inline void compute_dot_product( + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry& y, + const gko::batch_dense::BatchEntry& result, + const gko::uint32& converged) +{ + for (int c = 0; c < result.num_rhs; c++) { + const gko::uint32 conv = converged & (1 << c); + + if (conv) { + continue; + } + + result.values[c] = gko::zero(); + } + + for (int r = 0; r < x.num_rows; r++) { + for (int c = 0; c < x.num_rhs; c++) { + const gko::uint32 conv = converged & (1 << c); + + if (conv) { + continue; + } + + result.values[c] += + conj(x.values[r * x.stride + c]) * y.values[r * y.stride + c]; + } + } +} + + +template +inline void add_scaled_identity( + const ValueType& a, const ValueType& b, + const gko::batch_dense::BatchEntry& mat) +{ + for (int i = 0; i < mat.num_rows; i++) { + for (int j = 0; j < mat.num_rhs; j++) { + mat.values[i * mat.stride + j] *= b; + if (i == j) { + mat.values[i * mat.stride + i] += a; + } + } + } +} diff --git a/reference/test/matrix/batch_vector_kernels.cpp b/reference/test/matrix/batch_vector_kernels.cpp new file mode 100644 index 00000000000..6e1a6c2f8e1 --- /dev/null +++ b/reference/test/matrix/batch_vector_kernels.cpp @@ -0,0 +1,1023 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/matrix/batch_dense_kernels.hpp" +#include "core/test/utils.hpp" + + +namespace { + + +template +class BatchDense : public ::testing::Test { +protected: + using value_type = T; + using size_type = gko::size_type; + using Mtx = gko::matrix::BatchDense; + using DenseMtx = gko::matrix::Dense; + using ComplexMtx = gko::to_complex; + using RealMtx = gko::remove_complex; + BatchDense() + : exec(gko::ReferenceExecutor::create()), + mtx_0(gko::batch_initialize( + {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, + {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}}, + exec)), + mtx_00(gko::initialize( + {I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, exec)), + mtx_01(gko::initialize( + {I({1.0, -2.0, -0.5}), I({1.0, -2.5, 4.0})}, exec)), + mtx_1( + gko::batch_initialize(std::vector{4, 4}, + {{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}}, + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + exec)), + mtx_10(gko::initialize( + {I({1.0, -1.0, 2.2}), I({-2.0, 2.0, -0.5})}, exec)), + mtx_11(gko::initialize( + 4, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)), + mtx_2(gko::batch_initialize( + std::vector{2, 2}, + {{{1.0, 1.5}, {6.0, 1.0}, {-0.25, 1.0}}, + {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}}, + exec)), + mtx_20(gko::initialize( + 4, {I({1.0, 1.5}), I({6.0, 1.0}), I({-0.25, 1.0})}, + exec)), + mtx_21(gko::initialize( + {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}, exec)), + mtx_3(gko::batch_initialize( + std::vector{4, 4}, + {{I({1.0, 1.5}), I({6.0, 1.0})}, {{2.0, -2.0}, {1.0, 3.0}}}, + exec)), + mtx_30(gko::initialize({I({1.0, 1.5}), I({6.0, 1.0})}, + exec)), + mtx_31(gko::initialize( + {I({2.0, -2.0}), I({1.0, 3.0})}, exec)), + mtx_4(gko::batch_initialize( + {{{1.0, 1.5, 3.0}, {6.0, 1.0, 5.0}, {6.0, 1.0, 5.5}}, + {{2.0, -2.0, 1.5}, {4.0, 3.0, 2.2}, {-1.25, 3.0, 0.5}}}, + exec)), + mtx_5(gko::batch_initialize( + {{{1.0, 1.5}, {6.0, 1.0}, {7.0, -4.5}}, + {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}}, + exec)), + mtx_6(gko::batch_initialize( + {{{1.0, 0.0, 3.0}, {0.0, 3.0, 0.0}, {0.0, 1.0, 5.0}}, + {{2.0, 0.0, 5.0}, {0.0, 1.0, 0.0}, {0.0, -1.0, 8.0}}}, + exec)) + {} + + std::shared_ptr exec; + std::unique_ptr mtx_0; + std::unique_ptr mtx_00; + std::unique_ptr mtx_01; + std::unique_ptr mtx_1; + std::unique_ptr mtx_10; + std::unique_ptr mtx_11; + std::unique_ptr mtx_2; + std::unique_ptr mtx_20; + std::unique_ptr mtx_21; + std::unique_ptr mtx_3; + std::unique_ptr mtx_30; + std::unique_ptr mtx_31; + std::unique_ptr mtx_4; + std::unique_ptr mtx_5; + std::unique_ptr mtx_6; + + std::ranlux48 rand_engine; +}; + + +TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); + + +TYPED_TEST(BatchDense, AppliesToBatchDense) +{ + using T = typename TestFixture::value_type; + this->mtx_1->apply(this->mtx_2.get(), this->mtx_3.get()); + this->mtx_10->apply(this->mtx_20.get(), this->mtx_30.get()); + this->mtx_11->apply(this->mtx_21.get(), this->mtx_31.get()); + + + auto res = this->mtx_3->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_30.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_31.get(), 0.); +} + + +TYPED_TEST(BatchDense, AppliesLinearCombinationToBatchDense) +{ + using Mtx = typename TestFixture::Mtx; + using DenseMtx = typename TestFixture::DenseMtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize({{1.5}, {-1.0}}, this->exec); + auto beta = gko::batch_initialize({{2.5}, {-4.0}}, this->exec); + auto alpha0 = gko::initialize({1.5}, this->exec); + auto alpha1 = gko::initialize({-1.0}, this->exec); + auto beta0 = gko::initialize({2.5}, this->exec); + auto beta1 = gko::initialize({-4.0}, this->exec); + + this->mtx_1->apply(alpha.get(), this->mtx_2.get(), beta.get(), + this->mtx_3.get()); + this->mtx_10->apply(alpha0.get(), this->mtx_20.get(), beta0.get(), + this->mtx_30.get()); + this->mtx_11->apply(alpha1.get(), this->mtx_21.get(), beta1.get(), + this->mtx_31.get()); + + auto res = this->mtx_3->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_30.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_31.get(), 0.); +} + + +TYPED_TEST(BatchDense, ApplyFailsOnWrongInnerDimension) +{ + using Mtx = typename TestFixture::Mtx; + auto res = Mtx::create( + this->exec, std::vector>{gko::dim<2>{2}, gko::dim<2>{2}}); + + ASSERT_THROW(this->mtx_2->apply(this->mtx_1.get(), res.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, ApplyFailsForNonUniformBatches) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto mat1 = gko::batch_initialize( + std::vector{4, 4}, + {{I({1.0, -1.0}), I({1.0, -1.0}), I({2.0, -0.5})}, + {{1.0, 2.5, 3.0}, {1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + this->exec); + auto mat2 = gko::batch_initialize( + std::vector{4, 4}, + {{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}}, + {{1.0, 2.5, -3.0}, {1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + this->exec); + auto res = Mtx::create( + this->exec, std::vector>{gko::dim<2>{2}, gko::dim<2>{3}}); + + ASSERT_THROW(mat2->apply(mat1.get(), res.get()), gko::NotImplemented); +} + + +TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfRows) +{ + using Mtx = typename TestFixture::Mtx; + auto res = Mtx::create( + this->exec, std::vector>{gko::dim<2>{3}, gko::dim<2>{3}}); + + ASSERT_THROW(this->mtx_1->apply(this->mtx_2.get(), res.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfCols) +{ + using Mtx = typename TestFixture::Mtx; + auto res = Mtx::create( + this->exec, + std::vector>{gko::dim<2>{2, 1}, gko::dim<2>{2, 1}}, + std::vector{3, 3}); + + + ASSERT_THROW(this->mtx_1->apply(this->mtx_2.get(), res.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, ScalesData) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize( + std::vector{3, 3}, + {{{2.0, -2.0, 1.5}}, {{3.0, -1.0, 0.25}}}, this->exec); + + auto ualpha = alpha->unbatch(); + + this->mtx_0->scale(alpha.get()); + this->mtx_00->scale(ualpha[0].get()); + this->mtx_01->scale(ualpha[1].get()); + + auto res = this->mtx_0->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_00.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_01.get(), 0.); +} + + +TYPED_TEST(BatchDense, ScalesDataWithScalar) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + + auto ualpha = alpha->unbatch(); + + this->mtx_1->scale(alpha.get()); + this->mtx_10->scale(ualpha[0].get()); + this->mtx_11->scale(ualpha[1].get()); + + auto res = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); +} + + +TYPED_TEST(BatchDense, ScalesDataWithStride) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize( + {{{2.0, -2.0, -1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); + + auto ualpha = alpha->unbatch(); + + this->mtx_1->scale(alpha.get()); + this->mtx_10->scale(ualpha[0].get()); + this->mtx_11->scale(ualpha[1].get()); + + auto res = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); +} + + +TYPED_TEST(BatchDense, AddsScaled) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize( + {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); + + auto ualpha = alpha->unbatch(); + + this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); + this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); + this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); + + auto res = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); +} + + +TYPED_TEST(BatchDense, AddsScale) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize( + {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); + auto beta = gko::batch_initialize( + {{{-1.0, 3.0, 0.5}}, {{1.5, 0.5, -4.0}}}, this->exec); + + auto ualpha = alpha->unbatch(); + auto ubeta = beta->unbatch(); + + this->mtx_1->add_scale(alpha.get(), this->mtx_0.get(), beta.get()); + this->mtx_10->add_scale(ualpha[0].get(), this->mtx_00.get(), + ubeta[0].get()); + this->mtx_11->add_scale(ualpha[1].get(), this->mtx_01.get(), + ubeta[1].get()); + + auto res = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); +} + + +TYPED_TEST(BatchDense, ConvergenceAddScaled) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize( + {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); + + auto ualpha = alpha->unbatch(); + + + const int num_rhs = 3; + const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); + + gko::kernels::reference::batch_dense::convergence_add_scaled( + this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), + converged); + + auto mtx_10_clone = gko::clone(this->mtx_10); + auto mtx_11_clone = gko::clone(this->mtx_11); + + this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); + this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); + + auto res = this->mtx_1->unbatch(); + + EXPECT_EQ(res[0]->at(0, 0), mtx_10_clone->at(0, 0)); + EXPECT_EQ(res[0]->at(1, 0), mtx_10_clone->at(1, 0)); + EXPECT_EQ(res[0]->at(0, 1), this->mtx_10->at(0, 1)); + EXPECT_EQ(res[0]->at(1, 1), this->mtx_10->at(1, 1)); + EXPECT_EQ(res[0]->at(0, 2), mtx_10_clone->at(0, 2)); + EXPECT_EQ(res[0]->at(1, 2), mtx_10_clone->at(1, 2)); + + EXPECT_EQ(res[1]->at(0, 0), mtx_11_clone->at(0, 0)); + EXPECT_EQ(res[1]->at(1, 0), mtx_11_clone->at(1, 0)); + EXPECT_EQ(res[1]->at(0, 1), this->mtx_11->at(0, 1)); + EXPECT_EQ(res[1]->at(1, 1), this->mtx_11->at(1, 1)); + EXPECT_EQ(res[1]->at(0, 2), mtx_11_clone->at(0, 2)); + EXPECT_EQ(res[1]->at(1, 2), mtx_11_clone->at(1, 2)); +} + + +TYPED_TEST(BatchDense, AddsScaledWithScalar) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + + auto ualpha = alpha->unbatch(); + + this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); + this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); + this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); + + auto res = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); +} + + +TYPED_TEST(BatchDense, AddsScaleWithScalar) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + auto beta = gko::batch_initialize({{-0.5}, {3.0}}, this->exec); + + auto ualpha = alpha->unbatch(); + auto ubeta = beta->unbatch(); + + this->mtx_1->add_scale(alpha.get(), this->mtx_0.get(), beta.get()); + this->mtx_10->add_scale(ualpha[0].get(), this->mtx_00.get(), + ubeta[0].get()); + this->mtx_11->add_scale(ualpha[1].get(), this->mtx_01.get(), + ubeta[1].get()); + + auto res = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); +} + + +TYPED_TEST(BatchDense, AddScaleWithScalarViaApply) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + auto beta = gko::batch_initialize({{-0.5}, {3.0}}, this->exec); + auto id = gko::matrix::BatchIdentity::create( + this->exec, gko::batch_dim<2>(2, gko::dim<2>(3, 3))); + auto ualpha = alpha->unbatch(); + auto ubeta = beta->unbatch(); + + this->mtx_0->apply(alpha.get(), id.get(), beta.get(), this->mtx_1.get()); + this->mtx_10->add_scale(ualpha[0].get(), this->mtx_00.get(), + ubeta[0].get()); + this->mtx_11->add_scale(ualpha[1].get(), this->mtx_01.get(), + ubeta[1].get()); + + auto res = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); +} + + +TYPED_TEST(BatchDense, ConvergenceAddScaledWithScalar) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + + auto ualpha = alpha->unbatch(); + + + const int num_rhs = 3; + const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); + + gko::kernels::reference::batch_dense::convergence_add_scaled( + this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), + converged); + + auto mtx_10_clone = gko::clone(this->mtx_10); + auto mtx_11_clone = gko::clone(this->mtx_11); + + this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); + this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); + + auto res = this->mtx_1->unbatch(); + + EXPECT_EQ(res[0]->at(0, 0), mtx_10_clone->at(0, 0)); + EXPECT_EQ(res[0]->at(1, 0), mtx_10_clone->at(1, 0)); + EXPECT_EQ(res[0]->at(0, 1), this->mtx_10->at(0, 1)); + EXPECT_EQ(res[0]->at(1, 1), this->mtx_10->at(1, 1)); + EXPECT_EQ(res[0]->at(0, 2), mtx_10_clone->at(0, 2)); + EXPECT_EQ(res[0]->at(1, 2), mtx_10_clone->at(1, 2)); + + EXPECT_EQ(res[1]->at(0, 0), mtx_11_clone->at(0, 0)); + EXPECT_EQ(res[1]->at(1, 0), mtx_11_clone->at(1, 0)); + EXPECT_EQ(res[1]->at(0, 1), this->mtx_11->at(0, 1)); + EXPECT_EQ(res[1]->at(1, 1), this->mtx_11->at(1, 1)); + EXPECT_EQ(res[1]->at(0, 2), mtx_11_clone->at(0, 2)); + EXPECT_EQ(res[1]->at(1, 2), mtx_11_clone->at(1, 2)); +} + + +TYPED_TEST(BatchDense, AddScaledFailsOnWrongSizes) +{ + using Mtx = typename TestFixture::Mtx; + auto alpha = + gko::batch_initialize({{2.0, 3.0, 4.0, 5.0}, {-2.0}}, this->exec); + + ASSERT_THROW(this->mtx_1->add_scaled(alpha.get(), this->mtx_2.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, AddScaleFailsOnWrongSizes) +{ + using Mtx = typename TestFixture::Mtx; + auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + auto beta = gko::batch_initialize({{2.0}, {3.0}}, this->exec); + + ASSERT_THROW( + this->mtx_1->add_scale(alpha.get(), this->mtx_2.get(), beta.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, AddScaleFailsOnWrongScalarSizes) +{ + using Mtx = typename TestFixture::Mtx; + auto alpha = gko::batch_initialize( + {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); + auto beta = gko::batch_initialize({{3.0}, {1.5}}, this->exec); + + ASSERT_THROW( + this->mtx_1->add_scale(alpha.get(), this->mtx_0.get(), beta.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, ComputesDot) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto result = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); + + auto ures = result->unbatch(); + + this->mtx_0->compute_dot(this->mtx_1.get(), result.get()); + this->mtx_00->compute_dot(this->mtx_10.get(), ures[0].get()); + this->mtx_01->compute_dot(this->mtx_11.get(), ures[1].get()); + + auto res = result->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); +} + + +TYPED_TEST(BatchDense, ConvergenceComputeDot) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto result = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); + + for (int ibatch = 0; ibatch < result->get_size().get_batch_sizes().size(); + ibatch++) { + for (int icol = 0; icol < result->get_size().at()[1]; icol++) { + result->at(ibatch, 0, icol) = gko::zero(); + } + } + + auto ures = result->unbatch(); + + const int num_rhs = 3; + const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); + + gko::kernels::reference::batch_dense::convergence_compute_dot( + this->exec, this->mtx_0.get(), this->mtx_1.get(), result.get(), + converged); + + auto ures_00_clone = gko::clone(ures[0]); + auto ures_01_clone = gko::clone(ures[1]); + + this->mtx_00->compute_dot(this->mtx_10.get(), ures[0].get()); + this->mtx_01->compute_dot(this->mtx_11.get(), ures[1].get()); + + auto res = result->unbatch(); + + EXPECT_EQ(res[0]->at(0, 0), ures_00_clone->at(0, 0)); + EXPECT_EQ(res[0]->at(0, 1), ures[0]->at(0, 1)); + EXPECT_EQ(res[0]->at(0, 2), ures_00_clone->at(0, 2)); + + EXPECT_EQ(res[1]->at(0, 0), ures_01_clone->at(0, 0)); + EXPECT_EQ(res[1]->at(0, 1), ures[1]->at(0, 1)); + EXPECT_EQ(res[1]->at(0, 2), ures_01_clone->at(0, 2)); +} + + +TYPED_TEST(BatchDense, ComputesNorm2) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using T_nc = gko::remove_complex; + using NormVector = gko::matrix::BatchDense; + auto mtx(gko::batch_initialize( + {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, + {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, + this->exec)); + auto batch_size = gko::batch_dim<2>( + std::vector>{gko::dim<2>{1, 2}, gko::dim<2>{1, 2}}); + auto result = + NormVector::create(this->exec, batch_size, gko::batch_stride(2, 2)); + + mtx->compute_norm2(result.get()); + + EXPECT_EQ(result->at(0, 0, 0), T_nc{3.0}); + EXPECT_EQ(result->at(0, 0, 1), T_nc{5.0}); + EXPECT_EQ(result->at(1, 0, 0), T_nc{5.0}); + EXPECT_EQ(result->at(1, 0, 1), T_nc{3.0}); +} + + +TYPED_TEST(BatchDense, ConvergenceComputeNorm2) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using T_nc = gko::remove_complex; + using NormVector = gko::matrix::BatchDense; + auto mtx(gko::batch_initialize( + {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, + {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, + this->exec)); + auto batch_size = gko::batch_dim<2>( + std::vector>{gko::dim<2>{1, 2}, gko::dim<2>{1, 2}}); + auto result = + NormVector::create(this->exec, batch_size, gko::batch_stride(2, 2)); + + for (int ibatch = 0; ibatch < result->get_size().get_batch_sizes().size(); + ibatch++) { + for (int icol = 0; icol < result->get_size().at()[1]; icol++) { + result->at(ibatch, 0, icol) = gko::zero(); + } + } + + auto result_clone = gko::clone(result); + + const int num_rhs = 2; + const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); + + gko::kernels::reference::batch_dense::convergence_compute_norm2( + this->exec, mtx.get(), result.get(), converged); + + EXPECT_EQ(result->at(0, 0, 0), result_clone->at(0, 0, 0)); + EXPECT_EQ(result->at(0, 0, 1), T_nc{5.0}); + + EXPECT_EQ(result->at(1, 0, 0), result_clone->at(1, 0, 0)); + EXPECT_EQ(result->at(1, 0, 1), T_nc{3.0}); +} + + +TYPED_TEST(BatchDense, ComputDotFailsOnWrongInputSize) +{ + using Mtx = typename TestFixture::Mtx; + auto result = + Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ + gko::dim<2>{1, 2}, gko::dim<2>{1, 3}})); + + ASSERT_THROW(this->mtx_1->compute_dot(this->mtx_2.get(), result.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, ComputDotFailsOnWrongResultSize) +{ + using Mtx = typename TestFixture::Mtx; + auto result = + Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ + gko::dim<2>{1, 2}, gko::dim<2>{1, 2}})); + auto result2 = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); + + ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result.get()), + gko::DimensionMismatch); + ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result2.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, CopiesData) +{ + gko::kernels::reference::batch_dense::copy(this->exec, this->mtx_0.get(), + this->mtx_1.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(this->mtx_1.get(), this->mtx_0.get(), 0.); +} + + +TYPED_TEST(BatchDense, ConvergenceCopyData) +{ + auto umtx_0 = this->mtx_0->unbatch(); + + const int num_rhs = 3; + const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); + gko::kernels::reference::batch_dense::convergence_copy( + this->exec, this->mtx_0.get(), this->mtx_1.get(), converged); + + auto mtx_10_clone = gko::clone(this->mtx_10); + auto mtx_11_clone = gko::clone(this->mtx_11); + + auto res = this->mtx_1->unbatch(); + + EXPECT_EQ(res[0]->at(0, 0), mtx_10_clone->at(0, 0)); + EXPECT_EQ(res[0]->at(1, 0), mtx_10_clone->at(1, 0)); + EXPECT_EQ(res[0]->at(0, 1), this->mtx_0->at(0, 0, 1)); + EXPECT_EQ(res[0]->at(1, 1), this->mtx_0->at(0, 1, 1)); + EXPECT_EQ(res[0]->at(0, 2), mtx_10_clone->at(0, 2)); + EXPECT_EQ(res[0]->at(1, 2), mtx_10_clone->at(1, 2)); + + EXPECT_EQ(res[1]->at(0, 0), mtx_11_clone->at(0, 0)); + EXPECT_EQ(res[1]->at(1, 0), mtx_11_clone->at(1, 0)); + EXPECT_EQ(res[1]->at(0, 1), this->mtx_0->at(1, 0, 1)); + EXPECT_EQ(res[1]->at(1, 1), this->mtx_0->at(1, 1, 1)); + EXPECT_EQ(res[1]->at(0, 2), mtx_11_clone->at(0, 2)); + EXPECT_EQ(res[1]->at(1, 2), mtx_11_clone->at(1, 2)); +} + + +TYPED_TEST(BatchDense, BatchScale) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using BDiag = gko::matrix::BatchDiagonal; + + auto mtx(gko::batch_initialize( + {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, + {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, + this->exec)); + + auto left(gko::batch_diagonal_initialize( + I>{I{1.0, 2.0, 3.0}, I{-1.0, -2.0, -3.0}}, this->exec)); + auto rght(gko::batch_diagonal_initialize( + I>{I{-0.5, -2.0}, I{2.0, 0.25}}, this->exec)); + + gko::kernels::reference::batch_dense::batch_scale(this->exec, left.get(), + rght.get(), mtx.get()); + + EXPECT_EQ(mtx->at(0, 0, 0), T{-0.5}); + EXPECT_EQ(mtx->at(0, 1, 0), T{-2.0}); + EXPECT_EQ(mtx->at(0, 2, 0), T{-3.0}); + EXPECT_EQ(mtx->at(0, 0, 1), T{0.0}); + EXPECT_EQ(mtx->at(0, 1, 1), T{-12.0}); + EXPECT_EQ(mtx->at(0, 2, 1), T{-24.0}); + + EXPECT_EQ(mtx->at(1, 0, 0), T{8.0}); + EXPECT_EQ(mtx->at(1, 1, 0), T{12.0}); + EXPECT_EQ(mtx->at(1, 2, 0), T{0.0}); + EXPECT_EQ(mtx->at(1, 0, 1), T{-0.5}); + EXPECT_EQ(mtx->at(1, 1, 1), T{1.0}); + EXPECT_EQ(mtx->at(1, 2, 1), T{-0.75}); +} + + +TYPED_TEST(BatchDense, ConvertsToPrecision) +{ + using BatchDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherBatchDense = typename gko::matrix::BatchDense; + auto tmp = OtherBatchDense::create(this->exec); + auto res = BatchDense::create(this->exec); + // If OtherT is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx_1->convert_to(tmp.get()); + tmp->convert_to(res.get()); + + auto ures = res->unbatch(); + auto umtx = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual); + GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual); +} + + +TYPED_TEST(BatchDense, MovesToPrecision) +{ + using BatchDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherBatchDense = typename gko::matrix::BatchDense; + auto tmp = OtherBatchDense::create(this->exec); + auto res = BatchDense::create(this->exec); + // If OtherT is more precise: 0, otherwise r + auto residual = r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{r::value}; + + this->mtx_1->move_to(tmp.get()); + tmp->move_to(res.get()); + + auto ures = res->unbatch(); + auto umtx = this->mtx_1->unbatch(); + GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual); + GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual); +} + + +TYPED_TEST(BatchDense, ConvertsToCsr32) +{ + using T = typename TestFixture::value_type; + using BatchCsr = typename gko::matrix::BatchCsr; + auto batch_csr_mtx = BatchCsr::create(this->mtx_6->get_executor()); + + this->mtx_6->convert_to(batch_csr_mtx.get()); + + auto v = batch_csr_mtx->get_const_values(); + auto c = batch_csr_mtx->get_const_col_idxs(); + auto r = batch_csr_mtx->get_const_row_ptrs(); + ASSERT_EQ(batch_csr_mtx->get_num_batch_entries(), 2); + ASSERT_EQ(batch_csr_mtx->get_size().at(0), gko::dim<2>(3, 3)); + ASSERT_EQ(batch_csr_mtx->get_size().at(1), gko::dim<2>(3, 3)); + ASSERT_EQ(batch_csr_mtx->get_num_stored_elements(), 10); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 2); + EXPECT_EQ(r[2], 3); + EXPECT_EQ(r[3], 5); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 2); + EXPECT_EQ(c[2], 1); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(c[4], 2); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{3.0}); + EXPECT_EQ(v[3], T{1.0}); + EXPECT_EQ(v[4], T{5.0}); + EXPECT_EQ(v[5], T{2.0}); + EXPECT_EQ(v[6], T{5.0}); + EXPECT_EQ(v[7], T{1.0}); + EXPECT_EQ(v[8], T{-1.0}); + EXPECT_EQ(v[9], T{8.0}); +} + + +TYPED_TEST(BatchDense, MovesToCsr32) +{ + using T = typename TestFixture::value_type; + using BatchCsr = typename gko::matrix::BatchCsr; + auto batch_csr_mtx = BatchCsr::create(this->mtx_6->get_executor()); + + this->mtx_6->move_to(batch_csr_mtx.get()); + + auto v = batch_csr_mtx->get_const_values(); + auto c = batch_csr_mtx->get_const_col_idxs(); + auto r = batch_csr_mtx->get_const_row_ptrs(); + ASSERT_EQ(batch_csr_mtx->get_num_batch_entries(), 2); + ASSERT_EQ(batch_csr_mtx->get_size().at(0), gko::dim<2>(3, 3)); + ASSERT_EQ(batch_csr_mtx->get_size().at(1), gko::dim<2>(3, 3)); + ASSERT_EQ(batch_csr_mtx->get_num_stored_elements(), 10); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 2); + EXPECT_EQ(r[2], 3); + EXPECT_EQ(r[3], 5); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 2); + EXPECT_EQ(c[2], 1); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(c[4], 2); + EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[1], T{3.0}); + EXPECT_EQ(v[2], T{3.0}); + EXPECT_EQ(v[3], T{1.0}); + EXPECT_EQ(v[4], T{5.0}); + EXPECT_EQ(v[5], T{2.0}); + EXPECT_EQ(v[6], T{5.0}); + EXPECT_EQ(v[7], T{1.0}); + EXPECT_EQ(v[8], T{-1.0}); + EXPECT_EQ(v[9], T{8.0}); +} + + +TYPED_TEST(BatchDense, ConvertsEmptyToPrecision) +{ + using BatchDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherBatchDense = typename gko::matrix::BatchDense; + auto empty = OtherBatchDense::create(this->exec); + auto res = BatchDense::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_FALSE(res->get_num_batch_entries()); +} + + +TYPED_TEST(BatchDense, MovesEmptyToPrecision) +{ + using BatchDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherBatchDense = typename gko::matrix::BatchDense; + auto empty = OtherBatchDense::create(this->exec); + auto res = BatchDense::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_FALSE(res->get_num_batch_entries()); +} + + +TYPED_TEST(BatchDense, ConvertsEmptyMatrixToCsr) +{ + using BatchDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using BatchCsr = typename gko::matrix::BatchCsr; + auto empty = BatchDense::create(this->exec); + auto res = BatchCsr::create(this->exec); + + empty->convert_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_num_batch_entries()); +} + + +TYPED_TEST(BatchDense, MovesEmptyMatrixToCsr) +{ + using BatchDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using BatchCsr = typename gko::matrix::BatchCsr; + auto empty = BatchDense::create(this->exec); + auto res = BatchCsr::create(this->exec); + + empty->move_to(res.get()); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_num_batch_entries()); +} + + +TYPED_TEST(BatchDense, ConvertsToBatchDiagonal) +{ + using BDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using BDiag = gko::matrix::BatchDiagonal; + auto vec = gko::batch_initialize( + {I({2.0, 3.0, -1.0}), I({1.0, -2.0, 8.0})}, this->exec); + auto diag = BDiag::create(this->exec); + + vec->convert_to(diag.get()); + + auto check_sz = gko::batch_dim<2>{2, gko::dim<2>{3}}; + ASSERT_EQ(diag->get_size(), check_sz); + auto diag_vals = diag->get_const_values(); + ASSERT_EQ(diag_vals[0], T{2.0}); + ASSERT_EQ(diag_vals[1], T{3.0}); + ASSERT_EQ(diag_vals[2], T{-1.0}); + ASSERT_EQ(diag_vals[3], T{1.0}); + ASSERT_EQ(diag_vals[4], T{-2.0}); + ASSERT_EQ(diag_vals[5], T{8.0}); +} + + +TYPED_TEST(BatchDense, MovesToBatchDiagonal) +{ + using BDense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using BDiag = gko::matrix::BatchDiagonal; + auto vec = gko::batch_initialize( + {I({2.0, 3.0, -1.0}), I({1.0, -2.0, 8.0})}, this->exec); + auto vec_ptr = vec->get_const_values(); + auto diag = BDiag::create(this->exec); + + vec->move_to(diag.get()); + + auto check_sz = gko::batch_dim<2>{2, gko::dim<2>{3}}; + ASSERT_EQ(diag->get_size(), check_sz); + auto diag_vals = diag->get_const_values(); + ASSERT_EQ(diag_vals, vec_ptr); + ASSERT_NE(diag_vals, vec->get_const_values()); + ASSERT_EQ(vec->get_num_batch_entries(), 0); +} + + +TYPED_TEST(BatchDense, SquareMatrixIsTransposable) +{ + using Mtx = typename TestFixture::Mtx; + auto trans = this->mtx_4->transpose(); + auto trans_as_batch_dense = static_cast(trans.get()); + + auto utb = trans_as_batch_dense->unbatch(); + GKO_ASSERT_MTX_NEAR(utb[0].get(), + l({{1.0, 6.0, 6.0}, {1.5, 1.0, 1.0}, {3.0, 5.0, 5.5}}), + r::value); + GKO_ASSERT_MTX_NEAR( + utb[1].get(), l({{2.0, 4.0, -1.25}, {-2.0, 3.0, 3.0}, {1.5, 2.2, 0.5}}), + r::value); +} + + +TYPED_TEST(BatchDense, NonSquareMatrixIsTransposable) +{ + using Mtx = typename TestFixture::Mtx; + auto trans = this->mtx_5->transpose(); + auto trans_as_batch_dense = static_cast(trans.get()); + + auto utb = trans_as_batch_dense->unbatch(); + GKO_ASSERT_MTX_NEAR(utb[0].get(), l({{1.0, 6.0, 7.0}, {1.5, 1.0, -4.5}}), + r::value); + GKO_ASSERT_MTX_NEAR(utb[1].get(), l({{2.0, 1.0, 4.0}, {-2.0, 3.0, 3.0}}), + r::value); +} + + +TYPED_TEST(BatchDense, SquareMatrixAddScaledIdentity) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + auto mtx = gko::batch_initialize( + {{I({1.0, -1.0, 1.5}), I({-2.0, 0.0, 3.0}), + I({1.2, -0.5, 1.0})}, + {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}, {3.0, 0.0, -1.5}}}, + this->exec); + auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + auto beta = gko::batch_initialize({{3.0}, {-1.0}}, this->exec); + auto sol_mtx = gko::batch_initialize( + {{I({5.0, -3.0, 4.5}), I({-6.0, 2.0, 9.0}), + I({3.6, -1.5, 5.0})}, + {{-3.0, 2.0, 0.5}, {-1.0, 0.5, -4.0}, {-3.0, 0.0, -0.5}}}, + this->exec); + + mtx->add_scaled_identity(alpha.get(), beta.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(mtx, sol_mtx, r::value); +} + + +} // namespace diff --git a/test/matrix/batch_vector_kernels.cpp b/test/matrix/batch_vector_kernels.cpp new file mode 100644 index 00000000000..5d275dbea5b --- /dev/null +++ b/test/matrix/batch_vector_kernels.cpp @@ -0,0 +1,433 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/test/utils.hpp" +#include "core/test/utils/batch.hpp" +#include "test/utils/executor.hpp" + + +#ifndef GKO_COMPILING_DPCPP + + +class BatchDense : public CommonTestFixture { +protected: + using vtype = double; + using Mtx = gko::matrix::BatchDense; + using NormVector = gko::matrix::BatchDense>; + using ComplexMtx = gko::matrix::BatchDense>; + + BatchDense() : rand_engine(15) {} + + template + std::unique_ptr gen_mtx(const size_t batchsize, int num_rows, + int num_cols) + { + return gko::test::generate_uniform_batch_random_matrix( + batchsize, num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, false, ref); + } + + void set_up_vector_data(gko::size_type num_vecs, + bool different_alpha = false) + { + const int num_rows = 252; + x = gen_mtx(batch_size, num_rows, num_vecs); + y = gen_mtx(batch_size, num_rows, num_vecs); + if (different_alpha) { + alpha = gen_mtx(batch_size, 1, num_vecs); + beta = gen_mtx(batch_size, 1, num_vecs); + } else { + alpha = gko::batch_initialize(batch_size, {2.0}, ref); + beta = gko::batch_initialize(batch_size, {-0.5}, ref); + } + dx = Mtx::create(exec); + dx->copy_from(x.get()); + dy = Mtx::create(exec); + dy->copy_from(y.get()); + dalpha = Mtx::create(exec); + dalpha->copy_from(alpha.get()); + dbeta = gko::clone(exec, beta.get()); + expected = Mtx::create( + ref, gko::batch_dim<>(batch_size, gko::dim<2>{1, num_vecs})); + dresult = Mtx::create( + exec, gko::batch_dim<>(batch_size, gko::dim<2>{1, num_vecs})); + } + + void set_up_apply_data(const int p = 1) + { + const int m = 35, n = 15; + x = gen_mtx(batch_size, m, n); + c_x = gen_mtx(batch_size, m, n); + y = gen_mtx(batch_size, n, p); + expected = gen_mtx(batch_size, m, p); + alpha = gko::batch_initialize(batch_size, {2.0}, ref); + beta = gko::batch_initialize(batch_size, {-1.0}, ref); + square = gen_mtx(batch_size, x->get_size().at()[0], + x->get_size().at()[0]); + dx = Mtx::create(exec); + dx->copy_from(x.get()); + dc_x = ComplexMtx::create(exec); + dc_x->copy_from(c_x.get()); + dy = Mtx::create(exec); + dy->copy_from(y.get()); + dresult = Mtx::create(exec); + dresult->copy_from(expected.get()); + dalpha = Mtx::create(exec); + dalpha->copy_from(alpha.get()); + dbeta = Mtx::create(exec); + dbeta->copy_from(beta.get()); + dsquare = Mtx::create(exec); + dsquare->copy_from(square.get()); + } + + std::ranlux48 rand_engine; + + const size_t batch_size = 11; + std::unique_ptr x; + std::unique_ptr c_x; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr square; + std::unique_ptr dresult; + std::unique_ptr dx; + std::unique_ptr dc_x; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; + std::unique_ptr dsquare; +}; + + +TEST_F(BatchDense, SingleVectorAppyIsEquivalentToRef) +{ + set_up_apply_data(1); + + x->apply(y.get(), expected.get()); + dx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(BatchDense, SingleVectorAdvancedAppyIsEquivalentToRef) +{ + set_up_apply_data(1); + + x->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(BatchDense, SingleVectorAddScaledIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, SingleVectorAddScaleIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->add_scale(alpha.get(), y.get(), beta.get()); + dx->add_scale(dalpha.get(), dy.get(), dbeta.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, MultipleVectorAddScaledIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, MultipleVectorAddScaleIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->add_scale(alpha.get(), y.get(), beta.get()); + dx->add_scale(dalpha.get(), dy.get(), dbeta.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) +{ + set_up_vector_data(20, true); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) +{ + set_up_vector_data(20, true); + + x->add_scale(alpha.get(), y.get(), beta.get()); + dx->add_scale(dalpha.get(), dy.get(), dbeta.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, SingleVectorScaleIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, MultipleVectorScaleIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) +{ + set_up_vector_data(20, true); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, ComputeNorm2SingleIsEquivalentToRef) +{ + set_up_vector_data(1); + auto norm_size = + gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + auto norm_expected = NormVector::create(this->ref, norm_size); + auto dnorm = NormVector::create(this->exec, norm_size); + + x->compute_norm2(norm_expected.get()); + dx->compute_norm2(dnorm.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 1e-14); +} + + +TEST_F(BatchDense, ComputeNorm2IsEquivalentToRef) +{ + set_up_vector_data(20); + auto norm_size = + gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + auto norm_expected = NormVector::create(this->ref, norm_size); + auto dnorm = NormVector::create(this->exec, norm_size); + + x->compute_norm2(norm_expected.get()); + dx->compute_norm2(dnorm.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 1e-14); +} + + +TEST_F(BatchDense, ComputeDotIsEquivalentToRef) +{ + set_up_vector_data(20); + auto dot_size = + gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + auto dot_expected = Mtx::create(this->ref, dot_size); + auto ddot = Mtx::create(this->exec, dot_size); + + x->compute_dot(y.get(), dot_expected.get()); + dx->compute_dot(dy.get(), ddot.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 1e-14); +} + + +TEST_F(BatchDense, ComputeDotSingleIsEquivalentToRef) +{ + set_up_vector_data(1); + auto dot_size = + gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + auto dot_expected = Mtx::create(this->ref, dot_size); + auto ddot = Mtx::create(this->exec, dot_size); + + x->compute_dot(y.get(), dot_expected.get()); + dx->compute_dot(dy.get(), ddot.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 1e-14); +} + + +TEST_F(BatchDense, CopySingleIsEquivalentToRef) +{ + set_up_vector_data(1); + + gko::kernels::reference::batch_dense::copy(this->ref, x.get(), y.get()); + gko::kernels::EXEC_NAMESPACE::batch_dense::copy(this->exec, dx.get(), + dy.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); +} + + +TEST_F(BatchDense, CopyIsEquivalentToRef) +{ + set_up_vector_data(20); + + gko::kernels::reference::batch_dense::copy(this->ref, x.get(), y.get()); + gko::kernels::EXEC_NAMESPACE::batch_dense::copy(this->exec, dx.get(), + dy.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); +} + + +TEST_F(BatchDense, BatchScaleIsEquivalentToRef) +{ + using BDiag = gko::matrix::BatchDiagonal; + const int num_rhs = 20; + set_up_vector_data(num_rhs); + + const int num_rows_in_mat = x->get_size().at(0)[0]; + const auto left = + gen_mtx(batch_size, num_rows_in_mat, num_rows_in_mat); + const auto rght = gen_mtx(batch_size, num_rhs, num_rhs); + auto dleft = BDiag::create(this->exec); + dleft->copy_from(left.get()); + auto drght = BDiag::create(this->exec); + drght->copy_from(rght.get()); + + gko::kernels::reference::batch_dense::batch_scale(this->ref, left.get(), + rght.get(), x.get()); + gko::kernels::EXEC_NAMESPACE::batch_dense::batch_scale( + this->exec, dleft.get(), drght.get(), dx.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(BatchDense, TransposeIsEquivalentToRef) +{ + const int nrows = 11; + const int ncols = 6; + const size_t nbatch = 5; + const auto orig = gen_mtx(nbatch, nrows, ncols); + auto corig = Mtx::create(exec); + corig->copy_from(orig.get()); + + auto trans = orig->transpose(); + auto ctrans = corig->transpose(); + + auto dtrans = static_cast(trans.get()); + auto dctrans = static_cast(ctrans.get()); + GKO_ASSERT_BATCH_MTX_NEAR(dtrans, dctrans, 0.0); +} + + +TEST_F(BatchDense, ConjugateTransposeIsEquivalentToRef) +{ + const int nrows = 11; + const int ncols = 6; + const size_t nbatch = 5; + const auto orig = gen_mtx(nbatch, nrows, ncols); + auto corig = Mtx::create(exec); + corig->copy_from(orig.get()); + + auto trans = orig->conj_transpose(); + auto ctrans = corig->conj_transpose(); + + auto dtrans = static_cast(trans.get()); + auto dctrans = static_cast(ctrans.get()); + GKO_ASSERT_BATCH_MTX_NEAR(dtrans, dctrans, 0.0); +} + + +TEST_F(BatchDense, AddScaledIdentityNonSquareIsEquivalentToReference) +{ + set_up_apply_data(); + const gko::size_type batchsize = 10; + const gko::size_type num_rows = 62; + const gko::size_type num_cols = 51; + auto rmtx = gko::test::generate_uniform_batch_random_matrix( + batchsize, num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, true, ref); + auto dmtx = Mtx::create(exec); + dmtx->copy_from(rmtx.get()); + + rmtx->add_scaled_identity(alpha.get(), beta.get()); + dmtx->add_scaled_identity(dalpha.get(), dbeta.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(rmtx, dmtx, 1e-15) +} + + +#endif From f74a8b90986590153f82bfe8fd6ac8c348b4f586 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 30 Jun 2023 18:13:59 +0200 Subject: [PATCH 105/583] Move batch_dim to separate class and simplify --- core/test/base/batch_dim.cpp | 92 ++++++++++++ include/ginkgo/core/base/batch_dim.hpp | 152 ++++++++++++++++++++ include/ginkgo/core/base/dim.hpp | 186 ------------------------- 3 files changed, 244 insertions(+), 186 deletions(-) create mode 100644 core/test/base/batch_dim.cpp create mode 100644 include/ginkgo/core/base/batch_dim.hpp diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp new file mode 100644 index 00000000000..f4361195d7c --- /dev/null +++ b/core/test/base/batch_dim.cpp @@ -0,0 +1,92 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +TEST(BatchDim, ConstructsCorrectUniformObject) +{ + gko::batch_dim<2> d{4, gko::dim<2>(5)}; + + ASSERT_EQ(d.get_num_batch_entries(), 4); + ASSERT_EQ(d.get_common_size(), gko::dim<2>(5)); +} + + +TEST(BatchDim, ConstructsNullObject) +{ + gko::batch_dim<2> d{}; + + ASSERT_EQ(d.get_num_batch_entries(), 0); + ASSERT_EQ(d.get_common_size(), gko::dim<2>{}); +} + + +TEST(BatchDim, EqualityReturnsTrueWhenEqual) +{ + ASSERT_TRUE(gko::batch_dim<2>(2, gko::dim<2>{3}) == + gko::batch_dim<2>(2, gko::dim<2>{3})); +} + + +TEST(BatchDim, EqualityReturnsFalseWhenDifferentNumBatches) +{ + ASSERT_FALSE(gko::batch_dim<2>(3, gko::dim<2>{3}) == + gko::batch_dim<2>(2, gko::dim<2>{3})); +} + + +TEST(BatchDim, EqualityReturnsFalseWhenDifferentBatchSizes) +{ + ASSERT_FALSE(gko::batch_dim<2>(3, gko::dim<2>{3}) == + gko::batch_dim<2>(3, gko::dim<2>{4})); +} + + +TEST(BatchDim, NotEqualWorks) +{ + ASSERT_TRUE(gko::batch_dim<2>(3, gko::dim<2>{3}) != + gko::batch_dim<2>(3, gko::dim<2>{4})); +} + + +TEST(BatchDim, TransposesBatchDimensions) +{ + ASSERT_EQ(gko::transpose(gko::batch_dim<2>(2, gko::dim<2>{4, 2})), + gko::batch_dim<2>(2, gko::dim<2>{2, 4})); +} diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp new file mode 100644 index 00000000000..211225d7df2 --- /dev/null +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -0,0 +1,152 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_DIM_HPP_ +#define GKO_PUBLIC_CORE_BASE_DIM_HPP_ + + +#include + + +#include +#include + + +namespace gko { + + +/** + * A type representing the dimensions of a multidimensional batch object. + * + * @tparam Dimensionality number of dimensions of the object + * @tparam DimensionType datatype used to represent each dimension + * + * @ingroup batch_dim + */ +template +struct batch_dim { + static constexpr size_type dimensionality = Dimensionality; + using dimension_type = DimensionType; + + /** + * Get the number of batch entries stored + * + * @return num_batch_entries + */ + size_type get_num_batch_entries() const { return num_batch_entries_; } + + /** + * Get the common size of the batches + * + * @return common_size + */ + dim get_common_size() const + { + return common_size_; + } + + /** + * Checks if two batch_dim objects are equal. + * + * @param x first object + * @param y second object + * + * @return true if and only if all dimensions of both objects are equal. + */ + friend bool operator==(const batch_dim& x, const batch_dim& y) + { + return x.num_batch_entries_ == y.num_batch_entries_ && + x.common_size_ == y.common_size_; + } + + /** + * Creates a batch_dim object which stores a uniform size for all batch + * entries. + * + * @param num_batch_entries number of batch entries to be stored + * @param common_size the common size of all the batch entries stored + * + * @note Use this constructor when uniform batches need to be stored. + */ + explicit batch_dim(const size_type num_batch_entries = 0, + const dim& common_size = + dim{}) + : common_size_(common_size), num_batch_entries_(num_batch_entries) + {} + +private: + size_type num_batch_entries_{}; + dim common_size_{}; +}; + + +/** + * Checks if two batch dim objects are different. + * + * @tparam Dimensionality number of dimensions of the dim objects + * @tparam DimensionType datatype used to represent each dimension + * + * @param x first object + * @param y second object + * + * @return `!(x == y)` + */ +template +inline bool operator!=(const batch_dim& x, + const batch_dim& y) +{ + return !(x == y); +} + + +/** + * Returns a batch_dim object with its dimensions swapped for batched operators + * + * @tparam DimensionType datatype used to represent each dimension + * + * @param dimensions original object + * + * @return a batch_dim object with dimensions swapped + */ +template +inline batch_dim<2, DimensionType> transpose( + const batch_dim<2, DimensionType>& input) +{ + return batch_dim<2, DimensionType>(input.get_num_batch_entries(), + gko::transpose(input.get_common_size())); +} + + +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_BATCH_DIM_HPP_ diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index ae13290cdd2..c70c5f054ec 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -243,144 +243,6 @@ struct dim<1u, DimensionType> { }; -/** - * A type representing the dimensions of a multidimensional batch object. - * - * @tparam Dimensionality number of dimensions of the object - * @tparam DimensionType datatype used to represent each dimension - * - * @ingroup batch_dim - */ -template -struct batch_dim { - static constexpr size_type dimensionality = Dimensionality; - using dimension_type = DimensionType; - - /** - * Checks if the batch_dim object stores equal sizes. - * - * @return bool representing whether equal sizes are being stored - */ - bool stores_equal_sizes() const { return equal_sizes_; } - - /** - * Get the number of batch entries stored - * - * @return num_batch_entries - */ - size_type get_num_batch_entries() const { return num_batch_entries_; } - - /** - * Get the sizes of all entries as a std::vector. - * - * @return the std::vector of batch sizes - */ - std::vector> get_batch_sizes() const - { - if (equal_sizes_) { - if (num_batch_entries_ > 0) { - return std::vector>( - num_batch_entries_, common_size_); - } else { - return std::vector>{ - common_size_}; - } - } else { - return sizes_; - } - } - - /** - * Get the batch size at a particular index. - * - * @param batch_entry the index of the entry whose size is needed - * - * @return the size of the batch entry at the requested batch-index - */ - const dim& at( - const size_type batch_entry = 0) const - { - if (equal_sizes_) { - return common_size_; - } else { - GKO_ASSERT(batch_entry < num_batch_entries_); - return sizes_[batch_entry]; - } - } - - /** - * Checks if two batch_dim objects are equal. - * - * @param x first object - * @param y second object - * - * @return true if and only if all dimensions of both objects are equal. - */ - friend bool operator==(const batch_dim& x, const batch_dim& y) - { - if (x.equal_sizes_ && y.equal_sizes_) { - return x.num_batch_entries_ == y.num_batch_entries_ && - x.common_size_ == y.common_size_; - } else { - return x.sizes_ == y.sizes_; - } - } - - /** - * Creates a batch_dim object which stores a uniform size for all batch - * entries. - * - * @param num_batch_entries number of batch entries to be stored - * @param common_size the common size of all the batch entries stored - * - * @note Use this constructor when uniform batches need to be stored. - */ - explicit batch_dim(const size_type num_batch_entries = 0, - const dim& common_size = - dim{}) - : equal_sizes_(true), - common_size_(common_size), - num_batch_entries_(num_batch_entries), - sizes_() - {} - - /** - * Creates a batch_dim object which stores possibly non-uniform sizes for - * the different batch entries. - * - * @param batch_sizes the std::vector object that stores the batch_sizes - * - * @note Use this constructor when non-uniform batches need to be stored. - */ - batch_dim( - const std::vector>& batch_sizes) - : equal_sizes_(false), - common_size_(dim{}), - num_batch_entries_(batch_sizes.size()), - sizes_(batch_sizes) - { - check_size_equality(); - } - -private: - void check_size_equality() - { - for (size_type i = 0; i < num_batch_entries_; ++i) { - if (!(sizes_[i] == sizes_[0])) { - return; - } - } - common_size_ = sizes_[0]; - equal_sizes_ = true; - } - - bool equal_sizes_{}; - size_type num_batch_entries_{}; - dim common_size_{}; - std::vector> sizes_{}; -}; - - /** * Checks if two dim objects are different. * @@ -418,54 +280,6 @@ constexpr GKO_ATTRIBUTES GKO_INLINE dim<2, DimensionType> transpose( } -/** - * Checks if two batch dim objects are different. - * - * @tparam Dimensionality number of dimensions of the dim objects - * @tparam DimensionType datatype used to represent each dimension - * - * @param x first object - * @param y second object - * - * @return `!(x == y)` - */ -template -inline bool operator!=(const batch_dim& x, - const batch_dim& y) -{ - return !(x == y); -} - - -/** - * Returns a batch_dim object with its dimensions swapped for batched operators - * - * @tparam DimensionType datatype used to represent each dimension - * - * @param dimensions original object - * - * @return a batch_dim object with the individual batches having their - * dimensions swapped - */ -template -inline batch_dim<2, DimensionType> transpose( - const batch_dim<2, DimensionType>& input) -{ - batch_dim<2, DimensionType> out{}; - if (input.stores_equal_sizes()) { - out = batch_dim<2, DimensionType>(input.get_num_batch_entries(), - gko::transpose(input.at(0))); - return out; - } - auto trans = - std::vector>(input.get_num_batch_entries()); - for (size_type i = 0; i < trans.size(); ++i) { - trans[i] = transpose(input.at(i)); - } - return batch_dim<2, DimensionType>(trans); -} - - } // namespace gko From bf211ef2fa7a08c3fe10de65770e40d527ac69a8 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 3 Jul 2023 14:27:28 +0200 Subject: [PATCH 106/583] WIP batch_vector updates --- core/device_hooks/common_kernels.inc.cpp | 13 + core/matrix/batch_struct.hpp | 143 +++++ core/matrix/batch_vector.cpp | 287 ++-------- core/matrix/batch_vector_kernels.hpp | 251 ++------- core/test/matrix/CMakeLists.txt | 1 + .../{batch_dense.cpp => batch_vector.cpp} | 110 ++-- cuda/matrix/batch_struct.hpp | 118 +++++ cuda/matrix/batch_vector_kernels.cu | 303 +---------- hip/matrix/batch_struct.hip.hpp | 120 +++++ hip/matrix/batch_vector_kernels.hip.cpp | 306 +---------- include/ginkgo/core/matrix/batch_vector.hpp | 297 +++++------ omp/matrix/batch_vector_kernels.cpp | 497 +----------------- reference/matrix/batch_struct.hpp | 120 +++++ reference/matrix/batch_vector_kernels.cpp | 476 +---------------- reference/matrix/batch_vector_kernels.hpp.inc | 70 +-- .../test/matrix/batch_vector_kernels.cpp | 164 +++--- test/matrix/batch_vector_kernels.cpp | 68 +-- 17 files changed, 1012 insertions(+), 2332 deletions(-) create mode 100644 core/matrix/batch_struct.hpp rename core/test/matrix/{batch_dense.cpp => batch_vector.cpp} (84%) create mode 100644 cuda/matrix/batch_struct.hpp create mode 100644 hip/matrix/batch_struct.hip.hpp create mode 100644 reference/matrix/batch_struct.hpp diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index f1af9318f9f..a5aa43100a3 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -272,6 +272,19 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_BUILD_LOCAL_NONLOCAL); } // namespace distributed_matrix +namespace batch_vector { + + +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); + + +} // namespace batch_vector + + namespace dense { diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp new file mode 100644 index 00000000000..01092f0e4d0 --- /dev/null +++ b/core/matrix/batch_struct.hpp @@ -0,0 +1,143 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ + + +#include +#include +#include + + +namespace gko { +namespace batch_vector { + + +/** + * Encapsulates one matrix from a batch of dense matrices (vectors). + */ +template +struct BatchEntry { + using value_type = ValueType; + ValueType* values; + size_type stride; + int num_rows; + int num_rhs; +}; + +/** + * A 'simple' structure to store a global uniform batch of dense matrices. + * + * It is uniform in the sense that all matrices in the batch have common sizes. + */ +template +struct UniformBatch { + using value_type = ValueType; + using entry_type = BatchEntry; + + ValueType* values; ///< Concatenated values of all matrices in the batch + size_type num_batch; ///< Number of matrices in the batch + size_type stride; ///< Common stride of each dense matrix + int num_rows; ///< Common number of rows in each matrix + int num_rhs; ///< Common number of columns of each matrix + int num_nnz; ///< Common number of non-zeros of each matrix, ie., + ///< the number or rows times the number of columns + + size_type get_entry_storage() const { return num_nnz * sizeof(value_type); } +}; + + +} // namespace batch_vector + + +namespace batch { + + +template +GKO_ATTRIBUTES GKO_INLINE gko::batch_vector::BatchEntry +to_const(const gko::batch_vector::BatchEntry& b) +{ + return {b.values, b.stride, b.num_rows, b.num_rhs}; +} + + +template +GKO_ATTRIBUTES GKO_INLINE gko::batch_vector::UniformBatch +to_const(const gko::batch_vector::UniformBatch& ub) +{ + return {ub.values, ub.num_batch, ub.stride, ub.num_rows, ub.num_rhs}; +} + + +/** + * Extract one object (matrix, vector etc.) from a batch of objects + * + * This overload is for batch dense matrices. + * These overloads are intended to be called from within a kernel. + * + * @param batch The batch of objects to extract from + * @param batch_idx The position of the desired object in the batch + */ +template +GKO_ATTRIBUTES GKO_INLINE batch_vector::BatchEntry batch_entry( + const batch_vector::UniformBatch& batch, + const size_type batch_idx) +{ + return {batch.values + batch_idx * batch.stride * batch.num_rows, + batch.stride, batch.num_rows, batch.num_rhs}; +} + +template +GKO_ATTRIBUTES GKO_INLINE batch_vector::BatchEntry batch_entry( + ValueType* const batch_values, const size_type stride, const int num_rows, + const int num_rhs, const size_type batch_idx) +{ + return {batch_values + batch_idx * stride * num_rows, stride, num_rows, + num_rhs}; +} + +template +GKO_ATTRIBUTES GKO_INLINE ValueType* batch_entry_ptr( + ValueType* const batch_start, const size_type stride, const int num_rows, + const size_type batch_idx) +{ + return batch_start + batch_idx * stride * num_rows; +} + + +} // namespace batch + + +} // namespace gko + +#endif // GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ diff --git a/core/matrix/batch_vector.cpp b/core/matrix/batch_vector.cpp index 4449516d5a1..abacd9b1cd8 100644 --- a/core/matrix/batch_vector.cpp +++ b/core/matrix/batch_vector.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include @@ -43,93 +43,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include -#include -#include -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_vector_kernels.hpp" namespace gko { namespace matrix { -namespace batch_dense { - - -GKO_REGISTER_OPERATION(simple_apply, batch_dense::simple_apply); -GKO_REGISTER_OPERATION(apply, batch_dense::apply); -GKO_REGISTER_OPERATION(scale, batch_dense::scale); -GKO_REGISTER_OPERATION(add_scaled, batch_dense::add_scaled); -GKO_REGISTER_OPERATION(add_scale, batch_dense::add_scale); -GKO_REGISTER_OPERATION(convergence_add_scaled, - batch_dense::convergence_add_scaled); -GKO_REGISTER_OPERATION(add_scaled_diag, batch_dense::add_scaled_diag); -GKO_REGISTER_OPERATION(compute_dot, batch_dense::compute_dot); -GKO_REGISTER_OPERATION(convergence_compute_dot, - batch_dense::convergence_compute_dot); -GKO_REGISTER_OPERATION(compute_norm2, batch_dense::compute_norm2); -GKO_REGISTER_OPERATION(convergence_compute_norm2, - batch_dense::convergence_compute_norm2); -GKO_REGISTER_OPERATION(copy, batch_dense::copy); -GKO_REGISTER_OPERATION(convergence_copy, batch_dense::convergence_copy); -GKO_REGISTER_OPERATION(convert_to_batch_csr, batch_dense::convert_to_batch_csr); -GKO_REGISTER_OPERATION(count_nonzeros, batch_dense::count_nonzeros); -GKO_REGISTER_OPERATION(calculate_max_nnz_per_row, - batch_dense::calculate_max_nnz_per_row); -GKO_REGISTER_OPERATION(calculate_nonzeros_per_row, - batch_dense::calculate_nonzeros_per_row); -GKO_REGISTER_OPERATION(calculate_total_cols, batch_dense::calculate_total_cols); -GKO_REGISTER_OPERATION(transpose, batch_dense::transpose); -GKO_REGISTER_OPERATION(conj_transpose, batch_dense::conj_transpose); -GKO_REGISTER_OPERATION(add_scaled_identity, batch_dense::add_scaled_identity); - - -} // namespace batch_dense +namespace batch_vector { -template -void BatchDense::apply_impl(const BatchLinOp* b, BatchLinOp* x) const -{ - // TODO: Remove this when non-uniform batching kernels have been - // implemented - if (!this->get_size().stores_equal_sizes() || - !this->get_stride().stores_equal_strides()) { - GKO_NOT_IMPLEMENTED; - } - this->get_executor()->run(batch_dense::make_simple_apply( - this, as>(b), as>(x))); -} +GKO_REGISTER_OPERATION(scale, batch_vector::scale); +GKO_REGISTER_OPERATION(add_scaled, batch_vector::add_scaled); +GKO_REGISTER_OPERATION(compute_dot, batch_vector::compute_dot); +GKO_REGISTER_OPERATION(compute_norm2, batch_vector::compute_norm2); +GKO_REGISTER_OPERATION(copy, batch_vector::copy); -template -void BatchDense::apply_impl(const BatchLinOp* alpha, - const BatchLinOp* b, - const BatchLinOp* beta, - BatchLinOp* x) const -{ - if (!this->get_size().stores_equal_sizes() || - !this->get_stride().stores_equal_strides()) { - GKO_NOT_IMPLEMENTED; - } - if (auto bid = dynamic_cast*>(b)) { - if (auto xdense = dynamic_cast*>(x)) { - xdense->add_scale(alpha, this, beta); - } else { - GKO_NOT_SUPPORTED(x); - } - } else { - this->get_executor()->run(batch_dense::make_apply( - as>(alpha), this, - as>(b), as>(beta), - as>(x))); - } -} +} // namespace batch_vector template -void BatchDense::scale_impl(const BatchLinOp* alpha) +void BatchVector::scale_impl(const BatchLinOp* alpha) { - auto batch_alpha = as>(alpha); + auto batch_alpha = as>(alpha); GKO_ASSERT_BATCH_EQUAL_ROWS( batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { @@ -139,16 +76,16 @@ void BatchDense::scale_impl(const BatchLinOp* alpha) } } auto exec = this->get_executor(); - exec->run(batch_dense::make_scale(batch_alpha, this)); + exec->run(batch_vector::make_scale(batch_alpha, this)); } template -void BatchDense::add_scaled_impl(const BatchLinOp* alpha, - const BatchLinOp* b) +void BatchVector::add_scaled_impl(const BatchLinOp* alpha, + const BatchLinOp* b) { - auto batch_alpha = as>(alpha); - auto batch_b = as>(b); + auto batch_alpha = as>(alpha); + auto batch_b = as>(b); GKO_ASSERT_BATCH_EQUAL_ROWS( batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { @@ -160,37 +97,7 @@ void BatchDense::add_scaled_impl(const BatchLinOp* alpha, GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); auto exec = this->get_executor(); - exec->run(batch_dense::make_add_scaled(batch_alpha, batch_b, this)); -} - - -template -void BatchDense::add_scale(const BatchLinOp* const alpha, - const BatchLinOp* const a, - const BatchLinOp* const beta) -{ - auto batch_alpha = as>(alpha); - auto batch_beta = as>(beta); - auto batch_a = as>(a); - GKO_ASSERT_BATCH_EQUAL_ROWS( - batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); - if (batch_alpha->get_size().stores_equal_sizes()) { - if (batch_alpha->get_size().at(0)[1] != 1) { - // different alpha for each column - GKO_ASSERT_BATCH_EQUAL_COLS(this, batch_alpha); - } - } else { - for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { - if (batch_alpha->get_size().at(b)[1] != 1) { - GKO_ASSERT(this->get_size().at(b)[1] == - batch_alpha->get_size().at(b)[1]); - } - } - } - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_a); - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_alpha, batch_beta); - this->get_executor()->run( - batch_dense::make_add_scale(batch_alpha, batch_a, batch_beta, this)); + exec->run(batch_vector::make_add_scaled(batch_alpha, batch_b, this)); } @@ -205,35 +112,35 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) template -void BatchDense::compute_dot_impl(const BatchLinOp* b, - BatchLinOp* result) const +void BatchVector::compute_dot_impl(const BatchLinOp* b, + BatchLinOp* result) const { - auto batch_result = as>(result); - auto batch_b = as>(b); + auto batch_result = as>(result); + auto batch_b = as>(b); GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, get_col_sizes(this->get_size())); auto exec = this->get_executor(); - exec->run(batch_dense::make_compute_dot(this, batch_b, batch_result)); + exec->run(batch_vector::make_compute_dot(this, batch_b, batch_result)); } template -void BatchDense::compute_norm2_impl(BatchLinOp* result) const +void BatchVector::compute_norm2_impl(BatchLinOp* result) const { - using NormVector = BatchDense>; + using NormVector = BatchVector>; auto batch_result = as(result); GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, get_col_sizes(this->get_size())); auto exec = this->get_executor(); - exec->run(batch_dense::make_compute_norm2(as>(this), - batch_result)); + exec->run(batch_vector::make_compute_norm2(as>(this), + batch_result)); } template -void BatchDense::convert_to( - BatchDense>* result) const +void BatchVector::convert_to( + BatchVector>* result) const { result->values_ = this->values_; result->stride_ = this->stride_; @@ -243,94 +150,13 @@ void BatchDense::convert_to( template -void BatchDense::move_to( - BatchDense>* result) +void BatchVector::move_to( + BatchVector>* result) { this->convert_to(result); } -template -void BatchDense::convert_to(BatchCsr* result) const -{ - auto exec = this->get_executor(); - - auto batch_size = this->get_size(); - if (!batch_size.stores_equal_sizes()) { - GKO_NOT_IMPLEMENTED; - } - - auto num_stored_nonzeros = - array{exec->get_master(), this->get_num_batch_entries()}; - exec->run( - batch_dense::make_count_nonzeros(this, num_stored_nonzeros.get_data())); - gko::dim<2> main_size = this->get_size().at(0); - const size_type num_nnz = - num_stored_nonzeros.get_data() ? num_stored_nonzeros.get_data()[0] : 0; - auto tmp = BatchCsr::create( - exec, this->get_num_batch_entries(), main_size, num_nnz); - exec->run(batch_dense::make_convert_to_batch_csr(this, tmp.get())); - tmp->move_to(result); -} - - -template -void BatchDense::move_to(BatchCsr* result) -{ - this->convert_to(result); -} - - -template -void BatchDense::convert_to( - BatchDiagonal* const result) const -{ - auto exec = this->get_executor(); - - auto batch_size = this->get_size(); - if (!batch_size.stores_equal_sizes()) { - GKO_NOT_IMPLEMENTED; - } - GKO_ASSERT_BATCH_HAS_SINGLE_COLUMN(this); - if (this->get_stride().at(0) != 1) { - GKO_NOT_IMPLEMENTED; - } - auto temp = BatchDiagonal::create( - exec, batch_dim<2>{batch_size.get_num_batch_entries(), - dim<2>{batch_size.at(0)[0]}}); - exec->copy(this->get_num_stored_elements(), this->get_const_values(), - temp->get_values()); - result->copy_from(temp.get()); -} - - -template -void BatchDense::move_to(BatchDiagonal* const result) -{ - auto exec = this->get_executor(); - - auto batch_size = this->get_size(); - if (!batch_size.stores_equal_sizes()) { - GKO_NOT_IMPLEMENTED; - } - GKO_ASSERT_BATCH_HAS_SINGLE_COLUMN(this); - if (this->get_stride().at(0) != 1) { - GKO_NOT_IMPLEMENTED; - } - auto temp = BatchDiagonal::create( - exec, - batch_dim<2>{batch_size.get_num_batch_entries(), - dim<2>{batch_size.at(0)[0]}}, - std::move(this->values_)); - *result = std::move(*temp); - // set the size of this to 0 - this->set_size(batch_dim<2>()); -} - - -namespace { - - template inline void read_impl(MatrixType* mtx, const std::vector& data) { @@ -362,26 +188,20 @@ inline void read_impl(MatrixType* mtx, const std::vector& data) } -} // namespace - - template -void BatchDense::read(const std::vector& data) +void BatchVector::read(const std::vector& data) { read_impl(this, data); } template -void BatchDense::read(const std::vector& data) +void BatchVector::read(const std::vector& data) { read_impl(this, data); } -namespace { - - template inline void write_impl(const MatrixType* mtx, std::vector& data) { @@ -410,57 +230,22 @@ inline void write_impl(const MatrixType* mtx, std::vector& data) } -} // namespace - - template -void BatchDense::write(std::vector& data) const +void BatchVector::write(std::vector& data) const { write_impl(this, data); } template -void BatchDense::write(std::vector& data) const +void BatchVector::write(std::vector& data) const { write_impl(this, data); } -template -std::unique_ptr BatchDense::transpose() const -{ - auto exec = this->get_executor(); - auto trans_cpy = BatchDense::create(exec, gko::transpose(this->get_size())); - - exec->run(batch_dense::make_transpose(this, trans_cpy.get())); - - return std::move(trans_cpy); -} - - -template -std::unique_ptr BatchDense::conj_transpose() const -{ - auto exec = this->get_executor(); - auto trans_cpy = BatchDense::create(exec, gko::transpose(this->get_size())); - - exec->run(batch_dense::make_conj_transpose(this, trans_cpy.get())); - return std::move(trans_cpy); -} - - -template -void BatchDense::add_scaled_identity_impl(const BatchLinOp* const a, - const BatchLinOp* const b) -{ - this->get_executor()->run(batch_dense::make_add_scaled_identity( - as>(a), as>(b), this)); -} - - -#define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class BatchDense<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX); +#define GKO_DECLARE_BATCH_VECTOR_MATRIX(_type) class BatchVector<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_MATRIX); } // namespace matrix diff --git a/core/matrix/batch_vector_kernels.hpp b/core/matrix/batch_vector_kernels.hpp index 91dd3e6f5b7..6ddfc9e2676 100644 --- a/core/matrix/batch_vector_kernels.hpp +++ b/core/matrix/batch_vector_kernels.hpp @@ -30,11 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ -#define GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ +#ifndef GKO_CORE_MATRIX_BATCH_VECTOR_KERNELS_HPP_ +#define GKO_CORE_MATRIX_BATCH_VECTOR_KERNELS_HPP_ -#include +#include #include @@ -46,232 +46,49 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(_type) \ - void simple_apply(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* a, \ - const matrix::BatchDense<_type>* b, \ - matrix::BatchDense<_type>* c) - -#define GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL(_type) \ - void apply(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* alpha, \ - const matrix::BatchDense<_type>* a, \ - const matrix::BatchDense<_type>* b, \ - const matrix::BatchDense<_type>* beta, \ - matrix::BatchDense<_type>* c) - -#define GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL(_type) \ void scale(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* alpha, \ - matrix::BatchDense<_type>* x) + const matrix::BatchVector<_type>* alpha, \ + matrix::BatchVector<_type>* x) -#define GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL(_type) \ void add_scaled(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* alpha, \ - const matrix::BatchDense<_type>* x, \ - matrix::BatchDense<_type>* y) - -#define GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL(_type) \ - void add_scale(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* alpha, \ - const matrix::BatchDense<_type>* x, \ - const matrix::BatchDense<_type>* beta, \ - matrix::BatchDense<_type>* y) - -#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL(_type) \ - void convergence_add_scaled(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* alpha, \ - const matrix::BatchDense<_type>* x, \ - matrix::BatchDense<_type>* y, \ - const uint32& converged) - -#define GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL(_type) \ - void add_scaled_diag(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* alpha, \ - const matrix::Diagonal<_type>* x, \ - matrix::BatchDense<_type>* y) + const matrix::BatchVector<_type>* alpha, \ + const matrix::BatchVector<_type>* x, \ + matrix::BatchVector<_type>* y) -#define GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL(_type) \ void compute_dot(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* x, \ - const matrix::BatchDense<_type>* y, \ - matrix::BatchDense<_type>* result) + const matrix::BatchVector<_type>* x, \ + const matrix::BatchVector<_type>* y, \ + matrix::BatchVector<_type>* result) - -#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL(_type) \ - void convergence_compute_dot(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* x, \ - const matrix::BatchDense<_type>* y, \ - matrix::BatchDense<_type>* result, \ - const uint32& converged) - -#define GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL(_type) \ void compute_norm2(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* x, \ - matrix::BatchDense>* result) - -#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL(_type) \ - void convergence_compute_norm2( \ - std::shared_ptr exec, \ - const matrix::BatchDense<_type>* x, \ - matrix::BatchDense>* result, \ - const uint32& converged) + const matrix::BatchVector<_type>* x, \ + matrix::BatchVector>* result) - -#define GKO_DECLARE_BATCH_DENSE_COPY_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL(_type) \ void copy(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* x, \ - matrix::BatchDense<_type>* result) - -#define GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL(_type) \ - void convergence_copy(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* x, \ - matrix::BatchDense<_type>* result, \ - const uint32& converged) - -#define GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL(_type, _prec) \ - void convert_to_batch_csr(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* source, \ - matrix::BatchCsr<_type, _prec>* other) - -#define GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL(_type) \ - void count_nonzeros(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* source, \ - size_type* result) - -#define GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(_type) \ - void calculate_max_nnz_per_row( \ - std::shared_ptr exec, \ - const matrix::BatchDense<_type>* source, size_type* result) - -#define GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL(_type) \ - void calculate_nonzeros_per_row( \ - std::shared_ptr exec, \ - const matrix::BatchDense<_type>* source, array* result) - -#define GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL(_type) \ - void calculate_total_cols( \ - std::shared_ptr exec, \ - const matrix::BatchDense<_type>* source, size_type* result, \ - const size_type* stride_factor, const size_type* slice_size) - -#define GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL(_type) \ - void transpose(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* orig, \ - matrix::BatchDense<_type>* trans) - -#define GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL(_type) \ - void conj_transpose(std::shared_ptr exec, \ - const matrix::BatchDense<_type>* orig, \ - matrix::BatchDense<_type>* trans) - -#define GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL(ValueType) \ - void batch_scale(std::shared_ptr exec, \ - const matrix::BatchDiagonal* left_scale, \ - const matrix::BatchDiagonal* right_scale, \ - matrix::BatchDense* vec_to_scale) - -#define GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL(ValueType) \ - void add_scaled_identity(std::shared_ptr exec, \ - const matrix::BatchDense* a, \ - const matrix::BatchDense* b, \ - matrix::BatchDense* mtx) - - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_COPY_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL(ValueType) - - -namespace omp { -namespace batch_dense { - -GKO_DECLARE_ALL_AS_TEMPLATES; - -} // namespace batch_dense -} // namespace omp - - -namespace cuda { -namespace batch_dense { - -GKO_DECLARE_ALL_AS_TEMPLATES; - -} // namespace batch_dense -} // namespace cuda - - -namespace reference { -namespace batch_dense { - -GKO_DECLARE_ALL_AS_TEMPLATES; - -} // namespace batch_dense -} // namespace reference - - -namespace hip { -namespace batch_dense { - -GKO_DECLARE_ALL_AS_TEMPLATES; - -} // namespace batch_dense -} // namespace hip + const matrix::BatchVector<_type>* x, \ + matrix::BatchVector<_type>* result) -namespace dpcpp { -namespace batch_dense { +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL(ValueType) -GKO_DECLARE_ALL_AS_TEMPLATES; -} // namespace batch_dense -} // namespace dpcpp +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_vector, + GKO_DECLARE_ALL_AS_TEMPLATES); #undef GKO_DECLARE_ALL_AS_TEMPLATES @@ -281,4 +98,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES; } // namespace gko -#endif // GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ +#endif // GKO_CORE_MATRIX_BATCH_VECTOR_KERNELS_HPP_ diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt index 433361a054f..fbfe5f95e3f 100644 --- a/core/test/matrix/CMakeLists.txt +++ b/core/test/matrix/CMakeLists.txt @@ -1,3 +1,4 @@ +ginkgo_create_test(batch_vector) ginkgo_create_test(coo) ginkgo_create_test(coo_builder) ginkgo_create_test(csr) diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_vector.cpp similarity index 84% rename from core/test/matrix/batch_dense.cpp rename to core/test/matrix/batch_vector.cpp index 7db7469baf6..4735d5eead2 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_vector.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include @@ -44,18 +44,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -namespace { - - template -class BatchDense : public ::testing::Test { +class BatchVector : public ::testing::Test { protected: using value_type = T; using DenseMtx = gko::matrix::Dense; using size_type = gko::size_type; - BatchDense() + BatchVector() : exec(gko::ReferenceExecutor::create()), - mtx(gko::batch_initialize>( + mtx(gko::batch_initialize>( std::vector{4, 3}, {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, @@ -64,7 +61,7 @@ class BatchDense : public ::testing::Test { static void assert_equal_to_original_mtx( - gko::matrix::BatchDense* m) + gko::matrix::BatchVector* m) { ASSERT_EQ(m->get_num_batch_entries(), 2); ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); @@ -88,37 +85,37 @@ class BatchDense : public ::testing::Test { ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); } - static void assert_empty(gko::matrix::BatchDense* m) + static void assert_empty(gko::matrix::BatchVector* m) { ASSERT_EQ(m->get_num_batch_entries(), 0); ASSERT_EQ(m->get_num_stored_elements(), 0); } std::shared_ptr exec; - std::unique_ptr> mtx; + std::unique_ptr> mtx; }; -TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); +TYPED_TEST_SUITE(BatchVector, gko::test::ValueTypes); -TYPED_TEST(BatchDense, CanBeEmpty) +TYPED_TEST(BatchVector, CanBeEmpty) { - auto empty = gko::matrix::BatchDense::create(this->exec); + auto empty = gko::matrix::BatchVector::create(this->exec); this->assert_empty(empty.get()); } -TYPED_TEST(BatchDense, ReturnsNullValuesArrayWhenEmpty) +TYPED_TEST(BatchVector, ReturnsNullValuesArrayWhenEmpty) { - auto empty = gko::matrix::BatchDense::create(this->exec); + auto empty = gko::matrix::BatchVector::create(this->exec); ASSERT_EQ(empty->get_const_values(), nullptr); } -TYPED_TEST(BatchDense, CanBeConstructedWithSize) +TYPED_TEST(BatchVector, CanBeConstructedWithSize) { using size_type = gko::size_type; - auto m = gko::matrix::BatchDense::create( + auto m = gko::matrix::BatchVector::create( this->exec, std::vector>{gko::dim<2>{2, 4}, gko::dim<2>{2, 3}}); @@ -133,10 +130,10 @@ TYPED_TEST(BatchDense, CanBeConstructedWithSize) } -TYPED_TEST(BatchDense, CanBeConstructedWithSizeAndStride) +TYPED_TEST(BatchVector, CanBeConstructedWithSizeAndStride) { using size_type = gko::size_type; - auto m = gko::matrix::BatchDense::create( + auto m = gko::matrix::BatchVector::create( this->exec, std::vector>{gko::dim<2>{2, 3}}, std::vector{4}); @@ -146,7 +143,7 @@ TYPED_TEST(BatchDense, CanBeConstructedWithSizeAndStride) } -TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) +TYPED_TEST(BatchVector, CanBeConstructedFromExistingData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -158,7 +155,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) 5.0, 6.0, -3.0}; // clang-format on - auto m = gko::matrix::BatchDense::create( + auto m = gko::matrix::BatchVector::create( this->exec, std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, gko::array::view(this->exec, 12, data), @@ -172,7 +169,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) } -TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) +TYPED_TEST(BatchVector, CanBeConstructedFromExistingConstData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -184,7 +181,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) 5.0, 6.0, -3.0}; // clang-format on - auto m = gko::matrix::BatchDense::create_const( + auto m = gko::matrix::BatchVector::create_const( this->exec, std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, gko::array::const_view(this->exec, 12, data), @@ -198,7 +195,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) } -TYPED_TEST(BatchDense, CanBeConstructedFromBatchDenseMatrices) +TYPED_TEST(BatchVector, CanBeConstructedFromBatchVectorMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -208,19 +205,19 @@ TYPED_TEST(BatchDense, CanBeConstructedFromBatchDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::matrix::BatchDense::create( + auto m = gko::matrix::BatchVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::matrix::BatchDense::create( + auto m_ref = gko::matrix::BatchVector::create( this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), mat1.get(), mat2.get()}); auto m2 = - gko::matrix::BatchDense::create(this->exec, 3, m.get()); + gko::matrix::BatchVector::create(this->exec, 3, m.get()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } -TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatricesByDuplication) +TYPED_TEST(BatchVector, CanBeConstructedFromDenseMatricesByDuplication) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -230,16 +227,16 @@ TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatricesByDuplication) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::matrix::BatchDense::create( + auto bat_m = gko::matrix::BatchVector::create( this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); auto m = - gko::matrix::BatchDense::create(this->exec, 3, mat1.get()); + gko::matrix::BatchVector::create(this->exec, 3, mat1.get()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } -TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) +TYPED_TEST(BatchVector, CanBeConstructedFromDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -249,14 +246,14 @@ TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::matrix::BatchDense::create( + auto m = gko::matrix::BatchVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); this->assert_equal_to_original_mtx(m.get()); } -TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) +TYPED_TEST(BatchVector, CanBeUnbatchedIntoDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -274,16 +271,16 @@ TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) } -TYPED_TEST(BatchDense, KnowsItsSizeAndValues) +TYPED_TEST(BatchVector, KnowsItsSizeAndValues) { this->assert_equal_to_original_mtx(this->mtx.get()); } -TYPED_TEST(BatchDense, CanBeListConstructed) +TYPED_TEST(BatchVector, CanBeListConstructed) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); ASSERT_EQ(m->get_num_batch_entries(), 2); @@ -297,10 +294,10 @@ TYPED_TEST(BatchDense, CanBeListConstructed) } -TYPED_TEST(BatchDense, CanBeListConstructedWithstride) +TYPED_TEST(BatchVector, CanBeListConstructedWithstride) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( std::vector{2}, {{1.0, 2.0}}, this->exec); ASSERT_EQ(m->get_num_batch_entries(), 1); ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); @@ -310,10 +307,10 @@ TYPED_TEST(BatchDense, CanBeListConstructedWithstride) } -TYPED_TEST(BatchDense, CanBeListConstructedByCopies) +TYPED_TEST(BatchVector, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( 2, I({1.0, 2.0}), this->exec); ASSERT_EQ(m->get_num_batch_entries(), 2); ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); @@ -326,11 +323,11 @@ TYPED_TEST(BatchDense, CanBeListConstructedByCopies) } -TYPED_TEST(BatchDense, CanBeDoubleListConstructed) +TYPED_TEST(BatchVector, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; using T = value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, this->exec); @@ -355,11 +352,11 @@ TYPED_TEST(BatchDense, CanBeDoubleListConstructed) } -TYPED_TEST(BatchDense, CanBeDoubleListConstructedWithstride) +TYPED_TEST(BatchVector, CanBeDoubleListConstructedWithstride) { using value_type = typename TestFixture::value_type; using T = value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( {4, 3}, {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, @@ -385,9 +382,9 @@ TYPED_TEST(BatchDense, CanBeDoubleListConstructedWithstride) } -TYPED_TEST(BatchDense, CanBeCopied) +TYPED_TEST(BatchVector, CanBeCopied) { - auto mtx_copy = gko::matrix::BatchDense::create(this->exec); + auto mtx_copy = gko::matrix::BatchVector::create(this->exec); mtx_copy->copy_from(this->mtx.get()); this->assert_equal_to_original_mtx(this->mtx.get()); this->mtx->at(0, 0, 0) = 7; @@ -396,15 +393,15 @@ TYPED_TEST(BatchDense, CanBeCopied) } -TYPED_TEST(BatchDense, CanBeMoved) +TYPED_TEST(BatchVector, CanBeMoved) { - auto mtx_copy = gko::matrix::BatchDense::create(this->exec); + auto mtx_copy = gko::matrix::BatchVector::create(this->exec); mtx_copy->copy_from(std::move(this->mtx)); this->assert_equal_to_original_mtx(mtx_copy.get()); } -TYPED_TEST(BatchDense, CanBeCloned) +TYPED_TEST(BatchVector, CanBeCloned) { auto mtx_clone = this->mtx->clone(); this->assert_equal_to_original_mtx( @@ -412,17 +409,17 @@ TYPED_TEST(BatchDense, CanBeCloned) } -TYPED_TEST(BatchDense, CanBeCleared) +TYPED_TEST(BatchVector, CanBeCleared) { this->mtx->clear(); this->assert_empty(this->mtx.get()); } -TYPED_TEST(BatchDense, CanBeReadFromMatrixData) +TYPED_TEST(BatchVector, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; - auto m = gko::matrix::BatchDense::create(this->exec); + auto m = gko::matrix::BatchVector::create(this->exec); // clang-format off m->read({gko::matrix_data{{2, 3}, {{0, 0, 1.0}, @@ -456,7 +453,7 @@ TYPED_TEST(BatchDense, CanBeReadFromMatrixData) } -TYPED_TEST(BatchDense, GeneratesCorrectMatrixData) +TYPED_TEST(BatchVector, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; using tpl = typename gko::matrix_data::nonzero_type; @@ -483,10 +480,10 @@ TYPED_TEST(BatchDense, GeneratesCorrectMatrixData) } -TYPED_TEST(BatchDense, CanBeReadFromMatrixAssemblyData) +TYPED_TEST(BatchVector, CanBeReadFromMatrixAssemblyData) { using value_type = typename TestFixture::value_type; - auto m = gko::matrix::BatchDense::create(this->exec); + auto m = gko::matrix::BatchVector::create(this->exec); gko::matrix_assembly_data data1(gko::dim<2>{2, 3}); data1.set_value(0, 0, 1.0); data1.set_value(0, 1, 3.0); @@ -515,6 +512,3 @@ TYPED_TEST(BatchDense, CanBeReadFromMatrixAssemblyData) EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); EXPECT_EQ(m->at(1, 1, 0), value_type{5.0}); } - - -} // namespace diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp new file mode 100644 index 00000000000..104286f66b9 --- /dev/null +++ b/cuda/matrix/batch_struct.hpp @@ -0,0 +1,118 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ + + +#include "core/matrix/batch_struct.hpp" + + +#include +#include + + +#include "cuda/base/config.hpp" +#include "cuda/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp, + * while also shallow-casting to the requried CUDA scalar type. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_vector::UniformBatch> +get_batch_struct(const matrix::BatchVector* const op) +{ + return { + as_cuda_type(op->get_const_values()), + op->get_num_batch_entries(), + op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1]), + static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; +} + +/** + * Generates a uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_vector::UniformBatch> get_batch_struct( + matrix::BatchVector* const op) +{ + return { + as_cuda_type(op->get_values()), + op->get_num_batch_entries(), + op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1]), + static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; +} + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices + * that may be null. + */ +template +inline gko::batch_vector::UniformBatch> +maybe_null_batch_struct(const matrix::BatchVector* const op) +{ + if (op) { + return {as_cuda_type(op->get_const_values()), + op->get_num_batch_entries(), op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1])}; + } else { + return {nullptr, 0, 0, 0, 0}; + } +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko +#endif // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ diff --git a/cuda/matrix/batch_vector_kernels.cu b/cuda/matrix/batch_vector_kernels.cu index af67fa1597a..9ceca9e2b3a 100644 --- a/cuda/matrix/batch_vector_kernels.cu +++ b/cuda/matrix/batch_vector_kernels.cu @@ -30,12 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_vector_kernels.hpp" #include #include -#include #include "core/matrix/batch_struct.hpp" @@ -53,69 +52,24 @@ namespace gko { namespace kernels { namespace cuda { /** - * @brief The BatchDense matrix format namespace. + * @brief The BatchVector matrix format namespace. * - * @ingroup batch_dense + * @ingroup batch_vector */ -namespace batch_dense { +namespace batch_vector { constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_vector_kernels.hpp.inc" -template -void simple_apply(std::shared_ptr exec, - const matrix::BatchDense* a, - const matrix::BatchDense* b, - matrix::BatchDense* c) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto a_ub = get_batch_struct(a); - const auto b_ub = get_batch_struct(b); - const auto c_ub = get_batch_struct(c); - if (b_ub.num_rhs > 1) { - GKO_NOT_IMPLEMENTED; - } - mv<<>>(a_ub, b_ub, c_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::BatchDense* alpha, - const matrix::BatchDense* a, - const matrix::BatchDense* b, - const matrix::BatchDense* beta, - matrix::BatchDense* c) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto a_ub = get_batch_struct(a); - const auto b_ub = get_batch_struct(b); - const auto c_ub = get_batch_struct(c); - const auto alpha_ub = get_batch_struct(alpha); - const auto beta_ub = get_batch_struct(beta); - if (b_ub.num_rhs > 1) { - GKO_NOT_IMPLEMENTED; - } - advanced_mv<<>>(alpha_ub, a_ub, b_ub, - beta_ub, c_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); - - template void scale(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - matrix::BatchDense* const x) + const matrix::BatchVector* const alpha, + matrix::BatchVector* const x) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto alpha_ub = get_batch_struct(alpha); @@ -123,14 +77,14 @@ void scale(std::shared_ptr exec, scale<<>>(alpha_ub, x_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); template void add_scaled(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - matrix::BatchDense* const y) + const matrix::BatchVector* const alpha, + const matrix::BatchVector* const x, + matrix::BatchVector* const y) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const size_type nrhs = x->get_size().at(0)[1]; @@ -148,55 +102,14 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); - - -template -void add_scale(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - const matrix::BatchDense* const beta, - matrix::BatchDense* const y) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const size_type nrhs = x->get_size().at(0)[1]; - const auto alpha_ub = get_batch_struct(alpha); - const auto beta_ub = get_batch_struct(beta); - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - add_scale<<>>(alpha_ub, x_ub, beta_ub, - y_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); - - -template -void convergence_add_scaled(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - matrix::BatchDense* const y, - const uint32& converged) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); - - -template -void add_scaled_diag(std::shared_ptr exec, - const matrix::BatchDense* alpha, - const matrix::Diagonal* x, - matrix::BatchDense* y) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); template void compute_dot(std::shared_ptr exec, - const matrix::BatchDense* x, - const matrix::BatchDense* y, - matrix::BatchDense* result) + const matrix::BatchVector* x, + const matrix::BatchVector* y, + matrix::BatchVector* result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -215,26 +128,14 @@ void compute_dot(std::shared_ptr exec, } } - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); - - -template -void convergence_compute_dot(std::shared_ptr exec, - const matrix::BatchDense* x, - const matrix::BatchDense* y, - matrix::BatchDense* result, - const uint32& converged) GKO_NOT_IMPLEMENTED; - - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, - const matrix::BatchDense* const x, - matrix::BatchDense>* const result) + const matrix::BatchVector* const x, + matrix::BatchVector>* const result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -251,112 +152,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); - - -template -void convergence_compute_norm2( - std::shared_ptr exec, - const matrix::BatchDense* const x, - matrix::BatchDense>* const result, - const uint32& converged) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); - - -template -void convert_to_batch_csr(std::shared_ptr exec, - const matrix::BatchDense* source, - matrix::BatchCsr* other) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::BatchDense* source, - array* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result, const size_type* stride_factor, - const size_type* slice_size) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - -template -void transpose(std::shared_ptr exec, - const matrix::BatchDense* const orig, - matrix::BatchDense* const trans) -{ - using cu_val_type = cuda_type; - const size_type nbatch = orig->get_num_batch_entries(); - const size_type orig_stride = orig->get_stride().at(); - const size_type trans_stride = trans->get_stride().at(); - const int nrows = orig->get_size().at()[0]; - const int ncols = orig->get_size().at()[1]; - transpose<<>>( - nrows, ncols, orig_stride, as_cuda_type(orig->get_const_values()), - trans_stride, as_cuda_type(trans->get_values()), - [] __device__(cu_val_type x) { return x; }); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::BatchDense* orig, - matrix::BatchDense* trans) -{ - using cu_val_type = cuda_type; - const size_type nbatch = orig->get_num_batch_entries(); - const size_type orig_stride = orig->get_stride().at(); - const size_type trans_stride = trans->get_stride().at(); - const int nrows = orig->get_size().at()[0]; - const int ncols = orig->get_size().at()[1]; - transpose<<>>( - nrows, ncols, orig_stride, as_cuda_type(orig->get_const_values()), - trans_stride, as_cuda_type(trans->get_values()), - [] __device__(cu_val_type x) { return conj(x); }); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result) + const matrix::BatchVector* x, + matrix::BatchVector* result) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto result_ub = get_batch_struct(result); @@ -364,71 +166,10 @@ void copy(std::shared_ptr exec, copy<<>>(x_ub, result_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); - - -template -void convergence_copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result, - const uint32& converged) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); - - -template -void batch_scale(std::shared_ptr exec, - const matrix::BatchDiagonal* const left_scale, - const matrix::BatchDiagonal* const rght_scale, - matrix::BatchDense* const vec_to_scale) -{ - if (!left_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - if (!rght_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - if (!vec_to_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - - const auto stride = vec_to_scale->get_stride().at(); - const auto nrows = static_cast(vec_to_scale->get_size().at()[0]); - const auto nrhs = static_cast(vec_to_scale->get_size().at()[1]); - const auto nbatch = vec_to_scale->get_num_batch_entries(); - - const int num_blocks = vec_to_scale->get_num_batch_entries(); - uniform_batch_scale<<>>( - nrows, stride, nrhs, nbatch, - as_cuda_type(left_scale->get_const_values()), - as_cuda_type(rght_scale->get_const_values()), - as_cuda_type(vec_to_scale->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); - - -template -void add_scaled_identity(std::shared_ptr exec, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - matrix::BatchDense* const mtx) -{ - if (!mtx->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - const auto num_blocks = mtx->get_num_batch_entries(); - const auto nrows = static_cast(mtx->get_size().at(0)[0]); - const auto ncols = static_cast(mtx->get_size().at(0)[1]); - const auto stride = mtx->get_stride().at(0); - const auto values = mtx->get_values(); - const auto alpha = a->get_const_values(); - const auto a_stride = a->get_stride().at(0); - const auto b_stride = b->get_stride().at(0); - const auto beta = b->get_const_values(); - add_scaled_identity<<>>( - num_blocks, nrows, ncols, stride, as_cuda_type(values), a_stride, - as_cuda_type(alpha), b_stride, as_cuda_type(beta)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); -} // namespace batch_dense +} // namespace batch_vector } // namespace cuda } // namespace kernels } // namespace gko diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp new file mode 100644 index 00000000000..e2648ba4a25 --- /dev/null +++ b/hip/matrix/batch_struct.hip.hpp @@ -0,0 +1,120 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ +#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ + + +#include "core/matrix/batch_struct.hpp" + + +#include +#include + + +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp, + * while also shallow-casting to the requried Hip scalar type. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_vector::UniformBatch> +get_batch_struct(const matrix::BatchVector* const op) +{ + return { + as_hip_type(op->get_const_values()), + op->get_num_batch_entries(), + op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1]), + static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; +} + +/** + * Generates a uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_vector::UniformBatch> get_batch_struct( + matrix::BatchVector* const op) +{ + return { + as_hip_type(op->get_values()), + op->get_num_batch_entries(), + op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1]), + static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; +} + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices + * that may be null. + */ +template +inline gko::batch_vector::UniformBatch> +maybe_null_batch_struct(const matrix::BatchVector* const op) +{ + if (op) { + return {as_hip_type(op->get_const_values()), + op->get_num_batch_entries(), op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1])}; + } else { + return {nullptr, 0, 0, 0, 0}; + } +} + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ diff --git a/hip/matrix/batch_vector_kernels.hip.cpp b/hip/matrix/batch_vector_kernels.hip.cpp index 32665e31191..97bbaf50440 100644 --- a/hip/matrix/batch_vector_kernels.hip.cpp +++ b/hip/matrix/batch_vector_kernels.hip.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_vector_kernels.hpp" #include @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include #include "core/matrix/batch_struct.hpp" @@ -56,70 +55,24 @@ namespace gko { namespace kernels { namespace hip { /** - * @brief The BatchDense matrix format namespace. + * @brief The BatchVector matrix format namespace. * - * @ingroup batch_dense + * @ingroup batch_vector */ -namespace batch_dense { +namespace batch_vector { constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_vector_kernels.hpp.inc" -template -void simple_apply(std::shared_ptr exec, - const matrix::BatchDense* a, - const matrix::BatchDense* b, - matrix::BatchDense* c) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto a_ub = get_batch_struct(a); - const auto b_ub = get_batch_struct(b); - const auto c_ub = get_batch_struct(c); - if (b_ub.num_rhs > 1) { - GKO_NOT_IMPLEMENTED; - } - hipLaunchKernelGGL(mv, num_blocks, default_block_size, 0, 0, a_ub, b_ub, - c_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::BatchDense* alpha, - const matrix::BatchDense* a, - const matrix::BatchDense* b, - const matrix::BatchDense* beta, - matrix::BatchDense* c) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto a_ub = get_batch_struct(a); - const auto b_ub = get_batch_struct(b); - const auto c_ub = get_batch_struct(c); - const auto alpha_ub = get_batch_struct(alpha); - const auto beta_ub = get_batch_struct(beta); - if (b_ub.num_rhs > 1) { - GKO_NOT_IMPLEMENTED; - } - hipLaunchKernelGGL(advanced_mv, num_blocks, default_block_size, 0, 0, - alpha_ub, a_ub, b_ub, beta_ub, c_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); - - template void scale(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - matrix::BatchDense* const x) + const matrix::BatchVector* const alpha, + matrix::BatchVector* const x) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto alpha_ub = get_batch_struct(alpha); @@ -128,14 +81,14 @@ void scale(std::shared_ptr exec, alpha_ub, x_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); template void add_scaled(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - matrix::BatchDense* const y) + const matrix::BatchVector* const alpha, + const matrix::BatchVector* const x, + matrix::BatchVector* const y) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const size_type nrhs = x->get_size().at(0)[1]; @@ -156,55 +109,14 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); - - -template -void add_scale(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - const matrix::BatchDense* const beta, - matrix::BatchDense* const y) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const size_type nrhs = x->get_size().at(0)[1]; - const auto alpha_ub = get_batch_struct(alpha); - const auto beta_ub = get_batch_struct(beta); - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - hipLaunchKernelGGL(add_scale, num_blocks, default_block_size, 0, 0, - alpha_ub, x_ub, beta_ub, y_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); - - -template -void convergence_add_scaled(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - matrix::BatchDense* const y, - const uint32& converged) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); - - -template -void add_scaled_diag(std::shared_ptr exec, - const matrix::BatchDense* alpha, - const matrix::Diagonal* x, - matrix::BatchDense* y) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); template void compute_dot(std::shared_ptr exec, - const matrix::BatchDense* x, - const matrix::BatchDense* y, - matrix::BatchDense* result) + const matrix::BatchVector* x, + const matrix::BatchVector* y, + matrix::BatchVector* result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -225,25 +137,14 @@ void compute_dot(std::shared_ptr exec, } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); - - -template -void convergence_compute_dot(std::shared_ptr exec, - const matrix::BatchDense* x, - const matrix::BatchDense* y, - matrix::BatchDense* result, - const uint32& converged) GKO_NOT_IMPLEMENTED; - - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, - const matrix::BatchDense* const x, - matrix::BatchDense>* const result) + const matrix::BatchVector* const x, + matrix::BatchVector>* const result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -262,114 +163,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); - - -template -void convergence_compute_norm2( - std::shared_ptr exec, - const matrix::BatchDense* const x, - matrix::BatchDense>* const result, - const uint32& converged) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); - - -template -void convert_to_batch_csr(std::shared_ptr exec, - const matrix::BatchDense* source, - matrix::BatchCsr* other) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::BatchDense* source, - array* result) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result, const size_type* stride_factor, - const size_type* slice_size) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - -template -void transpose(std::shared_ptr exec, - const matrix::BatchDense* const orig, - matrix::BatchDense* const trans) -{ - using hip_val_type = hip_type; - const size_type nbatch = orig->get_num_batch_entries(); - const size_type orig_stride = orig->get_stride().at(); - const size_type trans_stride = trans->get_stride().at(); - const int nrows = orig->get_size().at()[0]; - const int ncols = orig->get_size().at()[1]; - hipLaunchKernelGGL(transpose, dim3(nbatch), dim3(default_block_size), 0, 0, - nrows, ncols, orig_stride, - as_hip_type(orig->get_const_values()), trans_stride, - as_hip_type(trans->get_values()), - [] __device__(hip_val_type x) { return x; }); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::BatchDense* orig, - matrix::BatchDense* trans) -{ - using hip_val_type = hip_type; - const size_type nbatch = orig->get_num_batch_entries(); - const size_type orig_stride = orig->get_stride().at(); - const size_type trans_stride = trans->get_stride().at(); - const int nrows = orig->get_size().at()[0]; - const int ncols = orig->get_size().at()[1]; - hipLaunchKernelGGL(transpose, dim3(nbatch), dim3(default_block_size), 0, 0, - nrows, ncols, orig_stride, - as_hip_type(orig->get_const_values()), trans_stride, - as_hip_type(trans->get_values()), - [] __device__(hip_val_type x) { return conj(x); }); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result) + const matrix::BatchVector* x, + matrix::BatchVector* result) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto result_ub = get_batch_struct(result); @@ -378,72 +178,10 @@ void copy(std::shared_ptr exec, x_ub, result_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); - - -template -void convergence_copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result, - const uint32& converged) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); - - -template -void batch_scale(std::shared_ptr exec, - const matrix::BatchDiagonal* const left_scale, - const matrix::BatchDiagonal* const rght_scale, - matrix::BatchDense* const vec_to_scale) -{ - if (!left_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - if (!rght_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - if (!vec_to_scale->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - - const auto stride = vec_to_scale->get_stride().at(); - const auto nrows = static_cast(vec_to_scale->get_size().at()[0]); - const auto nrhs = static_cast(vec_to_scale->get_size().at()[1]); - const auto nbatch = vec_to_scale->get_num_batch_entries(); - - const int num_blocks = vec_to_scale->get_num_batch_entries(); - hipLaunchKernelGGL(uniform_batch_scale, dim3(num_blocks), - dim3(default_block_size), 0, 0, nrows, stride, nrhs, - nbatch, as_hip_type(left_scale->get_const_values()), - as_hip_type(rght_scale->get_const_values()), - as_hip_type(vec_to_scale->get_values())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); - - -template -void add_scaled_identity(std::shared_ptr exec, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - matrix::BatchDense* const mtx) -{ - if (!mtx->get_size().stores_equal_sizes()) GKO_NOT_IMPLEMENTED; - const auto num_blocks = mtx->get_num_batch_entries(); - const auto nrows = static_cast(mtx->get_size().at(0)[0]); - const auto ncols = static_cast(mtx->get_size().at(0)[1]); - const auto stride = mtx->get_stride().at(0); - const auto values = mtx->get_values(); - const auto alpha = a->get_const_values(); - const auto a_stride = a->get_stride().at(0); - const auto b_stride = b->get_stride().at(0); - const auto beta = b->get_const_values(); - hipLaunchKernelGGL(add_scaled_identity, num_blocks, default_block_size, 0, - 0, num_blocks, nrows, ncols, stride, as_hip_type(values), - a_stride, as_hip_type(alpha), b_stride, - as_hip_type(beta)); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); -} // namespace batch_dense +} // namespace batch_vector } // namespace hip } // namespace kernels } // namespace gko diff --git a/include/ginkgo/core/matrix/batch_vector.hpp b/include/ginkgo/core/matrix/batch_vector.hpp index f4061114052..aee16bbc27b 100644 --- a/include/ginkgo/core/matrix/batch_vector.hpp +++ b/include/ginkgo/core/matrix/batch_vector.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ -#define GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ +#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_VECTOR_HPP_ +#define GKO_PUBLIC_CORE_MATRIX_BATCH_VECTOR_HPP_ #include @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -52,17 +51,9 @@ namespace gko { namespace matrix { -template -class BatchDiagonal; - - -template -class BatchCsr; - - /** - * BatchDense is a batch matrix format which explicitly stores all values of the - * matrix in each of the batches. + * BatchVector is a batch matrix format which explicitly stores all values of + * the vector in each of the batches. * * The values in each of the batches are stored in row-major format (values * belonging to the same row appear consecutive in the memory). Optionally, rows @@ -72,51 +63,44 @@ class BatchCsr; * * @note While this format is not very useful for storing sparse matrices, it * is often suitable to store vectors, and sets of vectors. - * @ingroup batch_dense + * @ingroup batch_vector * @ingroup mat_formats * @ingroup BatchLinOp */ template -class BatchDense : public EnableBatchLinOp>, - public EnableCreateMethod>, - public ConvertibleTo>>, - public ConvertibleTo>, - public ConvertibleTo>, - public BatchReadableFromMatrixData, - public BatchReadableFromMatrixData, - public BatchWritableToMatrixData, - public BatchWritableToMatrixData, - public BatchTransposable, - public BatchScaledIdentityAddable { - friend class EnableCreateMethod; - friend class EnablePolymorphicObject; - friend class BatchDense>; +class BatchVector + : public EnableAbstractPolymorphicObject>, + public EnableCreateMethod>, + public ConvertibleTo>>, + public BatchReadableFromMatrixData, + public BatchReadableFromMatrixData, + public BatchWritableToMatrixData, + public BatchWritableToMatrixData { + friend class EnableCreateMethod; + friend class BatchVector>; public: - using EnableBatchLinOp::convert_to; - using EnableBatchLinOp::move_to; using BatchReadableFromMatrixData::read; using BatchReadableFromMatrixData::read; using value_type = ValueType; using index_type = int32; - using transposed_type = BatchDense; using unbatch_type = Dense; using mat_data = gko::matrix_data; using mat_data32 = gko::matrix_data; - using absolute_type = remove_complex; - using complex_type = to_complex; + using absolute_type = remove_complex; + using complex_type = to_complex; using row_major_range = gko::range>; /** - * Creates a BatchDense matrix with the configuration of another BatchDense - * matrix. + * Creates a BatchVector matrix with the configuration of another + * BatchVector matrix. * * @param other The other matrix whose configuration needs to copied. */ - static std::unique_ptr create_with_config_of( - const BatchDense* other) + static std::unique_ptr create_with_config_of( + const BatchVector* other) { // De-referencing `other` before calling the functions (instead of // using operator `->`) is currently required to be compatible with @@ -125,20 +109,12 @@ class BatchDense : public EnableBatchLinOp>, return (*other).create_with_same_config(); } - friend class BatchDense>; + friend class BatchVector>; void convert_to( - BatchDense>* result) const override; - - void move_to(BatchDense>* result) override; + BatchVector>* result) const override; - void convert_to(BatchCsr* result) const override; - - void move_to(BatchCsr* result) override; - - void convert_to(BatchDiagonal* result) const override; - - void move_to(BatchDiagonal* result) override; + void move_to(BatchVector>* result) override; void read(const std::vector& data) override; @@ -148,10 +124,6 @@ class BatchDense : public EnableBatchLinOp>, void write(std::vector& data) const override; - std::unique_ptr transpose() const override; - - std::unique_ptr conj_transpose() const override; - /** * Unbatches the batched dense and creates a std::vector of Dense matrices * @@ -174,14 +146,14 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Returns a pointer to the array of values of the matrix. + * Returns a pointer to the array of values of the vector. * * @return the pointer to the array of values */ value_type* get_values() noexcept { return values_.get_data(); } /** - * Returns a pointer to the array of values of the matrix. + * Returns a pointer to the array of values of the vector. * * @return the pointer to the array of values */ @@ -218,18 +190,11 @@ class BatchDense : public EnableBatchLinOp>, num_elems_per_batch_cumul_.get_const_data()[batch]; } - /** - * Returns the batch_stride of the matrix. - * - * @return the batch_stride of the matrix. - */ - const batch_stride& get_stride() const noexcept { return stride_; } - /** * Returns the number of elements explicitly stored in the batch matrix, * cumulative across all the batches. * - * @return the number of elements explicitly stored in the matrix, + * @return the number of elements explicitly stored in the vector, * cumulative across all the batches */ size_type get_num_stored_elements() const noexcept @@ -243,7 +208,7 @@ class BatchDense : public EnableBatchLinOp>, * * @param batch the batch index to be queried * - * @return the number of elements explicitly stored in the matrix + * @return the number of elements explicitly stored in the vector */ size_type get_num_stored_elements(size_type batch) const noexcept { @@ -259,7 +224,7 @@ class BatchDense : public EnableBatchLinOp>, * @param row the row of the requested element * @param col the column of the requested element * - * @note the method has to be called on the same Executor the matrix is + * @note the method has to be called on the same Executor the vector is * stored at (e.g. trying to call this method on a GPU matrix from * the OMP results in a runtime error) */ @@ -270,7 +235,7 @@ class BatchDense : public EnableBatchLinOp>, } /** - * @copydoc BatchDense::at(size_type, size_type, size_type) + * @copydoc BatchVector::at(size_type, size_type, size_type) */ value_type at(size_type batch, size_type row, size_type col) const noexcept { @@ -281,7 +246,7 @@ class BatchDense : public EnableBatchLinOp>, /** * Returns a single element for a particular batch entry. * - * Useful for iterating across all elements of the matrix. + * Useful for iterating across all elements of the vector. * However, it is less efficient than the two-parameter variant of this * method. * @@ -289,7 +254,7 @@ class BatchDense : public EnableBatchLinOp>, * @param idx a linear index of the requested element * (ignoring the stride) * - * @note the method has to be called on the same Executor the matrix is + * @note the method has to be called on the same Executor the vector is * stored at (e.g. trying to call this method on a GPU matrix from * the OMP results in a runtime error) */ @@ -299,7 +264,7 @@ class BatchDense : public EnableBatchLinOp>, } /** - * @copydoc BatchDense::at(size_type, size_type, size_type) + * @copydoc BatchVector::at(size_type, size_type, size_type) */ ValueType at(size_type batch, size_type idx) const noexcept { @@ -307,11 +272,11 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Scales the matrix with a scalar (aka: BLAS scal). + * Scales the vector with a scalar (aka: BLAS scal). * - * @param alpha If alpha is 1x1 BatchDense matrix, the entire matrix (all - * batches) is scaled by alpha. If it is a BatchDense row vector of values, - * then i-th column of the matrix is scaled with the i-th element of alpha + * @param alpha If alpha is 1x1 BatchVector matrix, the entire matrix (all + * batches) is scaled by alpha. If it is a BatchVector row vector of values, + * then i-th column of the vector is scaled with the i-th element of alpha * (the number of columns of alpha has to match the number of columns of the * matrix). */ @@ -322,12 +287,12 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Adds `b` scaled by `alpha` to the matrix (aka: BLAS axpy). + * Adds `b` scaled by `alpha` to the vector (aka: BLAS axpy). * - * @param alpha If alpha is 1x1 BatchDense matrix, the entire matrix is - * scaled by alpha. If it is a BatchDense row vector of values, then i-th - * column of the matrix is scaled with the i-th element of alpha (the number - * of columns of alpha has to match the number of columns of the matrix). + * @param alpha If alpha is 1x1 BatchVector matrix, the entire matrix is + * scaled by alpha. If it is a BatchVector row vector of values, then i-th + * column of the vector is scaled with the i-th element of alpha (the number + * of columns of alpha has to match the number of columns of the vector). * @param b a matrix of the same dimension as this */ void add_scaled(const BatchLinOp* alpha, const BatchLinOp* b) @@ -338,11 +303,11 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Adds `a` scaled by `alpha` to the matrix scaled by `beta`: + * Adds `a` scaled by `alpha` to the vector scaled by `beta`: * this <- alpha * a + beta * this. * - * @param alpha If alpha is 1x1 BatchDense matrix, the entire matrix a is - * scaled by alpha. If it is a BatchDense row vector of + * @param alpha If alpha is 1x1 BatchVector matrix, the entire matrix a is + * scaled by alpha. If it is a BatchVector row vector of * values, then i-th column of a is scaled with the i-th * element of alpha (the number of columns of alpha has to * match the number of columns of a). @@ -355,11 +320,11 @@ class BatchDense : public EnableBatchLinOp>, /** * Computes the column-wise dot product of each matrix in this batch and its - * corresponding entry in `b`. If the matrix has complex value_type, then + * corresponding entry in `b`. If the vector has complex value_type, then * the conjugate of this is taken. * - * @param b a BatchDense matrix of same dimension as this - * @param result a BatchDense row vector, used to store the dot product + * @param b a BatchVector matrix of same dimension as this + * @param result a BatchVector row vector, used to store the dot product * (the number of column in the vector must match the number * of columns of this) */ @@ -373,7 +338,7 @@ class BatchDense : public EnableBatchLinOp>, /** * Computes the Euclidean (L^2) norm of each matrix in this batch. * - * @param result a BatchDense row vector, used to store the norm + * @param result a BatchVector row vector, used to store the norm * (the number of columns in the vector must match the number * of columns of this) */ @@ -386,22 +351,22 @@ class BatchDense : public EnableBatchLinOp>, /** * Creates a constant (immutable) batch dense matrix from a constant array. * - * @param exec the executor to create the matrix on - * @param size the dimensions of the matrix - * @param values the value array of the matrix - * @param stride the row-stride of the matrix + * @param exec the executor to create the vector on + * @param size the dimensions of the vector + * @param values the value array of the vector + * @param stride the row-stride of the vector * @returns A smart pointer to the constant matrix wrapping the input array - * (if it resides on the same executor as the matrix) or a copy of + * (if it resides on the same executor as the vector) or a copy of * the array on the correct executor. */ - static std::unique_ptr create_const( + static std::unique_ptr create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values, const batch_stride& strides) { // cast const-ness away, but return a const object afterwards, // so we can ensure that no modifications take place. - return std::unique_ptr(new BatchDense{ + return std::unique_ptr(new BatchVector{ exec, sizes, gko::detail::array_const_cast(std::move(values)), strides}); } @@ -489,30 +454,31 @@ class BatchDense : public EnableBatchLinOp>, protected: /** - * Creates an uninitialized BatchDense matrix of the specified size. + * Creates an uninitialized BatchVector matrix of the specified size. * - * @param exec Executor associated to the matrix - * @param size size of the matrix + * @param exec Executor associated to the vector + * @param size size of the vector */ - BatchDense(std::shared_ptr exec, - const batch_dim<2>& size = batch_dim<2>{}) - : BatchDense(std::move(exec), size, - size.get_num_batch_entries() > 0 ? extract_nth_dim(1, size) - : batch_stride{}) + BatchVector(std::shared_ptr exec, + const batch_dim<2>& size = batch_dim<2>{}) + : BatchVector(std::move(exec), size, + size.get_num_batch_entries() > 0 + ? extract_nth_dim(1, size) + : batch_stride{}) {} /** - * Creates an uninitialized BatchDense matrix of the specified size. + * Creates an uninitialized BatchVector matrix of the specified size. * - * @param exec Executor associated to the matrix + * @param exec Executor associated to the vector * @param size size of the batch matrices in a batch_dim object * @param stride stride of the rows (i.e. offset between the first * elements of two consecutive rows, expressed as the * number of matrix elements) */ - BatchDense(std::shared_ptr exec, const batch_dim<2>& size, - const batch_stride& stride) - : EnableBatchLinOp(exec, size), + BatchVector(std::shared_ptr exec, const batch_dim<2>& size, + const batch_stride& stride) + : EnableBatchLinOp(exec, size), values_(exec, compute_batch_mem(size, stride)), stride_(stride) { @@ -521,12 +487,12 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Creates a BatchDense matrix from an already allocated (and initialized) + * Creates a BatchVector matrix from an already allocated (and initialized) * array. * * @tparam ValuesArray type of array of values * - * @param exec Executor associated to the matrix + * @param exec Executor associated to the vector * @param size sizes of the batch matrices in a batch_dim object * @param values array of matrix values * @param strides stride of the rows (i.e. offset between the first @@ -535,12 +501,12 @@ class BatchDense : public EnableBatchLinOp>, * * @note If `values` is not an rvalue, not an array of ValueType, or is on * the wrong executor, an internal copy will be created, and the - * original array data will not be used in the matrix. + * original array data will not be used in the vector. */ template - BatchDense(std::shared_ptr exec, const batch_dim<2>& size, - ValuesArray&& values, const batch_stride& stride) - : EnableBatchLinOp(exec, size), + BatchVector(std::shared_ptr exec, const batch_dim<2>& size, + ValuesArray&& values, const batch_stride& stride) + : EnableBatchLinOp(exec, size), values_{exec, std::forward(values)}, stride_{stride}, num_elems_per_batch_cumul_( @@ -557,14 +523,14 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Creates a BatchDense matrix from a vector of matrices + * Creates a BatchVector matrix from a vector of matrices * - * @param exec Executor associated to the matrix + * @param exec Executor associated to the vector * @param matrices The matrices that need to be batched. */ - BatchDense(std::shared_ptr exec, - const std::vector*>& matrices) - : EnableBatchLinOp(exec, get_sizes_from_mtxs(matrices)), + BatchVector(std::shared_ptr exec, + const std::vector*>& matrices) + : EnableBatchLinOp(exec, get_sizes_from_mtxs(matrices)), stride_{get_strides_from_mtxs(matrices)}, values_(exec, compute_batch_mem(this->get_size(), stride_)) { @@ -581,15 +547,16 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Creates a BatchDense matrix by duplicating BatchDense matrix + * Creates a BatchVector matrix by duplicating BatchVector matrix * - * @param exec Executor associated to the matrix + * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate - * @param input The matrix to be duplicated. + * @param input the vector to be duplicated. */ - BatchDense(std::shared_ptr exec, size_type num_duplications, - const BatchDense* input) - : EnableBatchLinOp( + BatchVector(std::shared_ptr exec, + size_type num_duplications, + const BatchVector* input) + : EnableBatchLinOp( exec, gko::batch_dim<2>( input->get_num_batch_entries() * num_duplications, input->get_size().at(0))), @@ -611,15 +578,15 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Creates a BatchDense matrix by duplicating Dense matrix + * Creates a BatchVector matrix by duplicating Dense matrix * - * @param exec Executor associated to the matrix + * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate - * @param input The matrix to be duplicated. + * @param input the vector to be duplicated. */ - BatchDense(std::shared_ptr exec, size_type num_duplications, - const Dense* input) - : EnableBatchLinOp( + BatchVector(std::shared_ptr exec, + size_type num_duplications, const Dense* input) + : EnableBatchLinOp( exec, gko::batch_dim<2>(num_duplications, input->get_size())), stride_{gko::batch_stride(num_duplications, input->get_stride())}, values_(exec, compute_batch_mem(this->get_size(), stride_)) @@ -637,30 +604,30 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Creates a BatchDense matrix with the same configuration as the callers + * Creates a BatchVector matrix with the same configuration as the callers * matrix. * - * @returns a BatchDense matrix with the same configuration as the caller. + * @returns a BatchVector matrix with the same configuration as the caller. */ - virtual std::unique_ptr create_with_same_config() const + virtual std::unique_ptr create_with_same_config() const { - return BatchDense::create(this->get_executor(), this->get_size(), - this->get_stride()); + return BatchVector::create(this->get_executor(), this->get_size(), + this->get_stride()); } /** * @copydoc scale(const BatchLinOp *) * - * @note Other implementations of batch_dense should override this function - * instead of scale(const BatchLinOp *alpha). + * @note Other implementations of batch_vector should override this + * function instead of scale(const BatchLinOp *alpha). */ virtual void scale_impl(const BatchLinOp* alpha); /** * @copydoc add_scaled(const BatchLinOp *, const BatchLinOp *) * - * @note Other implementations of batch_dense should override this function - * instead of add_scale(const BatchLinOp *alpha, const BatchLinOp + * @note Other implementations of batch_vector should override this + * function instead of add_scale(const BatchLinOp *alpha, const BatchLinOp * *b). */ virtual void add_scaled_impl(const BatchLinOp* alpha, const BatchLinOp* b); @@ -668,8 +635,8 @@ class BatchDense : public EnableBatchLinOp>, /** * @copydoc compute_dot(const BatchLinOp *, BatchLinOp *) const * - * @note Other implementations of batch_dense should override this function - * instead of compute_dot(const BatchLinOp *b, BatchLinOp *result). + * @note Other implementations of batch_vector should override this + * function instead of compute_dot(const BatchLinOp *b, BatchLinOp *result). */ virtual void compute_dot_impl(const BatchLinOp* b, BatchLinOp* result) const; @@ -677,16 +644,11 @@ class BatchDense : public EnableBatchLinOp>, /** * @copydoc compute_norm2(BatchLinOp *) const * - * @note Other implementations of batch_dense should override this function - * instead of compute_norm2(BatchLinOp *result). + * @note Other implementations of batch_vector should override this + * function instead of compute_norm2(BatchLinOp *result). */ virtual void compute_norm2_impl(BatchLinOp* result) const; - void apply_impl(const BatchLinOp* b, BatchLinOp* x) const override; - - void apply_impl(const BatchLinOp* alpha, const BatchLinOp* b, - const BatchLinOp* beta, BatchLinOp* x) const override; - size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept { @@ -704,9 +666,6 @@ class BatchDense : public EnableBatchLinOp>, batch_stride stride_; array num_elems_per_batch_cumul_; array values_; - - void add_scaled_identity_impl(const BatchLinOp* a, - const BatchLinOp* b) override; }; @@ -717,7 +676,7 @@ class BatchDense : public EnableBatchLinOp>, * Creates and initializes a batch of column-vectors. * * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the matrix to the requested type. + * values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize * (Dense has to implement the ConvertibleTo interface) @@ -741,7 +700,7 @@ std::unique_ptr batch_initialize( vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_dense = matrix::BatchDense; + using batch_vector = matrix::BatchVector; size_type num_batch_entries = vals.size(); std::vector num_rows(num_batch_entries); std::vector> sizes(num_batch_entries); @@ -753,7 +712,7 @@ std::unique_ptr batch_initialize( } auto b_size = batch_dim<2>(sizes); auto b_stride = batch_stride(stride); - auto tmp = batch_dense::create(exec->get_master(), b_size, b_stride); + auto tmp = batch_vector::create(exec->get_master(), b_size, b_stride); size_type batch = 0; for (const auto& b : vals) { size_type idx = 0; @@ -772,7 +731,7 @@ std::unique_ptr batch_initialize( * Creates and initializes a batch of column-vectors. * * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the matrix to the requested type. The stride of + * values, and then converts the vector to the requested type. The stride of * the intermediate Dense matrix is set to 1. * * @tparam Matrix matrix type to initialize @@ -805,7 +764,7 @@ std::unique_ptr batch_initialize( * Creates and initializes a batch of matrices. * * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the matrix to the requested type. + * values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize * (Dense has to implement the ConvertibleTo interface) @@ -813,8 +772,8 @@ std::unique_ptr batch_initialize( * (not including the implied Executor as the first argument) * * @param stride row stride for the temporary Dense matrix - * @param vals values used to initialize the matrix - * @param exec Executor associated to the matrix + * @param vals values used to initialize the vector + * @param exec Executor associated to the vector * @param create_args additional arguments passed to Matrix::create, not * including the Executor, which is passed as the first * argument @@ -830,7 +789,7 @@ std::unique_ptr batch_initialize( vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_dense = matrix::BatchDense; + using batch_vector = matrix::BatchVector; size_type num_batch_entries = vals.size(); std::vector num_rows(num_batch_entries); std::vector num_cols(num_batch_entries); @@ -844,7 +803,7 @@ std::unique_ptr batch_initialize( } auto b_size = batch_dim<2>(sizes); auto b_stride = batch_stride(stride); - auto tmp = batch_dense::create(exec->get_master(), b_size, b_stride); + auto tmp = batch_vector::create(exec->get_master(), b_size, b_stride); size_type batch = 0; for (const auto& b : vals) { size_type ridx = 0; @@ -868,7 +827,7 @@ std::unique_ptr batch_initialize( * Creates and initializes a batch of matrices. * * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the matrix to the requested type. The stride of + * values, and then converts the vector to the requested type. The stride of * the intermediate Dense matrix is set to the number of columns of the * initializer list. * @@ -877,8 +836,8 @@ std::unique_ptr batch_initialize( * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param vals values used to initialize the matrix - * @param exec Executor associated to the matrix + * @param vals values used to initialize the vector + * @param exec Executor associated to the vector * @param create_args additional arguments passed to Matrix::create, not * including the Executor, which is passed as the first * argument @@ -909,7 +868,7 @@ std::unique_ptr batch_initialize( * input column vector. * * This function first creates a temporary batch dense matrix, fills it with - * passed in values, and then converts the matrix to the requested type. + * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize * (Dense has to implement the ConvertibleTo @@ -935,7 +894,7 @@ std::unique_ptr batch_initialize( std::initializer_list vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_dense = matrix::BatchDense; + using batch_vector = matrix::BatchVector; std::vector num_rows(num_vectors); std::vector> sizes(num_vectors); for (size_type b = 0; b < num_vectors; ++b) { @@ -944,7 +903,7 @@ std::unique_ptr batch_initialize( } auto b_size = batch_dim<2>(sizes); auto b_stride = batch_stride(stride); - auto tmp = batch_dense::create(exec->get_master(), b_size, b_stride); + auto tmp = batch_vector::create(exec->get_master(), b_size, b_stride); for (size_type batch = 0; batch < num_vectors; batch++) { size_type idx = 0; for (const auto& elem : vals) { @@ -962,7 +921,7 @@ std::unique_ptr batch_initialize( * Creates and initializes a column-vector from copies of a given vector. * * This function first creates a temporary Dense matrix, fills it with passed - * in values, and then converts the matrix to the requested type. The stride of + * in values, and then converts the vector to the requested type. The stride of * the intermediate Dense matrix is set to 1. * * @tparam Matrix matrix type to initialize @@ -997,7 +956,7 @@ std::unique_ptr batch_initialize( * Creates and initializes a matrix from copies of a given matrix. * * This function first creates a temporary batch dense matrix, fills it with - * passed in values, and then converts the matrix to the requested type. + * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize * (Dense has to implement the ConvertibleTo interface) @@ -1023,7 +982,7 @@ std::unique_ptr batch_initialize( vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_dense = matrix::BatchDense; + using batch_vector = matrix::BatchVector; std::vector> sizes(num_matrices); const size_type num_rows = vals.size(); for (size_type b = 0; b < num_matrices; ++b) { @@ -1033,7 +992,7 @@ std::unique_ptr batch_initialize( GKO_ASSERT(blockit->size() == num_cols); } } - auto tmp = batch_dense::create(exec->get_master(), sizes, stride); + auto tmp = batch_vector::create(exec->get_master(), sizes, stride); for (size_type batch = 0; batch < num_matrices; batch++) { size_type ridx = 0; for (const auto& row : vals) { @@ -1054,7 +1013,7 @@ std::unique_ptr batch_initialize( * Creates and initializes a matrix from copies of a given matrix. * * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the matrix to the requested type. The stride of + * values, and then converts the vector to the requested type. The stride of * the intermediate Dense matrix is set to 1. * * @tparam Matrix matrix type to initialize @@ -1090,4 +1049,4 @@ std::unique_ptr batch_initialize( } // namespace gko -#endif // GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ +#endif // GKO_PUBLIC_CORE_MATRIX_BATCH_VECTOR_HPP_ diff --git a/omp/matrix/batch_vector_kernels.cpp b/omp/matrix/batch_vector_kernels.cpp index 70c0794f4a8..7ade2fcca23 100644 --- a/omp/matrix/batch_vector_kernels.cpp +++ b/omp/matrix/batch_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_vector_kernels.hpp" #include @@ -39,8 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include -#include #include "core/components/prefix_sum_kernels.hpp" @@ -51,70 +49,20 @@ namespace gko { namespace kernels { namespace omp { /** - * @brief The BatchDense matrix format namespace. - * @ref BatchDense - * @ingroup batch_dense + * @brief The BatchVector matrix format namespace. + * @ref BatchVector + * @ingroup batch_vector */ -namespace batch_dense { +namespace batch_vector { -#include "reference/matrix/batch_dense_kernels.hpp.inc" - - -template -void simple_apply(std::shared_ptr exec, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - matrix::BatchDense* const c) -{ - const auto a_ub = host::get_batch_struct(a); - const auto b_ub = host::get_batch_struct(b); - const auto c_ub = host::get_batch_struct(c); -#pragma omp parallel for - for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { - const auto a_b = gko::batch::batch_entry(a_ub, batch); - const auto b_b = gko::batch::batch_entry(b_ub, batch); - const auto c_b = gko::batch::batch_entry(c_ub, batch); - matvec_kernel(a_b, b_b, c_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - const matrix::BatchDense* const beta, - matrix::BatchDense* const c) -{ - const auto a_ub = host::get_batch_struct(a); - const auto b_ub = host::get_batch_struct(b); - const auto c_ub = host::get_batch_struct(c); - const auto alpha_ub = host::get_batch_struct(alpha); - const auto beta_ub = host::get_batch_struct(beta); -#pragma omp parallel for - for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { - const auto a_b = gko::batch::batch_entry(a_ub, batch); - const auto b_b = gko::batch::batch_entry(b_ub, batch); - const auto c_b = gko::batch::batch_entry(c_ub, batch); - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto beta_b = gko::batch::batch_entry(beta_ub, batch); - advanced_matvec_kernel(alpha_b.values[0], a_b, b_b, beta_b.values[0], - c_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); +#include "reference/matrix/batch_vector_kernels.hpp.inc" template void scale(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - matrix::BatchDense* const x) + const matrix::BatchVector* const alpha, + matrix::BatchVector* const x) { const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); @@ -126,14 +74,14 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); template void add_scaled(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - matrix::BatchDense* const y) + const matrix::BatchVector* const alpha, + const matrix::BatchVector* const x, + matrix::BatchVector* const y) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -147,71 +95,14 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); - - -template -void add_scale(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - const matrix::BatchDense* const beta, - matrix::BatchDense* const y) -{ - const auto x_ub = host::get_batch_struct(x); - const auto y_ub = host::get_batch_struct(y); - const auto alpha_ub = host::get_batch_struct(alpha); - const auto beta_ub = host::get_batch_struct(beta); -#pragma omp parallel for - for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto beta_b = gko::batch::batch_entry(beta_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); - add_scale(alpha_b, x_b, beta_b, y_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); - - -template -void convergence_add_scaled(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - matrix::BatchDense* const y, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto y_ub = host::get_batch_struct(y); - const auto alpha_ub = host::get_batch_struct(alpha); -#pragma omp parallel for - for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); - add_scaled(alpha_b, x_b, y_b, converged); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); - - -template -void add_scaled_diag(std::shared_ptr, - const matrix::BatchDense*, - const matrix::Diagonal*, - matrix::BatchDense*) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); template void compute_dot(std::shared_ptr exec, - const matrix::BatchDense* const x, - const matrix::BatchDense* const y, - matrix::BatchDense* const result) + const matrix::BatchVector* const x, + const matrix::BatchVector* const y, + matrix::BatchVector* const result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -226,37 +117,14 @@ void compute_dot(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); - - -template -void convergence_compute_dot(std::shared_ptr exec, - const matrix::BatchDense* const x, - const matrix::BatchDense* const y, - matrix::BatchDense* const result, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto y_ub = host::get_batch_struct(y); - const auto res_ub = host::get_batch_struct(result); -#pragma omp parallel for - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); - compute_dot_product(x_b, y_b, res_b, converged); - } -} - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, - const matrix::BatchDense* const x, - matrix::BatchDense>* const result) + const matrix::BatchVector* const x, + matrix::BatchVector>* const result) { const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); @@ -270,261 +138,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); - - -template -void convergence_compute_norm2( - std::shared_ptr exec, - const matrix::BatchDense* const x, - matrix::BatchDense>* const result, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto res_ub = host::get_batch_struct(result); -#pragma omp parallel for - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - compute_norm2(x_b, res_b, converged); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); - - -template -void convert_to_batch_csr(std::shared_ptr exec, - const matrix::BatchDense* const source, - matrix::BatchCsr* const result) -{ - GKO_ASSERT(source->get_size().stores_equal_sizes() == true); - auto num_rows = result->get_size().at(0)[0]; - auto num_cols = result->get_size().at(0)[1]; - auto num_batches = result->get_num_batch_entries(); - - auto row_ptrs = result->get_row_ptrs(); - auto col_idxs = result->get_col_idxs(); - auto values = result->get_values(); - - -#pragma omp parallel for - for (size_type row = 0; row < num_rows; ++row) { - IndexType row_nnz{}; - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(0, row, col); - row_nnz += static_cast(val != zero()); - } - row_ptrs[row] = row_nnz; - } - - components::prefix_sum(exec, row_ptrs, num_rows + 1); - -#pragma omp parallel for - for (size_type row = 0; row < num_rows; ++row) { - auto cur_ptr = row_ptrs[row]; - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(0, row, col); - if (val != zero()) { - col_idxs[cur_ptr] = static_cast(col); - ++cur_ptr; - } - } - } - -#pragma omp parallel for - for (size_type batch = 0; batch < num_batches; ++batch) { - size_type cur_ptr = - batch * row_ptrs[num_rows]; // as row_ptrs[num_rows] is the num of - // non zero elements in the matrix - for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(batch, row, col); - if (val != zero()) { - values[cur_ptr] = val; - ++cur_ptr; - } - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_AND_INT32_INDEX( - GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::BatchDense* const source, - size_type* const result) -{ -#pragma omp parallel for - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - size_type num_nonzeros = 0; - - for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += static_cast( - source->at(batch, row, col) != zero()); - } - } - result[batch] = num_nonzeros; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row( - std::shared_ptr, - const matrix::BatchDense* const source, size_type* const result) -{ -#pragma omp parallel for - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - size_type num_stored_elements_per_row = 0; - size_type num_nonzeros = 0; - - for (size_type row = 0; row < num_rows; ++row) { - num_nonzeros = 0; - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += static_cast( - source->at(batch, row, col) != zero()); - } - num_stored_elements_per_row = - std::max(num_nonzeros, num_stored_elements_per_row); - } - result[batch] = num_stored_elements_per_row; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row( - std::shared_ptr, - const matrix::BatchDense* const source, - array* const result) -{ - size_type cumul_prev_rows = 0; - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - auto row_nnz_val = result->get_data() + cumul_prev_rows; - -#pragma omp parallel for reduction(+ : cumul_prev_rows) - for (size_type row = 0; row < num_rows; ++row) { - size_type num_nonzeros = 0; - - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += static_cast( - source->at(batch, row, col) != zero()); - } - row_nnz_val[row] = num_nonzeros; - ++cumul_prev_rows; - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr, - const matrix::BatchDense* const source, - size_type* const result, - const size_type* const stride_factor, - const size_type* const slice_size) -{ -#pragma omp parallel for - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - auto slice_num = ceildiv(num_rows, slice_size[batch]); - size_type total_cols = 0; - size_type temp = 0; - size_type slice_temp = 0; - - for (size_type slice = 0; slice < slice_num; slice++) { - slice_temp = 0; - for (size_type row = 0; row < slice_size[batch] && - row + slice * slice_size[batch] < num_rows; - row++) { - temp = 0; - for (size_type col = 0; col < num_cols; col++) { - temp += static_cast( - source->at(batch, row + slice * slice_size[batch], - col) != zero()); - } - slice_temp = (slice_temp < temp) ? temp : slice_temp; - } - slice_temp = ceildiv(slice_temp, stride_factor[batch]) * - stride_factor[batch]; - total_cols += slice_temp; - } - result[batch] = total_cols; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - -template -void transpose(std::shared_ptr, - const matrix::BatchDense* const orig, - matrix::BatchDense* const trans) -{ -#pragma omp parallel for - for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { - for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { - for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { - trans->at(batch, j, i) = orig->at(batch, i, j); - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr, - const matrix::BatchDense* const orig, - matrix::BatchDense* const trans) -{ -#pragma omp parallel for - for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { - for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { - for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { - trans->at(batch, j, i) = conj(orig->at(batch, i, j)); - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result) + const matrix::BatchVector* x, + matrix::BatchVector* result) { const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); @@ -536,79 +156,10 @@ void copy(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); - - -template -void convergence_copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto result_ub = host::get_batch_struct(result); -#pragma omp parallel for - for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { - const auto result_b = gko::batch::batch_entry(result_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - copy(x_b, result_b, converged); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); - - -template -void batch_scale(std::shared_ptr exec, - const matrix::BatchDiagonal* const left, - const matrix::BatchDiagonal* const rght, - matrix::BatchDense* const vecs) -{ - const auto left_vals = left->get_const_values(); - const auto rght_vals = rght->get_const_values(); - const auto v_vals = vecs->get_values(); - const auto nrows = static_cast(vecs->get_size().at(0)[0]); - const auto ncols = static_cast(vecs->get_size().at(0)[1]); - const auto vstride = vecs->get_stride().at(0); -#pragma omp parallel for - for (size_type batch = 0; batch < vecs->get_num_batch_entries(); ++batch) { - const auto left_b = - gko::batch::batch_entry_ptr(left_vals, 1, nrows, batch); - const auto rght_b = - gko::batch::batch_entry_ptr(rght_vals, 1, ncols, batch); - const auto v_b = - gko::batch::batch_entry_ptr(v_vals, vstride, nrows, batch); - batch_scale(nrows, ncols, vstride, left_b, rght_b, v_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); - - -template -void add_scaled_identity(std::shared_ptr exec, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - matrix::BatchDense* const mtx) -{ - const auto a_ub = host::get_batch_struct(a); - const auto b_ub = host::get_batch_struct(b); - const auto mtx_ub = host::get_batch_struct(mtx); -#pragma omp parallel for - for (size_type batch = 0; batch < mtx->get_num_batch_entries(); ++batch) { - auto a_b = gko::batch::batch_entry(a_ub, batch); - auto b_b = gko::batch::batch_entry(b_ub, batch); - auto mtx_b = gko::batch::batch_entry(mtx_ub, batch); - add_scaled_identity(a_b.values[0], b_b.values[0], mtx_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); -} // namespace batch_dense +} // namespace batch_vector } // namespace omp } // namespace kernels } // namespace gko diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp new file mode 100644 index 00000000000..0c07956d9d6 --- /dev/null +++ b/reference/matrix/batch_struct.hpp @@ -0,0 +1,120 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ + + +#include "core/matrix/batch_struct.hpp" + + +#include +#include + + +namespace gko { +namespace kernels { +/** + * @brief A namespace for shared functionality between omp and reference + * executors. + */ +namespace host { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_vector::UniformBatch get_batch_struct( + const matrix::BatchVector* const op) +{ + return { + op->get_const_values(), + op->get_num_batch_entries(), + op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1]), + static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; +} + + +/** + * Generates a uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_vector::UniformBatch get_batch_struct( + matrix::BatchVector* const op) +{ + return { + op->get_values(), + op->get_num_batch_entries(), + op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1]), + static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; +} + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices + * that may be null. + */ +template +inline gko::batch_vector::UniformBatch maybe_null_batch_struct( + const matrix::BatchVector* const op) +{ + if (op) { + return {op->get_const_values(), op->get_num_batch_entries(), + op->get_stride().at(0), + static_cast(op->get_size().at(0)[0]), + static_cast(op->get_size().at(0)[1])}; + } else { + return {nullptr, 0, 0, 0, 0}; + } +} + + +} // namespace host +} // namespace kernels +} // namespace gko + + +#endif // GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ diff --git a/reference/matrix/batch_vector_kernels.cpp b/reference/matrix/batch_vector_kernels.cpp index 8e9e857cc5b..01748c6e524 100644 --- a/reference/matrix/batch_vector_kernels.cpp +++ b/reference/matrix/batch_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_vector_kernels.hpp" #include @@ -39,8 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include -#include #include "core/matrix/batch_struct.hpp" @@ -51,68 +49,20 @@ namespace gko { namespace kernels { namespace reference { /** - * @brief The BatchDense matrix format namespace. - * @ref BatchDense - * @ingroup batch_dense + * @brief The BatchVector matrix format namespace. + * @ref BatchVector + * @ingroup batch_vector */ -namespace batch_dense { +namespace batch_vector { -#include "reference/matrix/batch_dense_kernels.hpp.inc" +#include "reference/matrix/batch_vector_kernels.hpp.inc" template -void simple_apply(std::shared_ptr exec, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - matrix::BatchDense* const c) -{ - const auto a_ub = host::get_batch_struct(a); - const auto b_ub = host::get_batch_struct(b); - const auto c_ub = host::get_batch_struct(c); - for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { - const auto a_b = gko::batch::batch_entry(a_ub, batch); - const auto b_b = gko::batch::batch_entry(b_ub, batch); - const auto c_b = gko::batch::batch_entry(c_ub, batch); - matvec_kernel(a_b, b_b, c_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - const matrix::BatchDense* const beta, - matrix::BatchDense* const c) -{ - const auto a_ub = host::get_batch_struct(a); - const auto b_ub = host::get_batch_struct(b); - const auto c_ub = host::get_batch_struct(c); - const auto alpha_ub = host::get_batch_struct(alpha); - const auto beta_ub = host::get_batch_struct(beta); - for (size_type batch = 0; batch < c->get_num_batch_entries(); ++batch) { - const auto a_b = gko::batch::batch_entry(a_ub, batch); - const auto b_b = gko::batch::batch_entry(b_ub, batch); - const auto c_b = gko::batch::batch_entry(c_ub, batch); - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto beta_b = gko::batch::batch_entry(beta_ub, batch); - advanced_matvec_kernel(alpha_b.values[0], a_b, b_b, beta_b.values[0], - c_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_APPLY_KERNEL); - - -template -void scale(std::shared_ptr exec, - const matrix::BatchDense* alpha, - matrix::BatchDense* x) +void scale(std::shared_ptr exec, + const matrix::BatchVector* alpha, + matrix::BatchVector* x) { const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); @@ -123,14 +73,14 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); template -void add_scaled(std::shared_ptr exec, - const matrix::BatchDense* alpha, - const matrix::BatchDense* x, - matrix::BatchDense* y) +void add_scaled(std::shared_ptr exec, + const matrix::BatchVector* alpha, + const matrix::BatchVector* x, + matrix::BatchVector* y) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -143,77 +93,14 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALED_KERNEL); - - -template -void add_scale(std::shared_ptr exec, - const matrix::BatchDense* const alpha, - const matrix::BatchDense* const x, - const matrix::BatchDense* const beta, - matrix::BatchDense* const y) -{ - const auto x_ub = host::get_batch_struct(x); - const auto y_ub = host::get_batch_struct(y); - const auto alpha_ub = host::get_batch_struct(alpha); - const auto beta_ub = host::get_batch_struct(beta); - for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto beta_b = gko::batch::batch_entry(beta_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); - add_scale(alpha_b, x_b, beta_b, y_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADD_SCALE_KERNEL); - - -template -void convergence_add_scaled(std::shared_ptr exec, - const matrix::BatchDense* alpha, - const matrix::BatchDense* x, - matrix::BatchDense* y, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto y_ub = host::get_batch_struct(y); - const auto alpha_ub = host::get_batch_struct(alpha); - for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); - add_scaled(alpha_b, x_b, y_b, converged); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_ADD_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); template -void add_scaled_diag(std::shared_ptr exec, - const matrix::BatchDense* alpha, - const matrix::Diagonal* x, - matrix::BatchDense* y) GKO_NOT_IMPLEMENTED; -// { -// for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { -// const auto diag_values = x->get_const_values(); -// for (size_type i = 0; i < x->get_size().at(batch)[0]; i++) { -// y->at(batch,i, i) += alpha->at(batch,0, 0) * diag_values[i]; -// } -// } -// } - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_DIAG_KERNEL); - - -template -void compute_dot(std::shared_ptr exec, - const matrix::BatchDense* x, - const matrix::BatchDense* y, - matrix::BatchDense* result) +void compute_dot(std::shared_ptr exec, + const matrix::BatchVector* x, + const matrix::BatchVector* y, + matrix::BatchVector* result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -227,36 +114,14 @@ void compute_dot(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COMPUTE_DOT_KERNEL); - - -template -void convergence_compute_dot(std::shared_ptr exec, - const matrix::BatchDense* x, - const matrix::BatchDense* y, - matrix::BatchDense* result, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto y_ub = host::get_batch_struct(y); - const auto res_ub = host::get_batch_struct(result); - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); - compute_dot_product(x_b, y_b, res_b, converged); - } -} - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); template -void compute_norm2(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense>* result) +void compute_norm2(std::shared_ptr exec, + const matrix::BatchVector* x, + matrix::BatchVector>* result) { const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); @@ -269,232 +134,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COMPUTE_NORM2_KERNEL); - - -template -void convergence_compute_norm2( - std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense>* result, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto res_ub = host::get_batch_struct(result); - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - compute_norm2(x_b, res_b, converged); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COMPUTE_NORM2_KERNEL); - - -template -void convert_to_batch_csr(std::shared_ptr exec, - const matrix::BatchDense* source, - matrix::BatchCsr* result) -{ - GKO_ASSERT(source->get_size().stores_equal_sizes() == true); - auto num_rows = result->get_size().at(0)[0]; - auto num_cols = result->get_size().at(0)[1]; - auto num_batch_entries = result->get_num_batch_entries(); - - auto row_ptrs = result->get_row_ptrs(); - auto col_idxs = result->get_col_idxs(); - auto values = result->get_values(); - - size_type cur_ptr = 0; - row_ptrs[0] = cur_ptr; - for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(0, row, col); - if (val != zero()) { - col_idxs[cur_ptr] = col; - ++cur_ptr; - } - } - row_ptrs[row + 1] = cur_ptr; - } - - cur_ptr = 0; - for (size_type batch = 0; batch < num_batch_entries; ++batch) { - for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - auto val = source->at(batch, row, col); - if (val != zero()) { - values[cur_ptr] = val; - ++cur_ptr; - } - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_AND_INT32_INDEX( - GKO_DECLARE_BATCH_DENSE_CONVERT_TO_BATCH_CSR_KERNEL); - - -template -void count_nonzeros(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result) -{ - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - auto num_nonzeros = 0; - - for (size_type row = 0; row < num_rows; ++row) { - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += - (source->at(batch, row, col) != zero()); - } - } - result[batch] = num_nonzeros; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_COUNT_NONZEROS_KERNEL); - - -template -void calculate_max_nnz_per_row(std::shared_ptr exec, - const matrix::BatchDense* source, - size_type* result) -{ - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - size_type num_stored_elements_per_row = 0; - size_type num_nonzeros = 0; - for (size_type row = 0; row < num_rows; ++row) { - num_nonzeros = 0; - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += - (source->at(batch, row, col) != zero()); - } - num_stored_elements_per_row = - std::max(num_nonzeros, num_stored_elements_per_row); - } - result[batch] = num_stored_elements_per_row; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void calculate_nonzeros_per_row(std::shared_ptr exec, - const matrix::BatchDense* source, - array* result) -{ - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - auto row_nnz_val = result->get_data(); - size_type offset = 0; - for (size_type row = 0; row < num_rows; ++row) { - size_type num_nonzeros = 0; - for (size_type col = 0; col < num_cols; ++col) { - num_nonzeros += - (source->at(batch, row, col) != zero()); - } - row_nnz_val[offset + row] = num_nonzeros; - ++offset; - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); - - -template -void calculate_total_cols(std::shared_ptr exec, - const matrix::BatchDense* const source, - size_type* const result, - const size_type* const stride_factor, - const size_type* const slice_size) -{ - for (size_type batch = 0; batch < source->get_num_batch_entries(); - ++batch) { - auto num_rows = source->get_size().at(batch)[0]; - auto num_cols = source->get_size().at(batch)[1]; - auto slice_num = ceildiv(num_rows, slice_size[batch]); - auto total_cols = 0; - auto temp = 0, slice_temp = 0; - for (size_type slice = 0; slice < slice_num; slice++) { - slice_temp = 0; - for (size_type row = 0; row < slice_size[batch] && - row + slice * slice_size[batch] < num_rows; - row++) { - temp = 0; - for (size_type col = 0; col < num_cols; col++) { - temp += (source->at(batch, row + slice * slice_size[batch], - col) != zero()); - } - slice_temp = (slice_temp < temp) ? temp : slice_temp; - } - slice_temp = ceildiv(slice_temp, stride_factor[batch]) * - stride_factor[batch]; - total_cols += slice_temp; - } - result[batch] = total_cols; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CALCULATE_TOTAL_COLS_KERNEL); - - -template -void transpose(std::shared_ptr exec, - const matrix::BatchDense* const orig, - matrix::BatchDense* const trans) -{ - for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { - for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { - for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { - trans->at(batch, j, i) = orig->at(batch, i, j); - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::BatchDense* orig, - matrix::BatchDense* trans) -{ - for (size_type batch = 0; batch < orig->get_num_batch_entries(); ++batch) { - for (size_type i = 0; i < orig->get_size().at(batch)[0]; ++i) { - for (size_type j = 0; j < orig->get_size().at(batch)[1]; ++j) { - trans->at(batch, j, i) = conj(orig->at(batch, i, j)); - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONJ_TRANSPOSE_KERNEL); + GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result) + const matrix::BatchVector* x, + matrix::BatchVector* result) { const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); @@ -505,76 +151,10 @@ void copy(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_COPY_KERNEL); - - -template -void convergence_copy(std::shared_ptr exec, - const matrix::BatchDense* x, - matrix::BatchDense* result, - const uint32& converged) -{ - const auto x_ub = host::get_batch_struct(x); - const auto result_ub = host::get_batch_struct(result); - for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { - const auto result_b = gko::batch::batch_entry(result_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - copy(x_b, result_b, converged); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_CONVERGENCE_COPY_KERNEL); - - -template -void batch_scale(std::shared_ptr exec, - const matrix::BatchDiagonal* const left, - const matrix::BatchDiagonal* const rght, - matrix::BatchDense* const vecs) -{ - const auto left_vals = left->get_const_values(); - const auto rght_vals = rght->get_const_values(); - const auto v_vals = vecs->get_values(); - const auto nrows = static_cast(vecs->get_size().at(0)[0]); - const auto ncols = static_cast(vecs->get_size().at(0)[1]); - const auto vstride = vecs->get_stride().at(0); - for (size_type batch = 0; batch < vecs->get_num_batch_entries(); ++batch) { - const auto left_b = - gko::batch::batch_entry_ptr(left_vals, 1, nrows, batch); - const auto rght_b = - gko::batch::batch_entry_ptr(rght_vals, 1, ncols, batch); - const auto v_b = - gko::batch::batch_entry_ptr(v_vals, vstride, nrows, batch); - batch_scale(nrows, ncols, vstride, left_b, rght_b, v_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_BATCH_SCALE_KERNEL); - - -template -void add_scaled_identity(std::shared_ptr exec, - const matrix::BatchDense* const a, - const matrix::BatchDense* const b, - matrix::BatchDense* const mtx) -{ - const auto a_ub = host::get_batch_struct(a); - const auto b_ub = host::get_batch_struct(b); - const auto mtx_ub = host::get_batch_struct(mtx); - for (size_type batch = 0; batch < mtx->get_num_batch_entries(); ++batch) { - auto a_b = gko::batch::batch_entry(a_ub, batch); - auto b_b = gko::batch::batch_entry(b_ub, batch); - auto mtx_b = gko::batch::batch_entry(mtx_ub, batch); - add_scaled_identity(a_b.values[0], b_b.values[0], mtx_b); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); -} // namespace batch_dense +} // namespace batch_vector } // namespace reference } // namespace kernels } // namespace gko diff --git a/reference/matrix/batch_vector_kernels.hpp.inc b/reference/matrix/batch_vector_kernels.hpp.inc index db828206239..eb4a8cfab2a 100644 --- a/reference/matrix/batch_vector_kernels.hpp.inc +++ b/reference/matrix/batch_vector_kernels.hpp.inc @@ -32,9 +32,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void matvec_kernel( - const gko::batch_dense::BatchEntry& a, - const gko::batch_dense::BatchEntry& b, - const gko::batch_dense::BatchEntry& c) + const gko::batch_vector::BatchEntry& a, + const gko::batch_vector::BatchEntry& b, + const gko::batch_vector::BatchEntry& c) { for (int row = 0; row < c.num_rows; ++row) { for (int col = 0; col < c.num_rhs; ++col) { @@ -57,9 +57,9 @@ inline void matvec_kernel( template inline void advanced_matvec_kernel( const ValueType alpha, - const gko::batch_dense::BatchEntry& a, - const gko::batch_dense::BatchEntry& b, - const ValueType beta, const gko::batch_dense::BatchEntry& c) + const gko::batch_vector::BatchEntry& a, + const gko::batch_vector::BatchEntry& b, + const ValueType beta, const gko::batch_vector::BatchEntry& c) { if (beta != gko::zero()) { for (int row = 0; row < c.num_rows; ++row) { @@ -88,8 +88,8 @@ inline void advanced_matvec_kernel( template -inline void scale(const gko::batch_dense::BatchEntry& alpha, - const gko::batch_dense::BatchEntry& x) +inline void scale(const gko::batch_vector::BatchEntry& alpha, + const gko::batch_vector::BatchEntry& x) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -109,9 +109,9 @@ inline void scale(const gko::batch_dense::BatchEntry& alpha, template inline void add_scaled( - const gko::batch_dense::BatchEntry& alpha, - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry& y) + const gko::batch_vector::BatchEntry& alpha, + const gko::batch_vector::BatchEntry& x, + const gko::batch_vector::BatchEntry& y) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -133,10 +133,10 @@ inline void add_scaled( template inline void add_scale( - const gko::batch_dense::BatchEntry& alpha, - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry& beta, - const gko::batch_dense::BatchEntry& y) + const gko::batch_vector::BatchEntry& alpha, + const gko::batch_vector::BatchEntry& x, + const gko::batch_vector::BatchEntry& beta, + const gko::batch_vector::BatchEntry& y) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -160,8 +160,8 @@ inline void add_scale( template inline void compute_norm2( - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry>& result) + const gko::batch_vector::BatchEntry& x, + const gko::batch_vector::BatchEntry>& result) { for (int j = 0; j < x.num_rhs; ++j) { result.values[j] = gko::zero>(); @@ -185,8 +185,8 @@ inline void compute_norm2( */ template inline void batch_scale( - const gko::batch_dense::BatchEntry& diag_vec, - const gko::batch_dense::BatchEntry& a) + const gko::batch_vector::BatchEntry& diag_vec, + const gko::batch_vector::BatchEntry& a) { for (int i_row = 0; i_row < a.num_rows; i_row++) { const ValueType scale = diag_vec.values[i_row]; @@ -217,8 +217,8 @@ inline void batch_scale(const int nrows, const int ncols, * and stride set. */ template -inline void copy(const gko::batch_dense::BatchEntry& in, - const gko::batch_dense::BatchEntry& out) +inline void copy(const gko::batch_vector::BatchEntry& in, + const gko::batch_vector::BatchEntry& out) { for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) { const int i = iz / in.num_rhs; @@ -230,9 +230,9 @@ inline void copy(const gko::batch_dense::BatchEntry& in, template inline void compute_dot_product( - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry& y, - const gko::batch_dense::BatchEntry& result) + const gko::batch_vector::BatchEntry& x, + const gko::batch_vector::BatchEntry& y, + const gko::batch_vector::BatchEntry& result) { for (int c = 0; c < result.num_rhs; c++) { result.values[c] = gko::zero(); @@ -249,8 +249,8 @@ inline void compute_dot_product( template inline void copy( - const gko::batch_dense::BatchEntry& source_entry, - const gko::batch_dense::BatchEntry& destination_entry, + const gko::batch_vector::BatchEntry& source_entry, + const gko::batch_vector::BatchEntry& destination_entry, const gko::uint32& converged) { for (int r = 0; r < source_entry.num_rows; r++) { @@ -270,9 +270,9 @@ inline void copy( template inline void add_scaled( - const gko::batch_dense::BatchEntry& alpha, - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry& y, + const gko::batch_vector::BatchEntry& alpha, + const gko::batch_vector::BatchEntry& x, + const gko::batch_vector::BatchEntry& y, const gko::uint32& converged) { if (alpha.num_rhs == 1) { @@ -308,8 +308,8 @@ inline void add_scaled( template inline void compute_norm2( - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry>& result, + const gko::batch_vector::BatchEntry& x, + const gko::batch_vector::BatchEntry>& result, const gko::uint32& converged) { for (int j = 0; j < x.num_rhs; ++j) { @@ -346,9 +346,9 @@ inline void compute_norm2( template inline void compute_dot_product( - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry& y, - const gko::batch_dense::BatchEntry& result, + const gko::batch_vector::BatchEntry& x, + const gko::batch_vector::BatchEntry& y, + const gko::batch_vector::BatchEntry& result, const gko::uint32& converged) { for (int c = 0; c < result.num_rhs; c++) { @@ -379,7 +379,7 @@ inline void compute_dot_product( template inline void add_scaled_identity( const ValueType& a, const ValueType& b, - const gko::batch_dense::BatchEntry& mat) + const gko::batch_vector::BatchEntry& mat) { for (int i = 0; i < mat.num_rows; i++) { for (int j = 0; j < mat.num_rhs; j++) { diff --git a/reference/test/matrix/batch_vector_kernels.cpp b/reference/test/matrix/batch_vector_kernels.cpp index 6e1a6c2f8e1..e8aaad8d584 100644 --- a/reference/test/matrix/batch_vector_kernels.cpp +++ b/reference/test/matrix/batch_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include @@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_vector_kernels.hpp" #include "core/test/utils.hpp" @@ -58,15 +58,15 @@ namespace { template -class BatchDense : public ::testing::Test { +class BatchVector : public ::testing::Test { protected: using value_type = T; using size_type = gko::size_type; - using Mtx = gko::matrix::BatchDense; + using Mtx = gko::matrix::BatchVector; using DenseMtx = gko::matrix::Dense; using ComplexMtx = gko::to_complex; using RealMtx = gko::remove_complex; - BatchDense() + BatchVector() : exec(gko::ReferenceExecutor::create()), mtx_0(gko::batch_initialize( {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, @@ -138,10 +138,10 @@ class BatchDense : public ::testing::Test { }; -TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); +TYPED_TEST_SUITE(BatchVector, gko::test::ValueTypes); -TYPED_TEST(BatchDense, AppliesToBatchDense) +TYPED_TEST(BatchVector, AppliesToBatchVector) { using T = typename TestFixture::value_type; this->mtx_1->apply(this->mtx_2.get(), this->mtx_3.get()); @@ -155,7 +155,7 @@ TYPED_TEST(BatchDense, AppliesToBatchDense) } -TYPED_TEST(BatchDense, AppliesLinearCombinationToBatchDense) +TYPED_TEST(BatchVector, AppliesLinearCombinationToBatchVector) { using Mtx = typename TestFixture::Mtx; using DenseMtx = typename TestFixture::DenseMtx; @@ -180,7 +180,7 @@ TYPED_TEST(BatchDense, AppliesLinearCombinationToBatchDense) } -TYPED_TEST(BatchDense, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(BatchVector, ApplyFailsOnWrongInnerDimension) { using Mtx = typename TestFixture::Mtx; auto res = Mtx::create( @@ -191,7 +191,7 @@ TYPED_TEST(BatchDense, ApplyFailsOnWrongInnerDimension) } -TYPED_TEST(BatchDense, ApplyFailsForNonUniformBatches) +TYPED_TEST(BatchVector, ApplyFailsForNonUniformBatches) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -212,7 +212,7 @@ TYPED_TEST(BatchDense, ApplyFailsForNonUniformBatches) } -TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(BatchVector, ApplyFailsOnWrongNumberOfRows) { using Mtx = typename TestFixture::Mtx; auto res = Mtx::create( @@ -223,7 +223,7 @@ TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfRows) } -TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(BatchVector, ApplyFailsOnWrongNumberOfCols) { using Mtx = typename TestFixture::Mtx; auto res = Mtx::create( @@ -237,7 +237,7 @@ TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfCols) } -TYPED_TEST(BatchDense, ScalesData) +TYPED_TEST(BatchVector, ScalesData) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -257,7 +257,7 @@ TYPED_TEST(BatchDense, ScalesData) } -TYPED_TEST(BatchDense, ScalesDataWithScalar) +TYPED_TEST(BatchVector, ScalesDataWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -275,7 +275,7 @@ TYPED_TEST(BatchDense, ScalesDataWithScalar) } -TYPED_TEST(BatchDense, ScalesDataWithStride) +TYPED_TEST(BatchVector, ScalesDataWithStride) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -294,7 +294,7 @@ TYPED_TEST(BatchDense, ScalesDataWithStride) } -TYPED_TEST(BatchDense, AddsScaled) +TYPED_TEST(BatchVector, AddsScaled) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -313,7 +313,7 @@ TYPED_TEST(BatchDense, AddsScaled) } -TYPED_TEST(BatchDense, AddsScale) +TYPED_TEST(BatchVector, AddsScale) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -337,7 +337,7 @@ TYPED_TEST(BatchDense, AddsScale) } -TYPED_TEST(BatchDense, ConvergenceAddScaled) +TYPED_TEST(BatchVector, ConvergenceAddScaled) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -350,7 +350,7 @@ TYPED_TEST(BatchDense, ConvergenceAddScaled) const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_dense::convergence_add_scaled( + gko::kernels::reference::batch_vector::convergence_add_scaled( this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), converged); @@ -378,7 +378,7 @@ TYPED_TEST(BatchDense, ConvergenceAddScaled) } -TYPED_TEST(BatchDense, AddsScaledWithScalar) +TYPED_TEST(BatchVector, AddsScaledWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -396,7 +396,7 @@ TYPED_TEST(BatchDense, AddsScaledWithScalar) } -TYPED_TEST(BatchDense, AddsScaleWithScalar) +TYPED_TEST(BatchVector, AddsScaleWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -418,7 +418,7 @@ TYPED_TEST(BatchDense, AddsScaleWithScalar) } -TYPED_TEST(BatchDense, AddScaleWithScalarViaApply) +TYPED_TEST(BatchVector, AddScaleWithScalarViaApply) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -441,7 +441,7 @@ TYPED_TEST(BatchDense, AddScaleWithScalarViaApply) } -TYPED_TEST(BatchDense, ConvergenceAddScaledWithScalar) +TYPED_TEST(BatchVector, ConvergenceAddScaledWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -453,7 +453,7 @@ TYPED_TEST(BatchDense, ConvergenceAddScaledWithScalar) const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_dense::convergence_add_scaled( + gko::kernels::reference::batch_vector::convergence_add_scaled( this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), converged); @@ -481,7 +481,7 @@ TYPED_TEST(BatchDense, ConvergenceAddScaledWithScalar) } -TYPED_TEST(BatchDense, AddScaledFailsOnWrongSizes) +TYPED_TEST(BatchVector, AddScaledFailsOnWrongSizes) { using Mtx = typename TestFixture::Mtx; auto alpha = @@ -492,7 +492,7 @@ TYPED_TEST(BatchDense, AddScaledFailsOnWrongSizes) } -TYPED_TEST(BatchDense, AddScaleFailsOnWrongSizes) +TYPED_TEST(BatchVector, AddScaleFailsOnWrongSizes) { using Mtx = typename TestFixture::Mtx; auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); @@ -504,7 +504,7 @@ TYPED_TEST(BatchDense, AddScaleFailsOnWrongSizes) } -TYPED_TEST(BatchDense, AddScaleFailsOnWrongScalarSizes) +TYPED_TEST(BatchVector, AddScaleFailsOnWrongScalarSizes) { using Mtx = typename TestFixture::Mtx; auto alpha = gko::batch_initialize( @@ -517,7 +517,7 @@ TYPED_TEST(BatchDense, AddScaleFailsOnWrongScalarSizes) } -TYPED_TEST(BatchDense, ComputesDot) +TYPED_TEST(BatchVector, ComputesDot) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -536,7 +536,7 @@ TYPED_TEST(BatchDense, ComputesDot) } -TYPED_TEST(BatchDense, ConvergenceComputeDot) +TYPED_TEST(BatchVector, ConvergenceComputeDot) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -555,7 +555,7 @@ TYPED_TEST(BatchDense, ConvergenceComputeDot) const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_dense::convergence_compute_dot( + gko::kernels::reference::batch_vector::convergence_compute_dot( this->exec, this->mtx_0.get(), this->mtx_1.get(), result.get(), converged); @@ -577,12 +577,12 @@ TYPED_TEST(BatchDense, ConvergenceComputeDot) } -TYPED_TEST(BatchDense, ComputesNorm2) +TYPED_TEST(BatchVector, ComputesNorm2) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using T_nc = gko::remove_complex; - using NormVector = gko::matrix::BatchDense; + using NormVector = gko::matrix::BatchVector; auto mtx(gko::batch_initialize( {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, @@ -601,12 +601,12 @@ TYPED_TEST(BatchDense, ComputesNorm2) } -TYPED_TEST(BatchDense, ConvergenceComputeNorm2) +TYPED_TEST(BatchVector, ConvergenceComputeNorm2) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using T_nc = gko::remove_complex; - using NormVector = gko::matrix::BatchDense; + using NormVector = gko::matrix::BatchVector; auto mtx(gko::batch_initialize( {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, @@ -628,7 +628,7 @@ TYPED_TEST(BatchDense, ConvergenceComputeNorm2) const int num_rhs = 2; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_dense::convergence_compute_norm2( + gko::kernels::reference::batch_vector::convergence_compute_norm2( this->exec, mtx.get(), result.get(), converged); EXPECT_EQ(result->at(0, 0, 0), result_clone->at(0, 0, 0)); @@ -639,7 +639,7 @@ TYPED_TEST(BatchDense, ConvergenceComputeNorm2) } -TYPED_TEST(BatchDense, ComputDotFailsOnWrongInputSize) +TYPED_TEST(BatchVector, ComputDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -651,7 +651,7 @@ TYPED_TEST(BatchDense, ComputDotFailsOnWrongInputSize) } -TYPED_TEST(BatchDense, ComputDotFailsOnWrongResultSize) +TYPED_TEST(BatchVector, ComputDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -667,22 +667,22 @@ TYPED_TEST(BatchDense, ComputDotFailsOnWrongResultSize) } -TYPED_TEST(BatchDense, CopiesData) +TYPED_TEST(BatchVector, CopiesData) { - gko::kernels::reference::batch_dense::copy(this->exec, this->mtx_0.get(), - this->mtx_1.get()); + gko::kernels::reference::batch_vector::copy(this->exec, this->mtx_0.get(), + this->mtx_1.get()); GKO_ASSERT_BATCH_MTX_NEAR(this->mtx_1.get(), this->mtx_0.get(), 0.); } -TYPED_TEST(BatchDense, ConvergenceCopyData) +TYPED_TEST(BatchVector, ConvergenceCopyData) { auto umtx_0 = this->mtx_0->unbatch(); const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_dense::convergence_copy( + gko::kernels::reference::batch_vector::convergence_copy( this->exec, this->mtx_0.get(), this->mtx_1.get(), converged); auto mtx_10_clone = gko::clone(this->mtx_10); @@ -706,7 +706,7 @@ TYPED_TEST(BatchDense, ConvergenceCopyData) } -TYPED_TEST(BatchDense, BatchScale) +TYPED_TEST(BatchVector, BatchScale) { using T = typename TestFixture::value_type; using Mtx = typename TestFixture::Mtx; @@ -722,8 +722,8 @@ TYPED_TEST(BatchDense, BatchScale) auto rght(gko::batch_diagonal_initialize( I>{I{-0.5, -2.0}, I{2.0, 0.25}}, this->exec)); - gko::kernels::reference::batch_dense::batch_scale(this->exec, left.get(), - rght.get(), mtx.get()); + gko::kernels::reference::batch_vector::batch_scale(this->exec, left.get(), + rght.get(), mtx.get()); EXPECT_EQ(mtx->at(0, 0, 0), T{-0.5}); EXPECT_EQ(mtx->at(0, 1, 0), T{-2.0}); @@ -741,14 +741,14 @@ TYPED_TEST(BatchDense, BatchScale) } -TYPED_TEST(BatchDense, ConvertsToPrecision) +TYPED_TEST(BatchVector, ConvertsToPrecision) { - using BatchDense = typename TestFixture::Mtx; + using BatchVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchDense = typename gko::matrix::BatchDense; - auto tmp = OtherBatchDense::create(this->exec); - auto res = BatchDense::create(this->exec); + using OtherBatchVector = typename gko::matrix::BatchVector; + auto tmp = OtherBatchVector::create(this->exec); + auto res = BatchVector::create(this->exec); // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} @@ -764,14 +764,14 @@ TYPED_TEST(BatchDense, ConvertsToPrecision) } -TYPED_TEST(BatchDense, MovesToPrecision) +TYPED_TEST(BatchVector, MovesToPrecision) { - using BatchDense = typename TestFixture::Mtx; + using BatchVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchDense = typename gko::matrix::BatchDense; - auto tmp = OtherBatchDense::create(this->exec); - auto res = BatchDense::create(this->exec); + using OtherBatchVector = typename gko::matrix::BatchVector; + auto tmp = OtherBatchVector::create(this->exec); + auto res = BatchVector::create(this->exec); // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} @@ -787,7 +787,7 @@ TYPED_TEST(BatchDense, MovesToPrecision) } -TYPED_TEST(BatchDense, ConvertsToCsr32) +TYPED_TEST(BatchVector, ConvertsToCsr32) { using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; @@ -824,7 +824,7 @@ TYPED_TEST(BatchDense, ConvertsToCsr32) } -TYPED_TEST(BatchDense, MovesToCsr32) +TYPED_TEST(BatchVector, MovesToCsr32) { using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; @@ -861,14 +861,14 @@ TYPED_TEST(BatchDense, MovesToCsr32) } -TYPED_TEST(BatchDense, ConvertsEmptyToPrecision) +TYPED_TEST(BatchVector, ConvertsEmptyToPrecision) { - using BatchDense = typename TestFixture::Mtx; + using BatchVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchDense = typename gko::matrix::BatchDense; - auto empty = OtherBatchDense::create(this->exec); - auto res = BatchDense::create(this->exec); + using OtherBatchVector = typename gko::matrix::BatchVector; + auto empty = OtherBatchVector::create(this->exec); + auto res = BatchVector::create(this->exec); empty->convert_to(res.get()); @@ -876,14 +876,14 @@ TYPED_TEST(BatchDense, ConvertsEmptyToPrecision) } -TYPED_TEST(BatchDense, MovesEmptyToPrecision) +TYPED_TEST(BatchVector, MovesEmptyToPrecision) { - using BatchDense = typename TestFixture::Mtx; + using BatchVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchDense = typename gko::matrix::BatchDense; - auto empty = OtherBatchDense::create(this->exec); - auto res = BatchDense::create(this->exec); + using OtherBatchVector = typename gko::matrix::BatchVector; + auto empty = OtherBatchVector::create(this->exec); + auto res = BatchVector::create(this->exec); empty->move_to(res.get()); @@ -891,12 +891,12 @@ TYPED_TEST(BatchDense, MovesEmptyToPrecision) } -TYPED_TEST(BatchDense, ConvertsEmptyMatrixToCsr) +TYPED_TEST(BatchVector, ConvertsEmptyMatrixToCsr) { - using BatchDense = typename TestFixture::Mtx; + using BatchVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; - auto empty = BatchDense::create(this->exec); + auto empty = BatchVector::create(this->exec); auto res = BatchCsr::create(this->exec); empty->convert_to(res.get()); @@ -907,12 +907,12 @@ TYPED_TEST(BatchDense, ConvertsEmptyMatrixToCsr) } -TYPED_TEST(BatchDense, MovesEmptyMatrixToCsr) +TYPED_TEST(BatchVector, MovesEmptyMatrixToCsr) { - using BatchDense = typename TestFixture::Mtx; + using BatchVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; - auto empty = BatchDense::create(this->exec); + auto empty = BatchVector::create(this->exec); auto res = BatchCsr::create(this->exec); empty->move_to(res.get()); @@ -923,7 +923,7 @@ TYPED_TEST(BatchDense, MovesEmptyMatrixToCsr) } -TYPED_TEST(BatchDense, ConvertsToBatchDiagonal) +TYPED_TEST(BatchVector, ConvertsToBatchDiagonal) { using BDense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -946,7 +946,7 @@ TYPED_TEST(BatchDense, ConvertsToBatchDiagonal) } -TYPED_TEST(BatchDense, MovesToBatchDiagonal) +TYPED_TEST(BatchVector, MovesToBatchDiagonal) { using BDense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -967,13 +967,13 @@ TYPED_TEST(BatchDense, MovesToBatchDiagonal) } -TYPED_TEST(BatchDense, SquareMatrixIsTransposable) +TYPED_TEST(BatchVector, SquareMatrixIsTransposable) { using Mtx = typename TestFixture::Mtx; auto trans = this->mtx_4->transpose(); - auto trans_as_batch_dense = static_cast(trans.get()); + auto trans_as_batch_vector = static_cast(trans.get()); - auto utb = trans_as_batch_dense->unbatch(); + auto utb = trans_as_batch_vector->unbatch(); GKO_ASSERT_MTX_NEAR(utb[0].get(), l({{1.0, 6.0, 6.0}, {1.5, 1.0, 1.0}, {3.0, 5.0, 5.5}}), r::value); @@ -983,13 +983,13 @@ TYPED_TEST(BatchDense, SquareMatrixIsTransposable) } -TYPED_TEST(BatchDense, NonSquareMatrixIsTransposable) +TYPED_TEST(BatchVector, NonSquareMatrixIsTransposable) { using Mtx = typename TestFixture::Mtx; auto trans = this->mtx_5->transpose(); - auto trans_as_batch_dense = static_cast(trans.get()); + auto trans_as_batch_vector = static_cast(trans.get()); - auto utb = trans_as_batch_dense->unbatch(); + auto utb = trans_as_batch_vector->unbatch(); GKO_ASSERT_MTX_NEAR(utb[0].get(), l({{1.0, 6.0, 7.0}, {1.5, 1.0, -4.5}}), r::value); GKO_ASSERT_MTX_NEAR(utb[1].get(), l({{2.0, 1.0, 4.0}, {-2.0, 3.0, 3.0}}), @@ -997,7 +997,7 @@ TYPED_TEST(BatchDense, NonSquareMatrixIsTransposable) } -TYPED_TEST(BatchDense, SquareMatrixAddScaledIdentity) +TYPED_TEST(BatchVector, SquareMatrixAddScaledIdentity) { using T = typename TestFixture::value_type; using Mtx = typename TestFixture::Mtx; diff --git a/test/matrix/batch_vector_kernels.cpp b/test/matrix/batch_vector_kernels.cpp index 5d275dbea5b..150f02a3772 100644 --- a/test/matrix/batch_vector_kernels.cpp +++ b/test/matrix/batch_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_vector_kernels.hpp" #include @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include +#include #include @@ -53,14 +53,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef GKO_COMPILING_DPCPP -class BatchDense : public CommonTestFixture { +class BatchVector : public CommonTestFixture { protected: using vtype = double; - using Mtx = gko::matrix::BatchDense; - using NormVector = gko::matrix::BatchDense>; - using ComplexMtx = gko::matrix::BatchDense>; + using Mtx = gko::matrix::BatchVector; + using NormVector = gko::matrix::BatchVector>; + using ComplexMtx = gko::matrix::BatchVector>; - BatchDense() : rand_engine(15) {} + BatchVector() : rand_engine(15) {} template std::unique_ptr gen_mtx(const size_t batchsize, int num_rows, @@ -145,7 +145,7 @@ class BatchDense : public CommonTestFixture { }; -TEST_F(BatchDense, SingleVectorAppyIsEquivalentToRef) +TEST_F(BatchVector, SingleVectorAppyIsEquivalentToRef) { set_up_apply_data(1); @@ -156,7 +156,7 @@ TEST_F(BatchDense, SingleVectorAppyIsEquivalentToRef) } -TEST_F(BatchDense, SingleVectorAdvancedAppyIsEquivalentToRef) +TEST_F(BatchVector, SingleVectorAdvancedAppyIsEquivalentToRef) { set_up_apply_data(1); @@ -167,7 +167,7 @@ TEST_F(BatchDense, SingleVectorAdvancedAppyIsEquivalentToRef) } -TEST_F(BatchDense, SingleVectorAddScaledIsEquivalentToRef) +TEST_F(BatchVector, SingleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(1); @@ -178,7 +178,7 @@ TEST_F(BatchDense, SingleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchDense, SingleVectorAddScaleIsEquivalentToRef) +TEST_F(BatchVector, SingleVectorAddScaleIsEquivalentToRef) { set_up_vector_data(1); @@ -189,7 +189,7 @@ TEST_F(BatchDense, SingleVectorAddScaleIsEquivalentToRef) } -TEST_F(BatchDense, MultipleVectorAddScaledIsEquivalentToRef) +TEST_F(BatchVector, MultipleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(20); @@ -200,7 +200,7 @@ TEST_F(BatchDense, MultipleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchDense, MultipleVectorAddScaleIsEquivalentToRef) +TEST_F(BatchVector, MultipleVectorAddScaleIsEquivalentToRef) { set_up_vector_data(20); @@ -211,7 +211,7 @@ TEST_F(BatchDense, MultipleVectorAddScaleIsEquivalentToRef) } -TEST_F(BatchDense, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) +TEST_F(BatchVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); @@ -222,7 +222,7 @@ TEST_F(BatchDense, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) } -TEST_F(BatchDense, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) +TEST_F(BatchVector, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) { set_up_vector_data(20, true); @@ -233,7 +233,7 @@ TEST_F(BatchDense, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) } -TEST_F(BatchDense, SingleVectorScaleIsEquivalentToRef) +TEST_F(BatchVector, SingleVectorScaleIsEquivalentToRef) { set_up_vector_data(1); @@ -244,7 +244,7 @@ TEST_F(BatchDense, SingleVectorScaleIsEquivalentToRef) } -TEST_F(BatchDense, MultipleVectorScaleIsEquivalentToRef) +TEST_F(BatchVector, MultipleVectorScaleIsEquivalentToRef) { set_up_vector_data(20); @@ -255,7 +255,7 @@ TEST_F(BatchDense, MultipleVectorScaleIsEquivalentToRef) } -TEST_F(BatchDense, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) +TEST_F(BatchVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); @@ -266,7 +266,7 @@ TEST_F(BatchDense, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) } -TEST_F(BatchDense, ComputeNorm2SingleIsEquivalentToRef) +TEST_F(BatchVector, ComputeNorm2SingleIsEquivalentToRef) { set_up_vector_data(1); auto norm_size = @@ -281,7 +281,7 @@ TEST_F(BatchDense, ComputeNorm2SingleIsEquivalentToRef) } -TEST_F(BatchDense, ComputeNorm2IsEquivalentToRef) +TEST_F(BatchVector, ComputeNorm2IsEquivalentToRef) { set_up_vector_data(20); auto norm_size = @@ -296,7 +296,7 @@ TEST_F(BatchDense, ComputeNorm2IsEquivalentToRef) } -TEST_F(BatchDense, ComputeDotIsEquivalentToRef) +TEST_F(BatchVector, ComputeDotIsEquivalentToRef) { set_up_vector_data(20); auto dot_size = @@ -311,7 +311,7 @@ TEST_F(BatchDense, ComputeDotIsEquivalentToRef) } -TEST_F(BatchDense, ComputeDotSingleIsEquivalentToRef) +TEST_F(BatchVector, ComputeDotSingleIsEquivalentToRef) { set_up_vector_data(1); auto dot_size = @@ -326,31 +326,31 @@ TEST_F(BatchDense, ComputeDotSingleIsEquivalentToRef) } -TEST_F(BatchDense, CopySingleIsEquivalentToRef) +TEST_F(BatchVector, CopySingleIsEquivalentToRef) { set_up_vector_data(1); - gko::kernels::reference::batch_dense::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_dense::copy(this->exec, dx.get(), + gko::kernels::reference::batch_vector::copy(this->ref, x.get(), y.get()); + gko::kernels::EXEC_NAMESPACE::batch_vector::copy(this->exec, dx.get(), dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } -TEST_F(BatchDense, CopyIsEquivalentToRef) +TEST_F(BatchVector, CopyIsEquivalentToRef) { set_up_vector_data(20); - gko::kernels::reference::batch_dense::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_dense::copy(this->exec, dx.get(), + gko::kernels::reference::batch_vector::copy(this->ref, x.get(), y.get()); + gko::kernels::EXEC_NAMESPACE::batch_vector::copy(this->exec, dx.get(), dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } -TEST_F(BatchDense, BatchScaleIsEquivalentToRef) +TEST_F(BatchVector, BatchScaleIsEquivalentToRef) { using BDiag = gko::matrix::BatchDiagonal; const int num_rhs = 20; @@ -365,16 +365,16 @@ TEST_F(BatchDense, BatchScaleIsEquivalentToRef) auto drght = BDiag::create(this->exec); drght->copy_from(rght.get()); - gko::kernels::reference::batch_dense::batch_scale(this->ref, left.get(), + gko::kernels::reference::batch_vector::batch_scale(this->ref, left.get(), rght.get(), x.get()); - gko::kernels::EXEC_NAMESPACE::batch_dense::batch_scale( + gko::kernels::EXEC_NAMESPACE::batch_vector::batch_scale( this->exec, dleft.get(), drght.get(), dx.get()); GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); } -TEST_F(BatchDense, TransposeIsEquivalentToRef) +TEST_F(BatchVector, TransposeIsEquivalentToRef) { const int nrows = 11; const int ncols = 6; @@ -392,7 +392,7 @@ TEST_F(BatchDense, TransposeIsEquivalentToRef) } -TEST_F(BatchDense, ConjugateTransposeIsEquivalentToRef) +TEST_F(BatchVector, ConjugateTransposeIsEquivalentToRef) { const int nrows = 11; const int ncols = 6; @@ -410,7 +410,7 @@ TEST_F(BatchDense, ConjugateTransposeIsEquivalentToRef) } -TEST_F(BatchDense, AddScaledIdentityNonSquareIsEquivalentToReference) +TEST_F(BatchVector, AddScaledIdentityNonSquareIsEquivalentToReference) { set_up_apply_data(); const gko::size_type batchsize = 10; From 69fd4b0d9e95185fde5f546d9eaba5dde8234d42 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 6 Jul 2023 12:07:36 +0200 Subject: [PATCH 107/583] Remove matrix namespace and use MutliVector --- .../batch_multi_vector_kernels.hpp.inc} | 0 core/CMakeLists.txt | 2 +- .../batch_multi_vector.cpp} | 74 +++---- .../batch_multi_vector_kernels.hpp} | 66 +++--- core/{matrix => base}/batch_struct.hpp | 24 +-- core/device_hooks/common_kernels.inc.cpp | 14 +- core/test/base/CMakeLists.txt | 1 + .../batch_multi_vector.cpp} | 105 +++++----- core/test/matrix/CMakeLists.txt | 1 - cuda/CMakeLists.txt | 2 +- .../batch_multi_vector_kernels.cu} | 52 ++--- cuda/{matrix => base}/batch_struct.hpp | 16 +- hip/CMakeLists.txt | 2 +- .../batch_multi_vector_kernels.hip.cpp} | 52 ++--- hip/{matrix => base}/batch_struct.hip.hpp | 16 +- .../batch_multi_vector.hpp} | 190 +++++++++--------- omp/CMakeLists.txt | 2 +- .../batch_multi_vector_kernels.cpp} | 50 ++--- reference/CMakeLists.txt | 2 +- .../batch_multi_vector_kernels.cpp} | 52 ++--- .../batch_multi_vector_kernels.hpp.inc} | 74 +++---- reference/{matrix => base}/batch_struct.hpp | 16 +- .../batch_multi_vector_kernels.cpp} | 170 ++++++++-------- test/base/CMakeLists.txt | 1 + .../batch_multi_vector_kernels.cpp} | 75 ++++--- 25 files changed, 535 insertions(+), 524 deletions(-) rename common/cuda_hip/{matrix/batch_vector_kernels.hpp.inc => base/batch_multi_vector_kernels.hpp.inc} (100%) rename core/{matrix/batch_vector.cpp => base/batch_multi_vector.cpp} (73%) rename core/{matrix/batch_vector_kernels.hpp => base/batch_multi_vector_kernels.hpp} (53%) rename core/{matrix => base}/batch_struct.hpp (85%) rename core/test/{matrix/batch_vector.cpp => base/batch_multi_vector.cpp} (83%) rename cuda/{matrix/batch_vector_kernels.cu => base/batch_multi_vector_kernels.cu} (78%) rename cuda/{matrix => base}/batch_struct.hpp (88%) rename hip/{matrix/batch_vector_kernels.hip.cpp => base/batch_multi_vector_kernels.hip.cpp} (80%) rename hip/{matrix => base}/batch_struct.hip.hpp (88%) rename include/ginkgo/core/{matrix/batch_vector.hpp => base/batch_multi_vector.hpp} (85%) rename omp/{matrix/batch_vector_kernels.cpp => base/batch_multi_vector_kernels.cpp} (77%) rename reference/{matrix/batch_vector_kernels.cpp => base/batch_multi_vector_kernels.cpp} (77%) rename reference/{matrix/batch_vector_kernels.hpp.inc => base/batch_multi_vector_kernels.hpp.inc} (80%) rename reference/{matrix => base}/batch_struct.hpp (88%) rename reference/test/{matrix/batch_vector_kernels.cpp => base/batch_multi_vector_kernels.cpp} (87%) rename test/{matrix/batch_vector_kernels.cpp => base/batch_multi_vector_kernels.cpp} (81%) diff --git a/common/cuda_hip/matrix/batch_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc similarity index 100% rename from common/cuda_hip/matrix/batch_vector_kernels.hpp.inc rename to common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 03d558562dc..d224a7e0f90 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -4,6 +4,7 @@ add_library(ginkgo "") target_sources(ginkgo PRIVATE base/array.cpp + base/batch_multi_vector.cpp base/combination.cpp base/composition.cpp base/dense_cache.cpp @@ -38,7 +39,6 @@ target_sources(ginkgo log/vtune.cpp log/record.cpp log/stream.cpp - matrix/batch_vector.cpp matrix/coo.cpp matrix/csr.cpp matrix/dense.cpp diff --git a/core/matrix/batch_vector.cpp b/core/base/batch_multi_vector.cpp similarity index 73% rename from core/matrix/batch_vector.cpp rename to core/base/batch_multi_vector.cpp index abacd9b1cd8..76639494088 100644 --- a/core/matrix/batch_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include @@ -45,28 +45,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/matrix/batch_vector_kernels.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" namespace gko { namespace matrix { -namespace batch_vector { +namespace batch_multi_vector { -GKO_REGISTER_OPERATION(scale, batch_vector::scale); -GKO_REGISTER_OPERATION(add_scaled, batch_vector::add_scaled); -GKO_REGISTER_OPERATION(compute_dot, batch_vector::compute_dot); -GKO_REGISTER_OPERATION(compute_norm2, batch_vector::compute_norm2); -GKO_REGISTER_OPERATION(copy, batch_vector::copy); +GKO_REGISTER_OPERATION(scale, batch_multi_vector::scale); +GKO_REGISTER_OPERATION(add_scaled, batch_multi_vector::add_scaled); +GKO_REGISTER_OPERATION(compute_dot, batch_multi_vector::compute_dot); +GKO_REGISTER_OPERATION(compute_norm2, batch_multi_vector::compute_norm2); +GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); -} // namespace batch_vector +} // namespace batch_multi_vector template -void BatchVector::scale_impl(const BatchLinOp* alpha) +void BatchMultiVector::scale_impl(const BatchLinOp* alpha) { - auto batch_alpha = as>(alpha); + auto batch_alpha = as>(alpha); GKO_ASSERT_BATCH_EQUAL_ROWS( batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { @@ -76,16 +76,16 @@ void BatchVector::scale_impl(const BatchLinOp* alpha) } } auto exec = this->get_executor(); - exec->run(batch_vector::make_scale(batch_alpha, this)); + exec->run(batch_multi_vector::make_scale(batch_alpha, this)); } template -void BatchVector::add_scaled_impl(const BatchLinOp* alpha, - const BatchLinOp* b) +void BatchMultiVector::add_scaled_impl(const BatchLinOp* alpha, + const BatchLinOp* b) { - auto batch_alpha = as>(alpha); - auto batch_b = as>(b); + auto batch_alpha = as>(alpha); + auto batch_b = as>(b); GKO_ASSERT_BATCH_EQUAL_ROWS( batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { @@ -97,7 +97,7 @@ void BatchVector::add_scaled_impl(const BatchLinOp* alpha, GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); auto exec = this->get_executor(); - exec->run(batch_vector::make_add_scaled(batch_alpha, batch_b, this)); + exec->run(batch_multi_vector::make_add_scaled(batch_alpha, batch_b, this)); } @@ -112,35 +112,36 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) template -void BatchVector::compute_dot_impl(const BatchLinOp* b, - BatchLinOp* result) const +void BatchMultiVector::compute_dot_impl(const BatchLinOp* b, + BatchLinOp* result) const { - auto batch_result = as>(result); - auto batch_b = as>(b); + auto batch_result = as>(result); + auto batch_b = as>(b); GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, get_col_sizes(this->get_size())); auto exec = this->get_executor(); - exec->run(batch_vector::make_compute_dot(this, batch_b, batch_result)); + exec->run( + batch_multi_vector::make_compute_dot(this, batch_b, batch_result)); } template -void BatchVector::compute_norm2_impl(BatchLinOp* result) const +void BatchMultiVector::compute_norm2_impl(BatchLinOp* result) const { - using NormVector = BatchVector>; + using NormVector = BatchMultiVector>; auto batch_result = as(result); GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, get_col_sizes(this->get_size())); auto exec = this->get_executor(); - exec->run(batch_vector::make_compute_norm2(as>(this), - batch_result)); + exec->run(batch_multi_vector::make_compute_norm2( + as>(this), batch_result)); } template -void BatchVector::convert_to( - BatchVector>* result) const +void BatchMultiVector::convert_to( + BatchMultiVector>* result) const { result->values_ = this->values_; result->stride_ = this->stride_; @@ -150,8 +151,8 @@ void BatchVector::convert_to( template -void BatchVector::move_to( - BatchVector>* result) +void BatchMultiVector::move_to( + BatchMultiVector>* result) { this->convert_to(result); } @@ -189,14 +190,14 @@ inline void read_impl(MatrixType* mtx, const std::vector& data) template -void BatchVector::read(const std::vector& data) +void BatchMultiVector::read(const std::vector& data) { read_impl(this, data); } template -void BatchVector::read(const std::vector& data) +void BatchMultiVector::read(const std::vector& data) { read_impl(this, data); } @@ -231,21 +232,22 @@ inline void write_impl(const MatrixType* mtx, std::vector& data) template -void BatchVector::write(std::vector& data) const +void BatchMultiVector::write(std::vector& data) const { write_impl(this, data); } template -void BatchVector::write(std::vector& data) const +void BatchMultiVector::write(std::vector& data) const { write_impl(this, data); } -#define GKO_DECLARE_BATCH_VECTOR_MATRIX(_type) class BatchVector<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_MATRIX); +#define GKO_DECLARE_BATCH_MULTI_VECTOR_MATRIX(_type) \ + class BatchMultiVector<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_MATRIX); } // namespace matrix diff --git a/core/matrix/batch_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp similarity index 53% rename from core/matrix/batch_vector_kernels.hpp rename to core/base/batch_multi_vector_kernels.hpp index 6ddfc9e2676..34da4ce4c2f 100644 --- a/core/matrix/batch_vector_kernels.hpp +++ b/core/base/batch_multi_vector_kernels.hpp @@ -30,11 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_MATRIX_BATCH_VECTOR_KERNELS_HPP_ -#define GKO_CORE_MATRIX_BATCH_VECTOR_KERNELS_HPP_ +#ifndef GKO_CORE_MATRIX_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#define GKO_CORE_MATRIX_BATCH_MULTI_VECTOR_KERNELS_HPP_ -#include +#include #include @@ -46,48 +46,48 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(_type) \ void scale(std::shared_ptr exec, \ - const matrix::BatchVector<_type>* alpha, \ - matrix::BatchVector<_type>* x) + const BatchMultiVector<_type>* alpha, \ + BatchMultiVector<_type>* x) -#define GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(_type) \ void add_scaled(std::shared_ptr exec, \ - const matrix::BatchVector<_type>* alpha, \ - const matrix::BatchVector<_type>* x, \ - matrix::BatchVector<_type>* y) + const BatchMultiVector<_type>* alpha, \ + const BatchMultiVector<_type>* x, \ + BatchMultiVector<_type>* y) -#define GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(_type) \ void compute_dot(std::shared_ptr exec, \ - const matrix::BatchVector<_type>* x, \ - const matrix::BatchVector<_type>* y, \ - matrix::BatchVector<_type>* result) + const BatchMultiVector<_type>* x, \ + const BatchMultiVector<_type>* y, \ + BatchMultiVector<_type>* result) -#define GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(_type) \ void compute_norm2(std::shared_ptr exec, \ - const matrix::BatchVector<_type>* x, \ - matrix::BatchVector>* result) + const BatchMultiVector<_type>* x, \ + BatchMultiVector>* result) -#define GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL(_type) \ +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(_type) \ void copy(std::shared_ptr exec, \ - const matrix::BatchVector<_type>* x, \ - matrix::BatchVector<_type>* result) + const BatchMultiVector<_type>* x, \ + BatchMultiVector<_type>* result) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL(ValueType) +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(ValueType) -GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_vector, +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_multi_vector, GKO_DECLARE_ALL_AS_TEMPLATES); @@ -98,4 +98,4 @@ GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_vector, } // namespace gko -#endif // GKO_CORE_MATRIX_BATCH_VECTOR_KERNELS_HPP_ +#endif // GKO_CORE_MATRIX_BATCH_MULTI_VECTOR_KERNELS_HPP_ diff --git a/core/matrix/batch_struct.hpp b/core/base/batch_struct.hpp similarity index 85% rename from core/matrix/batch_struct.hpp rename to core/base/batch_struct.hpp index 01092f0e4d0..68fcdd9c8a0 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ -#define GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ +#ifndef GKO_CORE_BASE_BATCH_STRUCT_HPP_ +#define GKO_CORE_BASE_BATCH_STRUCT_HPP_ #include @@ -40,7 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace batch_vector { +namespace batch_multi_vector { /** @@ -77,23 +77,23 @@ struct UniformBatch { }; -} // namespace batch_vector +} // namespace batch_multi_vector namespace batch { template -GKO_ATTRIBUTES GKO_INLINE gko::batch_vector::BatchEntry -to_const(const gko::batch_vector::BatchEntry& b) +GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::BatchEntry +to_const(const gko::batch_multi_vector::BatchEntry& b) { return {b.values, b.stride, b.num_rows, b.num_rhs}; } template -GKO_ATTRIBUTES GKO_INLINE gko::batch_vector::UniformBatch -to_const(const gko::batch_vector::UniformBatch& ub) +GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::UniformBatch +to_const(const gko::batch_multi_vector::UniformBatch& ub) { return {ub.values, ub.num_batch, ub.stride, ub.num_rows, ub.num_rhs}; } @@ -109,8 +109,8 @@ to_const(const gko::batch_vector::UniformBatch& ub) * @param batch_idx The position of the desired object in the batch */ template -GKO_ATTRIBUTES GKO_INLINE batch_vector::BatchEntry batch_entry( - const batch_vector::UniformBatch& batch, +GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::BatchEntry batch_entry( + const batch_multi_vector::UniformBatch& batch, const size_type batch_idx) { return {batch.values + batch_idx * batch.stride * batch.num_rows, @@ -118,7 +118,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_vector::BatchEntry batch_entry( } template -GKO_ATTRIBUTES GKO_INLINE batch_vector::BatchEntry batch_entry( +GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::BatchEntry batch_entry( ValueType* const batch_values, const size_type stride, const int num_rows, const int num_rhs, const size_type batch_idx) { @@ -140,4 +140,4 @@ GKO_ATTRIBUTES GKO_INLINE ValueType* batch_entry_ptr( } // namespace gko -#endif // GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_CORE_BASE_BATCH_STRUCT_HPP_ diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index a5aa43100a3..3fe1372558b 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -272,17 +272,17 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_BUILD_LOCAL_NONLOCAL); } // namespace distributed_matrix -namespace batch_vector { +namespace batch_multi_vector { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); -} // namespace batch_vector +} // namespace batch_multi_vector namespace dense { diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index aa79ca3ed92..f51862e8244 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -1,6 +1,7 @@ ginkgo_create_test(abstract_factory) ginkgo_create_test(allocator) ginkgo_create_test(array) +ginkgo_create_test(batch_multi_vector) ginkgo_create_test(dense_cache) ginkgo_create_test(combination) ginkgo_create_test(composition) diff --git a/core/test/matrix/batch_vector.cpp b/core/test/base/batch_multi_vector.cpp similarity index 83% rename from core/test/matrix/batch_vector.cpp rename to core/test/base/batch_multi_vector.cpp index 4735d5eead2..e43be1e7b86 100644 --- a/core/test/matrix/batch_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include @@ -45,14 +45,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template -class BatchVector : public ::testing::Test { +class BatchMultiVector : public ::testing::Test { protected: using value_type = T; using DenseMtx = gko::matrix::Dense; using size_type = gko::size_type; - BatchVector() + BatchMultiVector() : exec(gko::ReferenceExecutor::create()), - mtx(gko::batch_initialize>( + mtx(gko::batch_initialize>( std::vector{4, 3}, {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, @@ -61,7 +61,7 @@ class BatchVector : public ::testing::Test { static void assert_equal_to_original_mtx( - gko::matrix::BatchVector* m) + gko::BatchMultiVector* m) { ASSERT_EQ(m->get_num_batch_entries(), 2); ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); @@ -85,37 +85,37 @@ class BatchVector : public ::testing::Test { ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); } - static void assert_empty(gko::matrix::BatchVector* m) + static void assert_empty(gko::BatchMultiVector* m) { ASSERT_EQ(m->get_num_batch_entries(), 0); ASSERT_EQ(m->get_num_stored_elements(), 0); } std::shared_ptr exec; - std::unique_ptr> mtx; + std::unique_ptr> mtx; }; -TYPED_TEST_SUITE(BatchVector, gko::test::ValueTypes); +TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); -TYPED_TEST(BatchVector, CanBeEmpty) +TYPED_TEST(BatchMultiVector, CanBeEmpty) { - auto empty = gko::matrix::BatchVector::create(this->exec); + auto empty = gko::BatchMultiVector::create(this->exec); this->assert_empty(empty.get()); } -TYPED_TEST(BatchVector, ReturnsNullValuesArrayWhenEmpty) +TYPED_TEST(BatchMultiVector, ReturnsNullValuesArrayWhenEmpty) { - auto empty = gko::matrix::BatchVector::create(this->exec); + auto empty = gko::BatchMultiVector::create(this->exec); ASSERT_EQ(empty->get_const_values(), nullptr); } -TYPED_TEST(BatchVector, CanBeConstructedWithSize) +TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize) { using size_type = gko::size_type; - auto m = gko::matrix::BatchVector::create( + auto m = gko::BatchMultiVector::create( this->exec, std::vector>{gko::dim<2>{2, 4}, gko::dim<2>{2, 3}}); @@ -130,10 +130,10 @@ TYPED_TEST(BatchVector, CanBeConstructedWithSize) } -TYPED_TEST(BatchVector, CanBeConstructedWithSizeAndStride) +TYPED_TEST(BatchMultiVector, CanBeConstructedWithSizeAndStride) { using size_type = gko::size_type; - auto m = gko::matrix::BatchVector::create( + auto m = gko::BatchMultiVector::create( this->exec, std::vector>{gko::dim<2>{2, 3}}, std::vector{4}); @@ -143,7 +143,7 @@ TYPED_TEST(BatchVector, CanBeConstructedWithSizeAndStride) } -TYPED_TEST(BatchVector, CanBeConstructedFromExistingData) +TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -155,7 +155,7 @@ TYPED_TEST(BatchVector, CanBeConstructedFromExistingData) 5.0, 6.0, -3.0}; // clang-format on - auto m = gko::matrix::BatchVector::create( + auto m = gko::BatchMultiVector::create( this->exec, std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, gko::array::view(this->exec, 12, data), @@ -169,7 +169,7 @@ TYPED_TEST(BatchVector, CanBeConstructedFromExistingData) } -TYPED_TEST(BatchVector, CanBeConstructedFromExistingConstData) +TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -181,7 +181,7 @@ TYPED_TEST(BatchVector, CanBeConstructedFromExistingConstData) 5.0, 6.0, -3.0}; // clang-format on - auto m = gko::matrix::BatchVector::create_const( + auto m = gko::BatchMultiVector::create_const( this->exec, std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, gko::array::const_view(this->exec, 12, data), @@ -195,7 +195,7 @@ TYPED_TEST(BatchVector, CanBeConstructedFromExistingConstData) } -TYPED_TEST(BatchVector, CanBeConstructedFromBatchVectorMatrices) +TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -205,19 +205,18 @@ TYPED_TEST(BatchVector, CanBeConstructedFromBatchVectorMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::matrix::BatchVector::create( + auto m = gko::BatchMultiVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::matrix::BatchVector::create( + auto m_ref = gko::BatchMultiVector::create( this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), mat1.get(), mat2.get()}); - auto m2 = - gko::matrix::BatchVector::create(this->exec, 3, m.get()); + auto m2 = gko::BatchMultiVector::create(this->exec, 3, m.get()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } -TYPED_TEST(BatchVector, CanBeConstructedFromDenseMatricesByDuplication) +TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatricesByDuplication) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -227,16 +226,16 @@ TYPED_TEST(BatchVector, CanBeConstructedFromDenseMatricesByDuplication) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::matrix::BatchVector::create( + auto bat_m = gko::BatchMultiVector::create( this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); auto m = - gko::matrix::BatchVector::create(this->exec, 3, mat1.get()); + gko::BatchMultiVector::create(this->exec, 3, mat1.get()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } -TYPED_TEST(BatchVector, CanBeConstructedFromDenseMatrices) +TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -246,14 +245,14 @@ TYPED_TEST(BatchVector, CanBeConstructedFromDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::matrix::BatchVector::create( + auto m = gko::BatchMultiVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); this->assert_equal_to_original_mtx(m.get()); } -TYPED_TEST(BatchVector, CanBeUnbatchedIntoDenseMatrices) +TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -271,16 +270,16 @@ TYPED_TEST(BatchVector, CanBeUnbatchedIntoDenseMatrices) } -TYPED_TEST(BatchVector, KnowsItsSizeAndValues) +TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) { this->assert_equal_to_original_mtx(this->mtx.get()); } -TYPED_TEST(BatchVector, CanBeListConstructed) +TYPED_TEST(BatchMultiVector, CanBeListConstructed) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); ASSERT_EQ(m->get_num_batch_entries(), 2); @@ -294,10 +293,10 @@ TYPED_TEST(BatchVector, CanBeListConstructed) } -TYPED_TEST(BatchVector, CanBeListConstructedWithstride) +TYPED_TEST(BatchMultiVector, CanBeListConstructedWithstride) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( std::vector{2}, {{1.0, 2.0}}, this->exec); ASSERT_EQ(m->get_num_batch_entries(), 1); ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); @@ -307,10 +306,10 @@ TYPED_TEST(BatchVector, CanBeListConstructedWithstride) } -TYPED_TEST(BatchVector, CanBeListConstructedByCopies) +TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( 2, I({1.0, 2.0}), this->exec); ASSERT_EQ(m->get_num_batch_entries(), 2); ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); @@ -323,11 +322,11 @@ TYPED_TEST(BatchVector, CanBeListConstructedByCopies) } -TYPED_TEST(BatchVector, CanBeDoubleListConstructed) +TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; using T = value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, this->exec); @@ -352,11 +351,11 @@ TYPED_TEST(BatchVector, CanBeDoubleListConstructed) } -TYPED_TEST(BatchVector, CanBeDoubleListConstructedWithstride) +TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructedWithstride) { using value_type = typename TestFixture::value_type; using T = value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch_initialize>( {4, 3}, {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, @@ -382,9 +381,9 @@ TYPED_TEST(BatchVector, CanBeDoubleListConstructedWithstride) } -TYPED_TEST(BatchVector, CanBeCopied) +TYPED_TEST(BatchMultiVector, CanBeCopied) { - auto mtx_copy = gko::matrix::BatchVector::create(this->exec); + auto mtx_copy = gko::BatchMultiVector::create(this->exec); mtx_copy->copy_from(this->mtx.get()); this->assert_equal_to_original_mtx(this->mtx.get()); this->mtx->at(0, 0, 0) = 7; @@ -393,15 +392,15 @@ TYPED_TEST(BatchVector, CanBeCopied) } -TYPED_TEST(BatchVector, CanBeMoved) +TYPED_TEST(BatchMultiVector, CanBeMoved) { - auto mtx_copy = gko::matrix::BatchVector::create(this->exec); + auto mtx_copy = gko::BatchMultiVector::create(this->exec); mtx_copy->copy_from(std::move(this->mtx)); this->assert_equal_to_original_mtx(mtx_copy.get()); } -TYPED_TEST(BatchVector, CanBeCloned) +TYPED_TEST(BatchMultiVector, CanBeCloned) { auto mtx_clone = this->mtx->clone(); this->assert_equal_to_original_mtx( @@ -409,17 +408,17 @@ TYPED_TEST(BatchVector, CanBeCloned) } -TYPED_TEST(BatchVector, CanBeCleared) +TYPED_TEST(BatchMultiVector, CanBeCleared) { this->mtx->clear(); this->assert_empty(this->mtx.get()); } -TYPED_TEST(BatchVector, CanBeReadFromMatrixData) +TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; - auto m = gko::matrix::BatchVector::create(this->exec); + auto m = gko::BatchMultiVector::create(this->exec); // clang-format off m->read({gko::matrix_data{{2, 3}, {{0, 0, 1.0}, @@ -453,7 +452,7 @@ TYPED_TEST(BatchVector, CanBeReadFromMatrixData) } -TYPED_TEST(BatchVector, GeneratesCorrectMatrixData) +TYPED_TEST(BatchMultiVector, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; using tpl = typename gko::matrix_data::nonzero_type; @@ -480,10 +479,10 @@ TYPED_TEST(BatchVector, GeneratesCorrectMatrixData) } -TYPED_TEST(BatchVector, CanBeReadFromMatrixAssemblyData) +TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixAssemblyData) { using value_type = typename TestFixture::value_type; - auto m = gko::matrix::BatchVector::create(this->exec); + auto m = gko::BatchMultiVector::create(this->exec); gko::matrix_assembly_data data1(gko::dim<2>{2, 3}); data1.set_value(0, 0, 1.0); data1.set_value(0, 1, 3.0); diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt index fbfe5f95e3f..433361a054f 100644 --- a/core/test/matrix/CMakeLists.txt +++ b/core/test/matrix/CMakeLists.txt @@ -1,4 +1,3 @@ -ginkgo_create_test(batch_vector) ginkgo_create_test(coo) ginkgo_create_test(coo_builder) ginkgo_create_test(csr) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index d630fb9a92a..dccc9e91401 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -6,6 +6,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE) list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE + base/batch_multi_vector_kernels.cu base/device.cpp base/device_matrix_data_kernels.cu base/exception.cpp @@ -35,7 +36,6 @@ target_sources(ginkgo_cuda factorization/par_ilut_select_kernel.cu factorization/par_ilut_spgeam_kernel.cu factorization/par_ilut_sweep_kernel.cu - matrix/batch_vector_kernels.cu matrix/coo_kernels.cu ${CSR_INSTANTIATE} matrix/dense_kernels.cu diff --git a/cuda/matrix/batch_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu similarity index 78% rename from cuda/matrix/batch_vector_kernels.cu rename to cuda/base/batch_multi_vector_kernels.cu index 9ceca9e2b3a..039ab94b767 100644 --- a/cuda/matrix/batch_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -30,14 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_vector_kernels.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" #include #include -#include "core/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/pointer_mode_guard.hpp" @@ -45,31 +46,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace cuda { /** - * @brief The BatchVector matrix format namespace. + * @brief The BatchMultiVector matrix format namespace. * - * @ingroup batch_vector + * @ingroup batch_multi_vector */ -namespace batch_vector { +namespace batch_multi_vector { constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; -#include "common/cuda_hip/matrix/batch_vector_kernels.hpp.inc" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" template void scale(std::shared_ptr exec, - const matrix::BatchVector* const alpha, - matrix::BatchVector* const x) + const BatchMultiVector* const alpha, + BatchMultiVector* const x) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto alpha_ub = get_batch_struct(alpha); @@ -77,14 +77,15 @@ void scale(std::shared_ptr exec, scale<<>>(alpha_ub, x_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); template void add_scaled(std::shared_ptr exec, - const matrix::BatchVector* const alpha, - const matrix::BatchVector* const x, - matrix::BatchVector* const y) + const BatchMultiVector* const alpha, + const BatchMultiVector* const x, + BatchMultiVector* const y) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const size_type nrhs = x->get_size().at(0)[1]; @@ -102,14 +103,15 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); template void compute_dot(std::shared_ptr exec, - const matrix::BatchVector* x, - const matrix::BatchVector* y, - matrix::BatchVector* result) + const BatchMultiVector* x, + const BatchMultiVector* y, + BatchMultiVector* result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -129,13 +131,13 @@ void compute_dot(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, - const matrix::BatchVector* const x, - matrix::BatchVector>* const result) + const BatchMultiVector* const x, + BatchMultiVector>* const result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -152,13 +154,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchVector* x, - matrix::BatchVector* result) + const BatchMultiVector* x, + BatchMultiVector* result) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto result_ub = get_batch_struct(result); @@ -166,10 +168,10 @@ void copy(std::shared_ptr exec, copy<<>>(x_ub, result_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); -} // namespace batch_vector +} // namespace batch_multi_vector } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/matrix/batch_struct.hpp b/cuda/base/batch_struct.hpp similarity index 88% rename from cuda/matrix/batch_struct.hpp rename to cuda/base/batch_struct.hpp index 104286f66b9..0bd9bd6dc40 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -34,11 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ -#include "core/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include #include -#include #include "cuda/base/config.hpp" @@ -64,8 +64,8 @@ namespace cuda { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_vector::UniformBatch> -get_batch_struct(const matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch> +get_batch_struct(const BatchMultiVector* const op) { return { as_cuda_type(op->get_const_values()), @@ -80,8 +80,8 @@ get_batch_struct(const matrix::BatchVector* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_vector::UniformBatch> get_batch_struct( - matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch> +get_batch_struct(BatchMultiVector* const op) { return { as_cuda_type(op->get_values()), @@ -98,8 +98,8 @@ inline gko::batch_vector::UniformBatch> get_batch_struct( * that may be null. */ template -inline gko::batch_vector::UniformBatch> -maybe_null_batch_struct(const matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch> +maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { return {as_cuda_type(op->get_const_values()), diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index fea0dec5c8c..1573169527d 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -4,6 +4,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANT # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES + base/batch_multi_vector_kernels.hip.cpp base/device.hip.cpp base/device_matrix_data_kernels.hip.cpp base/exception.hip.cpp @@ -33,7 +34,6 @@ set(GINKGO_HIP_SOURCES factorization/par_ilut_select_kernel.hip.cpp factorization/par_ilut_spgeam_kernel.hip.cpp factorization/par_ilut_sweep_kernel.hip.cpp - matrix/batch_vector_kernels.hip.cpp matrix/coo_kernels.hip.cpp ${CSR_INSTANTIATE} matrix/dense_kernels.hip.cpp diff --git a/hip/matrix/batch_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp similarity index 80% rename from hip/matrix/batch_vector_kernels.hip.cpp rename to hip/base/batch_multi_vector_kernels.hip.cpp index 97bbaf50440..01a443558e9 100644 --- a/hip/matrix/batch_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_vector_kernels.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" #include @@ -40,7 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/pointer_mode_guard.hip.hpp" @@ -48,31 +49,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" namespace gko { namespace kernels { namespace hip { /** - * @brief The BatchVector matrix format namespace. + * @brief The BatchMultiVector matrix format namespace. * - * @ingroup batch_vector + * @ingroup batch_multi_vector */ -namespace batch_vector { +namespace batch_multi_vector { constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; -#include "common/cuda_hip/matrix/batch_vector_kernels.hpp.inc" +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" template void scale(std::shared_ptr exec, - const matrix::BatchVector* const alpha, - matrix::BatchVector* const x) + const BatchMultiVector* const alpha, + BatchMultiVector* const x) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto alpha_ub = get_batch_struct(alpha); @@ -81,14 +81,15 @@ void scale(std::shared_ptr exec, alpha_ub, x_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); template void add_scaled(std::shared_ptr exec, - const matrix::BatchVector* const alpha, - const matrix::BatchVector* const x, - matrix::BatchVector* const y) + const BatchMultiVector* const alpha, + const BatchMultiVector* const x, + BatchMultiVector* const y) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const size_type nrhs = x->get_size().at(0)[1]; @@ -109,14 +110,15 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); template void compute_dot(std::shared_ptr exec, - const matrix::BatchVector* x, - const matrix::BatchVector* y, - matrix::BatchVector* result) + const BatchMultiVector* x, + const BatchMultiVector* y, + BatchMultiVector* result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -138,13 +140,13 @@ void compute_dot(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, - const matrix::BatchVector* const x, - matrix::BatchVector>* const result) + const BatchMultiVector* const x, + BatchMultiVector>* const result) { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_size().at()[1]; @@ -163,13 +165,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchVector* x, - matrix::BatchVector* result) + const BatchMultiVector* x, + BatchMultiVector* result) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto result_ub = get_batch_struct(result); @@ -178,10 +180,10 @@ void copy(std::shared_ptr exec, x_ub, result_ub); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); -} // namespace batch_vector +} // namespace batch_multi_vector } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp similarity index 88% rename from hip/matrix/batch_struct.hip.hpp rename to hip/base/batch_struct.hip.hpp index e2648ba4a25..214039f060b 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -34,11 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ -#include "core/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include #include -#include #include "hip/base/config.hip.hpp" @@ -64,8 +64,8 @@ namespace hip { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_vector::UniformBatch> -get_batch_struct(const matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch> +get_batch_struct(const BatchMultiVector* const op) { return { as_hip_type(op->get_const_values()), @@ -80,8 +80,8 @@ get_batch_struct(const matrix::BatchVector* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_vector::UniformBatch> get_batch_struct( - matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch> +get_batch_struct(BatchMultiVector* const op) { return { as_hip_type(op->get_values()), @@ -98,8 +98,8 @@ inline gko::batch_vector::UniformBatch> get_batch_struct( * that may be null. */ template -inline gko::batch_vector::UniformBatch> -maybe_null_batch_struct(const matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch> +maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { return {as_hip_type(op->get_const_values()), diff --git a/include/ginkgo/core/matrix/batch_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp similarity index 85% rename from include/ginkgo/core/matrix/batch_vector.hpp rename to include/ginkgo/core/base/batch_multi_vector.hpp index aee16bbc27b..a4dafd75faa 100644 --- a/include/ginkgo/core/matrix/batch_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_VECTOR_HPP_ -#define GKO_PUBLIC_CORE_MATRIX_BATCH_VECTOR_HPP_ +#ifndef GKO_PUBLIC_CORE_BASE_BATCH_MULTI_VECTOR_HPP_ +#define GKO_PUBLIC_CORE_BASE_BATCH_MULTI_VECTOR_HPP_ #include @@ -48,12 +48,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace matrix { - /** - * BatchVector is a batch matrix format which explicitly stores all values of - * the vector in each of the batches. + * BatchMultiVector is a batch matrix format which explicitly stores all values + * of the vector in each of the batches. * * The values in each of the batches are stored in row-major format (values * belonging to the same row appear consecutive in the memory). Optionally, rows @@ -63,21 +61,21 @@ namespace matrix { * * @note While this format is not very useful for storing sparse matrices, it * is often suitable to store vectors, and sets of vectors. - * @ingroup batch_vector + * @ingroup batch_multi_vector * @ingroup mat_formats * @ingroup BatchLinOp */ template -class BatchVector - : public EnableAbstractPolymorphicObject>, - public EnableCreateMethod>, - public ConvertibleTo>>, +class BatchMultiVector + : public EnableAbstractPolymorphicObject>, + public EnableCreateMethod>, + public ConvertibleTo>>, public BatchReadableFromMatrixData, public BatchReadableFromMatrixData, public BatchWritableToMatrixData, public BatchWritableToMatrixData { - friend class EnableCreateMethod; - friend class BatchVector>; + friend class EnableCreateMethod; + friend class BatchMultiVector>; public: using BatchReadableFromMatrixData::read; @@ -88,19 +86,19 @@ class BatchVector using unbatch_type = Dense; using mat_data = gko::matrix_data; using mat_data32 = gko::matrix_data; - using absolute_type = remove_complex; - using complex_type = to_complex; + using absolute_type = remove_complex; + using complex_type = to_complex; using row_major_range = gko::range>; /** - * Creates a BatchVector matrix with the configuration of another - * BatchVector matrix. + * Creates a BatchMultiVector matrix with the configuration of another + * BatchMultiVector matrix. * * @param other The other matrix whose configuration needs to copied. */ - static std::unique_ptr create_with_config_of( - const BatchVector* other) + static std::unique_ptr create_with_config_of( + const BatchMultiVector* other) { // De-referencing `other` before calling the functions (instead of // using operator `->`) is currently required to be compatible with @@ -109,12 +107,12 @@ class BatchVector return (*other).create_with_same_config(); } - friend class BatchVector>; + friend class BatchMultiVector>; void convert_to( - BatchVector>* result) const override; + BatchMultiVector>* result) const override; - void move_to(BatchVector>* result) override; + void move_to(BatchMultiVector>* result) override; void read(const std::vector& data) override; @@ -235,7 +233,7 @@ class BatchVector } /** - * @copydoc BatchVector::at(size_type, size_type, size_type) + * @copydoc BatchMultiVector::at(size_type, size_type, size_type) */ value_type at(size_type batch, size_type row, size_type col) const noexcept { @@ -264,7 +262,7 @@ class BatchVector } /** - * @copydoc BatchVector::at(size_type, size_type, size_type) + * @copydoc BatchMultiVector::at(size_type, size_type, size_type) */ ValueType at(size_type batch, size_type idx) const noexcept { @@ -274,11 +272,11 @@ class BatchVector /** * Scales the vector with a scalar (aka: BLAS scal). * - * @param alpha If alpha is 1x1 BatchVector matrix, the entire matrix (all - * batches) is scaled by alpha. If it is a BatchVector row vector of values, - * then i-th column of the vector is scaled with the i-th element of alpha - * (the number of columns of alpha has to match the number of columns of the - * matrix). + * @param alpha If alpha is 1x1 BatchMultiVector matrix, the entire matrix + * (all batches) is scaled by alpha. If it is a BatchMultiVector row vector + * of values, then i-th column of the vector is scaled with the i-th element + * of alpha (the number of columns of alpha has to match the number of + * columns of the matrix). */ void scale(const BatchLinOp* alpha) { @@ -289,10 +287,11 @@ class BatchVector /** * Adds `b` scaled by `alpha` to the vector (aka: BLAS axpy). * - * @param alpha If alpha is 1x1 BatchVector matrix, the entire matrix is - * scaled by alpha. If it is a BatchVector row vector of values, then i-th - * column of the vector is scaled with the i-th element of alpha (the number - * of columns of alpha has to match the number of columns of the vector). + * @param alpha If alpha is 1x1 BatchMultiVector matrix, the entire matrix + * is scaled by alpha. If it is a BatchMultiVector row vector of values, + * then i-th column of the vector is scaled with the i-th element of alpha + * (the number of columns of alpha has to match the number of columns of the + * vector). * @param b a matrix of the same dimension as this */ void add_scaled(const BatchLinOp* alpha, const BatchLinOp* b) @@ -306,11 +305,10 @@ class BatchVector * Adds `a` scaled by `alpha` to the vector scaled by `beta`: * this <- alpha * a + beta * this. * - * @param alpha If alpha is 1x1 BatchVector matrix, the entire matrix a is - * scaled by alpha. If it is a BatchVector row vector of - * values, then i-th column of a is scaled with the i-th - * element of alpha (the number of columns of alpha has to - * match the number of columns of a). + * @param alpha If alpha is 1x1 BatchMultiVector matrix, the entire matrix + * a is scaled by alpha. If it is a BatchMultiVector row vector of values, + * then i-th column of a is scaled with the i-th element of alpha (the + * number of columns of alpha has to match the number of columns of a). * @param a a matrix of the same dimension as this. * @param beta Scalar(s), of the same size as alpha, to multiply this * matrix. @@ -323,10 +321,10 @@ class BatchVector * corresponding entry in `b`. If the vector has complex value_type, then * the conjugate of this is taken. * - * @param b a BatchVector matrix of same dimension as this - * @param result a BatchVector row vector, used to store the dot product - * (the number of column in the vector must match the number - * of columns of this) + * @param b a BatchMultiVector matrix of same dimension as this + * @param result a BatchMultiVector row vector, used to store the dot + * product (the number of column in the vector must match the number of + * columns of this) */ void compute_dot(const BatchLinOp* b, BatchLinOp* result) const { @@ -338,7 +336,7 @@ class BatchVector /** * Computes the Euclidean (L^2) norm of each matrix in this batch. * - * @param result a BatchVector row vector, used to store the norm + * @param result a BatchMultiVector row vector, used to store the norm * (the number of columns in the vector must match the number * of columns of this) */ @@ -359,14 +357,14 @@ class BatchVector * (if it resides on the same executor as the vector) or a copy of * the array on the correct executor. */ - static std::unique_ptr create_const( + static std::unique_ptr create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values, const batch_stride& strides) { // cast const-ness away, but return a const object afterwards, // so we can ensure that no modifications take place. - return std::unique_ptr(new BatchVector{ + return std::unique_ptr(new BatchMultiVector{ exec, sizes, gko::detail::array_const_cast(std::move(values)), strides}); } @@ -454,21 +452,21 @@ class BatchVector protected: /** - * Creates an uninitialized BatchVector matrix of the specified size. + * Creates an uninitialized BatchMultiVector matrix of the specified size. * * @param exec Executor associated to the vector * @param size size of the vector */ - BatchVector(std::shared_ptr exec, - const batch_dim<2>& size = batch_dim<2>{}) - : BatchVector(std::move(exec), size, - size.get_num_batch_entries() > 0 - ? extract_nth_dim(1, size) - : batch_stride{}) + BatchMultiVector(std::shared_ptr exec, + const batch_dim<2>& size = batch_dim<2>{}) + : BatchMultiVector(std::move(exec), size, + size.get_num_batch_entries() > 0 + ? extract_nth_dim(1, size) + : batch_stride{}) {} /** - * Creates an uninitialized BatchVector matrix of the specified size. + * Creates an uninitialized BatchMultiVector matrix of the specified size. * * @param exec Executor associated to the vector * @param size size of the batch matrices in a batch_dim object @@ -476,9 +474,9 @@ class BatchVector * elements of two consecutive rows, expressed as the * number of matrix elements) */ - BatchVector(std::shared_ptr exec, const batch_dim<2>& size, - const batch_stride& stride) - : EnableBatchLinOp(exec, size), + BatchMultiVector(std::shared_ptr exec, + const batch_dim<2>& size, const batch_stride& stride) + : EnableBatchLinOp(exec, size), values_(exec, compute_batch_mem(size, stride)), stride_(stride) { @@ -487,8 +485,8 @@ class BatchVector } /** - * Creates a BatchVector matrix from an already allocated (and initialized) - * array. + * Creates a BatchMultiVector matrix from an already allocated (and + * initialized) array. * * @tparam ValuesArray type of array of values * @@ -504,9 +502,10 @@ class BatchVector * original array data will not be used in the vector. */ template - BatchVector(std::shared_ptr exec, const batch_dim<2>& size, - ValuesArray&& values, const batch_stride& stride) - : EnableBatchLinOp(exec, size), + BatchMultiVector(std::shared_ptr exec, + const batch_dim<2>& size, ValuesArray&& values, + const batch_stride& stride) + : EnableBatchLinOp(exec, size), values_{exec, std::forward(values)}, stride_{stride}, num_elems_per_batch_cumul_( @@ -523,14 +522,15 @@ class BatchVector } /** - * Creates a BatchVector matrix from a vector of matrices + * Creates a BatchMultiVector matrix from a vector of matrices * * @param exec Executor associated to the vector * @param matrices The matrices that need to be batched. */ - BatchVector(std::shared_ptr exec, - const std::vector*>& matrices) - : EnableBatchLinOp(exec, get_sizes_from_mtxs(matrices)), + BatchMultiVector(std::shared_ptr exec, + const std::vector*>& matrices) + : EnableBatchLinOp(exec, + get_sizes_from_mtxs(matrices)), stride_{get_strides_from_mtxs(matrices)}, values_(exec, compute_batch_mem(this->get_size(), stride_)) { @@ -547,16 +547,16 @@ class BatchVector } /** - * Creates a BatchVector matrix by duplicating BatchVector matrix + * Creates a BatchMultiVector matrix by duplicating BatchMultiVector matrix * * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate * @param input the vector to be duplicated. */ - BatchVector(std::shared_ptr exec, - size_type num_duplications, - const BatchVector* input) - : EnableBatchLinOp( + BatchMultiVector(std::shared_ptr exec, + size_type num_duplications, + const BatchMultiVector* input) + : EnableBatchLinOp( exec, gko::batch_dim<2>( input->get_num_batch_entries() * num_duplications, input->get_size().at(0))), @@ -578,15 +578,15 @@ class BatchVector } /** - * Creates a BatchVector matrix by duplicating Dense matrix + * Creates a BatchMultiVector matrix by duplicating Dense matrix * * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate * @param input the vector to be duplicated. */ - BatchVector(std::shared_ptr exec, - size_type num_duplications, const Dense* input) - : EnableBatchLinOp( + BatchMultiVector(std::shared_ptr exec, + size_type num_duplications, const Dense* input) + : EnableBatchLinOp( exec, gko::batch_dim<2>(num_duplications, input->get_size())), stride_{gko::batch_stride(num_duplications, input->get_stride())}, values_(exec, compute_batch_mem(this->get_size(), stride_)) @@ -604,21 +604,22 @@ class BatchVector } /** - * Creates a BatchVector matrix with the same configuration as the callers - * matrix. + * Creates a BatchMultiVector matrix with the same configuration as the + * callers matrix. * - * @returns a BatchVector matrix with the same configuration as the caller. + * @returns a BatchMultiVector matrix with the same configuration as the + * caller. */ - virtual std::unique_ptr create_with_same_config() const + virtual std::unique_ptr create_with_same_config() const { - return BatchVector::create(this->get_executor(), this->get_size(), - this->get_stride()); + return BatchMultiVector::create(this->get_executor(), this->get_size(), + this->get_stride()); } /** * @copydoc scale(const BatchLinOp *) * - * @note Other implementations of batch_vector should override this + * @note Other implementations of batch_multi_vector should override this * function instead of scale(const BatchLinOp *alpha). */ virtual void scale_impl(const BatchLinOp* alpha); @@ -626,7 +627,7 @@ class BatchVector /** * @copydoc add_scaled(const BatchLinOp *, const BatchLinOp *) * - * @note Other implementations of batch_vector should override this + * @note Other implementations of batch_multi_vector should override this * function instead of add_scale(const BatchLinOp *alpha, const BatchLinOp * *b). */ @@ -635,7 +636,7 @@ class BatchVector /** * @copydoc compute_dot(const BatchLinOp *, BatchLinOp *) const * - * @note Other implementations of batch_vector should override this + * @note Other implementations of batch_multi_vector should override this * function instead of compute_dot(const BatchLinOp *b, BatchLinOp *result). */ virtual void compute_dot_impl(const BatchLinOp* b, @@ -644,7 +645,7 @@ class BatchVector /** * @copydoc compute_norm2(BatchLinOp *) const * - * @note Other implementations of batch_vector should override this + * @note Other implementations of batch_multi_vector should override this * function instead of compute_norm2(BatchLinOp *result). */ virtual void compute_norm2_impl(BatchLinOp* result) const; @@ -669,9 +670,6 @@ class BatchVector }; -} // namespace matrix - - /** * Creates and initializes a batch of column-vectors. * @@ -700,7 +698,7 @@ std::unique_ptr batch_initialize( vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_vector = matrix::BatchVector; + using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); std::vector num_rows(num_batch_entries); std::vector> sizes(num_batch_entries); @@ -712,7 +710,7 @@ std::unique_ptr batch_initialize( } auto b_size = batch_dim<2>(sizes); auto b_stride = batch_stride(stride); - auto tmp = batch_vector::create(exec->get_master(), b_size, b_stride); + auto tmp = batch_multi_vector::create(exec->get_master(), b_size, b_stride); size_type batch = 0; for (const auto& b : vals) { size_type idx = 0; @@ -789,7 +787,7 @@ std::unique_ptr batch_initialize( vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_vector = matrix::BatchVector; + using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); std::vector num_rows(num_batch_entries); std::vector num_cols(num_batch_entries); @@ -803,7 +801,7 @@ std::unique_ptr batch_initialize( } auto b_size = batch_dim<2>(sizes); auto b_stride = batch_stride(stride); - auto tmp = batch_vector::create(exec->get_master(), b_size, b_stride); + auto tmp = batch_multi_vector::create(exec->get_master(), b_size, b_stride); size_type batch = 0; for (const auto& b : vals) { size_type ridx = 0; @@ -894,7 +892,7 @@ std::unique_ptr batch_initialize( std::initializer_list vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_vector = matrix::BatchVector; + using batch_multi_vector = BatchMultiVector; std::vector num_rows(num_vectors); std::vector> sizes(num_vectors); for (size_type b = 0; b < num_vectors; ++b) { @@ -903,7 +901,7 @@ std::unique_ptr batch_initialize( } auto b_size = batch_dim<2>(sizes); auto b_stride = batch_stride(stride); - auto tmp = batch_vector::create(exec->get_master(), b_size, b_stride); + auto tmp = batch_multi_vector::create(exec->get_master(), b_size, b_stride); for (size_type batch = 0; batch < num_vectors; batch++) { size_type idx = 0; for (const auto& elem : vals) { @@ -982,7 +980,7 @@ std::unique_ptr batch_initialize( vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_vector = matrix::BatchVector; + using batch_multi_vector = BatchMultiVector; std::vector> sizes(num_matrices); const size_type num_rows = vals.size(); for (size_type b = 0; b < num_matrices; ++b) { @@ -992,7 +990,7 @@ std::unique_ptr batch_initialize( GKO_ASSERT(blockit->size() == num_cols); } } - auto tmp = batch_vector::create(exec->get_master(), sizes, stride); + auto tmp = batch_multi_vector::create(exec->get_master(), sizes, stride); for (size_type batch = 0; batch < num_matrices; batch++) { size_type ridx = 0; for (const auto& row : vals) { @@ -1049,4 +1047,4 @@ std::unique_ptr batch_initialize( } // namespace gko -#endif // GKO_PUBLIC_CORE_MATRIX_BATCH_VECTOR_HPP_ +#endif // GKO_PUBLIC_CORE_BASE_BATCH_MULTI_VECTOR_HPP_ diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index abb50ffc09f..02248983385 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -3,6 +3,7 @@ add_library(ginkgo_omp $ "") list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_omp PRIVATE + base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp base/index_set_kernels.cpp base/scoped_device_id.cpp @@ -20,7 +21,6 @@ target_sources(ginkgo_omp factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp - matrix/batch_vector_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/omp/matrix/batch_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp similarity index 77% rename from omp/matrix/batch_vector_kernels.cpp rename to omp/base/batch_multi_vector_kernels.cpp index 7ade2fcca23..96b6716f0ba 100644 --- a/omp/matrix/batch_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_vector_kernels.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" #include @@ -49,20 +49,20 @@ namespace gko { namespace kernels { namespace omp { /** - * @brief The BatchVector matrix format namespace. - * @ref BatchVector - * @ingroup batch_vector + * @brief The BatchMultiVector matrix format namespace. + * @ref BatchMultiVector + * @ingroup batch_multi_vector */ -namespace batch_vector { +namespace batch_multi_vector { -#include "reference/matrix/batch_vector_kernels.hpp.inc" +#include "reference/matrix/batch_multi_vector_kernels.hpp.inc" template void scale(std::shared_ptr exec, - const matrix::BatchVector* const alpha, - matrix::BatchVector* const x) + const BatchMultiVector* const alpha, + BatchMultiVector* const x) { const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); @@ -74,14 +74,15 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); template void add_scaled(std::shared_ptr exec, - const matrix::BatchVector* const alpha, - const matrix::BatchVector* const x, - matrix::BatchVector* const y) + const BatchMultiVector* const alpha, + const BatchMultiVector* const x, + BatchMultiVector* const y) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -95,14 +96,15 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); template void compute_dot(std::shared_ptr exec, - const matrix::BatchVector* const x, - const matrix::BatchVector* const y, - matrix::BatchVector* const result) + const BatchMultiVector* const x, + const BatchMultiVector* const y, + BatchMultiVector* const result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -118,13 +120,13 @@ void compute_dot(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, - const matrix::BatchVector* const x, - matrix::BatchVector>* const result) + const BatchMultiVector* const x, + BatchMultiVector>* const result) { const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); @@ -138,13 +140,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchVector* x, - matrix::BatchVector* result) + const BatchMultiVector* x, + BatchMultiVector* result) { const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); @@ -156,10 +158,10 @@ void copy(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); -} // namespace batch_vector +} // namespace batch_multi_vector } // namespace omp } // namespace kernels } // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 224fb70dc0e..074d5efe818 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -1,6 +1,7 @@ add_library(ginkgo_reference $ "") target_sources(ginkgo_reference PRIVATE + base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp base/index_set_kernels.cpp base/scoped_device_id.cpp @@ -23,7 +24,6 @@ target_sources(ginkgo_reference factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp - matrix/batch_vector_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/reference/matrix/batch_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp similarity index 77% rename from reference/matrix/batch_vector_kernels.cpp rename to reference/base/batch_multi_vector_kernels.cpp index 01748c6e524..27f6539b9eb 100644 --- a/reference/matrix/batch_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_vector_kernels.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" #include @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" #include "reference/matrix/batch_struct.hpp" @@ -49,20 +49,20 @@ namespace gko { namespace kernels { namespace reference { /** - * @brief The BatchVector matrix format namespace. - * @ref BatchVector - * @ingroup batch_vector + * @brief The BatchMultiVector matrix format namespace. + * @ref BatchMultiVector + * @ingroup batch_multi_vector */ -namespace batch_vector { +namespace batch_multi_vector { -#include "reference/matrix/batch_vector_kernels.hpp.inc" +#include "reference/matrix/batch_multi_vector_kernels.hpp.inc" template void scale(std::shared_ptr exec, - const matrix::BatchVector* alpha, - matrix::BatchVector* x) + const BatchMultiVector* alpha, + BatchMultiVector* x) { const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); @@ -73,14 +73,15 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); template void add_scaled(std::shared_ptr exec, - const matrix::BatchVector* alpha, - const matrix::BatchVector* x, - matrix::BatchVector* y) + const BatchMultiVector* alpha, + const BatchMultiVector* x, + BatchMultiVector* y) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -93,14 +94,15 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_ADD_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); template void compute_dot(std::shared_ptr exec, - const matrix::BatchVector* x, - const matrix::BatchVector* y, - matrix::BatchVector* result) + const BatchMultiVector* x, + const BatchMultiVector* y, + BatchMultiVector* result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -115,13 +117,13 @@ void compute_dot(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_DOT_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, - const matrix::BatchVector* x, - matrix::BatchVector>* result) + const BatchMultiVector* x, + BatchMultiVector>* result) { const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); @@ -134,13 +136,13 @@ void compute_norm2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_VECTOR_COMPUTE_NORM2_KERNEL); + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); template void copy(std::shared_ptr exec, - const matrix::BatchVector* x, - matrix::BatchVector* result) + const BatchMultiVector* x, + BatchMultiVector* result) { const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); @@ -151,10 +153,10 @@ void copy(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_VECTOR_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); -} // namespace batch_vector +} // namespace batch_multi_vector } // namespace reference } // namespace kernels } // namespace gko diff --git a/reference/matrix/batch_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc similarity index 80% rename from reference/matrix/batch_vector_kernels.hpp.inc rename to reference/base/batch_multi_vector_kernels.hpp.inc index eb4a8cfab2a..2f9c88e53f1 100644 --- a/reference/matrix/batch_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -32,9 +32,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void matvec_kernel( - const gko::batch_vector::BatchEntry& a, - const gko::batch_vector::BatchEntry& b, - const gko::batch_vector::BatchEntry& c) + const gko::batch_multi_vector::BatchEntry& a, + const gko::batch_multi_vector::BatchEntry& b, + const gko::batch_multi_vector::BatchEntry& c) { for (int row = 0; row < c.num_rows; ++row) { for (int col = 0; col < c.num_rhs; ++col) { @@ -57,9 +57,10 @@ inline void matvec_kernel( template inline void advanced_matvec_kernel( const ValueType alpha, - const gko::batch_vector::BatchEntry& a, - const gko::batch_vector::BatchEntry& b, - const ValueType beta, const gko::batch_vector::BatchEntry& c) + const gko::batch_multi_vector::BatchEntry& a, + const gko::batch_multi_vector::BatchEntry& b, + const ValueType beta, + const gko::batch_multi_vector::BatchEntry& c) { if (beta != gko::zero()) { for (int row = 0; row < c.num_rows; ++row) { @@ -88,8 +89,9 @@ inline void advanced_matvec_kernel( template -inline void scale(const gko::batch_vector::BatchEntry& alpha, - const gko::batch_vector::BatchEntry& x) +inline void scale( + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -109,9 +111,9 @@ inline void scale(const gko::batch_vector::BatchEntry& alpha, template inline void add_scaled( - const gko::batch_vector::BatchEntry& alpha, - const gko::batch_vector::BatchEntry& x, - const gko::batch_vector::BatchEntry& y) + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -133,10 +135,10 @@ inline void add_scaled( template inline void add_scale( - const gko::batch_vector::BatchEntry& alpha, - const gko::batch_vector::BatchEntry& x, - const gko::batch_vector::BatchEntry& beta, - const gko::batch_vector::BatchEntry& y) + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& beta, + const gko::batch_multi_vector::BatchEntry& y) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -160,8 +162,9 @@ inline void add_scale( template inline void compute_norm2( - const gko::batch_vector::BatchEntry& x, - const gko::batch_vector::BatchEntry>& result) + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry>& + result) { for (int j = 0; j < x.num_rhs; ++j) { result.values[j] = gko::zero>(); @@ -185,8 +188,8 @@ inline void compute_norm2( */ template inline void batch_scale( - const gko::batch_vector::BatchEntry& diag_vec, - const gko::batch_vector::BatchEntry& a) + const gko::batch_multi_vector::BatchEntry& diag_vec, + const gko::batch_multi_vector::BatchEntry& a) { for (int i_row = 0; i_row < a.num_rows; i_row++) { const ValueType scale = diag_vec.values[i_row]; @@ -217,8 +220,8 @@ inline void batch_scale(const int nrows, const int ncols, * and stride set. */ template -inline void copy(const gko::batch_vector::BatchEntry& in, - const gko::batch_vector::BatchEntry& out) +inline void copy(const gko::batch_multi_vector::BatchEntry& in, + const gko::batch_multi_vector::BatchEntry& out) { for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) { const int i = iz / in.num_rhs; @@ -230,9 +233,9 @@ inline void copy(const gko::batch_vector::BatchEntry& in, template inline void compute_dot_product( - const gko::batch_vector::BatchEntry& x, - const gko::batch_vector::BatchEntry& y, - const gko::batch_vector::BatchEntry& result) + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, + const gko::batch_multi_vector::BatchEntry& result) { for (int c = 0; c < result.num_rhs; c++) { result.values[c] = gko::zero(); @@ -249,8 +252,8 @@ inline void compute_dot_product( template inline void copy( - const gko::batch_vector::BatchEntry& source_entry, - const gko::batch_vector::BatchEntry& destination_entry, + const gko::batch_multi_vector::BatchEntry& source_entry, + const gko::batch_multi_vector::BatchEntry& destination_entry, const gko::uint32& converged) { for (int r = 0; r < source_entry.num_rows; r++) { @@ -270,9 +273,9 @@ inline void copy( template inline void add_scaled( - const gko::batch_vector::BatchEntry& alpha, - const gko::batch_vector::BatchEntry& x, - const gko::batch_vector::BatchEntry& y, + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, const gko::uint32& converged) { if (alpha.num_rhs == 1) { @@ -308,8 +311,9 @@ inline void add_scaled( template inline void compute_norm2( - const gko::batch_vector::BatchEntry& x, - const gko::batch_vector::BatchEntry>& result, + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry>& + result, const gko::uint32& converged) { for (int j = 0; j < x.num_rhs; ++j) { @@ -346,9 +350,9 @@ inline void compute_norm2( template inline void compute_dot_product( - const gko::batch_vector::BatchEntry& x, - const gko::batch_vector::BatchEntry& y, - const gko::batch_vector::BatchEntry& result, + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, + const gko::batch_multi_vector::BatchEntry& result, const gko::uint32& converged) { for (int c = 0; c < result.num_rhs; c++) { @@ -379,7 +383,7 @@ inline void compute_dot_product( template inline void add_scaled_identity( const ValueType& a, const ValueType& b, - const gko::batch_vector::BatchEntry& mat) + const gko::batch_multi_vector::BatchEntry& mat) { for (int i = 0; i < mat.num_rows; i++) { for (int j = 0; j < mat.num_rhs; j++) { diff --git a/reference/matrix/batch_struct.hpp b/reference/base/batch_struct.hpp similarity index 88% rename from reference/matrix/batch_struct.hpp rename to reference/base/batch_struct.hpp index 0c07956d9d6..32c90db9d7f 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -34,11 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ -#include "core/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include #include -#include namespace gko { @@ -63,8 +63,8 @@ namespace host { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_vector::UniformBatch get_batch_struct( - const matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch get_batch_struct( + const BatchMultiVector* const op) { return { op->get_const_values(), @@ -80,8 +80,8 @@ inline gko::batch_vector::UniformBatch get_batch_struct( * Generates a uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_vector::UniformBatch get_batch_struct( - matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch get_batch_struct( + BatchMultiVector* const op) { return { op->get_values(), @@ -98,8 +98,8 @@ inline gko::batch_vector::UniformBatch get_batch_struct( * that may be null. */ template -inline gko::batch_vector::UniformBatch maybe_null_batch_struct( - const matrix::BatchVector* const op) +inline gko::batch_multi_vector::UniformBatch +maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { return {op->get_const_values(), op->get_num_batch_entries(), diff --git a/reference/test/matrix/batch_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp similarity index 87% rename from reference/test/matrix/batch_vector_kernels.cpp rename to reference/test/base/batch_multi_vector_kernels.cpp index e8aaad8d584..f2062a4e393 100644 --- a/reference/test/matrix/batch_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include @@ -41,16 +41,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include +#include #include #include #include -#include -#include -#include #include -#include "core/matrix/batch_vector_kernels.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" #include "core/test/utils.hpp" @@ -58,15 +58,15 @@ namespace { template -class BatchVector : public ::testing::Test { +class BatchMultiVector : public ::testing::Test { protected: using value_type = T; using size_type = gko::size_type; - using Mtx = gko::matrix::BatchVector; + using Mtx = gko::BatchMultiVector; using DenseMtx = gko::matrix::Dense; using ComplexMtx = gko::to_complex; using RealMtx = gko::remove_complex; - BatchVector() + BatchMultiVector() : exec(gko::ReferenceExecutor::create()), mtx_0(gko::batch_initialize( {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, @@ -138,10 +138,10 @@ class BatchVector : public ::testing::Test { }; -TYPED_TEST_SUITE(BatchVector, gko::test::ValueTypes); +TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); -TYPED_TEST(BatchVector, AppliesToBatchVector) +TYPED_TEST(BatchMultiVector, AppliesToBatchMultiVector) { using T = typename TestFixture::value_type; this->mtx_1->apply(this->mtx_2.get(), this->mtx_3.get()); @@ -155,7 +155,7 @@ TYPED_TEST(BatchVector, AppliesToBatchVector) } -TYPED_TEST(BatchVector, AppliesLinearCombinationToBatchVector) +TYPED_TEST(BatchMultiVector, AppliesLinearCombinationToBatchMultiVector) { using Mtx = typename TestFixture::Mtx; using DenseMtx = typename TestFixture::DenseMtx; @@ -180,7 +180,7 @@ TYPED_TEST(BatchVector, AppliesLinearCombinationToBatchVector) } -TYPED_TEST(BatchVector, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(BatchMultiVector, ApplyFailsOnWrongInnerDimension) { using Mtx = typename TestFixture::Mtx; auto res = Mtx::create( @@ -191,7 +191,7 @@ TYPED_TEST(BatchVector, ApplyFailsOnWrongInnerDimension) } -TYPED_TEST(BatchVector, ApplyFailsForNonUniformBatches) +TYPED_TEST(BatchMultiVector, ApplyFailsForNonUniformBatches) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -212,7 +212,7 @@ TYPED_TEST(BatchVector, ApplyFailsForNonUniformBatches) } -TYPED_TEST(BatchVector, ApplyFailsOnWrongNumberOfRows) +TYPED_TEST(BatchMultiVector, ApplyFailsOnWrongNumberOfRows) { using Mtx = typename TestFixture::Mtx; auto res = Mtx::create( @@ -223,7 +223,7 @@ TYPED_TEST(BatchVector, ApplyFailsOnWrongNumberOfRows) } -TYPED_TEST(BatchVector, ApplyFailsOnWrongNumberOfCols) +TYPED_TEST(BatchMultiVector, ApplyFailsOnWrongNumberOfCols) { using Mtx = typename TestFixture::Mtx; auto res = Mtx::create( @@ -237,7 +237,7 @@ TYPED_TEST(BatchVector, ApplyFailsOnWrongNumberOfCols) } -TYPED_TEST(BatchVector, ScalesData) +TYPED_TEST(BatchMultiVector, ScalesData) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -257,7 +257,7 @@ TYPED_TEST(BatchVector, ScalesData) } -TYPED_TEST(BatchVector, ScalesDataWithScalar) +TYPED_TEST(BatchMultiVector, ScalesDataWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -275,7 +275,7 @@ TYPED_TEST(BatchVector, ScalesDataWithScalar) } -TYPED_TEST(BatchVector, ScalesDataWithStride) +TYPED_TEST(BatchMultiVector, ScalesDataWithStride) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -294,7 +294,7 @@ TYPED_TEST(BatchVector, ScalesDataWithStride) } -TYPED_TEST(BatchVector, AddsScaled) +TYPED_TEST(BatchMultiVector, AddsScaled) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -313,7 +313,7 @@ TYPED_TEST(BatchVector, AddsScaled) } -TYPED_TEST(BatchVector, AddsScale) +TYPED_TEST(BatchMultiVector, AddsScale) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -337,7 +337,7 @@ TYPED_TEST(BatchVector, AddsScale) } -TYPED_TEST(BatchVector, ConvergenceAddScaled) +TYPED_TEST(BatchMultiVector, ConvergenceAddScaled) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -350,7 +350,7 @@ TYPED_TEST(BatchVector, ConvergenceAddScaled) const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_vector::convergence_add_scaled( + gko::kernels::reference::batch_multi_vector::convergence_add_scaled( this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), converged); @@ -378,7 +378,7 @@ TYPED_TEST(BatchVector, ConvergenceAddScaled) } -TYPED_TEST(BatchVector, AddsScaledWithScalar) +TYPED_TEST(BatchMultiVector, AddsScaledWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -396,7 +396,7 @@ TYPED_TEST(BatchVector, AddsScaledWithScalar) } -TYPED_TEST(BatchVector, AddsScaleWithScalar) +TYPED_TEST(BatchMultiVector, AddsScaleWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -418,7 +418,7 @@ TYPED_TEST(BatchVector, AddsScaleWithScalar) } -TYPED_TEST(BatchVector, AddScaleWithScalarViaApply) +TYPED_TEST(BatchMultiVector, AddScaleWithScalarViaApply) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -441,7 +441,7 @@ TYPED_TEST(BatchVector, AddScaleWithScalarViaApply) } -TYPED_TEST(BatchVector, ConvergenceAddScaledWithScalar) +TYPED_TEST(BatchMultiVector, ConvergenceAddScaledWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -453,7 +453,7 @@ TYPED_TEST(BatchVector, ConvergenceAddScaledWithScalar) const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_vector::convergence_add_scaled( + gko::kernels::reference::batch_multi_vector::convergence_add_scaled( this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), converged); @@ -481,7 +481,7 @@ TYPED_TEST(BatchVector, ConvergenceAddScaledWithScalar) } -TYPED_TEST(BatchVector, AddScaledFailsOnWrongSizes) +TYPED_TEST(BatchMultiVector, AddScaledFailsOnWrongSizes) { using Mtx = typename TestFixture::Mtx; auto alpha = @@ -492,7 +492,7 @@ TYPED_TEST(BatchVector, AddScaledFailsOnWrongSizes) } -TYPED_TEST(BatchVector, AddScaleFailsOnWrongSizes) +TYPED_TEST(BatchMultiVector, AddScaleFailsOnWrongSizes) { using Mtx = typename TestFixture::Mtx; auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); @@ -504,7 +504,7 @@ TYPED_TEST(BatchVector, AddScaleFailsOnWrongSizes) } -TYPED_TEST(BatchVector, AddScaleFailsOnWrongScalarSizes) +TYPED_TEST(BatchMultiVector, AddScaleFailsOnWrongScalarSizes) { using Mtx = typename TestFixture::Mtx; auto alpha = gko::batch_initialize( @@ -517,7 +517,7 @@ TYPED_TEST(BatchVector, AddScaleFailsOnWrongScalarSizes) } -TYPED_TEST(BatchVector, ComputesDot) +TYPED_TEST(BatchMultiVector, ComputesDot) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -536,7 +536,7 @@ TYPED_TEST(BatchVector, ComputesDot) } -TYPED_TEST(BatchVector, ConvergenceComputeDot) +TYPED_TEST(BatchMultiVector, ConvergenceComputeDot) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -555,7 +555,7 @@ TYPED_TEST(BatchVector, ConvergenceComputeDot) const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_vector::convergence_compute_dot( + gko::kernels::reference::batch_multi_vector::convergence_compute_dot( this->exec, this->mtx_0.get(), this->mtx_1.get(), result.get(), converged); @@ -577,12 +577,12 @@ TYPED_TEST(BatchVector, ConvergenceComputeDot) } -TYPED_TEST(BatchVector, ComputesNorm2) +TYPED_TEST(BatchMultiVector, ComputesNorm2) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using T_nc = gko::remove_complex; - using NormVector = gko::matrix::BatchVector; + using NormVector = gko::BatchMultiVector; auto mtx(gko::batch_initialize( {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, @@ -601,12 +601,12 @@ TYPED_TEST(BatchVector, ComputesNorm2) } -TYPED_TEST(BatchVector, ConvergenceComputeNorm2) +TYPED_TEST(BatchMultiVector, ConvergenceComputeNorm2) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using T_nc = gko::remove_complex; - using NormVector = gko::matrix::BatchVector; + using NormVector = gko::BatchMultiVector; auto mtx(gko::batch_initialize( {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, @@ -628,7 +628,7 @@ TYPED_TEST(BatchVector, ConvergenceComputeNorm2) const int num_rhs = 2; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_vector::convergence_compute_norm2( + gko::kernels::reference::batch_multi_vector::convergence_compute_norm2( this->exec, mtx.get(), result.get(), converged); EXPECT_EQ(result->at(0, 0, 0), result_clone->at(0, 0, 0)); @@ -639,7 +639,7 @@ TYPED_TEST(BatchVector, ConvergenceComputeNorm2) } -TYPED_TEST(BatchVector, ComputDotFailsOnWrongInputSize) +TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -651,7 +651,7 @@ TYPED_TEST(BatchVector, ComputDotFailsOnWrongInputSize) } -TYPED_TEST(BatchVector, ComputDotFailsOnWrongResultSize) +TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -667,22 +667,22 @@ TYPED_TEST(BatchVector, ComputDotFailsOnWrongResultSize) } -TYPED_TEST(BatchVector, CopiesData) +TYPED_TEST(BatchMultiVector, CopiesData) { - gko::kernels::reference::batch_vector::copy(this->exec, this->mtx_0.get(), - this->mtx_1.get()); + gko::kernels::reference::batch_multi_vector::copy( + this->exec, this->mtx_0.get(), this->mtx_1.get()); GKO_ASSERT_BATCH_MTX_NEAR(this->mtx_1.get(), this->mtx_0.get(), 0.); } -TYPED_TEST(BatchVector, ConvergenceCopyData) +TYPED_TEST(BatchMultiVector, ConvergenceCopyData) { auto umtx_0 = this->mtx_0->unbatch(); const int num_rhs = 3; const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_vector::convergence_copy( + gko::kernels::reference::batch_multi_vector::convergence_copy( this->exec, this->mtx_0.get(), this->mtx_1.get(), converged); auto mtx_10_clone = gko::clone(this->mtx_10); @@ -706,7 +706,7 @@ TYPED_TEST(BatchVector, ConvergenceCopyData) } -TYPED_TEST(BatchVector, BatchScale) +TYPED_TEST(BatchMultiVector, BatchScale) { using T = typename TestFixture::value_type; using Mtx = typename TestFixture::Mtx; @@ -722,8 +722,8 @@ TYPED_TEST(BatchVector, BatchScale) auto rght(gko::batch_diagonal_initialize( I>{I{-0.5, -2.0}, I{2.0, 0.25}}, this->exec)); - gko::kernels::reference::batch_vector::batch_scale(this->exec, left.get(), - rght.get(), mtx.get()); + gko::kernels::reference::batch_multi_vector::batch_scale( + this->exec, left.get(), rght.get(), mtx.get()); EXPECT_EQ(mtx->at(0, 0, 0), T{-0.5}); EXPECT_EQ(mtx->at(0, 1, 0), T{-2.0}); @@ -741,14 +741,14 @@ TYPED_TEST(BatchVector, BatchScale) } -TYPED_TEST(BatchVector, ConvertsToPrecision) +TYPED_TEST(BatchMultiVector, ConvertsToPrecision) { - using BatchVector = typename TestFixture::Mtx; + using BatchMultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchVector = typename gko::matrix::BatchVector; - auto tmp = OtherBatchVector::create(this->exec); - auto res = BatchVector::create(this->exec); + using OtherBatchMultiVector = typename gko::BatchMultiVector; + auto tmp = OtherBatchMultiVector::create(this->exec); + auto res = BatchMultiVector::create(this->exec); // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} @@ -764,14 +764,14 @@ TYPED_TEST(BatchVector, ConvertsToPrecision) } -TYPED_TEST(BatchVector, MovesToPrecision) +TYPED_TEST(BatchMultiVector, MovesToPrecision) { - using BatchVector = typename TestFixture::Mtx; + using BatchMultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchVector = typename gko::matrix::BatchVector; - auto tmp = OtherBatchVector::create(this->exec); - auto res = BatchVector::create(this->exec); + using OtherBatchMultiVector = typename gko::BatchMultiVector; + auto tmp = OtherBatchMultiVector::create(this->exec); + auto res = BatchMultiVector::create(this->exec); // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} @@ -787,7 +787,7 @@ TYPED_TEST(BatchVector, MovesToPrecision) } -TYPED_TEST(BatchVector, ConvertsToCsr32) +TYPED_TEST(BatchMultiVector, ConvertsToCsr32) { using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; @@ -824,7 +824,7 @@ TYPED_TEST(BatchVector, ConvertsToCsr32) } -TYPED_TEST(BatchVector, MovesToCsr32) +TYPED_TEST(BatchMultiVector, MovesToCsr32) { using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; @@ -861,14 +861,14 @@ TYPED_TEST(BatchVector, MovesToCsr32) } -TYPED_TEST(BatchVector, ConvertsEmptyToPrecision) +TYPED_TEST(BatchMultiVector, ConvertsEmptyToPrecision) { - using BatchVector = typename TestFixture::Mtx; + using BatchMultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchVector = typename gko::matrix::BatchVector; - auto empty = OtherBatchVector::create(this->exec); - auto res = BatchVector::create(this->exec); + using OtherBatchMultiVector = typename gko::BatchMultiVector; + auto empty = OtherBatchMultiVector::create(this->exec); + auto res = BatchMultiVector::create(this->exec); empty->convert_to(res.get()); @@ -876,14 +876,14 @@ TYPED_TEST(BatchVector, ConvertsEmptyToPrecision) } -TYPED_TEST(BatchVector, MovesEmptyToPrecision) +TYPED_TEST(BatchMultiVector, MovesEmptyToPrecision) { - using BatchVector = typename TestFixture::Mtx; + using BatchMultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchVector = typename gko::matrix::BatchVector; - auto empty = OtherBatchVector::create(this->exec); - auto res = BatchVector::create(this->exec); + using OtherBatchMultiVector = typename gko::BatchMultiVector; + auto empty = OtherBatchMultiVector::create(this->exec); + auto res = BatchMultiVector::create(this->exec); empty->move_to(res.get()); @@ -891,12 +891,12 @@ TYPED_TEST(BatchVector, MovesEmptyToPrecision) } -TYPED_TEST(BatchVector, ConvertsEmptyMatrixToCsr) +TYPED_TEST(BatchMultiVector, ConvertsEmptyMatrixToCsr) { - using BatchVector = typename TestFixture::Mtx; + using BatchMultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; - auto empty = BatchVector::create(this->exec); + auto empty = BatchMultiVector::create(this->exec); auto res = BatchCsr::create(this->exec); empty->convert_to(res.get()); @@ -907,12 +907,12 @@ TYPED_TEST(BatchVector, ConvertsEmptyMatrixToCsr) } -TYPED_TEST(BatchVector, MovesEmptyMatrixToCsr) +TYPED_TEST(BatchMultiVector, MovesEmptyMatrixToCsr) { - using BatchVector = typename TestFixture::Mtx; + using BatchMultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using BatchCsr = typename gko::matrix::BatchCsr; - auto empty = BatchVector::create(this->exec); + auto empty = BatchMultiVector::create(this->exec); auto res = BatchCsr::create(this->exec); empty->move_to(res.get()); @@ -923,7 +923,7 @@ TYPED_TEST(BatchVector, MovesEmptyMatrixToCsr) } -TYPED_TEST(BatchVector, ConvertsToBatchDiagonal) +TYPED_TEST(BatchMultiVector, ConvertsToBatchDiagonal) { using BDense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -946,7 +946,7 @@ TYPED_TEST(BatchVector, ConvertsToBatchDiagonal) } -TYPED_TEST(BatchVector, MovesToBatchDiagonal) +TYPED_TEST(BatchMultiVector, MovesToBatchDiagonal) { using BDense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -967,13 +967,13 @@ TYPED_TEST(BatchVector, MovesToBatchDiagonal) } -TYPED_TEST(BatchVector, SquareMatrixIsTransposable) +TYPED_TEST(BatchMultiVector, SquareMatrixIsTransposable) { using Mtx = typename TestFixture::Mtx; auto trans = this->mtx_4->transpose(); - auto trans_as_batch_vector = static_cast(trans.get()); + auto trans_as_batch_multi_vector = static_cast(trans.get()); - auto utb = trans_as_batch_vector->unbatch(); + auto utb = trans_as_batch_multi_vector->unbatch(); GKO_ASSERT_MTX_NEAR(utb[0].get(), l({{1.0, 6.0, 6.0}, {1.5, 1.0, 1.0}, {3.0, 5.0, 5.5}}), r::value); @@ -983,13 +983,13 @@ TYPED_TEST(BatchVector, SquareMatrixIsTransposable) } -TYPED_TEST(BatchVector, NonSquareMatrixIsTransposable) +TYPED_TEST(BatchMultiVector, NonSquareMatrixIsTransposable) { using Mtx = typename TestFixture::Mtx; auto trans = this->mtx_5->transpose(); - auto trans_as_batch_vector = static_cast(trans.get()); + auto trans_as_batch_multi_vector = static_cast(trans.get()); - auto utb = trans_as_batch_vector->unbatch(); + auto utb = trans_as_batch_multi_vector->unbatch(); GKO_ASSERT_MTX_NEAR(utb[0].get(), l({{1.0, 6.0, 7.0}, {1.5, 1.0, -4.5}}), r::value); GKO_ASSERT_MTX_NEAR(utb[1].get(), l({{2.0, 1.0, 4.0}, {-2.0, 3.0, 3.0}}), @@ -997,7 +997,7 @@ TYPED_TEST(BatchVector, NonSquareMatrixIsTransposable) } -TYPED_TEST(BatchVector, SquareMatrixAddScaledIdentity) +TYPED_TEST(BatchMultiVector, SquareMatrixAddScaledIdentity) { using T = typename TestFixture::value_type; using Mtx = typename TestFixture::Mtx; diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt index 80026fdabe1..a80be354878 100644 --- a/test/base/CMakeLists.txt +++ b/test/base/CMakeLists.txt @@ -1,3 +1,4 @@ +ginkgo_create_common_and_reference_test(batch_multi_vector_kernels) ginkgo_create_common_and_reference_test(device_matrix_data_kernels) ginkgo_create_common_device_test(kernel_launch_generic) ginkgo_create_common_and_reference_test(executor) diff --git a/test/matrix/batch_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp similarity index 81% rename from test/matrix/batch_vector_kernels.cpp rename to test/base/batch_multi_vector_kernels.cpp index 150f02a3772..e16607db844 100644 --- a/test/matrix/batch_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_vector_kernels.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" #include @@ -40,9 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include -#include -#include #include "core/test/utils.hpp" @@ -53,14 +52,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef GKO_COMPILING_DPCPP -class BatchVector : public CommonTestFixture { +class BatchMultiVector : public CommonTestFixture { protected: using vtype = double; - using Mtx = gko::matrix::BatchVector; - using NormVector = gko::matrix::BatchVector>; - using ComplexMtx = gko::matrix::BatchVector>; + using Mtx = gko::BatchMultiVector; + using NormVector = gko::BatchMultiVector>; + using ComplexMtx = gko::BatchMultiVector>; - BatchVector() : rand_engine(15) {} + BatchMultiVector() : rand_engine(15) {} template std::unique_ptr gen_mtx(const size_t batchsize, int num_rows, @@ -145,7 +144,7 @@ class BatchVector : public CommonTestFixture { }; -TEST_F(BatchVector, SingleVectorAppyIsEquivalentToRef) +TEST_F(BatchMultiVector, SingleVectorAppyIsEquivalentToRef) { set_up_apply_data(1); @@ -156,7 +155,7 @@ TEST_F(BatchVector, SingleVectorAppyIsEquivalentToRef) } -TEST_F(BatchVector, SingleVectorAdvancedAppyIsEquivalentToRef) +TEST_F(BatchMultiVector, SingleVectorAdvancedAppyIsEquivalentToRef) { set_up_apply_data(1); @@ -167,7 +166,7 @@ TEST_F(BatchVector, SingleVectorAdvancedAppyIsEquivalentToRef) } -TEST_F(BatchVector, SingleVectorAddScaledIsEquivalentToRef) +TEST_F(BatchMultiVector, SingleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(1); @@ -178,7 +177,7 @@ TEST_F(BatchVector, SingleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchVector, SingleVectorAddScaleIsEquivalentToRef) +TEST_F(BatchMultiVector, SingleVectorAddScaleIsEquivalentToRef) { set_up_vector_data(1); @@ -189,7 +188,7 @@ TEST_F(BatchVector, SingleVectorAddScaleIsEquivalentToRef) } -TEST_F(BatchVector, MultipleVectorAddScaledIsEquivalentToRef) +TEST_F(BatchMultiVector, MultipleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(20); @@ -200,7 +199,7 @@ TEST_F(BatchVector, MultipleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchVector, MultipleVectorAddScaleIsEquivalentToRef) +TEST_F(BatchMultiVector, MultipleVectorAddScaleIsEquivalentToRef) { set_up_vector_data(20); @@ -211,7 +210,7 @@ TEST_F(BatchVector, MultipleVectorAddScaleIsEquivalentToRef) } -TEST_F(BatchVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) +TEST_F(BatchMultiVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); @@ -222,7 +221,7 @@ TEST_F(BatchVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) } -TEST_F(BatchVector, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) +TEST_F(BatchMultiVector, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) { set_up_vector_data(20, true); @@ -233,7 +232,7 @@ TEST_F(BatchVector, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) } -TEST_F(BatchVector, SingleVectorScaleIsEquivalentToRef) +TEST_F(BatchMultiVector, SingleVectorScaleIsEquivalentToRef) { set_up_vector_data(1); @@ -244,7 +243,7 @@ TEST_F(BatchVector, SingleVectorScaleIsEquivalentToRef) } -TEST_F(BatchVector, MultipleVectorScaleIsEquivalentToRef) +TEST_F(BatchMultiVector, MultipleVectorScaleIsEquivalentToRef) { set_up_vector_data(20); @@ -255,7 +254,7 @@ TEST_F(BatchVector, MultipleVectorScaleIsEquivalentToRef) } -TEST_F(BatchVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) +TEST_F(BatchMultiVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); @@ -266,7 +265,7 @@ TEST_F(BatchVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) } -TEST_F(BatchVector, ComputeNorm2SingleIsEquivalentToRef) +TEST_F(BatchMultiVector, ComputeNorm2SingleIsEquivalentToRef) { set_up_vector_data(1); auto norm_size = @@ -281,7 +280,7 @@ TEST_F(BatchVector, ComputeNorm2SingleIsEquivalentToRef) } -TEST_F(BatchVector, ComputeNorm2IsEquivalentToRef) +TEST_F(BatchMultiVector, ComputeNorm2IsEquivalentToRef) { set_up_vector_data(20); auto norm_size = @@ -296,7 +295,7 @@ TEST_F(BatchVector, ComputeNorm2IsEquivalentToRef) } -TEST_F(BatchVector, ComputeDotIsEquivalentToRef) +TEST_F(BatchMultiVector, ComputeDotIsEquivalentToRef) { set_up_vector_data(20); auto dot_size = @@ -311,7 +310,7 @@ TEST_F(BatchVector, ComputeDotIsEquivalentToRef) } -TEST_F(BatchVector, ComputeDotSingleIsEquivalentToRef) +TEST_F(BatchMultiVector, ComputeDotSingleIsEquivalentToRef) { set_up_vector_data(1); auto dot_size = @@ -326,31 +325,31 @@ TEST_F(BatchVector, ComputeDotSingleIsEquivalentToRef) } -TEST_F(BatchVector, CopySingleIsEquivalentToRef) +TEST_F(BatchMultiVector, CopySingleIsEquivalentToRef) { set_up_vector_data(1); - gko::kernels::reference::batch_vector::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_vector::copy(this->exec, dx.get(), - dy.get()); + gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); + gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), + dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } -TEST_F(BatchVector, CopyIsEquivalentToRef) +TEST_F(BatchMultiVector, CopyIsEquivalentToRef) { set_up_vector_data(20); - gko::kernels::reference::batch_vector::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_vector::copy(this->exec, dx.get(), - dy.get()); + gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); + gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), + dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } -TEST_F(BatchVector, BatchScaleIsEquivalentToRef) +TEST_F(BatchMultiVector, BatchScaleIsEquivalentToRef) { using BDiag = gko::matrix::BatchDiagonal; const int num_rhs = 20; @@ -365,16 +364,16 @@ TEST_F(BatchVector, BatchScaleIsEquivalentToRef) auto drght = BDiag::create(this->exec); drght->copy_from(rght.get()); - gko::kernels::reference::batch_vector::batch_scale(this->ref, left.get(), - rght.get(), x.get()); - gko::kernels::EXEC_NAMESPACE::batch_vector::batch_scale( + gko::kernels::reference::batch_multi_vector::batch_scale(this->ref, left.get(), + rght.get(), x.get()); + gko::kernels::EXEC_NAMESPACE::batch_multi_vector::batch_scale( this->exec, dleft.get(), drght.get(), dx.get()); GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); } -TEST_F(BatchVector, TransposeIsEquivalentToRef) +TEST_F(BatchMultiVector, TransposeIsEquivalentToRef) { const int nrows = 11; const int ncols = 6; @@ -392,7 +391,7 @@ TEST_F(BatchVector, TransposeIsEquivalentToRef) } -TEST_F(BatchVector, ConjugateTransposeIsEquivalentToRef) +TEST_F(BatchMultiVector, ConjugateTransposeIsEquivalentToRef) { const int nrows = 11; const int ncols = 6; @@ -410,7 +409,7 @@ TEST_F(BatchVector, ConjugateTransposeIsEquivalentToRef) } -TEST_F(BatchVector, AddScaledIdentityNonSquareIsEquivalentToReference) +TEST_F(BatchMultiVector, AddScaledIdentityNonSquareIsEquivalentToReference) { set_up_apply_data(); const gko::size_type batchsize = 10; From 92c3a7289fbb60b8de06899622eadbee08d57ac8 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 7 Jul 2023 18:39:25 +0200 Subject: [PATCH 108/583] Updates to BatchMultiVector --- include/ginkgo/core/base/batch_dim.hpp | 12 + .../ginkgo/core/base/batch_multi_vector.hpp | 294 ++++++------------ 2 files changed, 108 insertions(+), 198 deletions(-) diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index 211225d7df2..3e650745a50 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -74,6 +74,18 @@ struct batch_dim { return common_size_; } + /** + * Get the cumulative storage size offset + * + * @param b the batch id + * + * @return the cumulative offset + */ + size_type get_cumulative_offset(size_type b) const + { + return b * common_size_[0] * common_size_[1]; + } + /** * Checks if two batch_dim objects are equal. * diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index a4dafd75faa..a4860e2c7b3 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -62,8 +62,6 @@ namespace gko { * @note While this format is not very useful for storing sparse matrices, it * is often suitable to store vectors, and sets of vectors. * @ingroup batch_multi_vector - * @ingroup mat_formats - * @ingroup BatchLinOp */ template class BatchMultiVector @@ -132,17 +130,41 @@ class BatchMultiVector auto exec = this->get_executor(); auto unbatch_mats = std::vector>{}; for (size_type b = 0; b < this->get_num_batch_entries(); ++b) { - auto mat = unbatch_type::create(exec, this->get_size().at(b), - this->get_stride().at(b)); + auto mat = unbatch_type::create(exec, this->get_common_size(), + this->get_common_size()[1]); exec->copy_from(exec.get(), mat->get_num_stored_elements(), this->get_const_values() + - num_elems_per_batch_cumul_.get_const_data()[b], + this->get_size().get_cumulative_offset(b), mat->get_values()); unbatch_mats.emplace_back(std::move(mat)); } return unbatch_mats; } + /** + * Returns the batch size. + * + * @return the batch size + */ + batch_dim<2> get_size() { return batch_size_; } + + /** + * Returns the number of batch entries. + * + * @return the number of batch entries + */ + size_type get_num_batch_entries() + { + return batch_size_.get_num_batch_entries(); + } + + /** + * Returns the common size of the batch entries. + * + * @return the common size stored + */ + dim<2> get_common_size() { return batch_size_.get_common_size(); } + /** * Returns a pointer to the array of values of the vector. * @@ -158,8 +180,9 @@ class BatchMultiVector value_type* get_values(size_type batch) noexcept { GKO_ASSERT(batch < this->get_num_batch_entries()); + // TODO Verify return values_.get_data() + - num_elems_per_batch_cumul_.get_const_data()[batch]; + this->get_size().get_cumulative_offset(batch); } /** @@ -185,7 +208,7 @@ class BatchMultiVector { GKO_ASSERT(batch < this->get_num_batch_entries()); return values_.get_const_data() + - num_elems_per_batch_cumul_.get_const_data()[batch]; + this->get_size().get_cumulative_offset(batch); } /** @@ -200,21 +223,6 @@ class BatchMultiVector return values_.get_num_elems(); } - /** - * Returns the number of elements explicitly stored at a specific batch - * index. - * - * @param batch the batch index to be queried - * - * @return the number of elements explicitly stored in the vector - */ - size_type get_num_stored_elements(size_type batch) const noexcept - { - GKO_ASSERT(batch < this->get_num_batch_entries()); - return num_elems_per_batch_cumul_.get_const_data()[batch + 1] - - num_elems_per_batch_cumul_.get_const_data()[batch]; - } - /** * Returns a single element for a particular batch. * @@ -226,7 +234,7 @@ class BatchMultiVector * stored at (e.g. trying to call this method on a GPU matrix from * the OMP results in a runtime error) */ - value_type& at(size_type batch, size_type row, size_type col) noexcept + value_type& at(size_type batch, size_type row, size_type col) { GKO_ASSERT(batch < this->get_num_batch_entries()); return values_.get_data()[linearize_index(batch, row, col)]; @@ -235,7 +243,7 @@ class BatchMultiVector /** * @copydoc BatchMultiVector::at(size_type, size_type, size_type) */ - value_type at(size_type batch, size_type row, size_type col) const noexcept + value_type at(size_type batch, size_type row, size_type col) const { GKO_ASSERT(batch < this->get_num_batch_entries()); return values_.get_const_data()[linearize_index(batch, row, col)]; @@ -278,7 +286,7 @@ class BatchMultiVector * of alpha (the number of columns of alpha has to match the number of * columns of the matrix). */ - void scale(const BatchLinOp* alpha) + void scale(const BatchMultiVector* alpha) { auto exec = this->get_executor(); this->scale_impl(make_temporary_clone(exec, alpha).get()); @@ -294,7 +302,7 @@ class BatchMultiVector * vector). * @param b a matrix of the same dimension as this */ - void add_scaled(const BatchLinOp* alpha, const BatchLinOp* b) + void add_scaled(const BatchMultiVector* alpha, const BatchMultiVector* b) { auto exec = this->get_executor(); this->add_scaled_impl(make_temporary_clone(exec, alpha).get(), @@ -313,8 +321,8 @@ class BatchMultiVector * @param beta Scalar(s), of the same size as alpha, to multiply this * matrix. */ - void add_scale(const BatchLinOp* alpha, const BatchLinOp* a, - const BatchLinOp* beta); + void add_scale(const BatchMultiVector* alpha, const BatchMultiVector* a, + const BatchMultiVector* beta); /** * Computes the column-wise dot product of each matrix in this batch and its @@ -326,7 +334,7 @@ class BatchMultiVector * product (the number of column in the vector must match the number of * columns of this) */ - void compute_dot(const BatchLinOp* b, BatchLinOp* result) const + void compute_dot(const BatchMultiVector* b, BatchMultiVector* result) const { auto exec = this->get_executor(); this->compute_dot_impl(make_temporary_clone(exec, b).get(), @@ -340,7 +348,7 @@ class BatchMultiVector * (the number of columns in the vector must match the number * of columns of this) */ - void compute_norm2(BatchLinOp* result) const + void compute_norm2(BatchMultiVector* result) const { auto exec = this->get_executor(); this->compute_norm2_impl(make_temporary_clone(exec, result).get()); @@ -359,95 +367,28 @@ class BatchMultiVector */ static std::unique_ptr create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - gko::detail::const_array_view&& values, - const batch_stride& strides) + gko::detail::const_array_view&& values) { // cast const-ness away, but return a const object afterwards, // so we can ensure that no modifications take place. return std::unique_ptr(new BatchMultiVector{ - exec, sizes, gko::detail::array_const_cast(std::move(values)), - strides}); + exec, sizes, gko::detail::array_const_cast(std::move(values))}); } private: - /** - * Compute the memory required for the values array from the sizes and the - * strides. - */ - inline size_type compute_batch_mem(const batch_dim<2>& sizes, - const batch_stride& strides) - { - GKO_ASSERT(sizes.get_num_batch_entries() == - strides.get_num_batch_entries()); - if (sizes.stores_equal_sizes() && strides.stores_equal_strides()) { - return (sizes.at(0))[0] * strides.at(0) * - sizes.get_num_batch_entries(); - } - size_type mem_req = 0; - for (auto i = 0; i < sizes.get_num_batch_entries(); ++i) { - mem_req += (sizes.at(i))[0] * strides.at(i); - } - return mem_req; - } - - /** - * Extract the nth dim of the batch sizes from the input batch_dim object. - */ - inline batch_stride extract_nth_dim(const int dim, const batch_dim<2>& size) - { - if (size.stores_equal_sizes()) { - return batch_stride(size.get_num_batch_entries(), size.at(0)[dim]); - } - std::vector stride(size.get_num_batch_entries()); - for (auto i = 0; i < size.get_num_batch_entries(); ++i) { - stride[i] = (size.at(i))[dim]; - } - return batch_stride(stride); - } - - /** - * Extract strides from the vector of the distinct Dense matrices. - */ - inline batch_stride get_strides_from_mtxs( - const std::vector*> mtxs) + inline batch_dim<2> compute_batch_size( + const std::vector*>& matrices) { - auto strides = std::vector(mtxs.size()); - for (auto i = 0; i < mtxs.size(); ++i) { - strides[i] = mtxs[i]->get_stride(); + auto common_size = matrices[0]->get_size(); + for (int i = 1; i < matrices.size(); ++i) { + GKO_ASSERT_EQ(common_size, matrices[i]->get_size()); } - return batch_stride(strides); + return batch_dim<2>{num_entries, common_size}; } - /** - * Extract sizes from the vector of the distinct Dense matrices. - */ - inline batch_dim<2> get_sizes_from_mtxs( - const std::vector*> mtxs) + inline size_type compute_num_elems(const batch_dim<2>& size) { - auto sizes = std::vector>(mtxs.size()); - for (auto i = 0; i < mtxs.size(); ++i) { - sizes[i] = mtxs[i]->get_size(); - } - return batch_dim<2>(sizes); - } - - /** - * Compute the number of elements stored in each batch and store it in a - * prefixed sum fashion - */ - inline array compute_num_elems_per_batch_cumul( - std::shared_ptr exec, const batch_dim<2>& sizes, - const batch_stride& strides) - { - auto num_elems = array(exec->get_master(), - sizes.get_num_batch_entries() + 1); - num_elems.get_data()[0] = 0; - for (auto i = 0; i < sizes.get_num_batch_entries(); ++i) { - num_elems.get_data()[i + 1] = - num_elems.get_data()[i] + (sizes.at(i))[0] * strides.at(i); - } - num_elems.set_executor(exec); - return num_elems; + return size.get_cumulative_offset(size.get_num_batch_entries()); } protected: @@ -459,31 +400,11 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const batch_dim<2>& size = batch_dim<2>{}) - : BatchMultiVector(std::move(exec), size, - size.get_num_batch_entries() > 0 - ? extract_nth_dim(1, size) - : batch_stride{}) + : batch_size_(size), + values_(exec, compute_num_elems(size)), + exec(std::move(exec)) {} - /** - * Creates an uninitialized BatchMultiVector matrix of the specified size. - * - * @param exec Executor associated to the vector - * @param size size of the batch matrices in a batch_dim object - * @param stride stride of the rows (i.e. offset between the first - * elements of two consecutive rows, expressed as the - * number of matrix elements) - */ - BatchMultiVector(std::shared_ptr exec, - const batch_dim<2>& size, const batch_stride& stride) - : EnableBatchLinOp(exec, size), - values_(exec, compute_batch_mem(size, stride)), - stride_(stride) - { - num_elems_per_batch_cumul_ = - compute_num_elems_per_batch_cumul(exec, this->get_size(), stride); - } - /** * Creates a BatchMultiVector matrix from an already allocated (and * initialized) array. @@ -503,21 +424,13 @@ class BatchMultiVector */ template BatchMultiVector(std::shared_ptr exec, - const batch_dim<2>& size, ValuesArray&& values, - const batch_stride& stride) - : EnableBatchLinOp(exec, size), + const batch_dim<2>& size, ValuesArray&& values) + : batch_size_(size), values_{exec, std::forward(values)}, - stride_{stride}, - num_elems_per_batch_cumul_( - exec->get_master(), - compute_num_elems_per_batch_cumul(exec->get_master(), - this->get_size(), stride)) + exec_(std::move(exec)) { - auto num_elems = - num_elems_per_batch_cumul_ - .get_const_data()[num_elems_per_batch_cumul_.get_num_elems() - - 1] - - 1; + // Ensure that the values array has the correct size + auto num_elems = compute_num_elems(size); GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems()); } @@ -529,20 +442,16 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const std::vector*>& matrices) - : EnableBatchLinOp(exec, - get_sizes_from_mtxs(matrices)), - stride_{get_strides_from_mtxs(matrices)}, - values_(exec, compute_batch_mem(this->get_size(), stride_)) + : batch_size_{compute_batch_size(matrices)}, + values(exec, compute_num_elems(batch_size_)), + exec(std::move(exec)) { - num_elems_per_batch_cumul_ = compute_num_elems_per_batch_cumul( - exec->get_master(), this->get_size(), stride_); for (size_type i = 0; i < this->get_num_batch_entries(); ++i) { auto local_exec = matrices[i]->get_executor(); - exec->copy_from(local_exec.get(), - matrices[i]->get_num_stored_elements(), - matrices[i]->get_const_values(), - this->get_values() + - num_elems_per_batch_cumul_.get_const_data()[i]); + exec->copy_from( + local_exec.get(), matrices[i]->get_num_stored_elements(), + matrices[i]->get_const_values(), + this->get_values() + this->get_size().get_cumulative_offset(i)); } } @@ -556,18 +465,11 @@ class BatchMultiVector BatchMultiVector(std::shared_ptr exec, size_type num_duplications, const BatchMultiVector* input) - : EnableBatchLinOp( + : EnableBatchMultiVector( exec, gko::batch_dim<2>( input->get_num_batch_entries() * num_duplications, - input->get_size().at(0))), - stride_{gko::batch_stride( - input->get_num_batch_entries() * num_duplications, - input->get_stride().at(0))}, - values_(exec, compute_batch_mem(this->get_size(), stride_)) + input->get_common_size())) { - // Check if it works when stride neq num_cols - num_elems_per_batch_cumul_ = compute_num_elems_per_batch_cumul( - exec->get_master(), this->get_size(), stride_); size_type offset = 0; for (size_type i = 0; i < num_duplications; ++i) { exec->copy_from( @@ -586,14 +488,9 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, size_type num_duplications, const Dense* input) - : EnableBatchLinOp( - exec, gko::batch_dim<2>(num_duplications, input->get_size())), - stride_{gko::batch_stride(num_duplications, input->get_stride())}, - values_(exec, compute_batch_mem(this->get_size(), stride_)) + : EnableBatchMultiVector( + exec, gko::batch_dim<2>(num_duplications, input->get_size())) { - // Check if it works when stride neq num_cols - num_elems_per_batch_cumul_ = compute_num_elems_per_batch_cumul( - exec->get_master(), this->get_size(), stride_); size_type offset = 0; for (size_type i = 0; i < num_duplications; ++i) { exec->copy_from( @@ -612,61 +509,62 @@ class BatchMultiVector */ virtual std::unique_ptr create_with_same_config() const { - return BatchMultiVector::create(this->get_executor(), this->get_size(), - this->get_stride()); + return BatchMultiVector::create(this->get_executor(), this->get_size()); } /** - * @copydoc scale(const BatchLinOp *) + * @copydoc scale(const BatchMultiVector *) * * @note Other implementations of batch_multi_vector should override this - * function instead of scale(const BatchLinOp *alpha). + * function instead of scale(const BatchMultiVector *alpha). */ - virtual void scale_impl(const BatchLinOp* alpha); + virtual void scale_impl(const BatchMultiVector* alpha); /** - * @copydoc add_scaled(const BatchLinOp *, const BatchLinOp *) + * @copydoc add_scaled(const BatchMultiVector *, const BatchMultiVector *) * * @note Other implementations of batch_multi_vector should override this - * function instead of add_scale(const BatchLinOp *alpha, const BatchLinOp - * *b). + * function instead of add_scale(const BatchMultiVector *alpha, const + * BatchMultiVector *b). */ - virtual void add_scaled_impl(const BatchLinOp* alpha, const BatchLinOp* b); + virtual void add_scaled_impl(const BatchMultiVector* alpha, + const BatchMultiVector* b); /** - * @copydoc compute_dot(const BatchLinOp *, BatchLinOp *) const + * @copydoc compute_dot(const BatchMultiVector *, BatchMultiVector *) const * * @note Other implementations of batch_multi_vector should override this - * function instead of compute_dot(const BatchLinOp *b, BatchLinOp *result). + * function instead of compute_dot(const BatchMultiVector *b, + * BatchMultiVector *result). */ - virtual void compute_dot_impl(const BatchLinOp* b, - BatchLinOp* result) const; + virtual void compute_dot_impl(const BatchMultiVector* b, + BatchMultiVector* result) const; /** - * @copydoc compute_norm2(BatchLinOp *) const + * @copydoc compute_norm2(BatchMultiVector *) const * * @note Other implementations of batch_multi_vector should override this - * function instead of compute_norm2(BatchLinOp *result). + * function instead of compute_norm2(BatchMultiVector *result). */ - virtual void compute_norm2_impl(BatchLinOp* result) const; + virtual void compute_norm2_impl(BatchMultiVector* result) const; size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept { - return num_elems_per_batch_cumul_.get_const_data()[batch] + - row * stride_.at(batch) + col; + return batch_size_.get_cumulative_offset(batch) + + row * batch_size_.get_common_size()[1] + col; } size_type linearize_index(size_type batch, size_type idx) const noexcept { - return linearize_index(batch, idx / this->get_size().at(batch)[1], - idx % this->get_size().at(batch)[1]); + return linearize_index(batch, idx / this->get_common_size()[1], + idx % this->get_common_size()[1]); } private: - batch_stride stride_; - array num_elems_per_batch_cumul_; + batch_dim<2> batch_size_; array values_; + std::shared_ptr exec; }; @@ -688,7 +586,7 @@ class BatchMultiVector * including the Executor, which is passed as the first * argument * - * @ingroup BatchLinOp + * @ingroup BatchMultiVector * @ingroup mat_formats */ template @@ -743,7 +641,7 @@ std::unique_ptr batch_initialize( * including the Executor, which is passed as the first * argument * - * @ingroup BatchLinOp + * @ingroup BatchMultiVector * @ingroup mat_formats */ template @@ -776,7 +674,7 @@ std::unique_ptr batch_initialize( * including the Executor, which is passed as the first * argument * - * @ingroup BatchLinOp + * @ingroup BatchMultiVector * @ingroup mat_formats */ template @@ -840,7 +738,7 @@ std::unique_ptr batch_initialize( * including the Executor, which is passed as the first * argument * - * @ingroup BatchLinOp + * @ingroup BatchMultiVector * @ingroup mat_formats */ template @@ -883,7 +781,7 @@ std::unique_ptr batch_initialize( * including the Executor, which is passed as the first * argument * - * @ingroup BatchLinOp + * @ingroup BatchMultiVector * @ingroup mat_formats */ template @@ -936,7 +834,7 @@ std::unique_ptr batch_initialize( * including the Executor, which is passed as the first * argument * - * @ingroup BatchLinOp + * @ingroup BatchMultiVector * @ingroup mat_formats */ template From 3979e1bdab4b8c0ca7c22f26025337e957a2c1eb Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 9 Jul 2023 00:38:11 +0200 Subject: [PATCH 109/583] Use PolymorphicObject, fix batch_initialize --- core/base/batch_multi_vector.cpp | 73 ++--- .../ginkgo/core/base/batch_multi_vector.hpp | 310 ++++++------------ 2 files changed, 127 insertions(+), 256 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 76639494088..cc83638ee92 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -64,78 +64,67 @@ GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); template -void BatchMultiVector::scale_impl(const BatchLinOp* alpha) +void BatchMultiVector::scale_impl( + const BatchMultiVector* alpha) { - auto batch_alpha = as>(alpha); GKO_ASSERT_BATCH_EQUAL_ROWS( - batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); - for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { - if (batch_alpha->get_size().at(b)[1] != 1) { + alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); + for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) { + if (alpha->get_common_size()[1] != 1) { // different alpha for each column - GKO_ASSERT_BATCH_EQUAL_COLS(this, batch_alpha); + GKO_ASSERT_BATCH_EQUAL_COLS(this, alpha); } } - auto exec = this->get_executor(); - exec->run(batch_multi_vector::make_scale(batch_alpha, this)); + this->get_executor()->run(batch_multi_vector::make_scale(alpha, this)); } template -void BatchMultiVector::add_scaled_impl(const BatchLinOp* alpha, - const BatchLinOp* b) +void BatchMultiVector::add_scaled_impl( + const BatchMultiVector* alpha, + const BatchMultiVector* b) { - auto batch_alpha = as>(alpha); - auto batch_b = as>(b); GKO_ASSERT_BATCH_EQUAL_ROWS( - batch_alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); - for (size_type b = 0; b < batch_alpha->get_num_batch_entries(); ++b) { - if (batch_alpha->get_size().at(b)[1] != 1) { + alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); + for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) { + if (alpha->get_common_size()[1] != 1) { // different alpha for each column - GKO_ASSERT_BATCH_EQUAL_COLS(this, batch_alpha); + GKO_ASSERT_BATCH_EQUAL_COLS(this, alpha); } } - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); - auto exec = this->get_executor(); + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, b); - exec->run(batch_multi_vector::make_add_scaled(batch_alpha, batch_b, this)); + this->get_executor()->run( + batch_multi_vector::make_add_scaled(alpha, b, this)); } inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) { - auto col_sizes = std::vector>(sizes.get_num_batch_entries()); - for (size_type i = 0; i < col_sizes.size(); ++i) { - col_sizes[i] = dim<2>(1, sizes.at(i)[1]); - } - return batch_dim<2>(col_sizes); + return batch_dim<2>(sizes.get_num_batch_entries(), dim<2>(1, sizes[1])); } template -void BatchMultiVector::compute_dot_impl(const BatchLinOp* b, - BatchLinOp* result) const +void BatchMultiVector::compute_dot_impl( + const BatchMultiVector* b, + BatchMultiVector* result) const { - auto batch_result = as>(result); - auto batch_b = as>(b); - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, batch_b); + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, b); GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, get_col_sizes(this->get_size())); - auto exec = this->get_executor(); - exec->run( - batch_multi_vector::make_compute_dot(this, batch_b, batch_result)); + this->get_executor()->run( + batch_multi_vector::make_compute_dot(this, b, result)); } template -void BatchMultiVector::compute_norm2_impl(BatchLinOp* result) const +void BatchMultiVector::compute_norm2_impl( + BatchMultiVector>* result) const { - using NormVector = BatchMultiVector>; - auto batch_result = as(result); - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, - get_col_sizes(this->get_size())); - auto exec = this->get_executor(); - exec->run(batch_multi_vector::make_compute_norm2( - as>(this), batch_result)); + GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(result, get_col_sizes(this->get_size())); + this->get_executor()->run(batch_multi_vector::make_compute_norm2( + as>(this), result)); } @@ -144,8 +133,6 @@ void BatchMultiVector::convert_to( BatchMultiVector>* result) const { result->values_ = this->values_; - result->stride_ = this->stride_; - result->num_elems_per_batch_cumul_ = this->num_elems_per_batch_cumul_; result->set_size(this->get_size()); } @@ -206,7 +193,7 @@ void BatchMultiVector::read(const std::vector& data) template inline void write_impl(const MatrixType* mtx, std::vector& data) { - std::unique_ptr op{}; + std::unique_ptr> op{}; const MatrixType* tmp{}; if (mtx->get_executor()->get_master() != mtx->get_executor()) { op = mtx->clone(mtx->get_executor()->get_master()); diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index a4860e2c7b3..9513272648d 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -84,8 +84,8 @@ class BatchMultiVector using unbatch_type = Dense; using mat_data = gko::matrix_data; using mat_data32 = gko::matrix_data; - using absolute_type = remove_complex; - using complex_type = to_complex; + using absolute_type = remove_complex>; + using complex_type = to_complex>; using row_major_range = gko::range>; @@ -286,7 +286,7 @@ class BatchMultiVector * of alpha (the number of columns of alpha has to match the number of * columns of the matrix). */ - void scale(const BatchMultiVector* alpha) + void scale(const BatchMultiVector* alpha) { auto exec = this->get_executor(); this->scale_impl(make_temporary_clone(exec, alpha).get()); @@ -302,7 +302,8 @@ class BatchMultiVector * vector). * @param b a matrix of the same dimension as this */ - void add_scaled(const BatchMultiVector* alpha, const BatchMultiVector* b) + void add_scaled(const BatchMultiVector* alpha, + const BatchMultiVector* b) { auto exec = this->get_executor(); this->add_scaled_impl(make_temporary_clone(exec, alpha).get(), @@ -321,8 +322,9 @@ class BatchMultiVector * @param beta Scalar(s), of the same size as alpha, to multiply this * matrix. */ - void add_scale(const BatchMultiVector* alpha, const BatchMultiVector* a, - const BatchMultiVector* beta); + void add_scale(const BatchMultiVector* alpha, + const BatchMultiVector* a, + const BatchMultiVector* beta); /** * Computes the column-wise dot product of each matrix in this batch and its @@ -334,7 +336,8 @@ class BatchMultiVector * product (the number of column in the vector must match the number of * columns of this) */ - void compute_dot(const BatchMultiVector* b, BatchMultiVector* result) const + void compute_dot(const BatchMultiVector* b, + BatchMultiVector* result) const { auto exec = this->get_executor(); this->compute_dot_impl(make_temporary_clone(exec, b).get(), @@ -348,7 +351,7 @@ class BatchMultiVector * (the number of columns in the vector must match the number * of columns of this) */ - void compute_norm2(BatchMultiVector* result) const + void compute_norm2(BatchMultiVector* result) const { auto exec = this->get_executor(); this->compute_norm2_impl(make_temporary_clone(exec, result).get()); @@ -365,7 +368,7 @@ class BatchMultiVector * (if it resides on the same executor as the vector) or a copy of * the array on the correct executor. */ - static std::unique_ptr create_const( + static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values) { @@ -375,6 +378,43 @@ class BatchMultiVector exec, sizes, gko::detail::array_const_cast(std::move(values))}); } + /** + * Copy-assigns a BatchMultiVector. Preserves the executor and copies the + * size. + */ + BatchMultiVector& operator=(const BatchMultiVector&) = default; + + /** + * Move-assigns a BatchMultiVector. Preserves the executor and moves the + * size. The moved-from object has size 0x0 afterwards, but its executor is + * unchanged. + */ + BatchMultiVector& operator=(BatchMultiVector&& other) + { + if (this != &other) { + EnableAbstractPolymorphicObject::operator=( + std::move(other)); + this->set_size(other.get_size()); + other.set_size({}); + } + return *this; + } + + /** + * Copy-constructs a BatchMultiVector. Inherits executor and size from the + * input. + */ + BatchMultiVector(const BatchMultiVector&) = default; + + /** + * Move-constructs a BatchMultiVector. Inherits executor and size from the + * input, which will have size 0x0 and unchanged executor afterwards. + */ + BatchMultiVector(BatchMultiVector&& other) + : EnableAbstractPolymorphicObject(std::move(other)), + batch_size_{std::exchange(other.batch_size_, batch_dim<2>{})} + {} + private: inline batch_dim<2> compute_batch_size( const std::vector*>& matrices) @@ -392,6 +432,13 @@ class BatchMultiVector } protected: + /** + * Sets the size of the BatchMultiVector. + * + * @param value the new size of the operator + */ + void set_size(const batch_dim<2>& value) noexcept { batch_size_ = value; } + /** * Creates an uninitialized BatchMultiVector matrix of the specified size. * @@ -400,9 +447,9 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const batch_dim<2>& size = batch_dim<2>{}) - : batch_size_(size), - values_(exec, compute_num_elems(size)), - exec(std::move(exec)) + : EnableAbstractPolymorphicObject(exec), + batch_size_(size), + values_(exec, compute_num_elems(size)) {} /** @@ -425,9 +472,9 @@ class BatchMultiVector template BatchMultiVector(std::shared_ptr exec, const batch_dim<2>& size, ValuesArray&& values) - : batch_size_(size), - values_{exec, std::forward(values)}, - exec_(std::move(exec)) + : EnableAbstractPolymorphicObject(exec), + batch_size_(size), + values_{exec, std::forward(values)} { // Ensure that the values array has the correct size auto num_elems = compute_num_elems(size); @@ -442,9 +489,9 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const std::vector*>& matrices) - : batch_size_{compute_batch_size(matrices)}, - values(exec, compute_num_elems(batch_size_)), - exec(std::move(exec)) + : EnableAbstractPolymorphicObject(exec), + batch_size_{compute_batch_size(matrices)}, + values(exec, compute_num_elems(batch_size_)) { for (size_type i = 0; i < this->get_num_batch_entries(); ++i) { auto local_exec = matrices[i]->get_executor(); @@ -518,7 +565,7 @@ class BatchMultiVector * @note Other implementations of batch_multi_vector should override this * function instead of scale(const BatchMultiVector *alpha). */ - virtual void scale_impl(const BatchMultiVector* alpha); + virtual void scale_impl(const BatchMultiVector* alpha); /** * @copydoc add_scaled(const BatchMultiVector *, const BatchMultiVector *) @@ -527,8 +574,8 @@ class BatchMultiVector * function instead of add_scale(const BatchMultiVector *alpha, const * BatchMultiVector *b). */ - virtual void add_scaled_impl(const BatchMultiVector* alpha, - const BatchMultiVector* b); + virtual void add_scaled_impl(const BatchMultiVector* alpha, + const BatchMultiVector* b); /** * @copydoc compute_dot(const BatchMultiVector *, BatchMultiVector *) const @@ -537,8 +584,8 @@ class BatchMultiVector * function instead of compute_dot(const BatchMultiVector *b, * BatchMultiVector *result). */ - virtual void compute_dot_impl(const BatchMultiVector* b, - BatchMultiVector* result) const; + virtual void compute_dot_impl(const BatchMultiVector* b, + BatchMultiVector* result) const; /** * @copydoc compute_norm2(BatchMultiVector *) const @@ -546,7 +593,7 @@ class BatchMultiVector * @note Other implementations of batch_multi_vector should override this * function instead of compute_norm2(BatchMultiVector *result). */ - virtual void compute_norm2_impl(BatchMultiVector* result) const; + virtual void compute_norm2_impl(BatchMultiVector* result) const; size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept @@ -564,7 +611,6 @@ class BatchMultiVector private: batch_dim<2> batch_size_; array values_; - std::shared_ptr exec; }; @@ -579,7 +625,6 @@ class BatchMultiVector * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param stride row stride for the temporary Dense matrix * @param vals values used to initialize the batch vector * @param exec Executor associated to the vector * @param create_args additional arguments passed to Matrix::create, not @@ -591,24 +636,21 @@ class BatchMultiVector */ template std::unique_ptr batch_initialize( - std::vector stride, std::initializer_list> vals, std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - size_type num_batch_entries = vals.size(); - std::vector num_rows(num_batch_entries); - std::vector> sizes(num_batch_entries); + size_type common_num_rows = vals_begin->size(); + size_type common_size = dim<2>(common_num_rows, 1); + dim<2> common_size; auto vals_begin = begin(vals); for (size_type b = 0; b < num_batch_entries; ++b) { - num_rows[b] = vals_begin->size(); - sizes[b] = dim<2>(num_rows[b], 1); + GKO_ASSERT_EQ(common_num_rows, vals_begin->size()); vals_begin++; } - auto b_size = batch_dim<2>(sizes); - auto b_stride = batch_stride(stride); - auto tmp = batch_multi_vector::create(exec->get_master(), b_size, b_stride); + auto b_size = batch_dim<2>(num_batch_entries, common_size); + auto tmp = batch_multi_vector::create(exec->get_master(), b_size); size_type batch = 0; for (const auto& b : vals) { size_type idx = 0; @@ -623,38 +665,6 @@ std::unique_ptr batch_initialize( return mtx; } -/** - * Creates and initializes a batch of column-vectors. - * - * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the vector to the requested type. The stride of - * the intermediate Dense matrix is set to 1. - * - * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param vals values used to initialize the vector - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup BatchMultiVector - * @ingroup mat_formats - */ -template -std::unique_ptr batch_initialize( - std::initializer_list> - vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - return batch_initialize(std::vector(vals.size(), 1), - vals, std::move(exec), - std::forward(create_args)...); -} - /** * Creates and initializes a batch of matrices. @@ -667,7 +677,6 @@ std::unique_ptr batch_initialize( * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param stride row stride for the temporary Dense matrix * @param vals values used to initialize the vector * @param exec Executor associated to the vector * @param create_args additional arguments passed to Matrix::create, not @@ -679,7 +688,6 @@ std::unique_ptr batch_initialize( */ template std::unique_ptr batch_initialize( - std::vector stride, std::initializer_list>> vals, @@ -687,19 +695,20 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); - std::vector num_rows(num_batch_entries); - std::vector num_cols(num_batch_entries); - std::vector> sizes(num_batch_entries); + + auto vals_begin = begin(vals); + size_type common_num_rows = vals_begin->size(); + size_type common_num_cols = begin(vals_begin)->size(); + auto common_size = dim<2>(common_num_rows, common_num_cols); size_type ind = 0; for (const auto& b : vals) { - num_rows[ind] = b.size(); - num_cols[ind] = num_rows[ind] > 0 ? begin(b)->size() : 1; - sizes[ind] = dim<2>(num_rows[ind], num_cols[ind]); - ++ind; - } - auto b_size = batch_dim<2>(sizes); - auto b_stride = batch_stride(stride); - auto tmp = batch_multi_vector::create(exec->get_master(), b_size, b_stride); + auto num_rows = b.size(); + auto num_cols = begin(b)->size(); + auto b_size = dim<2>(num_rows, num_cols); + GKO_ASSERT_EQ(b_size, common_size); + } + auto b_size = batch_dim<2>(num_batch_entries, common_size); + auto tmp = batch_multi_vector::create(exec->get_master(), b_size); size_type batch = 0; for (const auto& b : vals) { size_type ridx = 0; @@ -719,46 +728,6 @@ std::unique_ptr batch_initialize( } -/** - * Creates and initializes a batch of matrices. - * - * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the vector to the requested type. The stride of - * the intermediate Dense matrix is set to the number of columns of the - * initializer list. - * - * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param vals values used to initialize the vector - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup BatchMultiVector - * @ingroup mat_formats - */ -template -std::unique_ptr batch_initialize( - std::initializer_list>> - vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - auto strides = std::vector(vals.size(), 0); - size_type ind = 0; - for (const auto& b : vals) { - strides[ind] = begin(b)->size(); - ++ind; - } - return batch_initialize(strides, vals, std::move(exec), - std::forward(create_args)...); -} - - /** * Creates and initializes a batch column-vector by making copies of the single * input column vector. @@ -772,7 +741,6 @@ std::unique_ptr batch_initialize( * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param stride row strides for the temporary batch dense matrix * @param num_vectors The number of times the input vector is copied into * the final output * @param vals values used to initialize each vector in the temp. batch @@ -786,20 +754,14 @@ std::unique_ptr batch_initialize( */ template std::unique_ptr batch_initialize( - std::vector stride, const size_type num_vectors, + const size_type num_vectors, std::initializer_list vals, std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - std::vector num_rows(num_vectors); - std::vector> sizes(num_vectors); - for (size_type b = 0; b < num_vectors; ++b) { - num_rows[b] = vals.size(); - sizes[b] = dim<2>(vals.size(), 1); - } - auto b_size = batch_dim<2>(sizes); - auto b_stride = batch_stride(stride); - auto tmp = batch_multi_vector::create(exec->get_master(), b_size, b_stride); + size_type num_batch_entries = num_vectors; + auto b_size = batch_dim<2>(num_batch_entries, dim<2>(vals.size(), 1)); + auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_vectors; batch++) { size_type idx = 0; for (const auto& elem : vals) { @@ -813,41 +775,6 @@ std::unique_ptr batch_initialize( } -/** - * Creates and initializes a column-vector from copies of a given vector. - * - * This function first creates a temporary Dense matrix, fills it with passed - * in values, and then converts the vector to the requested type. The stride of - * the intermediate Dense matrix is set to 1. - * - * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo - * interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param num_vectors The number of times the input vector is copied into - * the final output - * @param vals values used to initialize the vector - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup BatchMultiVector - * @ingroup mat_formats - */ -template -std::unique_ptr batch_initialize( - const size_type num_vectors, - std::initializer_list vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - return batch_initialize(std::vector(num_vectors, 1), - num_vectors, vals, std::move(exec), - std::forward(create_args)...); -} - /** * Creates and initializes a matrix from copies of a given matrix. * @@ -873,22 +800,15 @@ std::unique_ptr batch_initialize( */ template std::unique_ptr batch_initialize( - std::vector stride, const size_type num_matrices, + const size_type num_matrices, std::initializer_list> vals, std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - std::vector> sizes(num_matrices); - const size_type num_rows = vals.size(); - for (size_type b = 0; b < num_matrices; ++b) { - const size_type num_cols = begin(vals)->size(); - sizes[b] = dim<2>(num_rows, num_cols); - for (auto blockit = begin(vals); blockit != end(vals); ++blockit) { - GKO_ASSERT(blockit->size() == num_cols); - } - } - auto tmp = batch_multi_vector::create(exec->get_master(), sizes, stride); + auto common_size = dim<2>(vals.size(), begin(vals)->size()); + batch_dim<2> b_size(num_matrices, common_size); + auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_matrices; batch++) { size_type ridx = 0; for (const auto& row : vals) { @@ -905,42 +825,6 @@ std::unique_ptr batch_initialize( return mtx; } -/** - * Creates and initializes a matrix from copies of a given matrix. - * - * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the vector to the requested type. The stride of - * the intermediate Dense matrix is set to 1. - * - * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param num_vectors The number of times the input vector is copied into - * the final output - * @param vals values used to initialize the vector - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup LinOp - * @ingroup mat_formats - */ -template -std::unique_ptr batch_initialize( - const size_type num_matrices, - std::initializer_list> - vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - auto strides = std::vector(num_matrices, begin(vals)->size()); - return batch_initialize(strides, num_matrices, vals, - std::move(exec), - std::forward(create_args)...); -} - } // namespace gko From 7bfaf49339f334d034738976822cd0ba278c2939 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 9 Jul 2023 00:42:33 +0200 Subject: [PATCH 110/583] Add read and write impls --- core/base/batch_multi_vector.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index cc83638ee92..3a3f0aff757 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -148,14 +148,15 @@ void BatchMultiVector::move_to( template inline void read_impl(MatrixType* mtx, const std::vector& data) { - auto batch_sizes = std::vector>(data.size()); + auto common_size = data[0].size; + auto batch_size = batch_dim<2>(data.size(), common_size); size_type ind = 0; for (const auto& b : data) { - batch_sizes[ind] = b.size; - ++ind; + b_size = b.size; + GKO_ASSERT_EQ(common_size, b_size); } - auto tmp = MatrixType::create(mtx->get_executor()->get_master(), - batch_dim<2>(batch_sizes)); + auto tmp = + MatrixType::create(mtx->get_executor()->get_master(), batch_size); for (size_type b = 0; b < data.size(); ++b) { size_type ind = 0; for (size_type row = 0; row < data[b].size[0]; ++row) { @@ -204,7 +205,7 @@ inline void write_impl(const MatrixType* mtx, std::vector& data) data = std::vector(mtx->get_num_batch_entries()); for (size_type b = 0; b < mtx->get_num_batch_entries(); ++b) { - data[b] = {mtx->get_size().at(b), {}}; + data[b] = {mtx->get_common_size(), {}}; for (size_type row = 0; row < data[b].size[0]; ++row) { for (size_type col = 0; col < data[b].size[1]; ++col) { if (tmp->at(b, row, col) != From 63743596a0bcd622fdc1d0f7393d3d3df6767054 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 9 Jul 2023 23:14:27 +0200 Subject: [PATCH 111/583] Fix kernels and batch_struct. --- core/base/batch_struct.hpp | 27 +- cuda/base/batch_multi_vector_kernels.cu | 20 +- cuda/base/batch_struct.hpp | 24 +- dpcpp/CMakeLists.txt | 1 + dpcpp/base/batch_multi_vector_kernels.dp.cpp | 232 +++++++++++++++ dpcpp/base/batch_multi_vector_kernels.hpp.inc | 168 +++++++++++ hip/base/batch_multi_vector_kernels.hip.cpp | 20 +- hip/base/batch_struct.hip.hpp | 22 +- .../ginkgo/core/base/batch_lin_op_helpers.hpp | 126 +++++++++ .../ginkgo/core/base/batch_multi_vector.hpp | 4 +- include/ginkgo/ginkgo.hpp | 2 + omp/base/batch_multi_vector_kernels.cpp | 8 +- reference/base/batch_multi_vector_kernels.cpp | 4 +- .../base/batch_multi_vector_kernels.hpp.inc | 263 ------------------ reference/base/batch_struct.hpp | 22 +- 15 files changed, 605 insertions(+), 338 deletions(-) create mode 100644 dpcpp/base/batch_multi_vector_kernels.dp.cpp create mode 100644 dpcpp/base/batch_multi_vector_kernels.hpp.inc create mode 100644 include/ginkgo/core/base/batch_lin_op_helpers.hpp diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index 68fcdd9c8a0..05ac4f0d105 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -65,15 +65,16 @@ struct UniformBatch { using value_type = ValueType; using entry_type = BatchEntry; - ValueType* values; ///< Concatenated values of all matrices in the batch - size_type num_batch; ///< Number of matrices in the batch - size_type stride; ///< Common stride of each dense matrix - int num_rows; ///< Common number of rows in each matrix - int num_rhs; ///< Common number of columns of each matrix - int num_nnz; ///< Common number of non-zeros of each matrix, ie., - ///< the number or rows times the number of columns - - size_type get_entry_storage() const { return num_nnz * sizeof(value_type); } + ValueType* values; + size_type num_batch_entries; + size_type stride; + int num_rows; + int num_rhs; + + size_type get_entry_storage() const + { + return num_rows * stride * sizeof(value_type); + } }; @@ -95,14 +96,15 @@ template GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::UniformBatch to_const(const gko::batch_multi_vector::UniformBatch& ub) { - return {ub.values, ub.num_batch, ub.stride, ub.num_rows, ub.num_rhs}; + return {ub.values, ub.num_batch_entries, ub.stride, ub.num_rows, + ub.num_rhs}; } /** * Extract one object (matrix, vector etc.) from a batch of objects * - * This overload is for batch dense matrices. + * This overload is for batch multi-vectors. * These overloads are intended to be called from within a kernel. * * @param batch The batch of objects to extract from @@ -136,8 +138,7 @@ GKO_ATTRIBUTES GKO_INLINE ValueType* batch_entry_ptr( } // namespace batch - - } // namespace gko + #endif // GKO_CORE_BASE_BATCH_STRUCT_HPP_ diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 039ab94b767..df5aa9149a5 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -67,7 +67,7 @@ constexpr int sm_multiplier = 4; template -void scale(std::shared_ptr exec, +void scale(std::shared_ptr exec, const BatchMultiVector* const alpha, BatchMultiVector* const x) { @@ -82,16 +82,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void add_scaled(std::shared_ptr exec, +void add_scaled(std::shared_ptr exec, const BatchMultiVector* const alpha, const BatchMultiVector* const x, BatchMultiVector* const y) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const size_type nrhs = x->get_size().at(0)[1]; + const size_type nrhs = x->get_common_size()[1]; if (nrhs == 1) { const auto num_batch = x->get_num_batch_entries(); - const auto num_rows = x->get_size().at(0)[0]; + const auto num_rows = x->get_common_size()[0]; single_add_scaled<<>>( num_batch, num_rows, as_cuda_type(alpha->get_const_values()), as_cuda_type(x->get_const_values()), as_cuda_type(y->get_values())); @@ -108,15 +108,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void compute_dot(std::shared_ptr exec, +void compute_dot(std::shared_ptr exec, const BatchMultiVector* x, const BatchMultiVector* y, BatchMultiVector* result) { const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_size().at()[1]; + const auto num_rhs = x->get_common_size()[1]; if (num_rhs == 1) { - const auto num_rows = x->get_size().at()[0]; + const auto num_rows = x->get_common_size()[0]; single_compute_dot_product<<>>( num_blocks, num_rows, as_cuda_type(x->get_const_values()), as_cuda_type(y->get_const_values()), @@ -135,14 +135,14 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void compute_norm2(std::shared_ptr exec, +void compute_norm2(std::shared_ptr exec, const BatchMultiVector* const x, BatchMultiVector>* const result) { const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_size().at()[1]; + const auto num_rhs = x->get_common_size()[1]; if (num_rhs == 1) { - const auto num_rows = x->get_size().at()[0]; + const auto num_rows = x->get_common_size()[0]; single_compute_norm2<<>>( num_blocks, num_rows, as_cuda_type(x->get_const_values()), as_cuda_type(result->get_values())); diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 0bd9bd6dc40..5db50064e2f 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -70,10 +70,10 @@ get_batch_struct(const BatchMultiVector* const op) return { as_cuda_type(op->get_const_values()), op->get_num_batch_entries(), - op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1]), - static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; } /** @@ -86,10 +86,10 @@ get_batch_struct(BatchMultiVector* const op) return { as_cuda_type(op->get_values()), op->get_num_batch_entries(), - op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1]), - static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; } @@ -103,9 +103,9 @@ maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { return {as_cuda_type(op->get_const_values()), - op->get_num_batch_entries(), op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1])}; + op->get_num_batch_entries(), op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } else { return {nullptr, 0, 0, 0, 0}; } @@ -115,4 +115,6 @@ maybe_null_batch_struct(const BatchMultiVector* const op) } // namespace cuda } // namespace kernels } // namespace gko + + #endif // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index b33b63d4af9..b70175c6b12 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -8,6 +8,7 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernel add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp PRIVATE + base/batch_multi_vector_kernels.dp.cpp base/device_matrix_data_kernels.dp.cpp base/executor.dp.cpp base/helper.dp.cpp diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp new file mode 100644 index 00000000000..6101ed3da4d --- /dev/null +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -0,0 +1,232 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/base/batch_multi_vector_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "core/components/prefix_sum_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The BatchMultiVector matrix format namespace. + * @ref BatchMultiVector + * @ingroup batch_multi_vector + */ +namespace batch_multi_vector { + + +#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc" + + +template +void scale(std::shared_ptr exec, + const BatchMultiVector* const alpha, + BatchMultiVector* const x) +{ + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + + const auto num_batches = x_ub.num_batch_entries; + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batches); + + // Launch a kernel that has nbatches blocks, each block has max group size + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = batch::batch_entry(alpha_ub, group_id); + const auto x_b = batch::batch_entry(x_ub, group_id); + single_scale_kernel(alpha_b, x_b, item_ct1); + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const BatchMultiVector* const alpha, + const BatchMultiVector* const x, + BatchMultiVector* const y) +{ + const size_type num_rows = x->get_common_size()[0]; + const size_type num_cols = x->get_common_size()[1]; + + const auto num_batches = x->get_num_batch_entries(); + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batches); + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = batch::batch_entry(alpha_ub, group_id); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto y_b = batch::batch_entry(y_ub, group_id); + add_scaled_kernel(alpha_b, x_b, y_b, item_ct1); + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const BatchMultiVector* const x, + const BatchMultiVector* const y, + BatchMultiVector* const result) +{ + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + + const auto num_batches = x_ub.num_batch_entries; + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batches); + + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto y_b = batch::batch_entry(y_ub, group_id); + const auto res_b = batch::batch_entry(res_ub, group_id); + compute_dot_product_kernel(x_b, y_b, res_b, item_ct1); + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const BatchMultiVector* const x, + BatchMultiVector>* const result) +{ + const auto x_ub = get_batch_struct(x); + const auto res_ub = get_batch_struct(result); + + const auto num_batches = x_ub.num_batch_entries; + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batches); + + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto res_b = batch::batch_entry(res_ub, group_id); + compute_norm2_kernel(x_b, res_b, item_ct1); + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); + + +template +void copy(std::shared_ptr exec, + const BatchMultiVector* x, + BatchMultiVector* result) +{ + const auto x_ub = get_batch_struct(x); + const auto result_ub = get_batch_struct(result); + + const auto num_batches = x_ub.num_batch_entries; + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batches); + + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto result_b = batch::batch_entry(result_ub, group_id); + copy_kernel(x_b, result_b, item_ct1); + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); + + +} // namespace batch_multi_vector +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc new file mode 100644 index 00000000000..7ea25fb4c22 --- /dev/null +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -0,0 +1,168 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * Copies the values of vector into another. + * + * @param num_rows Length of vector. + * @param in Vector to copy from. + * @param out Vector to copy into. + */ +template +__dpct_inline__ void copy_kernel(const int num_rows, + const ValueType* const __restrict__ in, + ValueType* const __restrict__ out, + sycl::nd_item<3> item_ct1) +{ + for (int iz = item_ct1.get_local_linear_id(); iz < num_rows; + iz += item_ct1.get_local_range().size()) { + out[iz] = in[iz]; + } +} + +/** + * Adds a scaled vector to another. + * + * @param num_rows Common length of both vectors. + * @param alpha Scaling factor. + * @param[in] x Vector to scale and add. + * @param[in,out] y Vector to add to. + */ +template +__dpct_inline__ void add_scaled_kernel(const int num_rows, + const ValueType alpha, + const ValueType* const __restrict__ x, + ValueType* const __restrict__ y, + sycl::nd_item<3> item_ct1) +{ + for (int li = item_ct1.get_local_linear_id(); li < num_rows; + li += item_ct1.get_local_range().size()) { + y[li] += alpha * x[li]; + } +} + +/** + * Computes the 2-norm of a vector in global or shared memory. + * + * @param x A row-major vector (only 1 column). + * @param result Norm value. + */ +template +__dpct_inline__ void compute_norm2_sg_kernel( + const int num_rows, const ValueType* const __restrict__ x, + gko::remove_complex& result, sycl::nd_item<3> item_ct1) +{ + const auto sg = item_ct1.get_sub_group(); + const auto sg_size = sg.get_local_range().size(); + const auto sg_tid = sg.get_local_id(); + + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = sg_tid; r < num_rows; r += sg_size) { + val += squared_norm(x[r]); + } + + val = sycl::reduce_over_group(sg, val, sycl::plus<>()); + + if (sg_tid == 0) { + result = sqrt(val); + } +} + +template +__dpct_inline__ void compute_norm2_kernel( + const int num_rows, const ValueType* const __restrict__ x, + gko::remove_complex& result, sycl::nd_item<3> item_ct1) +{ + const auto group = item_ct1.get_group(); + const auto group_size = item_ct1.get_local_range().size(); + const auto tid = item_ct1.get_local_linear_id(); + + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = tid; r < num_rows; r += group_size) { + val += squared_norm(x[r]); + } + + val = sycl::reduce_over_group(group, val, sycl::plus<>()); + + result = sqrt(val); +} + + +/** + * Computes the dot product of some column vectors in global or shared memory. + * + * @param result Holds dot product value for vector in x and y. + */ +template +__dpct_inline__ void compute_dot_product_sg_kernel( + const int num_rows, const ValueType* const __restrict__ x, + const ValueType* const __restrict__ y, ValueType& result, + sycl::nd_item<3> item_ct1) +{ + const auto sg = item_ct1.get_sub_group(); + const auto sg_size = sg.get_local_range().size(); + const auto sg_tid = sg.get_local_id(); + + ValueType val = zero(); + + for (int r = sg_tid; r < num_rows; r += sg_size) { + val += conj(x[r]) * y[r]; + } + + val = sycl::reduce_over_group(sg, val, sycl::plus<>()); + + if (sg_tid == 0) { + result = val; + } +} + +template +__dpct_inline__ void compute_dot_product_kernel( + const int num_rows, const ValueType* const __restrict__ x, + const ValueType* const __restrict__ y, ValueType& result, + sycl::nd_item<3> item_ct1) +{ + const auto group = item_ct1.get_group(); + const auto group_size = item_ct1.get_local_range().size(); + const auto tid = item_ct1.get_local_linear_id(); + + ValueType val = zero(); + + for (int r = tid; r < num_rows; r += group_size) { + val += conj(x[r]) * y[r]; + } + result = sycl::reduce_over_group(group, val, sycl::plus<>()); +} diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index 01a443558e9..2a6c3085772 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -70,7 +70,7 @@ constexpr int sm_multiplier = 4; template -void scale(std::shared_ptr exec, +void scale(std::shared_ptr exec, const BatchMultiVector* const alpha, BatchMultiVector* const x) { @@ -86,16 +86,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void add_scaled(std::shared_ptr exec, +void add_scaled(std::shared_ptr exec, const BatchMultiVector* const alpha, const BatchMultiVector* const x, BatchMultiVector* const y) { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const size_type nrhs = x->get_size().at(0)[1]; + const size_type nrhs = x->get_common_size()[1]; if (nrhs == 1) { const auto num_batch = x->get_num_batch_entries(); - const auto num_rows = x->get_size().at(0)[0]; + const auto num_rows = x->get_common_size()[0]; hipLaunchKernelGGL( single_add_scaled, dim3(num_blocks), dim3(default_block_size), 0, 0, num_batch, num_rows, as_hip_type(alpha->get_const_values()), @@ -115,15 +115,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void compute_dot(std::shared_ptr exec, +void compute_dot(std::shared_ptr exec, const BatchMultiVector* x, const BatchMultiVector* y, BatchMultiVector* result) { const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_size().at()[1]; + const auto num_rhs = x->get_common_size()[1]; if (num_rhs == 1) { - const auto num_rows = x->get_size().at()[0]; + const auto num_rows = x->get_common_size()[0]; hipLaunchKernelGGL(single_compute_dot_product, dim3(num_blocks), dim3(default_block_size), 0, 0, num_blocks, num_rows, as_hip_type(x->get_const_values()), @@ -144,14 +144,14 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void compute_norm2(std::shared_ptr exec, +void compute_norm2(std::shared_ptr exec, const BatchMultiVector* const x, BatchMultiVector>* const result) { const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_size().at()[1]; + const auto num_rhs = x->get_common_size()[1]; if (num_rhs == 1) { - const auto num_rows = x->get_size().at()[0]; + const auto num_rows = x->get_common_size()[0]; hipLaunchKernelGGL(single_compute_norm2, dim3(num_blocks), dim3(default_block_size), 0, 0, num_blocks, num_rows, as_hip_type(x->get_const_values()), diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index 214039f060b..c921e55d857 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -70,10 +70,10 @@ get_batch_struct(const BatchMultiVector* const op) return { as_hip_type(op->get_const_values()), op->get_num_batch_entries(), - op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1]), - static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; } /** @@ -86,10 +86,10 @@ get_batch_struct(BatchMultiVector* const op) return { as_hip_type(op->get_values()), op->get_num_batch_entries(), - op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1]), - static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; } @@ -103,9 +103,9 @@ maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { return {as_hip_type(op->get_const_values()), - op->get_num_batch_entries(), op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1])}; + op->get_num_batch_entries(), op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } else { return {nullptr, 0, 0, 0, 0}; } diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp new file mode 100644 index 00000000000..ecb8bcc4556 --- /dev/null +++ b/include/ginkgo/core/base/batch_lin_op_helpers.hpp @@ -0,0 +1,126 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ +#define GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ + + +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { + + +/** + * A BatchLinOp implementing this interface can read its data from a matrix_data + * structure. + * + * @ingroup BatchLinOp + */ +template +class BatchReadableFromMatrixData { +public: + using value_type = ValueType; + using index_type = IndexType; + + virtual ~BatchReadableFromMatrixData() = default; + + /** + * Reads a batch matrix from a std::vector of matrix_data objects. + * + * @param data the std::vector of matrix_data objects + */ + virtual void read( + const std::vector>& data) = 0; + + /** + * Reads a matrix from a std::vector of matrix_assembly_data objects. + * + * @param data the std::vector of matrix_assembly_data objects + */ + void read(const std::vector>& + assembly_data) + { + auto mat_data = std::vector>( + assembly_data.size()); + size_type ind = 0; + for (const auto& i : assembly_data) { + mat_data[ind] = i.get_ordered_data(); + ++ind; + } + this->read(mat_data); + } +}; + + +/** + * A BatchLinOp implementing this interface can write its data to a std::vector + * of matrix_data objects. + * + * @ingroup BatchLinOp + */ +template +class BatchWritableToMatrixData { +public: + using value_type = ValueType; + using index_type = IndexType; + + virtual ~BatchWritableToMatrixData() = default; + + /** + * Writes a matrix to a matrix_data structure. + * + * @param data the matrix_data structure + */ + virtual void write( + std::vector>& data) const = 0; +}; + + +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 9513272648d..c6614df0d66 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -461,9 +462,6 @@ class BatchMultiVector * @param exec Executor associated to the vector * @param size sizes of the batch matrices in a batch_dim object * @param values array of matrix values - * @param strides stride of the rows (i.e. offset between the first - * elements of two consecutive rows, expressed as the - * number of matrix elements) * * @note If `values` is not an rvalue, not an array of ValueType, or is on * the wrong executor, an internal copy will be created, and the diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index d73bf669700..8a88bf003f8 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -39,6 +39,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include +#include #include #include #include diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index 96b6716f0ba..6dd8b38e6d8 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -60,7 +60,7 @@ namespace batch_multi_vector { template -void scale(std::shared_ptr exec, +void scale(std::shared_ptr exec, const BatchMultiVector* const alpha, BatchMultiVector* const x) { @@ -79,7 +79,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void add_scaled(std::shared_ptr exec, +void add_scaled(std::shared_ptr exec, const BatchMultiVector* const alpha, const BatchMultiVector* const x, BatchMultiVector* const y) @@ -101,7 +101,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void compute_dot(std::shared_ptr exec, +void compute_dot(std::shared_ptr exec, const BatchMultiVector* const x, const BatchMultiVector* const y, BatchMultiVector* const result) @@ -124,7 +124,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template -void compute_norm2(std::shared_ptr exec, +void compute_norm2(std::shared_ptr exec, const BatchMultiVector* const x, BatchMultiVector>* const result) { diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index 27f6539b9eb..31e10fbe22f 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_struct.hpp" -#include "reference/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" namespace gko { @@ -56,7 +56,7 @@ namespace reference { namespace batch_multi_vector { -#include "reference/matrix/batch_multi_vector_kernels.hpp.inc" +#include "reference/base/batch_multi_vector_kernels.hpp.inc" template diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc index 2f9c88e53f1..3cda19cfc06 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -30,63 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -template -inline void matvec_kernel( - const gko::batch_multi_vector::BatchEntry& a, - const gko::batch_multi_vector::BatchEntry& b, - const gko::batch_multi_vector::BatchEntry& c) -{ - for (int row = 0; row < c.num_rows; ++row) { - for (int col = 0; col < c.num_rhs; ++col) { - c.values[row * c.stride + col] = gko::zero(); - } - } - - for (int row = 0; row < c.num_rows; ++row) { - for (int inner = 0; inner < a.num_rhs; ++inner) { - for (int col = 0; col < c.num_rhs; ++col) { - c.values[row * c.stride + col] += - a.values[row * a.stride + inner] * - b.values[inner * b.stride + col]; - } - } - } -} - - -template -inline void advanced_matvec_kernel( - const ValueType alpha, - const gko::batch_multi_vector::BatchEntry& a, - const gko::batch_multi_vector::BatchEntry& b, - const ValueType beta, - const gko::batch_multi_vector::BatchEntry& c) -{ - if (beta != gko::zero()) { - for (int row = 0; row < c.num_rows; ++row) { - for (int col = 0; col < c.num_rhs; ++col) { - c.values[row * c.stride + col] *= beta; - } - } - } else { - for (int row = 0; row < c.num_rows; ++row) { - for (int col = 0; col < c.num_rhs; ++col) { - c.values[row * c.stride + col] *= gko::zero(); - } - } - } - - for (int row = 0; row < c.num_rows; ++row) { - for (int inner = 0; inner < a.num_rhs; ++inner) { - for (int col = 0; col < c.num_rhs; ++col) { - c.values[row * c.stride + col] += - alpha * a.values[row * a.stride + inner] * - b.values[inner * b.stride + col]; - } - } - } -} - template inline void scale( @@ -133,33 +76,6 @@ inline void add_scaled( } -template -inline void add_scale( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& beta, - const gko::batch_multi_vector::BatchEntry& y) -{ - if (alpha.num_rhs == 1) { - for (int i = 0; i < x.num_rows; ++i) { - for (int j = 0; j < x.num_rhs; ++j) { - y.values[i * y.stride + j] = - alpha.values[0] * x.values[i * x.stride + j] + - beta.values[0] * y.values[i * y.stride + j]; - } - } - } else { - for (int i = 0; i < x.num_rows; ++i) { - for (int j = 0; j < x.num_rhs; ++j) { - y.values[i * y.stride + j] = - alpha.values[j] * x.values[i * x.stride + j] + - beta.values[j] * y.values[i * y.stride + j]; - } - } - } -} - - template inline void compute_norm2( const gko::batch_multi_vector::BatchEntry& x, @@ -180,39 +96,6 @@ inline void compute_norm2( } -/** - * Multiplies with a diagonal matrix represented as a dense vector. - * - * @param[in] diag_vec The entries of the diagonal matrix. - * @param[in,out] a The dense matrix or vectors to scale. - */ -template -inline void batch_scale( - const gko::batch_multi_vector::BatchEntry& diag_vec, - const gko::batch_multi_vector::BatchEntry& a) -{ - for (int i_row = 0; i_row < a.num_rows; i_row++) { - const ValueType scale = diag_vec.values[i_row]; - for (int j = 0; j < a.num_rhs; j++) { - a.values[i_row * a.stride + j] *= scale; - } - } -} - -template -inline void batch_scale(const int nrows, const int ncols, - const size_type a_stride, const ValueType* const left, - const ValueType* const right, ValueType* const a) -{ - for (int i_row = 0; i_row < nrows; i_row++) { - const ValueType scale = left[i_row]; - for (int j = 0; j < ncols; j++) { - a[i_row * a_stride + j] *= scale * right[j]; - } - } -} - - /** * Copies the values of one multi-vector into another. * @@ -248,149 +131,3 @@ inline void compute_dot_product( } } } - - -template -inline void copy( - const gko::batch_multi_vector::BatchEntry& source_entry, - const gko::batch_multi_vector::BatchEntry& destination_entry, - const gko::uint32& converged) -{ - for (int r = 0; r < source_entry.num_rows; r++) { - for (int c = 0; c < source_entry.num_rhs; c++) { - const gko::uint32 conv = converged & (1 << c); - - if (conv) { - continue; - } - - destination_entry.values[r * destination_entry.stride + c] = - source_entry.values[r * source_entry.stride + c]; - } - } -} - - -template -inline void add_scaled( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, - const gko::uint32& converged) -{ - if (alpha.num_rhs == 1) { - for (int i = 0; i < x.num_rows; ++i) { - for (int j = 0; j < x.num_rhs; ++j) { - const gko::uint32 conv = converged & (1 << j); - - if (conv) { - continue; - } - - y.values[i * y.stride + j] += - alpha.values[0] * x.values[i * x.stride + j]; - } - } - } else { - for (int i = 0; i < x.num_rows; ++i) { - for (int j = 0; j < x.num_rhs; ++j) { - const gko::uint32 conv = converged & (1 << j); - - if (conv) { - continue; - } - - - y.values[i * y.stride + j] += - alpha.values[j] * x.values[i * x.stride + j]; - } - } - } -} - - -template -inline void compute_norm2( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry>& - result, - const gko::uint32& converged) -{ - for (int j = 0; j < x.num_rhs; ++j) { - const gko::uint32 conv = converged & (1 << j); - - if (conv) { - continue; - } - - result.values[j] = gko::zero>(); - } - for (int i = 0; i < x.num_rows; ++i) { - for (int j = 0; j < x.num_rhs; ++j) { - const gko::uint32 conv = converged & (1 << j); - - if (conv) { - continue; - } - - result.values[j] += squared_norm(x.values[i * x.stride + j]); - } - } - for (int j = 0; j < x.num_rhs; ++j) { - const gko::uint32 conv = converged & (1 << j); - - if (conv) { - continue; - } - - result.values[j] = sqrt(result.values[j]); - } -} - - -template -inline void compute_dot_product( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, - const gko::batch_multi_vector::BatchEntry& result, - const gko::uint32& converged) -{ - for (int c = 0; c < result.num_rhs; c++) { - const gko::uint32 conv = converged & (1 << c); - - if (conv) { - continue; - } - - result.values[c] = gko::zero(); - } - - for (int r = 0; r < x.num_rows; r++) { - for (int c = 0; c < x.num_rhs; c++) { - const gko::uint32 conv = converged & (1 << c); - - if (conv) { - continue; - } - - result.values[c] += - conj(x.values[r * x.stride + c]) * y.values[r * y.stride + c]; - } - } -} - - -template -inline void add_scaled_identity( - const ValueType& a, const ValueType& b, - const gko::batch_multi_vector::BatchEntry& mat) -{ - for (int i = 0; i < mat.num_rows; i++) { - for (int j = 0; j < mat.num_rhs; j++) { - mat.values[i * mat.stride + j] *= b; - if (i == j) { - mat.values[i * mat.stride + i] += a; - } - } - } -} diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index 32c90db9d7f..bb492488b28 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -69,10 +69,10 @@ inline gko::batch_multi_vector::UniformBatch get_batch_struct( return { op->get_const_values(), op->get_num_batch_entries(), - op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1]), - static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; } @@ -86,10 +86,10 @@ inline gko::batch_multi_vector::UniformBatch get_batch_struct( return { op->get_values(), op->get_num_batch_entries(), - op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1]), - static_cast(op->get_size().at(0)[0] * op->get_size().at(0)[1])}; + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; } @@ -103,9 +103,9 @@ maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { return {op->get_const_values(), op->get_num_batch_entries(), - op->get_stride().at(0), - static_cast(op->get_size().at(0)[0]), - static_cast(op->get_size().at(0)[1])}; + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } else { return {nullptr, 0, 0, 0, 0}; } From c7bc6998d5cc8cdebc7e28e6202ea57f9a80bec0 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 11 Jul 2023 07:16:28 +0200 Subject: [PATCH 112/583] Minor typos and fixes --- core/base/batch_multi_vector_kernels.hpp | 3 ++ include/ginkgo/core/base/batch_dim.hpp | 4 +-- .../ginkgo/core/base/batch_multi_vector.hpp | 32 ++++++++++--------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp index 34da4ce4c2f..7e7f9c3bb37 100644 --- a/core/base/batch_multi_vector_kernels.hpp +++ b/core/base/batch_multi_vector_kernels.hpp @@ -42,6 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/kernel_declaration.hpp" + + namespace gko { namespace kernels { diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index 3e650745a50..bc17648be52 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_PUBLIC_CORE_BASE_DIM_HPP_ -#define GKO_PUBLIC_CORE_BASE_DIM_HPP_ +#ifndef GKO_PUBLIC_CORE_BASE_BATCH_DIM_HPP_ +#define GKO_PUBLIC_CORE_BASE_BATCH_DIM_HPP_ #include diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index c6614df0d66..1050ec28224 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -39,7 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include +#include #include #include #include @@ -82,7 +84,7 @@ class BatchMultiVector using value_type = ValueType; using index_type = int32; - using unbatch_type = Dense; + using unbatch_type = matrix::Dense; using mat_data = gko::matrix_data; using mat_data32 = gko::matrix_data; using absolute_type = remove_complex>; @@ -147,14 +149,14 @@ class BatchMultiVector * * @return the batch size */ - batch_dim<2> get_size() { return batch_size_; } + batch_dim<2> get_size() const { return batch_size_; } /** * Returns the number of batch entries. * * @return the number of batch entries */ - size_type get_num_batch_entries() + size_type get_num_batch_entries() const { return batch_size_.get_num_batch_entries(); } @@ -164,7 +166,7 @@ class BatchMultiVector * * @return the common size stored */ - dim<2> get_common_size() { return batch_size_.get_common_size(); } + dim<2> get_common_size() const { return batch_size_.get_common_size(); } /** * Returns a pointer to the array of values of the vector. @@ -418,13 +420,13 @@ class BatchMultiVector private: inline batch_dim<2> compute_batch_size( - const std::vector*>& matrices) + const std::vector*>& matrices) { auto common_size = matrices[0]->get_size(); for (int i = 1; i < matrices.size(); ++i) { GKO_ASSERT_EQ(common_size, matrices[i]->get_size()); } - return batch_dim<2>{num_entries, common_size}; + return batch_dim<2>{matrices.size(), common_size}; } inline size_type compute_num_elems(const batch_dim<2>& size) @@ -486,10 +488,10 @@ class BatchMultiVector * @param matrices The matrices that need to be batched. */ BatchMultiVector(std::shared_ptr exec, - const std::vector*>& matrices) + const std::vector*>& matrices) : EnableAbstractPolymorphicObject(exec), batch_size_{compute_batch_size(matrices)}, - values(exec, compute_num_elems(batch_size_)) + values_(exec, compute_num_elems(batch_size_)) { for (size_type i = 0; i < this->get_num_batch_entries(); ++i) { auto local_exec = matrices[i]->get_executor(); @@ -510,7 +512,7 @@ class BatchMultiVector BatchMultiVector(std::shared_ptr exec, size_type num_duplications, const BatchMultiVector* input) - : EnableBatchMultiVector( + : BatchMultiVector( exec, gko::batch_dim<2>( input->get_num_batch_entries() * num_duplications, input->get_common_size())) @@ -532,8 +534,9 @@ class BatchMultiVector * @param input the vector to be duplicated. */ BatchMultiVector(std::shared_ptr exec, - size_type num_duplications, const Dense* input) - : EnableBatchMultiVector( + size_type num_duplications, + const matrix::Dense* input) + : BatchMultiVector( exec, gko::batch_dim<2>(num_duplications, input->get_size())) { size_type offset = 0; @@ -639,10 +642,10 @@ std::unique_ptr batch_initialize( std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - size_type common_num_rows = vals_begin->size(); - size_type common_size = dim<2>(common_num_rows, 1); - dim<2> common_size; + size_type num_batch_entries = vals.size(); auto vals_begin = begin(vals); + size_type common_num_rows = vals_begin->size(); + auto common_size = dim<2>(common_num_rows, 1); for (size_type b = 0; b < num_batch_entries; ++b) { GKO_ASSERT_EQ(common_num_rows, vals_begin->size()); vals_begin++; @@ -693,7 +696,6 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); - auto vals_begin = begin(vals); size_type common_num_rows = vals_begin->size(); size_type common_num_cols = begin(vals_begin)->size(); From 848461eed080285831eb570513e8ecacb57438aa Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 11 Jul 2023 17:09:12 +0200 Subject: [PATCH 113/583] Update cuda/hip kernels --- .../base/batch_multi_vector_kernels.hpp.inc | 234 ++++++++++-------- core/device_hooks/common_kernels.inc.cpp | 1 + cuda/base/batch_struct.hpp | 22 +- hip/base/batch_struct.hip.hpp | 22 +- reference/base/batch_struct.hpp | 22 +- 5 files changed, 156 insertions(+), 145 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 0eb86996c81..0ef0408674a 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -32,165 +32,193 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** - * Copies the values of vector into another. - * - * @param num_rows Length of vector. - * @param in Vector to copy from. - * @param out Vector to copy into. + * Scales the vectors in global or shared memory with a factor of alpha (alpha + * is in global memory or shared memory) */ template -__device__ __forceinline__ void single_copy(const int num_rows, - const ValueType* const in, - ValueType* const out) +__device__ __forceinline__ void scale(const BatchEntry& alpha, + const BatchEntry& x) { - for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { - out[iz] = in[iz]; + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + if (alpha.num_rhs == 1) { + x.values[row * x.stride + col] = + alpha.values[0] * x.values[row * x.stride + col]; + } else { + x.values[row * x.stride + col] = + alpha.values[col] * x.values[row * x.stride + col]; + } } } template -__global__ __launch_bounds__(default_block_size) void single_copy( - const size_type num_batch, const int num_rows, - const ValueType* const __restrict__ in, ValueType* const __restrict__ out) +__global__ __launch_bounds__(default_block_size, sm_multiplier) void scale( + const gko::batch_dense::UniformBatch alpha, + const gko::batch_dense::UniformBatch x) { - for (size_type ibatch = blockIdx.x; ibatch < num_batch; + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; ibatch += gridDim.x) { - const auto in_b = gko::batch::batch_entry_ptr(in, 1, num_rows, ibatch); - const auto out_b = - gko::batch::batch_entry_ptr(out, 1, num_rows, ibatch); - single_copy(num_rows, in_b, out_b); + const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); + const auto x_b = gko::batch::batch_entry(x, ibatch); + scale(alpha_b, x_b); } } -/** - * Adds a scaled vector to another. - * - * @param num_rows Common length of both vectors. - * @param alpha Scaling factor. - * @param[in] x Vector to scale and add. - * @param[in,out] y Vector to add to. - */ template -__device__ __forceinline__ void single_add_scaled(const int num_rows, - const ValueType alpha, - const ValueType* const x, - ValueType* const y) +__device__ __forceinline__ void add_scaled( + const gko::batch_dense::BatchEntry& alpha, + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry& y) { - for (int li = threadIdx.x; li < num_rows; li += blockDim.x) { - y[li] += alpha * x[li]; + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + if (alpha.num_rhs == 1) { + y.values[row * y.stride + col] += + alpha.values[0] * x.values[row * x.stride + col]; + } else { + y.values[row * y.stride + col] += + alpha.values[col] * x.values[row * x.stride + col]; + } } } template -__global__ __launch_bounds__(default_block_size) void single_add_scaled( - const size_type num_batch, const int num_rows, - const ValueType* const __restrict__ alpha, - const ValueType* const __restrict__ x, ValueType* const __restrict__ y) +__global__ __launch_bounds__(default_block_size, sm_multiplier) void add_scaled( + const gko::batch_dense::UniformBatch alpha, + const gko::batch_dense::UniformBatch x, + const gko::batch_dense::UniformBatch y) { - for (size_type ibatch = blockIdx.x; ibatch < num_batch; + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; ibatch += gridDim.x) { - const auto x_b = gko::batch::batch_entry_ptr(x, 1, num_rows, ibatch); - const auto y_b = gko::batch::batch_entry_ptr(y, 1, num_rows, ibatch); - single_add_scaled(num_rows, alpha[0], x_b, y_b); + const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); + const auto x_b = gko::batch::batch_entry(x, ibatch); + const auto y_b = gko::batch::batch_entry(y, ibatch); + add_scaled(alpha_b, x_b, y_b); } } /** - * Computes the 2-norm of a vector in global or shared memory. + * Computes the dot product of some column vectors in global or shared memory. * - * @param x A row-major vector (only 1 column). - * @param result Norm value. + * @param result Holds dot product value for vector in x and y. */ template -__device__ __forceinline__ void single_compute_norm2( - group::thread_block_tile& warp_grp, const int num_rows, - const ValueType* const x, remove_complex& result) +__device__ __forceinline__ void compute_dot_product( + const BatchEntry& x, const BatchEntry& y, + const BatchEntry& result) { - using real_type = typename gko::remove_complex; - real_type val = zero(); - - for (int r = warp_grp.thread_rank(); r < num_rows; r += warp_grp.size()) { - val += squared_norm(x[r]); - } - - // warp level reduction -#pragma unroll - for (int j = config::warp_size / 2; j > 0; j /= 2) { - val += warp_grp.shfl_down(val, j); - } - - if (warp_grp.thread_rank() == 0) { - result = sqrt(val); + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); + const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subwarp_grp_id; rhs_index < x.num_rhs; + rhs_index += num_subwarp_grps_per_block) { + one_dot(x, y, rhs_index, result, subwarp_grp); } } template -__global__ __launch_bounds__(default_block_size) void single_compute_norm2( - const size_type num_batch, const int num_rows, - const ValueType* const __restrict__ x, - remove_complex* const __restrict__ result) +__global__ __launch_bounds__( + default_block_size, + sm_multiplier) void compute_dot_product(const gko::batch_dense:: + UniformBatch + x, + const gko::batch_dense:: + UniformBatch + y, + const gko::batch_dense:: + UniformBatch + result) { - auto warp_grp = - group::tiled_partition(group::this_thread_block()); - for (size_type ibatch = blockIdx.x; ibatch < num_batch; + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; ibatch += gridDim.x) { - const auto x_b = gko::batch::batch_entry_ptr(x, 1, num_rows, ibatch); - const auto r_b = gko::batch::batch_entry_ptr(result, 1, 1, ibatch); - if (threadIdx.x / config::warp_size == 0) { - single_compute_norm2(warp_grp, num_rows, x_b, r_b[0]); - } + const auto x_b = gko::batch::batch_entry(x, ibatch); + const auto y_b = gko::batch::batch_entry(y, ibatch); + const auto r_b = gko::batch::batch_entry(result, ibatch); + compute_dot_product(x_b, y_b, r_b); } } /** - * Computes the dot product of some column vectors in global or shared memory. + * Computes the 2-norms of some column vectors in global or shared memory. * - * @param result Holds dot product value for vector in x and y. + * @param x A row-major multivector with nrhs columns. + * @param result Holds norm value for each vector in x. */ template -__device__ __forceinline__ void single_compute_dot_product( - group::thread_block_tile& warp_grp, const int num_rows, - const ValueType* const x, const ValueType* const y, ValueType& result) +__device__ __forceinline__ void compute_norm2( + const gko::batch_dense::BatchEntry& x, + const gko::batch_dense::BatchEntry>& result) { - ValueType val = zero(); - - for (int r = warp_grp.thread_rank(); r < num_rows; r += warp_grp.size()) { - val += conj(x[r]) * y[r]; + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); + const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subwarp_grp_id; rhs_index < x.num_rhs; + rhs_index += num_subwarp_grps_per_block) { + one_norm2(x, rhs_index, result, subwarp_grp); } +} + - // warp level reduction -#pragma unroll - for (int j = config::warp_size / 2; j > 0; j /= 2) { - val += warp_grp.shfl_down(val, j); +template +__global__ + __launch_bounds__(default_block_size, sm_multiplier) void compute_norm2( + const gko::batch_dense::UniformBatch x, + const gko::batch_dense::UniformBatch> result) +{ + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; + ibatch += gridDim.x) { + const auto x_b = gko::batch::batch_entry(x, ibatch); + const auto r_b = gko::batch::batch_entry(result, ibatch); + compute_norm2(x_b, r_b); } +} + - if (warp_grp.thread_rank() == 0) { - result = val; +/** + * Copies the values of one multi-vector into another. + * + * Note that the output multi-vector should already have memory allocated + * and stride set. + */ +template +__device__ __forceinline__ void copy( + const gko::batch_dense::BatchEntry& in, + const gko::batch_dense::BatchEntry& out) +{ + for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; + iz += blockDim.x) { + const int i = iz / in.num_rhs; + const int j = iz % in.num_rhs; + out.values[i * out.stride + j] = in.values[i * in.stride + j]; } } -// clang-format off template -__global__ __launch_bounds__(default_block_size) -void single_compute_dot_product(const size_type num_batch, - const int num_rows, - const ValueType *const __restrict__ x, - const ValueType *const __restrict__ y, - ValueType *const __restrict__ result) -// clang-format on +__global__ __launch_bounds__(default_block_size, sm_multiplier) void copy( + const gko::batch_dense::UniformBatch src, + const gko::batch_dense::UniformBatch dst) { - auto warp_grp = - group::tiled_partition(group::this_thread_block()); - for (size_type ibatch = blockIdx.x; ibatch < num_batch; + for (size_type ibatch = blockIdx.x; ibatch < src.num_batch; ibatch += gridDim.x) { - const auto x_b = gko::batch::batch_entry_ptr(x, 1, num_rows, ibatch); - const auto y_b = gko::batch::batch_entry_ptr(y, 1, num_rows, ibatch); - const auto r_b = gko::batch::batch_entry_ptr(result, 1, 1, ibatch); - single_compute_dot_product(warp_grp, num_rows, x_b, y_b, r_b[0]); + const auto dst_b = gko::batch::batch_entry(dst, ibatch); + const auto src_b = gko::batch::batch_entry(src, ibatch); + copy(src_b, dst_b); } } diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 3fe1372558b..9ab79160394 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/batch_multi_vector_kernels.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/index_set_kernels.hpp" #include "core/base/mixed_precision_types.hpp" diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 5db50064e2f..9d4eb436c16 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -67,13 +67,10 @@ template inline gko::batch_multi_vector::UniformBatch> get_batch_struct(const BatchMultiVector* const op) { - return { - as_cuda_type(op->get_const_values()), - op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; + return {as_cuda_type(op->get_const_values()), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } /** @@ -83,13 +80,10 @@ template inline gko::batch_multi_vector::UniformBatch> get_batch_struct(BatchMultiVector* const op) { - return { - as_cuda_type(op->get_values()), - op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; + return {as_cuda_type(op->get_values()), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index c921e55d857..d796cdcdb37 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -67,13 +67,10 @@ template inline gko::batch_multi_vector::UniformBatch> get_batch_struct(const BatchMultiVector* const op) { - return { - as_hip_type(op->get_const_values()), - op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; + return {as_hip_type(op->get_const_values()), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } /** @@ -83,13 +80,10 @@ template inline gko::batch_multi_vector::UniformBatch> get_batch_struct(BatchMultiVector* const op) { - return { - as_hip_type(op->get_values()), - op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; + return {as_hip_type(op->get_values()), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index bb492488b28..056bb575f8a 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -66,13 +66,10 @@ template inline gko::batch_multi_vector::UniformBatch get_batch_struct( const BatchMultiVector* const op) { - return { - op->get_const_values(), - op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; + return {op->get_const_values(), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } @@ -83,13 +80,10 @@ template inline gko::batch_multi_vector::UniformBatch get_batch_struct( BatchMultiVector* const op) { - return { - op->get_values(), - op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0] * op->get_common_size()[1])}; + return {op->get_values(), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } From b4802c278c249e6b38741f99db779aa967c876c7 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 14 Jul 2023 13:54:50 +0200 Subject: [PATCH 114/583] Rename cuda/hip kernels --- .../base/batch_multi_vector_kernels.hpp.inc | 151 +++++++++++++----- cuda/base/batch_multi_vector_kernels.cu | 51 ++---- hip/base/batch_multi_vector_kernels.hip.cpp | 64 +++----- 3 files changed, 144 insertions(+), 122 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 0ef0408674a..6e9dc57681a 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -36,8 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * is in global memory or shared memory) */ template -__device__ __forceinline__ void scale(const BatchEntry& alpha, - const BatchEntry& x) +__device__ __forceinline__ void scale( + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { @@ -55,11 +56,12 @@ __device__ __forceinline__ void scale(const BatchEntry& alpha, } template -__global__ __launch_bounds__(default_block_size, sm_multiplier) void scale( - const gko::batch_dense::UniformBatch alpha, - const gko::batch_dense::UniformBatch x) +__global__ + __launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( + const gko::batch_multi_vector::UniformBatch alpha, + const gko::batch_multi_vector::UniformBatch x) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); const auto x_b = gko::batch::batch_entry(x, ibatch); @@ -70,9 +72,9 @@ __global__ __launch_bounds__(default_block_size, sm_multiplier) void scale( template __device__ __forceinline__ void add_scaled( - const gko::batch_dense::BatchEntry& alpha, - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry& y) + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { @@ -90,12 +92,13 @@ __device__ __forceinline__ void add_scaled( } template -__global__ __launch_bounds__(default_block_size, sm_multiplier) void add_scaled( - const gko::batch_dense::UniformBatch alpha, - const gko::batch_dense::UniformBatch x, - const gko::batch_dense::UniformBatch y) +__global__ + __launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( + const gko::batch_multi_vector::UniformBatch alpha, + const gko::batch_multi_vector::UniformBatch x, + const gko::batch_multi_vector::UniformBatch y) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); const auto x_b = gko::batch::batch_entry(x, ibatch); @@ -105,6 +108,34 @@ __global__ __launch_bounds__(default_block_size, sm_multiplier) void add_scaled( } +template +__device__ __forceinline__ void one_dot( + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, + const int rhs_index, + const gko::batch_multi_vector::BatchEntry& result, + group::thread_block_tile& subwarp_grp) +{ + ValueType val = zero(); + + for (int r = subwarp_grp.thread_rank(); r < x.num_rows; + r += subwarp_grp.size()) { + val += conj(x.values[r * x.stride + rhs_index]) * + y.values[r * y.stride + rhs_index]; + } + + // subwarp_grp level reduction +#pragma unroll + for (int j = config::warp_size / 2; j > 0; j /= 2) { + val += subwarp_grp.shfl_down(val, j); + } + + if (subwarp_grp.thread_rank() == 0) { + result.values[rhs_index] = val; + } +} + + /** * Computes the dot product of some column vectors in global or shared memory. * @@ -112,8 +143,9 @@ __global__ __launch_bounds__(default_block_size, sm_multiplier) void add_scaled( */ template __device__ __forceinline__ void compute_dot_product( - const BatchEntry& x, const BatchEntry& y, - const BatchEntry& result) + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, + const gko::batch_multi_vector::BatchEntry& result) { constexpr auto tile_size = config::warp_size; auto thread_block = group::this_thread_block(); @@ -131,17 +163,23 @@ __device__ __forceinline__ void compute_dot_product( template __global__ __launch_bounds__( default_block_size, - sm_multiplier) void compute_dot_product(const gko::batch_dense:: - UniformBatch - x, - const gko::batch_dense:: - UniformBatch - y, - const gko::batch_dense:: - UniformBatch - result) + sm_multiplier) void compute_dot_product_kernel(const gko:: + batch_multi_vector:: + UniformBatch< + const ValueType> + x, + const gko:: + batch_multi_vector:: + UniformBatch< + const ValueType> + y, + const gko:: + batch_multi_vector:: + UniformBatch< + ValueType> + result) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { const auto x_b = gko::batch::batch_entry(x, ibatch); const auto y_b = gko::batch::batch_entry(y, ibatch); @@ -151,6 +189,34 @@ __global__ __launch_bounds__( } +template +__device__ __forceinline__ void one_norm2( + const gko::batch_multi_vector::BatchEntry& x, + const int rhs_index, + const gko::batch_multi_vector::BatchEntry>& + result, + group::thread_block_tile& subwarp_grp) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subwarp_grp.thread_rank(); r < x.num_rows; + r += subwarp_grp.size()) { + val += squared_norm(x.values[r * x.stride + rhs_index]); + } + + // subwarp_grp level reduction +#pragma unroll + for (int j = config::warp_size / 2; j > 0; j /= 2) { + val += subwarp_grp.shfl_down(val, j); + } + + if (subwarp_grp.thread_rank() == 0) { + result.values[rhs_index] = sqrt(val); + } +} + + /** * Computes the 2-norms of some column vectors in global or shared memory. * @@ -159,8 +225,9 @@ __global__ __launch_bounds__( */ template __device__ __forceinline__ void compute_norm2( - const gko::batch_dense::BatchEntry& x, - const gko::batch_dense::BatchEntry>& result) + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry>& + result) { constexpr auto tile_size = config::warp_size; auto thread_block = group::this_thread_block(); @@ -176,12 +243,17 @@ __device__ __forceinline__ void compute_norm2( template -__global__ - __launch_bounds__(default_block_size, sm_multiplier) void compute_norm2( - const gko::batch_dense::UniformBatch x, - const gko::batch_dense::UniformBatch> result) +__global__ __launch_bounds__( + default_block_size, + sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector:: + UniformBatch + x, + const gko::batch_multi_vector:: + UniformBatch< + remove_complex> + result) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch; + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { const auto x_b = gko::batch::batch_entry(x, ibatch); const auto r_b = gko::batch::batch_entry(result, ibatch); @@ -198,8 +270,8 @@ __global__ */ template __device__ __forceinline__ void copy( - const gko::batch_dense::BatchEntry& in, - const gko::batch_dense::BatchEntry& out) + const gko::batch_multi_vector::BatchEntry& in, + const gko::batch_multi_vector::BatchEntry& out) { for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; iz += blockDim.x) { @@ -211,11 +283,12 @@ __device__ __forceinline__ void copy( template -__global__ __launch_bounds__(default_block_size, sm_multiplier) void copy( - const gko::batch_dense::UniformBatch src, - const gko::batch_dense::UniformBatch dst) +__global__ + __launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( + const gko::batch_multi_vector::UniformBatch src, + const gko::batch_multi_vector::UniformBatch dst) { - for (size_type ibatch = blockIdx.x; ibatch < src.num_batch; + for (size_type ibatch = blockIdx.x; ibatch < src.num_batch_entries; ibatch += gridDim.x) { const auto dst_b = gko::batch::batch_entry(dst, ibatch); const auto src_b = gko::batch::batch_entry(src, ibatch); diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index df5aa9149a5..8bfb6fc0167 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -74,7 +74,7 @@ void scale(std::shared_ptr exec, const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); - scale<<>>(alpha_ub, x_ub); + scale_kernel<<>>(alpha_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -89,18 +89,10 @@ void add_scaled(std::shared_ptr exec, { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const size_type nrhs = x->get_common_size()[1]; - if (nrhs == 1) { - const auto num_batch = x->get_num_batch_entries(); - const auto num_rows = x->get_common_size()[0]; - single_add_scaled<<>>( - num_batch, num_rows, as_cuda_type(alpha->get_const_values()), - as_cuda_type(x->get_const_values()), as_cuda_type(y->get_values())); - } else { - const auto alpha_ub = get_batch_struct(alpha); - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - add_scaled<<>>(alpha_ub, x_ub, y_ub); - } + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + add_scaled_kernel<<>>(alpha_ub, x_ub, y_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -115,19 +107,11 @@ void compute_dot(std::shared_ptr exec, { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_common_size()[1]; - if (num_rhs == 1) { - const auto num_rows = x->get_common_size()[0]; - single_compute_dot_product<<>>( - num_blocks, num_rows, as_cuda_type(x->get_const_values()), - as_cuda_type(y->get_const_values()), - as_cuda_type(result->get_values())); - } else { - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - const auto res_ub = get_batch_struct(result); - compute_dot_product<<>>(x_ub, y_ub, - res_ub); - } + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + compute_dot_product_kernel<<>>(x_ub, y_ub, + res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -141,16 +125,9 @@ void compute_norm2(std::shared_ptr exec, { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_common_size()[1]; - if (num_rhs == 1) { - const auto num_rows = x->get_common_size()[0]; - single_compute_norm2<<>>( - num_blocks, num_rows, as_cuda_type(x->get_const_values()), - as_cuda_type(result->get_values())); - } else { - const auto x_ub = get_batch_struct(x); - const auto res_ub = get_batch_struct(result); - compute_norm2<<>>(x_ub, res_ub); - } + const auto x_ub = get_batch_struct(x); + const auto res_ub = get_batch_struct(result); + compute_norm2_kernel<<>>(x_ub, res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -165,7 +142,7 @@ void copy(std::shared_ptr exec, const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); - copy<<>>(x_ub, result_ub); + copy_kernel<<>>(x_ub, result_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index 2a6c3085772..50f8593ffec 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -77,8 +77,8 @@ void scale(std::shared_ptr exec, const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); - hipLaunchKernelGGL(scale, dim3(num_blocks), dim3(default_block_size), 0, 0, - alpha_ub, x_ub); + hipLaunchKernelGGL(scale_kernel, dim3(num_blocks), dim3(default_block_size), + 0, 0, alpha_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -93,21 +93,11 @@ void add_scaled(std::shared_ptr exec, { const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const size_type nrhs = x->get_common_size()[1]; - if (nrhs == 1) { - const auto num_batch = x->get_num_batch_entries(); - const auto num_rows = x->get_common_size()[0]; - hipLaunchKernelGGL( - single_add_scaled, dim3(num_blocks), dim3(default_block_size), 0, 0, - num_batch, num_rows, as_hip_type(alpha->get_const_values()), - as_hip_type(x->get_const_values()), as_hip_type(y->get_values())); - } else { - const auto alpha_ub = get_batch_struct(alpha); - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - hipLaunchKernelGGL(add_scaled, dim3(num_blocks), - dim3(default_block_size), 0, 0, alpha_ub, x_ub, - y_ub); - } + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + hipLaunchKernelGGL(add_scaled_kernel, dim3(num_blocks), + dim3(default_block_size), 0, 0, alpha_ub, x_ub, y_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -122,23 +112,13 @@ void compute_dot(std::shared_ptr exec, { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_common_size()[1]; - if (num_rhs == 1) { - const auto num_rows = x->get_common_size()[0]; - hipLaunchKernelGGL(single_compute_dot_product, dim3(num_blocks), - dim3(default_block_size), 0, 0, num_blocks, num_rows, - as_hip_type(x->get_const_values()), - as_hip_type(y->get_const_values()), - as_hip_type(result->get_values())); - } else { - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - const auto res_ub = get_batch_struct(result); - hipLaunchKernelGGL(compute_dot_product, dim3(num_blocks), - dim3(default_block_size), 0, 0, x_ub, y_ub, res_ub); - } + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + hipLaunchKernelGGL(compute_dot_product_kernel, dim3(num_blocks), + dim3(default_block_size), 0, 0, x_ub, y_ub, res_ub); } - GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); @@ -150,18 +130,10 @@ void compute_norm2(std::shared_ptr exec, { const auto num_blocks = x->get_num_batch_entries(); const auto num_rhs = x->get_common_size()[1]; - if (num_rhs == 1) { - const auto num_rows = x->get_common_size()[0]; - hipLaunchKernelGGL(single_compute_norm2, dim3(num_blocks), - dim3(default_block_size), 0, 0, num_blocks, num_rows, - as_hip_type(x->get_const_values()), - as_hip_type(result->get_values())); - } else { - const auto x_ub = get_batch_struct(x); - const auto res_ub = get_batch_struct(result); - hipLaunchKernelGGL(compute_norm2, dim3(num_blocks), - dim3(default_block_size), 0, 0, x_ub, res_ub); - } + const auto x_ub = get_batch_struct(x); + const auto res_ub = get_batch_struct(result); + hipLaunchKernelGGL(compute_norm2_kernel, dim3(num_blocks), + dim3(default_block_size), 0, 0, x_ub, res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -176,8 +148,8 @@ void copy(std::shared_ptr exec, const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); - hipLaunchKernelGGL(copy, dim3(num_blocks), dim3(default_block_size), 0, 0, - x_ub, result_ub); + hipLaunchKernelGGL(copy_kernel, dim3(num_blocks), dim3(default_block_size), + 0, 0, x_ub, result_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); From f40f79271c7edb9962c246e56b27749f52fa6f46 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 14 Jul 2023 13:55:06 +0200 Subject: [PATCH 115/583] Update and fix dpcpp kernels Co-authored-by: Phuong Nguyen --- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 2 +- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 115 +++++++----------- 2 files changed, 43 insertions(+), 74 deletions(-) diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 6101ed3da4d..88cdb1d6e6f 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -83,7 +83,7 @@ void scale(std::shared_ptr exec, auto group_id = group.get_group_linear_id(); const auto alpha_b = batch::batch_entry(alpha_ub, group_id); const auto x_b = batch::batch_entry(x_ub, group_id); - single_scale_kernel(alpha_b, x_b, item_ct1); + scale_kernel(alpha_b, x_b, item_ct1); }); }); } diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index 7ea25fb4c22..07d6d97ff0a 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -30,25 +30,30 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -/** - * Copies the values of vector into another. - * - * @param num_rows Length of vector. - * @param in Vector to copy from. - * @param out Vector to copy into. - */ + template -__dpct_inline__ void copy_kernel(const int num_rows, - const ValueType* const __restrict__ in, - ValueType* const __restrict__ out, - sycl::nd_item<3> item_ct1) +__dpct_inline__ void scale_kernel( + const gko::batch_dense::BatchEntry& alpha, + const gko::batch_dense::BatchEntry& x, + sycl::nd_item<3>& item_ct1) { - for (int iz = item_ct1.get_local_linear_id(); iz < num_rows; - iz += item_ct1.get_local_range().size()) { - out[iz] = in[iz]; + const int max_li = x.num_rows * x.num_rhs; + for (int li = item_ct1.get_local_linear_id(); li < max_li; + li += item_ct1.get_local_range().size()) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + if (alpha.num_rhs == 1) { + x.values[row * x.stride + col] = + alpha.values[0] * x.values[row * x.stride + col]; + } else { + x.values[row * x.stride + col] = + alpha.values[col] * x.values[row * x.stride + col]; + } } } + /** * Adds a scaled vector to another. * @@ -70,35 +75,26 @@ __dpct_inline__ void add_scaled_kernel(const int num_rows, } } -/** - * Computes the 2-norm of a vector in global or shared memory. - * - * @param x A row-major vector (only 1 column). - * @param result Norm value. - */ + template -__dpct_inline__ void compute_norm2_sg_kernel( +__dpct_inline__ void compute_dot_product_kernel( const int num_rows, const ValueType* const __restrict__ x, - gko::remove_complex& result, sycl::nd_item<3> item_ct1) + const ValueType* const __restrict__ y, ValueType& result, + sycl::nd_item<3> item_ct1) { - const auto sg = item_ct1.get_sub_group(); - const auto sg_size = sg.get_local_range().size(); - const auto sg_tid = sg.get_local_id(); - - using real_type = typename gko::remove_complex; - real_type val = zero(); - - for (int r = sg_tid; r < num_rows; r += sg_size) { - val += squared_norm(x[r]); - } + const auto group = item_ct1.get_group(); + const auto group_size = item_ct1.get_local_range().size(); + const auto tid = item_ct1.get_local_linear_id(); - val = sycl::reduce_over_group(sg, val, sycl::plus<>()); + ValueType val = zero(); - if (sg_tid == 0) { - result = sqrt(val); + for (int r = tid; r < num_rows; r += group_size) { + val += conj(x[r]) * y[r]; } + result = sycl::reduce_over_group(group, val, sycl::plus<>()); } + template __dpct_inline__ void compute_norm2_kernel( const int num_rows, const ValueType* const __restrict__ x, @@ -122,47 +118,20 @@ __dpct_inline__ void compute_norm2_kernel( /** - * Computes the dot product of some column vectors in global or shared memory. + * Copies the values of vector into another. * - * @param result Holds dot product value for vector in x and y. + * @param num_rows Length of vector. + * @param in Vector to copy from. + * @param out Vector to copy into. */ template -__dpct_inline__ void compute_dot_product_sg_kernel( - const int num_rows, const ValueType* const __restrict__ x, - const ValueType* const __restrict__ y, ValueType& result, - sycl::nd_item<3> item_ct1) -{ - const auto sg = item_ct1.get_sub_group(); - const auto sg_size = sg.get_local_range().size(); - const auto sg_tid = sg.get_local_id(); - - ValueType val = zero(); - - for (int r = sg_tid; r < num_rows; r += sg_size) { - val += conj(x[r]) * y[r]; - } - - val = sycl::reduce_over_group(sg, val, sycl::plus<>()); - - if (sg_tid == 0) { - result = val; - } -} - -template -__dpct_inline__ void compute_dot_product_kernel( - const int num_rows, const ValueType* const __restrict__ x, - const ValueType* const __restrict__ y, ValueType& result, - sycl::nd_item<3> item_ct1) +__dpct_inline__ void copy_kernel(const int num_rows, + const ValueType* const __restrict__ in, + ValueType* const __restrict__ out, + sycl::nd_item<3> item_ct1) { - const auto group = item_ct1.get_group(); - const auto group_size = item_ct1.get_local_range().size(); - const auto tid = item_ct1.get_local_linear_id(); - - ValueType val = zero(); - - for (int r = tid; r < num_rows; r += group_size) { - val += conj(x[r]) * y[r]; + for (int iz = item_ct1.get_local_linear_id(); iz < num_rows; + iz += item_ct1.get_local_range().size()) { + out[iz] = in[iz]; } - result = sycl::reduce_over_group(group, val, sycl::plus<>()); } From 0edae740a72fac469d4c57d8ab6a73d80adc79b2 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 14 Jul 2023 13:55:26 +0200 Subject: [PATCH 116/583] Fix omp and ref kernels --- omp/base/batch_multi_vector_kernels.cpp | 14 +- reference/base/batch_multi_vector_kernels.cpp | 10 +- .../base/batch_multi_vector_kernels.hpp.inc | 49 +- .../test/base/batch_multi_vector_kernels.cpp | 641 +----------------- 4 files changed, 55 insertions(+), 659 deletions(-) diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index 6dd8b38e6d8..f46cbb12ead 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" -#include "reference/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" namespace gko { @@ -56,7 +56,7 @@ namespace omp { namespace batch_multi_vector { -#include "reference/matrix/batch_multi_vector_kernels.hpp.inc" +#include "reference/base/batch_multi_vector_kernels.hpp.inc" template @@ -70,7 +70,7 @@ void scale(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); - scale(alpha_b, x_b); + scale_kernel(alpha_b, x_b); } } @@ -92,7 +92,7 @@ void add_scaled(std::shared_ptr exec, const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); const auto y_b = gko::batch::batch_entry(y_ub, batch); - add_scaled(alpha_b, x_b, y_b); + add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -115,7 +115,7 @@ void compute_dot(std::shared_ptr exec, const auto res_b = gko::batch::batch_entry(res_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); const auto y_b = gko::batch::batch_entry(y_ub, batch); - compute_dot_product(x_b, y_b, res_b); + compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -135,7 +135,7 @@ void compute_norm2(std::shared_ptr exec, ++batch) { const auto res_b = gko::batch::batch_entry(res_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); - compute_norm2(x_b, res_b); + compute_norm2_kernel(x_b, res_b); } } @@ -154,7 +154,7 @@ void copy(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { const auto result_b = gko::batch::batch_entry(result_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); - copy(x_b, result_b); + copy_kernel(x_b, result_b); } } diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index 31e10fbe22f..f494a326773 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -69,7 +69,7 @@ void scale(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); - scale(alpha_b, x_b); + scale_kernel(alpha_b, x_b); } } @@ -90,7 +90,7 @@ void add_scaled(std::shared_ptr exec, const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); const auto y_b = gko::batch::batch_entry(y_ub, batch); - add_scaled(alpha_b, x_b, y_b); + add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -112,7 +112,7 @@ void compute_dot(std::shared_ptr exec, const auto res_b = gko::batch::batch_entry(res_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); const auto y_b = gko::batch::batch_entry(y_ub, batch); - compute_dot_product(x_b, y_b, res_b); + compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -131,7 +131,7 @@ void compute_norm2(std::shared_ptr exec, ++batch) { const auto res_b = gko::batch::batch_entry(res_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); - compute_norm2(x_b, res_b); + compute_norm2_kernel(x_b, res_b); } } @@ -149,7 +149,7 @@ void copy(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { const auto result_b = gko::batch::batch_entry(result_ub, batch); const auto x_b = gko::batch::batch_entry(x_ub, batch); - copy(x_b, result_b); + copy_kernel(x_b, result_b); } } diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc index 3cda19cfc06..a793fe030f9 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template -inline void scale( +inline void scale_kernel( const gko::batch_multi_vector::BatchEntry& alpha, const gko::batch_multi_vector::BatchEntry& x) { @@ -53,7 +53,7 @@ inline void scale( template -inline void add_scaled( +inline void add_scaled_kernel( const gko::batch_multi_vector::BatchEntry& alpha, const gko::batch_multi_vector::BatchEntry& x, const gko::batch_multi_vector::BatchEntry& y) @@ -77,7 +77,26 @@ inline void add_scaled( template -inline void compute_norm2( +inline void compute_dot_product_kernel( + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, + const gko::batch_multi_vector::BatchEntry& result) +{ + for (int c = 0; c < result.num_rhs; c++) { + result.values[c] = gko::zero(); + } + + for (int r = 0; r < x.num_rows; r++) { + for (int c = 0; c < x.num_rhs; c++) { + result.values[c] += + conj(x.values[r * x.stride + c]) * y.values[r * y.stride + c]; + } + } +} + + +template +inline void compute_norm2_kernel( const gko::batch_multi_vector::BatchEntry& x, const gko::batch_multi_vector::BatchEntry>& result) @@ -103,8 +122,9 @@ inline void compute_norm2( * and stride set. */ template -inline void copy(const gko::batch_multi_vector::BatchEntry& in, - const gko::batch_multi_vector::BatchEntry& out) +inline void copy_kernel( + const gko::batch_multi_vector::BatchEntry& in, + const gko::batch_multi_vector::BatchEntry& out) { for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) { const int i = iz / in.num_rhs; @@ -112,22 +132,3 @@ inline void copy(const gko::batch_multi_vector::BatchEntry& in, out.values[i * out.stride + j] = in.values[i * in.stride + j]; } } - - -template -inline void compute_dot_product( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, - const gko::batch_multi_vector::BatchEntry& result) -{ - for (int c = 0; c < result.num_rhs; c++) { - result.values[c] = gko::zero(); - } - - for (int r = 0; r < x.num_rows; r++) { - for (int c = 0; c < x.num_rhs; c++) { - result.values[c] += - conj(x.values[r * x.stride + c]) * y.values[r * y.stride + c]; - } - } -} diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index f2062a4e393..8ed8f03dc25 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -141,102 +141,6 @@ class BatchMultiVector : public ::testing::Test { TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); -TYPED_TEST(BatchMultiVector, AppliesToBatchMultiVector) -{ - using T = typename TestFixture::value_type; - this->mtx_1->apply(this->mtx_2.get(), this->mtx_3.get()); - this->mtx_10->apply(this->mtx_20.get(), this->mtx_30.get()); - this->mtx_11->apply(this->mtx_21.get(), this->mtx_31.get()); - - - auto res = this->mtx_3->unbatch(); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_30.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_31.get(), 0.); -} - - -TYPED_TEST(BatchMultiVector, AppliesLinearCombinationToBatchMultiVector) -{ - using Mtx = typename TestFixture::Mtx; - using DenseMtx = typename TestFixture::DenseMtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize({{1.5}, {-1.0}}, this->exec); - auto beta = gko::batch_initialize({{2.5}, {-4.0}}, this->exec); - auto alpha0 = gko::initialize({1.5}, this->exec); - auto alpha1 = gko::initialize({-1.0}, this->exec); - auto beta0 = gko::initialize({2.5}, this->exec); - auto beta1 = gko::initialize({-4.0}, this->exec); - - this->mtx_1->apply(alpha.get(), this->mtx_2.get(), beta.get(), - this->mtx_3.get()); - this->mtx_10->apply(alpha0.get(), this->mtx_20.get(), beta0.get(), - this->mtx_30.get()); - this->mtx_11->apply(alpha1.get(), this->mtx_21.get(), beta1.get(), - this->mtx_31.get()); - - auto res = this->mtx_3->unbatch(); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_30.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_31.get(), 0.); -} - - -TYPED_TEST(BatchMultiVector, ApplyFailsOnWrongInnerDimension) -{ - using Mtx = typename TestFixture::Mtx; - auto res = Mtx::create( - this->exec, std::vector>{gko::dim<2>{2}, gko::dim<2>{2}}); - - ASSERT_THROW(this->mtx_2->apply(this->mtx_1.get(), res.get()), - gko::DimensionMismatch); -} - - -TYPED_TEST(BatchMultiVector, ApplyFailsForNonUniformBatches) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto mat1 = gko::batch_initialize( - std::vector{4, 4}, - {{I({1.0, -1.0}), I({1.0, -1.0}), I({2.0, -0.5})}, - {{1.0, 2.5, 3.0}, {1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, - this->exec); - auto mat2 = gko::batch_initialize( - std::vector{4, 4}, - {{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}}, - {{1.0, 2.5, -3.0}, {1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, - this->exec); - auto res = Mtx::create( - this->exec, std::vector>{gko::dim<2>{2}, gko::dim<2>{3}}); - - ASSERT_THROW(mat2->apply(mat1.get(), res.get()), gko::NotImplemented); -} - - -TYPED_TEST(BatchMultiVector, ApplyFailsOnWrongNumberOfRows) -{ - using Mtx = typename TestFixture::Mtx; - auto res = Mtx::create( - this->exec, std::vector>{gko::dim<2>{3}, gko::dim<2>{3}}); - - ASSERT_THROW(this->mtx_1->apply(this->mtx_2.get(), res.get()), - gko::DimensionMismatch); -} - - -TYPED_TEST(BatchMultiVector, ApplyFailsOnWrongNumberOfCols) -{ - using Mtx = typename TestFixture::Mtx; - auto res = Mtx::create( - this->exec, - std::vector>{gko::dim<2>{2, 1}, gko::dim<2>{2, 1}}, - std::vector{3, 3}); - - - ASSERT_THROW(this->mtx_1->apply(this->mtx_2.get(), res.get()), - gko::DimensionMismatch); -} - - TYPED_TEST(BatchMultiVector, ScalesData) { using Mtx = typename TestFixture::Mtx; @@ -313,71 +217,6 @@ TYPED_TEST(BatchMultiVector, AddsScaled) } -TYPED_TEST(BatchMultiVector, AddsScale) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize( - {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto beta = gko::batch_initialize( - {{{-1.0, 3.0, 0.5}}, {{1.5, 0.5, -4.0}}}, this->exec); - - auto ualpha = alpha->unbatch(); - auto ubeta = beta->unbatch(); - - this->mtx_1->add_scale(alpha.get(), this->mtx_0.get(), beta.get()); - this->mtx_10->add_scale(ualpha[0].get(), this->mtx_00.get(), - ubeta[0].get()); - this->mtx_11->add_scale(ualpha[1].get(), this->mtx_01.get(), - ubeta[1].get()); - - auto res = this->mtx_1->unbatch(); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); -} - - -TYPED_TEST(BatchMultiVector, ConvergenceAddScaled) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize( - {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - - auto ualpha = alpha->unbatch(); - - - const int num_rhs = 3; - const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - - gko::kernels::reference::batch_multi_vector::convergence_add_scaled( - this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), - converged); - - auto mtx_10_clone = gko::clone(this->mtx_10); - auto mtx_11_clone = gko::clone(this->mtx_11); - - this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); - this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - - auto res = this->mtx_1->unbatch(); - - EXPECT_EQ(res[0]->at(0, 0), mtx_10_clone->at(0, 0)); - EXPECT_EQ(res[0]->at(1, 0), mtx_10_clone->at(1, 0)); - EXPECT_EQ(res[0]->at(0, 1), this->mtx_10->at(0, 1)); - EXPECT_EQ(res[0]->at(1, 1), this->mtx_10->at(1, 1)); - EXPECT_EQ(res[0]->at(0, 2), mtx_10_clone->at(0, 2)); - EXPECT_EQ(res[0]->at(1, 2), mtx_10_clone->at(1, 2)); - - EXPECT_EQ(res[1]->at(0, 0), mtx_11_clone->at(0, 0)); - EXPECT_EQ(res[1]->at(1, 0), mtx_11_clone->at(1, 0)); - EXPECT_EQ(res[1]->at(0, 1), this->mtx_11->at(0, 1)); - EXPECT_EQ(res[1]->at(1, 1), this->mtx_11->at(1, 1)); - EXPECT_EQ(res[1]->at(0, 2), mtx_11_clone->at(0, 2)); - EXPECT_EQ(res[1]->at(1, 2), mtx_11_clone->at(1, 2)); -} - - TYPED_TEST(BatchMultiVector, AddsScaledWithScalar) { using Mtx = typename TestFixture::Mtx; @@ -396,91 +235,6 @@ TYPED_TEST(BatchMultiVector, AddsScaledWithScalar) } -TYPED_TEST(BatchMultiVector, AddsScaleWithScalar) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); - auto beta = gko::batch_initialize({{-0.5}, {3.0}}, this->exec); - - auto ualpha = alpha->unbatch(); - auto ubeta = beta->unbatch(); - - this->mtx_1->add_scale(alpha.get(), this->mtx_0.get(), beta.get()); - this->mtx_10->add_scale(ualpha[0].get(), this->mtx_00.get(), - ubeta[0].get()); - this->mtx_11->add_scale(ualpha[1].get(), this->mtx_01.get(), - ubeta[1].get()); - - auto res = this->mtx_1->unbatch(); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); -} - - -TYPED_TEST(BatchMultiVector, AddScaleWithScalarViaApply) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); - auto beta = gko::batch_initialize({{-0.5}, {3.0}}, this->exec); - auto id = gko::matrix::BatchIdentity::create( - this->exec, gko::batch_dim<2>(2, gko::dim<2>(3, 3))); - auto ualpha = alpha->unbatch(); - auto ubeta = beta->unbatch(); - - this->mtx_0->apply(alpha.get(), id.get(), beta.get(), this->mtx_1.get()); - this->mtx_10->add_scale(ualpha[0].get(), this->mtx_00.get(), - ubeta[0].get()); - this->mtx_11->add_scale(ualpha[1].get(), this->mtx_01.get(), - ubeta[1].get()); - - auto res = this->mtx_1->unbatch(); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); -} - - -TYPED_TEST(BatchMultiVector, ConvergenceAddScaledWithScalar) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); - - auto ualpha = alpha->unbatch(); - - - const int num_rhs = 3; - const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - - gko::kernels::reference::batch_multi_vector::convergence_add_scaled( - this->exec, alpha.get(), this->mtx_0.get(), this->mtx_1.get(), - converged); - - auto mtx_10_clone = gko::clone(this->mtx_10); - auto mtx_11_clone = gko::clone(this->mtx_11); - - this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); - this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - - auto res = this->mtx_1->unbatch(); - - EXPECT_EQ(res[0]->at(0, 0), mtx_10_clone->at(0, 0)); - EXPECT_EQ(res[0]->at(1, 0), mtx_10_clone->at(1, 0)); - EXPECT_EQ(res[0]->at(0, 1), this->mtx_10->at(0, 1)); - EXPECT_EQ(res[0]->at(1, 1), this->mtx_10->at(1, 1)); - EXPECT_EQ(res[0]->at(0, 2), mtx_10_clone->at(0, 2)); - EXPECT_EQ(res[0]->at(1, 2), mtx_10_clone->at(1, 2)); - - EXPECT_EQ(res[1]->at(0, 0), mtx_11_clone->at(0, 0)); - EXPECT_EQ(res[1]->at(1, 0), mtx_11_clone->at(1, 0)); - EXPECT_EQ(res[1]->at(0, 1), this->mtx_11->at(0, 1)); - EXPECT_EQ(res[1]->at(1, 1), this->mtx_11->at(1, 1)); - EXPECT_EQ(res[1]->at(0, 2), mtx_11_clone->at(0, 2)); - EXPECT_EQ(res[1]->at(1, 2), mtx_11_clone->at(1, 2)); -} - - TYPED_TEST(BatchMultiVector, AddScaledFailsOnWrongSizes) { using Mtx = typename TestFixture::Mtx; @@ -492,18 +246,6 @@ TYPED_TEST(BatchMultiVector, AddScaledFailsOnWrongSizes) } -TYPED_TEST(BatchMultiVector, AddScaleFailsOnWrongSizes) -{ - using Mtx = typename TestFixture::Mtx; - auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); - auto beta = gko::batch_initialize({{2.0}, {3.0}}, this->exec); - - ASSERT_THROW( - this->mtx_1->add_scale(alpha.get(), this->mtx_2.get(), beta.get()), - gko::DimensionMismatch); -} - - TYPED_TEST(BatchMultiVector, AddScaleFailsOnWrongScalarSizes) { using Mtx = typename TestFixture::Mtx; @@ -536,72 +278,35 @@ TYPED_TEST(BatchMultiVector, ComputesDot) } -TYPED_TEST(BatchMultiVector, ConvergenceComputeDot) +TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; auto result = - Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); - - for (int ibatch = 0; ibatch < result->get_size().get_batch_sizes().size(); - ibatch++) { - for (int icol = 0; icol < result->get_size().at()[1]; icol++) { - result->at(ibatch, 0, icol) = gko::zero(); - } - } - - auto ures = result->unbatch(); - - const int num_rhs = 3; - const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - - gko::kernels::reference::batch_multi_vector::convergence_compute_dot( - this->exec, this->mtx_0.get(), this->mtx_1.get(), result.get(), - converged); - - auto ures_00_clone = gko::clone(ures[0]); - auto ures_01_clone = gko::clone(ures[1]); - - this->mtx_00->compute_dot(this->mtx_10.get(), ures[0].get()); - this->mtx_01->compute_dot(this->mtx_11.get(), ures[1].get()); - - auto res = result->unbatch(); - - EXPECT_EQ(res[0]->at(0, 0), ures_00_clone->at(0, 0)); - EXPECT_EQ(res[0]->at(0, 1), ures[0]->at(0, 1)); - EXPECT_EQ(res[0]->at(0, 2), ures_00_clone->at(0, 2)); + Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ + gko::dim<2>{1, 2}, gko::dim<2>{1, 3}})); - EXPECT_EQ(res[1]->at(0, 0), ures_01_clone->at(0, 0)); - EXPECT_EQ(res[1]->at(0, 1), ures[1]->at(0, 1)); - EXPECT_EQ(res[1]->at(0, 2), ures_01_clone->at(0, 2)); + ASSERT_THROW(this->mtx_1->compute_dot(this->mtx_2.get(), result.get()), + gko::DimensionMismatch); } -TYPED_TEST(BatchMultiVector, ComputesNorm2) +TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using T_nc = gko::remove_complex; - using NormVector = gko::BatchMultiVector; - auto mtx(gko::batch_initialize( - {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, - {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, - this->exec)); - auto batch_size = gko::batch_dim<2>( - std::vector>{gko::dim<2>{1, 2}, gko::dim<2>{1, 2}}); auto result = - NormVector::create(this->exec, batch_size, gko::batch_stride(2, 2)); - - mtx->compute_norm2(result.get()); + Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ + gko::dim<2>{1, 2}, gko::dim<2>{1, 2}})); + auto result2 = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); - EXPECT_EQ(result->at(0, 0, 0), T_nc{3.0}); - EXPECT_EQ(result->at(0, 0, 1), T_nc{5.0}); - EXPECT_EQ(result->at(1, 0, 0), T_nc{5.0}); - EXPECT_EQ(result->at(1, 0, 1), T_nc{3.0}); + ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result.get()), + gko::DimensionMismatch); + ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result2.get()), + gko::DimensionMismatch); } -TYPED_TEST(BatchMultiVector, ConvergenceComputeNorm2) +TYPED_TEST(BatchMultiVector, ComputesNorm2) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -616,57 +321,15 @@ TYPED_TEST(BatchMultiVector, ConvergenceComputeNorm2) auto result = NormVector::create(this->exec, batch_size, gko::batch_stride(2, 2)); - for (int ibatch = 0; ibatch < result->get_size().get_batch_sizes().size(); - ibatch++) { - for (int icol = 0; icol < result->get_size().at()[1]; icol++) { - result->at(ibatch, 0, icol) = gko::zero(); - } - } - - auto result_clone = gko::clone(result); - - const int num_rhs = 2; - const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - - gko::kernels::reference::batch_multi_vector::convergence_compute_norm2( - this->exec, mtx.get(), result.get(), converged); + mtx->compute_norm2(result.get()); - EXPECT_EQ(result->at(0, 0, 0), result_clone->at(0, 0, 0)); + EXPECT_EQ(result->at(0, 0, 0), T_nc{3.0}); EXPECT_EQ(result->at(0, 0, 1), T_nc{5.0}); - - EXPECT_EQ(result->at(1, 0, 0), result_clone->at(1, 0, 0)); + EXPECT_EQ(result->at(1, 0, 0), T_nc{5.0}); EXPECT_EQ(result->at(1, 0, 1), T_nc{3.0}); } -TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongInputSize) -{ - using Mtx = typename TestFixture::Mtx; - auto result = - Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ - gko::dim<2>{1, 2}, gko::dim<2>{1, 3}})); - - ASSERT_THROW(this->mtx_1->compute_dot(this->mtx_2.get(), result.get()), - gko::DimensionMismatch); -} - - -TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongResultSize) -{ - using Mtx = typename TestFixture::Mtx; - auto result = - Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ - gko::dim<2>{1, 2}, gko::dim<2>{1, 2}})); - auto result2 = - Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); - - ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result.get()), - gko::DimensionMismatch); - ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result2.get()), - gko::DimensionMismatch); -} - - TYPED_TEST(BatchMultiVector, CopiesData) { gko::kernels::reference::batch_multi_vector::copy( @@ -676,71 +339,6 @@ TYPED_TEST(BatchMultiVector, CopiesData) } -TYPED_TEST(BatchMultiVector, ConvergenceCopyData) -{ - auto umtx_0 = this->mtx_0->unbatch(); - - const int num_rhs = 3; - const gko::uint32 converged = 0xfffffffd | (0 - (1 << num_rhs)); - gko::kernels::reference::batch_multi_vector::convergence_copy( - this->exec, this->mtx_0.get(), this->mtx_1.get(), converged); - - auto mtx_10_clone = gko::clone(this->mtx_10); - auto mtx_11_clone = gko::clone(this->mtx_11); - - auto res = this->mtx_1->unbatch(); - - EXPECT_EQ(res[0]->at(0, 0), mtx_10_clone->at(0, 0)); - EXPECT_EQ(res[0]->at(1, 0), mtx_10_clone->at(1, 0)); - EXPECT_EQ(res[0]->at(0, 1), this->mtx_0->at(0, 0, 1)); - EXPECT_EQ(res[0]->at(1, 1), this->mtx_0->at(0, 1, 1)); - EXPECT_EQ(res[0]->at(0, 2), mtx_10_clone->at(0, 2)); - EXPECT_EQ(res[0]->at(1, 2), mtx_10_clone->at(1, 2)); - - EXPECT_EQ(res[1]->at(0, 0), mtx_11_clone->at(0, 0)); - EXPECT_EQ(res[1]->at(1, 0), mtx_11_clone->at(1, 0)); - EXPECT_EQ(res[1]->at(0, 1), this->mtx_0->at(1, 0, 1)); - EXPECT_EQ(res[1]->at(1, 1), this->mtx_0->at(1, 1, 1)); - EXPECT_EQ(res[1]->at(0, 2), mtx_11_clone->at(0, 2)); - EXPECT_EQ(res[1]->at(1, 2), mtx_11_clone->at(1, 2)); -} - - -TYPED_TEST(BatchMultiVector, BatchScale) -{ - using T = typename TestFixture::value_type; - using Mtx = typename TestFixture::Mtx; - using BDiag = gko::matrix::BatchDiagonal; - - auto mtx(gko::batch_initialize( - {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, - {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, - this->exec)); - - auto left(gko::batch_diagonal_initialize( - I>{I{1.0, 2.0, 3.0}, I{-1.0, -2.0, -3.0}}, this->exec)); - auto rght(gko::batch_diagonal_initialize( - I>{I{-0.5, -2.0}, I{2.0, 0.25}}, this->exec)); - - gko::kernels::reference::batch_multi_vector::batch_scale( - this->exec, left.get(), rght.get(), mtx.get()); - - EXPECT_EQ(mtx->at(0, 0, 0), T{-0.5}); - EXPECT_EQ(mtx->at(0, 1, 0), T{-2.0}); - EXPECT_EQ(mtx->at(0, 2, 0), T{-3.0}); - EXPECT_EQ(mtx->at(0, 0, 1), T{0.0}); - EXPECT_EQ(mtx->at(0, 1, 1), T{-12.0}); - EXPECT_EQ(mtx->at(0, 2, 1), T{-24.0}); - - EXPECT_EQ(mtx->at(1, 0, 0), T{8.0}); - EXPECT_EQ(mtx->at(1, 1, 0), T{12.0}); - EXPECT_EQ(mtx->at(1, 2, 0), T{0.0}); - EXPECT_EQ(mtx->at(1, 0, 1), T{-0.5}); - EXPECT_EQ(mtx->at(1, 1, 1), T{1.0}); - EXPECT_EQ(mtx->at(1, 2, 1), T{-0.75}); -} - - TYPED_TEST(BatchMultiVector, ConvertsToPrecision) { using BatchMultiVector = typename TestFixture::Mtx; @@ -787,80 +385,6 @@ TYPED_TEST(BatchMultiVector, MovesToPrecision) } -TYPED_TEST(BatchMultiVector, ConvertsToCsr32) -{ - using T = typename TestFixture::value_type; - using BatchCsr = typename gko::matrix::BatchCsr; - auto batch_csr_mtx = BatchCsr::create(this->mtx_6->get_executor()); - - this->mtx_6->convert_to(batch_csr_mtx.get()); - - auto v = batch_csr_mtx->get_const_values(); - auto c = batch_csr_mtx->get_const_col_idxs(); - auto r = batch_csr_mtx->get_const_row_ptrs(); - ASSERT_EQ(batch_csr_mtx->get_num_batch_entries(), 2); - ASSERT_EQ(batch_csr_mtx->get_size().at(0), gko::dim<2>(3, 3)); - ASSERT_EQ(batch_csr_mtx->get_size().at(1), gko::dim<2>(3, 3)); - ASSERT_EQ(batch_csr_mtx->get_num_stored_elements(), 10); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 2); - EXPECT_EQ(r[2], 3); - EXPECT_EQ(r[3], 5); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 2); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], 2); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{3.0}); - EXPECT_EQ(v[3], T{1.0}); - EXPECT_EQ(v[4], T{5.0}); - EXPECT_EQ(v[5], T{2.0}); - EXPECT_EQ(v[6], T{5.0}); - EXPECT_EQ(v[7], T{1.0}); - EXPECT_EQ(v[8], T{-1.0}); - EXPECT_EQ(v[9], T{8.0}); -} - - -TYPED_TEST(BatchMultiVector, MovesToCsr32) -{ - using T = typename TestFixture::value_type; - using BatchCsr = typename gko::matrix::BatchCsr; - auto batch_csr_mtx = BatchCsr::create(this->mtx_6->get_executor()); - - this->mtx_6->move_to(batch_csr_mtx.get()); - - auto v = batch_csr_mtx->get_const_values(); - auto c = batch_csr_mtx->get_const_col_idxs(); - auto r = batch_csr_mtx->get_const_row_ptrs(); - ASSERT_EQ(batch_csr_mtx->get_num_batch_entries(), 2); - ASSERT_EQ(batch_csr_mtx->get_size().at(0), gko::dim<2>(3, 3)); - ASSERT_EQ(batch_csr_mtx->get_size().at(1), gko::dim<2>(3, 3)); - ASSERT_EQ(batch_csr_mtx->get_num_stored_elements(), 10); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 2); - EXPECT_EQ(r[2], 3); - EXPECT_EQ(r[3], 5); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 2); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], 2); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{3.0}); - EXPECT_EQ(v[3], T{1.0}); - EXPECT_EQ(v[4], T{5.0}); - EXPECT_EQ(v[5], T{2.0}); - EXPECT_EQ(v[6], T{5.0}); - EXPECT_EQ(v[7], T{1.0}); - EXPECT_EQ(v[8], T{-1.0}); - EXPECT_EQ(v[9], T{8.0}); -} - - TYPED_TEST(BatchMultiVector, ConvertsEmptyToPrecision) { using BatchMultiVector = typename TestFixture::Mtx; @@ -891,133 +415,4 @@ TYPED_TEST(BatchMultiVector, MovesEmptyToPrecision) } -TYPED_TEST(BatchMultiVector, ConvertsEmptyMatrixToCsr) -{ - using BatchMultiVector = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using BatchCsr = typename gko::matrix::BatchCsr; - auto empty = BatchMultiVector::create(this->exec); - auto res = BatchCsr::create(this->exec); - - empty->convert_to(res.get()); - - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_EQ(*res->get_const_row_ptrs(), 0); - ASSERT_FALSE(res->get_num_batch_entries()); -} - - -TYPED_TEST(BatchMultiVector, MovesEmptyMatrixToCsr) -{ - using BatchMultiVector = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using BatchCsr = typename gko::matrix::BatchCsr; - auto empty = BatchMultiVector::create(this->exec); - auto res = BatchCsr::create(this->exec); - - empty->move_to(res.get()); - - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_EQ(*res->get_const_row_ptrs(), 0); - ASSERT_FALSE(res->get_num_batch_entries()); -} - - -TYPED_TEST(BatchMultiVector, ConvertsToBatchDiagonal) -{ - using BDense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using BDiag = gko::matrix::BatchDiagonal; - auto vec = gko::batch_initialize( - {I({2.0, 3.0, -1.0}), I({1.0, -2.0, 8.0})}, this->exec); - auto diag = BDiag::create(this->exec); - - vec->convert_to(diag.get()); - - auto check_sz = gko::batch_dim<2>{2, gko::dim<2>{3}}; - ASSERT_EQ(diag->get_size(), check_sz); - auto diag_vals = diag->get_const_values(); - ASSERT_EQ(diag_vals[0], T{2.0}); - ASSERT_EQ(diag_vals[1], T{3.0}); - ASSERT_EQ(diag_vals[2], T{-1.0}); - ASSERT_EQ(diag_vals[3], T{1.0}); - ASSERT_EQ(diag_vals[4], T{-2.0}); - ASSERT_EQ(diag_vals[5], T{8.0}); -} - - -TYPED_TEST(BatchMultiVector, MovesToBatchDiagonal) -{ - using BDense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using BDiag = gko::matrix::BatchDiagonal; - auto vec = gko::batch_initialize( - {I({2.0, 3.0, -1.0}), I({1.0, -2.0, 8.0})}, this->exec); - auto vec_ptr = vec->get_const_values(); - auto diag = BDiag::create(this->exec); - - vec->move_to(diag.get()); - - auto check_sz = gko::batch_dim<2>{2, gko::dim<2>{3}}; - ASSERT_EQ(diag->get_size(), check_sz); - auto diag_vals = diag->get_const_values(); - ASSERT_EQ(diag_vals, vec_ptr); - ASSERT_NE(diag_vals, vec->get_const_values()); - ASSERT_EQ(vec->get_num_batch_entries(), 0); -} - - -TYPED_TEST(BatchMultiVector, SquareMatrixIsTransposable) -{ - using Mtx = typename TestFixture::Mtx; - auto trans = this->mtx_4->transpose(); - auto trans_as_batch_multi_vector = static_cast(trans.get()); - - auto utb = trans_as_batch_multi_vector->unbatch(); - GKO_ASSERT_MTX_NEAR(utb[0].get(), - l({{1.0, 6.0, 6.0}, {1.5, 1.0, 1.0}, {3.0, 5.0, 5.5}}), - r::value); - GKO_ASSERT_MTX_NEAR( - utb[1].get(), l({{2.0, 4.0, -1.25}, {-2.0, 3.0, 3.0}, {1.5, 2.2, 0.5}}), - r::value); -} - - -TYPED_TEST(BatchMultiVector, NonSquareMatrixIsTransposable) -{ - using Mtx = typename TestFixture::Mtx; - auto trans = this->mtx_5->transpose(); - auto trans_as_batch_multi_vector = static_cast(trans.get()); - - auto utb = trans_as_batch_multi_vector->unbatch(); - GKO_ASSERT_MTX_NEAR(utb[0].get(), l({{1.0, 6.0, 7.0}, {1.5, 1.0, -4.5}}), - r::value); - GKO_ASSERT_MTX_NEAR(utb[1].get(), l({{2.0, 1.0, 4.0}, {-2.0, 3.0, 3.0}}), - r::value); -} - - -TYPED_TEST(BatchMultiVector, SquareMatrixAddScaledIdentity) -{ - using T = typename TestFixture::value_type; - using Mtx = typename TestFixture::Mtx; - auto mtx = gko::batch_initialize( - {{I({1.0, -1.0, 1.5}), I({-2.0, 0.0, 3.0}), - I({1.2, -0.5, 1.0})}, - {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}, {3.0, 0.0, -1.5}}}, - this->exec); - auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); - auto beta = gko::batch_initialize({{3.0}, {-1.0}}, this->exec); - auto sol_mtx = gko::batch_initialize( - {{I({5.0, -3.0, 4.5}), I({-6.0, 2.0, 9.0}), - I({3.6, -1.5, 5.0})}, - {{-3.0, 2.0, 0.5}, {-1.0, 0.5, -4.0}, {-3.0, 0.0, -0.5}}}, - this->exec); - - mtx->add_scaled_identity(alpha.get(), beta.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(mtx, sol_mtx, r::value); -} - - } // namespace From 80cbcbd4ab207475d475783f8e5f2e8734af2bfa Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 14 Jul 2023 13:55:42 +0200 Subject: [PATCH 117/583] Core and interface updates --- core/base/batch_multi_vector.cpp | 50 ++++++++++++------- .../ginkgo/core/base/batch_multi_vector.hpp | 27 ++++++---- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 3a3f0aff757..0a3612ab205 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -49,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace matrix { namespace batch_multi_vector { @@ -67,12 +67,14 @@ template void BatchMultiVector::scale_impl( const BatchMultiVector* alpha) { - GKO_ASSERT_BATCH_EQUAL_ROWS( - alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); + GKO_ASSERT_EQ(alpha->get_num_batch_entries(), + this->get_num_batch_entries()); + GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) { if (alpha->get_common_size()[1] != 1) { // different alpha for each column - GKO_ASSERT_BATCH_EQUAL_COLS(this, alpha); + GKO_ASSERT_EQUAL_COLS(this->get_common_size(), + alpha->get_common_size()); } } this->get_executor()->run(batch_multi_vector::make_scale(alpha, this)); @@ -84,15 +86,18 @@ void BatchMultiVector::add_scaled_impl( const BatchMultiVector* alpha, const BatchMultiVector* b) { - GKO_ASSERT_BATCH_EQUAL_ROWS( - alpha, batch_dim<2>(this->get_num_batch_entries(), dim<2>(1, 1))); + GKO_ASSERT_EQ(alpha->get_num_batch_entries(), + this->get_num_batch_entries()); + GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) { if (alpha->get_common_size()[1] != 1) { // different alpha for each column - GKO_ASSERT_BATCH_EQUAL_COLS(this, alpha); + GKO_ASSERT_EQUAL_COLS(this->get_common_size(), + alpha->get_common_size()); } } - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, b); + GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); + GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); this->get_executor()->run( batch_multi_vector::make_add_scaled(alpha, b, this)); @@ -101,7 +106,8 @@ void BatchMultiVector::add_scaled_impl( inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) { - return batch_dim<2>(sizes.get_num_batch_entries(), dim<2>(1, sizes[1])); + return batch_dim<2>(sizes.get_num_batch_entries(), + dim<2>(1, sizes.get_common_size()[1])); } @@ -110,9 +116,13 @@ void BatchMultiVector::compute_dot_impl( const BatchMultiVector* b, BatchMultiVector* result) const { - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(this, b); - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(batch_result, - get_col_sizes(this->get_size())); + GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); + GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQ(this->get_num_batch_entries(), + result->get_num_batch_entries()); + GKO_ASSERT_EQUAL_DIMENSIONS( + result->get_common_size(), + get_col_sizes(this->get_size()).get_common_size()); this->get_executor()->run( batch_multi_vector::make_compute_dot(this, b, result)); } @@ -122,7 +132,11 @@ template void BatchMultiVector::compute_norm2_impl( BatchMultiVector>* result) const { - GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(result, get_col_sizes(this->get_size())); + GKO_ASSERT_EQ(this->get_num_batch_entries(), + result->get_num_batch_entries()); + GKO_ASSERT_EQUAL_DIMENSIONS( + result->get_common_size(), + get_col_sizes(this->get_size()).get_common_size()); this->get_executor()->run(batch_multi_vector::make_compute_norm2( as>(this), result)); } @@ -152,8 +166,8 @@ inline void read_impl(MatrixType* mtx, const std::vector& data) auto batch_size = batch_dim<2>(data.size(), common_size); size_type ind = 0; for (const auto& b : data) { - b_size = b.size; - GKO_ASSERT_EQ(common_size, b_size); + auto b_size = b.size; + GKO_ASSERT_EQUAL_DIMENSIONS(common_size, b_size); } auto tmp = MatrixType::create(mtx->get_executor()->get_master(), batch_size); @@ -194,7 +208,8 @@ void BatchMultiVector::read(const std::vector& data) template inline void write_impl(const MatrixType* mtx, std::vector& data) { - std::unique_ptr> op{}; + std::unique_ptr> + op{}; const MatrixType* tmp{}; if (mtx->get_executor()->get_master() != mtx->get_executor()) { op = mtx->clone(mtx->get_executor()->get_master()); @@ -238,7 +253,4 @@ void BatchMultiVector::write(std::vector& data) const GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_MATRIX); -} // namespace matrix - - } // namespace gko diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 1050ec28224..d1a0c01ddb9 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -68,7 +68,8 @@ namespace gko { */ template class BatchMultiVector - : public EnableAbstractPolymorphicObject>, + : public EnablePolymorphicObject>, + public EnablePolymorphicAssignment>, public EnableCreateMethod>, public ConvertibleTo>>, public BatchReadableFromMatrixData, @@ -76,11 +77,15 @@ class BatchMultiVector public BatchWritableToMatrixData, public BatchWritableToMatrixData { friend class EnableCreateMethod; + friend class EnablePolymorphicObject; friend class BatchMultiVector>; public: using BatchReadableFromMatrixData::read; using BatchReadableFromMatrixData::read; + using EnablePolymorphicObject::EnablePolymorphicObject; + using EnablePolymorphicAssignment::convert_to; + using EnablePolymorphicAssignment::move_to; using value_type = ValueType; using index_type = int32; @@ -354,7 +359,8 @@ class BatchMultiVector * (the number of columns in the vector must match the number * of columns of this) */ - void compute_norm2(BatchMultiVector* result) const + void compute_norm2( + BatchMultiVector>* result) const { auto exec = this->get_executor(); this->compute_norm2_impl(make_temporary_clone(exec, result).get()); @@ -395,10 +401,10 @@ class BatchMultiVector BatchMultiVector& operator=(BatchMultiVector&& other) { if (this != &other) { - EnableAbstractPolymorphicObject::operator=( + EnablePolymorphicObject::operator=( std::move(other)); this->set_size(other.get_size()); - other.set_size({}); + other.set_size(batch_dim<2>{}); } return *this; } @@ -414,7 +420,7 @@ class BatchMultiVector * input, which will have size 0x0 and unchanged executor afterwards. */ BatchMultiVector(BatchMultiVector&& other) - : EnableAbstractPolymorphicObject(std::move(other)), + : EnablePolymorphicObject(std::move(other)), batch_size_{std::exchange(other.batch_size_, batch_dim<2>{})} {} @@ -424,7 +430,7 @@ class BatchMultiVector { auto common_size = matrices[0]->get_size(); for (int i = 1; i < matrices.size(); ++i) { - GKO_ASSERT_EQ(common_size, matrices[i]->get_size()); + GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); } return batch_dim<2>{matrices.size(), common_size}; } @@ -450,7 +456,7 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const batch_dim<2>& size = batch_dim<2>{}) - : EnableAbstractPolymorphicObject(exec), + : EnablePolymorphicObject(exec), batch_size_(size), values_(exec, compute_num_elems(size)) {} @@ -472,7 +478,7 @@ class BatchMultiVector template BatchMultiVector(std::shared_ptr exec, const batch_dim<2>& size, ValuesArray&& values) - : EnableAbstractPolymorphicObject(exec), + : EnablePolymorphicObject(exec), batch_size_(size), values_{exec, std::forward(values)} { @@ -489,7 +495,7 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const std::vector*>& matrices) - : EnableAbstractPolymorphicObject(exec), + : EnablePolymorphicObject(exec), batch_size_{compute_batch_size(matrices)}, values_(exec, compute_num_elems(batch_size_)) { @@ -594,7 +600,8 @@ class BatchMultiVector * @note Other implementations of batch_multi_vector should override this * function instead of compute_norm2(BatchMultiVector *result). */ - virtual void compute_norm2_impl(BatchMultiVector* result) const; + virtual void compute_norm2_impl( + BatchMultiVector>* result) const; size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept From 3fdf20b094645c02ce987560129ad6869c9a2a2d Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 14 Jul 2023 16:18:42 +0200 Subject: [PATCH 118/583] Core test fixes --- core/test/base/batch_multi_vector.cpp | 173 ++++-------------- .../ginkgo/core/base/batch_multi_vector.hpp | 4 +- test/base/batch_multi_vector_kernels.cpp | 154 +--------------- 3 files changed, 49 insertions(+), 282 deletions(-) diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index e43be1e7b86..60f5fc071ec 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -53,7 +53,6 @@ class BatchMultiVector : public ::testing::Test { BatchMultiVector() : exec(gko::ReferenceExecutor::create()), mtx(gko::batch_initialize>( - std::vector{4, 3}, {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, exec)) @@ -64,13 +63,7 @@ class BatchMultiVector : public ::testing::Test { gko::BatchMultiVector* m) { ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_stride().at(0), 4); - ASSERT_EQ(m->get_stride().at(1), 3); - ASSERT_EQ(m->get_num_stored_elements(), (2 * 4) + (2 * 3)); - ASSERT_EQ(m->get_num_stored_elements(0), 2 * 4); - ASSERT_EQ(m->get_num_stored_elements(1), 2 * 3); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); EXPECT_EQ(m->at(0, 0, 2), value_type{3.0}); @@ -88,7 +81,7 @@ class BatchMultiVector : public ::testing::Test { static void assert_empty(gko::BatchMultiVector* m) { ASSERT_EQ(m->get_num_batch_entries(), 0); - ASSERT_EQ(m->get_num_stored_elements(), 0); + ASSERT_EQ(m->get_common_size(), {}); } std::shared_ptr exec; @@ -116,30 +109,10 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize) { using size_type = gko::size_type; auto m = gko::BatchMultiVector::create( - this->exec, - std::vector>{gko::dim<2>{2, 4}, gko::dim<2>{2, 3}}); + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 4))); ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 4)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 3)); - EXPECT_EQ(m->get_stride().at(0), 4); - EXPECT_EQ(m->get_stride().at(1), 3); - ASSERT_EQ(m->get_num_stored_elements(), 14); - ASSERT_EQ(m->get_num_stored_elements(0), 8); - ASSERT_EQ(m->get_num_stored_elements(1), 6); -} - - -TYPED_TEST(BatchMultiVector, CanBeConstructedWithSizeAndStride) -{ - using size_type = gko::size_type; - auto m = gko::BatchMultiVector::create( - this->exec, std::vector>{gko::dim<2>{2, 3}}, - std::vector{4}); - - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - EXPECT_EQ(m->get_stride().at(0), 4); - ASSERT_EQ(m->get_num_stored_elements(), 8); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 4)); } @@ -156,16 +129,14 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingData) // clang-format on auto m = gko::BatchMultiVector::create( - this->exec, - std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, - gko::array::view(this->exec, 12, data), - std::vector{3, 3}); + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), + gko::array::view(this->exec, 4, data)); ASSERT_EQ(m->get_const_values(), data); - ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); - ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); - ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); + ASSERT_EQ(m->at(0, 0, 1), value_type{1.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{2.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); } @@ -184,14 +155,13 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) auto m = gko::BatchMultiVector::create_const( this->exec, std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, - gko::array::const_view(this->exec, 12, data), - std::vector{3, 3}); + gko::array::const_view(this->exec, 4, data)); ASSERT_EQ(m->get_const_values(), data); - ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); - ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); - ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); + ASSERT_EQ(m->at(0, 0, 1), value_type{1.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{2.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); } @@ -200,8 +170,8 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize( - 3, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); @@ -221,8 +191,8 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatricesByDuplication) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize( - 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); @@ -240,8 +210,8 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize( - 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); @@ -257,8 +227,8 @@ TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize( - 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); @@ -283,9 +253,7 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructed) {{1.0, 2.0}, {1.0, 3.0}}, this->exec); ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 4); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); EXPECT_EQ(m->at(0, 0), value_type{1}); EXPECT_EQ(m->at(0, 1), value_type{2}); EXPECT_EQ(m->at(1, 0), value_type{1}); @@ -293,28 +261,13 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructed) } -TYPED_TEST(BatchMultiVector, CanBeListConstructedWithstride) -{ - using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( - std::vector{2}, {{1.0, 2.0}}, this->exec); - ASSERT_EQ(m->get_num_batch_entries(), 1); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 4); - EXPECT_EQ(m->at(0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1), value_type{2.0}); -} - - TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; auto m = gko::batch_initialize>( 2, I({1.0, 2.0}), this->exec); ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 4); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); @@ -328,46 +281,10 @@ TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed) using T = value_type; auto m = gko::batch_initialize>( {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, - {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, - this->exec); - - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); - ASSERT_EQ(m->get_stride().at(0), 3); - ASSERT_EQ(m->get_stride().at(1), 2); - EXPECT_EQ(m->get_num_stored_elements(), 15); - ASSERT_EQ(m->get_num_stored_elements(0), 9); - ASSERT_EQ(m->get_num_stored_elements(1), 6); - EXPECT_EQ(m->at(0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1), value_type{1.0}); - EXPECT_EQ(m->at(0, 2), value_type{0.0}); - ASSERT_EQ(m->at(0, 3), value_type{2.0}); - EXPECT_EQ(m->at(0, 4), value_type{4.0}); - EXPECT_EQ(m->at(1, 0), value_type{1.0}); - EXPECT_EQ(m->at(1, 1), value_type{2.0}); - EXPECT_EQ(m->at(1, 2), value_type{3.0}); - ASSERT_EQ(m->at(1, 3), value_type{4.0}); - EXPECT_EQ(m->at(1, 4), value_type{5.0}); -} - - -TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructedWithstride) -{ - using value_type = typename TestFixture::value_type; - using T = value_type; - auto m = gko::batch_initialize>( - {4, 3}, - {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, - {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, + {I{1.0, 2.0, -1.0}, I{3.0, 4.0, -2.0}, I{5.0, 6.0, -3.0}}}, this->exec); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); - ASSERT_EQ(m->get_stride().at(0), 4); - ASSERT_EQ(m->get_stride().at(1), 3); - EXPECT_EQ(m->get_num_stored_elements(), 21); - ASSERT_EQ(m->get_num_stored_elements(0), 12); - ASSERT_EQ(m->get_num_stored_elements(1), 9); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3)); EXPECT_EQ(m->at(0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 1), value_type{1.0}); EXPECT_EQ(m->at(0, 2), value_type{0.0}); @@ -375,9 +292,9 @@ TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructedWithstride) EXPECT_EQ(m->at(0, 4), value_type{4.0}); EXPECT_EQ(m->at(1, 0), value_type{1.0}); EXPECT_EQ(m->at(1, 1), value_type{2.0}); - EXPECT_EQ(m->at(1, 2), value_type{3.0}); - ASSERT_EQ(m->at(1, 3), value_type{4.0}); - EXPECT_EQ(m->at(1, 4), value_type{5.0}); + EXPECT_EQ(m->at(1, 2), value_type{-1.0}); + ASSERT_EQ(m->at(1, 3), value_type{3.0}); + EXPECT_EQ(m->at(1, 4), value_type{4.0}); } @@ -420,13 +337,11 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixData) using value_type = typename TestFixture::value_type; auto m = gko::BatchMultiVector::create(this->exec); // clang-format off - m->read({gko::matrix_data{{2, 3}, + m->read({gko::matrix_data{{2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, - {0, 2, 2.0}, {1, 0, 0.0}, - {1, 1, 5.0}, - {1, 2, 0.0}}}, + {1, 1, 5.0}}}, gko::matrix_data{{2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, @@ -434,17 +349,11 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixData) {1, 1, 9.0}}}}); // clang-format on - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 2)); - ASSERT_EQ(m->get_num_stored_elements(), 10); - ASSERT_EQ(m->get_num_stored_elements(0), 6); - ASSERT_EQ(m->get_num_stored_elements(1), 4); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); - EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); - EXPECT_EQ(m->at(0, 1, 2), value_type{0.0}); EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); @@ -483,31 +392,27 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixAssemblyData) { using value_type = typename TestFixture::value_type; auto m = gko::BatchMultiVector::create(this->exec); - gko::matrix_assembly_data data1(gko::dim<2>{2, 3}); + gko::matrix_assembly_data data1(gko::dim<2>{2, 2}); data1.set_value(0, 0, 1.0); data1.set_value(0, 1, 3.0); - data1.set_value(0, 2, 2.0); data1.set_value(1, 0, 0.0); data1.set_value(1, 1, 5.0); - data1.set_value(1, 2, 0.0); gko::matrix_assembly_data data2(gko::dim<2>{2, 1}); data2.set_value(0, 0, 2.0); + data2.set_value(0, 1, 1.0); data2.set_value(1, 0, 5.0); + data2.set_value(1, 1, 4.0); auto data = std::vector>{data1, data2}; m->read(data); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 8); - ASSERT_EQ(m->get_num_stored_elements(0), 6); - ASSERT_EQ(m->get_num_stored_elements(1), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); - EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{0.0}); EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); EXPECT_EQ(m->at(1, 1, 0), value_type{5.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{1.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{4.0}); } diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index d1a0c01ddb9..0b8cb8b375e 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -705,14 +705,14 @@ std::unique_ptr batch_initialize( size_type num_batch_entries = vals.size(); auto vals_begin = begin(vals); size_type common_num_rows = vals_begin->size(); - size_type common_num_cols = begin(vals_begin)->size(); + size_type common_num_cols = vals_begin->begin()->size(); auto common_size = dim<2>(common_num_rows, common_num_cols); size_type ind = 0; for (const auto& b : vals) { auto num_rows = b.size(); auto num_cols = begin(b)->size(); auto b_size = dim<2>(num_rows, num_cols); - GKO_ASSERT_EQ(b_size, common_size); + GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size); } auto b_size = batch_dim<2>(num_batch_entries, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index e16607db844..8cff141e0a0 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -49,9 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "test/utils/executor.hpp" -#ifndef GKO_COMPILING_DPCPP - - class BatchMultiVector : public CommonTestFixture { protected: using vtype = double; @@ -144,28 +141,6 @@ class BatchMultiVector : public CommonTestFixture { }; -TEST_F(BatchMultiVector, SingleVectorAppyIsEquivalentToRef) -{ - set_up_apply_data(1); - - x->apply(y.get(), expected.get()); - dx->apply(dy.get(), dresult.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); -} - - -TEST_F(BatchMultiVector, SingleVectorAdvancedAppyIsEquivalentToRef) -{ - set_up_apply_data(1); - - x->apply(alpha.get(), y.get(), beta.get(), expected.get()); - dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); -} - - TEST_F(BatchMultiVector, SingleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(1); @@ -177,17 +152,6 @@ TEST_F(BatchMultiVector, SingleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchMultiVector, SingleVectorAddScaleIsEquivalentToRef) -{ - set_up_vector_data(1); - - x->add_scale(alpha.get(), y.get(), beta.get()); - dx->add_scale(dalpha.get(), dy.get(), dbeta.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); -} - - TEST_F(BatchMultiVector, MultipleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(20); @@ -199,18 +163,8 @@ TEST_F(BatchMultiVector, MultipleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchMultiVector, MultipleVectorAddScaleIsEquivalentToRef) -{ - set_up_vector_data(20); - - x->add_scale(alpha.get(), y.get(), beta.get()); - dx->add_scale(dalpha.get(), dy.get(), dbeta.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); -} - - -TEST_F(BatchMultiVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) +TEST_F(BatchMultiVector, + MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); @@ -221,17 +175,6 @@ TEST_F(BatchMultiVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentTo } -TEST_F(BatchMultiVector, MultipleVectorAddScaleWithDifferentScalarsIsEquivalentToRef) -{ - set_up_vector_data(20, true); - - x->add_scale(alpha.get(), y.get(), beta.get()); - dx->add_scale(dalpha.get(), dy.get(), dbeta.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); -} - - TEST_F(BatchMultiVector, SingleVectorScaleIsEquivalentToRef) { set_up_vector_data(1); @@ -329,9 +272,10 @@ TEST_F(BatchMultiVector, CopySingleIsEquivalentToRef) { set_up_vector_data(1); - gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); + gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), + y.get()); gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), - dy.get()); + dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } @@ -341,92 +285,10 @@ TEST_F(BatchMultiVector, CopyIsEquivalentToRef) { set_up_vector_data(20); - gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); + gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), + y.get()); gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), - dy.get()); + dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } - - -TEST_F(BatchMultiVector, BatchScaleIsEquivalentToRef) -{ - using BDiag = gko::matrix::BatchDiagonal; - const int num_rhs = 20; - set_up_vector_data(num_rhs); - - const int num_rows_in_mat = x->get_size().at(0)[0]; - const auto left = - gen_mtx(batch_size, num_rows_in_mat, num_rows_in_mat); - const auto rght = gen_mtx(batch_size, num_rhs, num_rhs); - auto dleft = BDiag::create(this->exec); - dleft->copy_from(left.get()); - auto drght = BDiag::create(this->exec); - drght->copy_from(rght.get()); - - gko::kernels::reference::batch_multi_vector::batch_scale(this->ref, left.get(), - rght.get(), x.get()); - gko::kernels::EXEC_NAMESPACE::batch_multi_vector::batch_scale( - this->exec, dleft.get(), drght.get(), dx.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); -} - - -TEST_F(BatchMultiVector, TransposeIsEquivalentToRef) -{ - const int nrows = 11; - const int ncols = 6; - const size_t nbatch = 5; - const auto orig = gen_mtx(nbatch, nrows, ncols); - auto corig = Mtx::create(exec); - corig->copy_from(orig.get()); - - auto trans = orig->transpose(); - auto ctrans = corig->transpose(); - - auto dtrans = static_cast(trans.get()); - auto dctrans = static_cast(ctrans.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dtrans, dctrans, 0.0); -} - - -TEST_F(BatchMultiVector, ConjugateTransposeIsEquivalentToRef) -{ - const int nrows = 11; - const int ncols = 6; - const size_t nbatch = 5; - const auto orig = gen_mtx(nbatch, nrows, ncols); - auto corig = Mtx::create(exec); - corig->copy_from(orig.get()); - - auto trans = orig->conj_transpose(); - auto ctrans = corig->conj_transpose(); - - auto dtrans = static_cast(trans.get()); - auto dctrans = static_cast(ctrans.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dtrans, dctrans, 0.0); -} - - -TEST_F(BatchMultiVector, AddScaledIdentityNonSquareIsEquivalentToReference) -{ - set_up_apply_data(); - const gko::size_type batchsize = 10; - const gko::size_type num_rows = 62; - const gko::size_type num_cols = 51; - auto rmtx = gko::test::generate_uniform_batch_random_matrix( - batchsize, num_rows, num_cols, - std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution<>(-1.0, 1.0), rand_engine, true, ref); - auto dmtx = Mtx::create(exec); - dmtx->copy_from(rmtx.get()); - - rmtx->add_scaled_identity(alpha.get(), beta.get()); - dmtx->add_scaled_identity(dalpha.get(), dbeta.get()); - - GKO_ASSERT_BATCH_MTX_NEAR(rmtx, dmtx, 1e-15) -} - - -#endif From f99b1f3b92654b8f1e46d3a5ac08cc889832b144 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 19 Jul 2023 17:22:25 +0200 Subject: [PATCH 119/583] Test fixes and dpcpp additions --- core/test/base/CMakeLists.txt | 1 + core/test/base/batch_dim.cpp | 10 + core/test/base/batch_multi_vector.cpp | 134 ++++++------ core/test/utils/assertions.hpp | 201 ++++++++++++++++++ core/test/utils/batch_helpers.hpp | 144 +++++++++++++ dpcpp/base/batch_struct.hpp | 114 ++++++++++ .../ginkgo/core/base/batch_multi_vector.hpp | 33 +-- test/base/batch_multi_vector_kernels.cpp | 2 +- 8 files changed, 562 insertions(+), 77 deletions(-) create mode 100644 core/test/utils/batch_helpers.hpp create mode 100644 dpcpp/base/batch_struct.hpp diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index f51862e8244..36bad656b07 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -1,6 +1,7 @@ ginkgo_create_test(abstract_factory) ginkgo_create_test(allocator) ginkgo_create_test(array) +ginkgo_create_test(batch_dim) ginkgo_create_test(batch_multi_vector) ginkgo_create_test(dense_cache) ginkgo_create_test(combination) diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp index f4361195d7c..40743656ca3 100644 --- a/core/test/base/batch_dim.cpp +++ b/core/test/base/batch_dim.cpp @@ -85,6 +85,16 @@ TEST(BatchDim, NotEqualWorks) } +TEST(BatchDim, CanGetCumulativeOffsets) +{ + auto d = gko::batch_dim<2>(3, gko::dim<2>(4, 2)); + + ASSERT_EQ(d.get_cumulative_offset(0), 0); + ASSERT_EQ(d.get_cumulative_offset(1), 8); + ASSERT_EQ(d.get_cumulative_offset(2), 16); +} + + TEST(BatchDim, TransposesBatchDimensions) { ASSERT_EQ(gko::transpose(gko::batch_dim<2>(2, gko::dim<2>{4, 2})), diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 60f5fc071ec..225c6b799ac 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -62,6 +62,7 @@ class BatchMultiVector : public ::testing::Test { static void assert_equal_to_original_mtx( gko::BatchMultiVector* m) { + EXPECT_EQ(m->get_values()[0], value_type{-1.0}); ASSERT_EQ(m->get_num_batch_entries(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); @@ -81,7 +82,7 @@ class BatchMultiVector : public ::testing::Test { static void assert_empty(gko::BatchMultiVector* m) { ASSERT_EQ(m->get_num_batch_entries(), 0); - ASSERT_EQ(m->get_common_size(), {}); + ASSERT_EQ(m->get_common_size(), gko::dim<2>{}); } std::shared_ptr exec; @@ -105,6 +106,46 @@ TYPED_TEST(BatchMultiVector, ReturnsNullValuesArrayWhenEmpty) } +TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} + + +TYPED_TEST(BatchMultiVector, CanBeCopied) +{ + auto mtx_copy = gko::BatchMultiVector::create(this->exec); + mtx_copy->copy_from(this->mtx.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->at(0, 0, 0) = 7; + this->mtx->at(0, 1) = 7; + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchMultiVector, CanBeMoved) +{ + auto mtx_copy = gko::BatchMultiVector::create(this->exec); + this->mtx->move_to(mtx_copy.get()); + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchMultiVector, CanBeCloned) +{ + auto mtx_clone = this->mtx->clone(); + this->assert_equal_to_original_mtx( + dynamic_castmtx.get())>(mtx_clone.get())); +} + + +TYPED_TEST(BatchMultiVector, CanBeCleared) +{ + this->mtx->clear(); + this->assert_empty(this->mtx.get()); +} + + TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize) { using size_type = gko::size_type; @@ -130,13 +171,17 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingData) auto m = gko::BatchMultiVector::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), - gko::array::view(this->exec, 4, data)); + gko::array::view(this->exec, 8, data)); ASSERT_EQ(m->get_const_values(), data); - ASSERT_EQ(m->at(0, 0, 1), value_type{1.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{2.0}); + ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); + ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); + ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); - ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); } @@ -153,15 +198,18 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) // clang-format on auto m = gko::BatchMultiVector::create_const( - this->exec, - std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, - gko::array::const_view(this->exec, 4, data)); + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), + gko::array::const_view(this->exec, 8, data)); ASSERT_EQ(m->get_const_values(), data); - ASSERT_EQ(m->at(0, 0, 1), value_type{1.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{2.0}); + ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); + ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); + ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); - ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); } @@ -222,30 +270,6 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) } -TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) -{ - using value_type = typename TestFixture::value_type; - using DenseMtx = typename TestFixture::DenseMtx; - using size_type = gko::size_type; - auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, - this->exec); - auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, - this->exec); - - auto dense_mats = this->mtx->unbatch(); - - - GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); - GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.); -} - - -TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) -{ - this->assert_equal_to_original_mtx(this->mtx.get()); -} - - TYPED_TEST(BatchMultiVector, CanBeListConstructed) { using value_type = typename TestFixture::value_type; @@ -266,6 +290,7 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies) using value_type = typename TestFixture::value_type; auto m = gko::batch_initialize>( 2, I({1.0, 2.0}), this->exec); + ASSERT_EQ(m->get_num_batch_entries(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); @@ -298,37 +323,20 @@ TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed) } -TYPED_TEST(BatchMultiVector, CanBeCopied) -{ - auto mtx_copy = gko::BatchMultiVector::create(this->exec); - mtx_copy->copy_from(this->mtx.get()); - this->assert_equal_to_original_mtx(this->mtx.get()); - this->mtx->at(0, 0, 0) = 7; - this->mtx->at(0, 1) = 7; - this->assert_equal_to_original_mtx(mtx_copy.get()); -} - - -TYPED_TEST(BatchMultiVector, CanBeMoved) -{ - auto mtx_copy = gko::BatchMultiVector::create(this->exec); - mtx_copy->copy_from(std::move(this->mtx)); - this->assert_equal_to_original_mtx(mtx_copy.get()); -} - - -TYPED_TEST(BatchMultiVector, CanBeCloned) +TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) { - auto mtx_clone = this->mtx->clone(); - this->assert_equal_to_original_mtx( - dynamic_castmtx.get())>(mtx_clone.get())); -} + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + auto dense_mats = this->mtx->unbatch(); -TYPED_TEST(BatchMultiVector, CanBeCleared) -{ - this->mtx->clear(); - this->assert_empty(this->mtx.get()); + GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); + GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.); } diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index e0ec27b8624..8e825a32d4f 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -315,6 +316,89 @@ double get_relative_error(const MatrixData1& first, const MatrixData2& second) } +template +::testing::AssertionResult batch_matrices_near_impl( + const std::string& first_expression, const std::string& second_expression, + const std::string& tolerance_expression, const MatrixData1& first, + const MatrixData2& second, double tolerance) +{ + std::vector err; + std::vector err_flag; + for (size_type b = 0; b < first.size(); ++b) { + auto num_rows = first[b].size[0]; + auto num_cols = first[b].size[1]; + if (num_rows != second[b].size[0] || num_cols != second[b].size[1]) { + return ::testing::AssertionFailure() + << "Expected matrices of equal size\n\t" << first_expression + << " is of size [" << num_rows << " x " << num_cols + << "]\n\t" << second_expression << " is of size [" + << second[b].size[0] << " x " << second[b].size[1] << "]" + << " for batch " << b; + } + + err.push_back(detail::get_relative_error(first[b], second[b])); + err_flag.push_back(err.back() <= tolerance); + } + + auto bat = std::find_if(err.begin(), err.end(), + [&](double& e) { return !(e <= tolerance); }); + if (bat == err.end()) { + return ::testing::AssertionSuccess(); + } else { + const auto b_pos = static_cast(bat - err.begin()); + auto num_rows = first[b_pos].size[0]; + auto num_cols = first[b_pos].size[1]; + auto fail = ::testing::AssertionFailure(); + fail << "Error for batch: " << b_pos << "\n Relative error between " + << first_expression << " and " << second_expression << " is " + << err[b_pos] << "\n" + << "\twhich is larger than " << tolerance_expression + << " (which is " << tolerance << ")\n"; + if (num_rows * num_cols <= 1000) { + fail << first_expression << " is:\n"; + detail::print_matrix(fail, first[b_pos]); + fail << second_expression << " is:\n"; + detail::print_matrix(fail, second[b_pos]); + fail << "component-wise relative error is:\n"; + detail::print_componentwise_error(fail, first[b_pos], + second[b_pos]); + } else { + // build output filenames + auto test_case_info = + ::testing::UnitTest::GetInstance()->current_test_info(); + auto testname = + test_case_info ? std::string{test_case_info->test_case_name()} + + "." + test_case_info->name() + : std::string{"null"}; + auto firstfile = testname + "." + first_expression + ".mtx"; + auto secondfile = testname + "." + second_expression + ".mtx"; + auto to_remove = [](char c) { + return !std::isalnum(c) && c != '_' && c != '.' && c != '-' && + c != '<' && c != '>'; + }; + // remove all but alphanumerical and _.-<> characters from + // expressions + firstfile.erase( + std::remove_if(firstfile.begin(), firstfile.end(), to_remove), + firstfile.end()); + secondfile.erase( + std::remove_if(secondfile.begin(), secondfile.end(), to_remove), + secondfile.end()); + // save matrices + std::ofstream first_stream{firstfile}; + gko::write_raw(first_stream, first[b_pos], + gko::layout_type::coordinate); + std::ofstream second_stream{secondfile}; + gko::write_raw(second_stream, second[b_pos], + gko::layout_type::coordinate); + fail << first_expression << " saved as " << firstfile << "\n"; + fail << second_expression << " saved as " << secondfile << "\n"; + } + return fail; + } +} + + template ::testing::AssertionResult matrices_near_impl( const std::string& first_expression, const std::string& second_expression, @@ -600,6 +684,85 @@ ::testing::AssertionResult values_near, std::complex>( } +/** + * This is a gtest predicate which checks if two batch matrices are relatively + * near. + * + * More formally, it checks whether the following equation holds for each of the + * matrices in the batch: + * + * ``` + * ||first - second|| <= tolerance * max(||first||, ||second||) + * ``` + * + * This function should not be called directly, but used in conjunction with + * `ASSERT_PRED_FORMAT3` as follows: + * + * ``` + * // Check if first and second are near + * ASSERT_PRED_FORMAT3(gko::test::assertions::batch_matrices_near, + * first, second, tolerance); + * // Check if first and second are far + * ASSERT_PRED_FORMAT3(!gko::test::assertions::batch_matrices_near, + * first, second, tolerance); + * ``` + * + * @see GKO_ASSERT_BATCH_MTX_NEAR + * @see GKO_EXPECT_BATCH_MTX_NEAR + */ +template +::testing::AssertionResult batch_matrices_near( + const std::string& first_expression, const std::string& second_expression, + const std::string& tolerance_expression, const Mat1* first, + const Mat2* second, double tolerance) +{ + auto exec = first->get_executor()->get_master(); + std::vector< + matrix_data> + first_data; + std::vector< + matrix_data> + second_data; + + first->write(first_data); + second->write(second_data); + + if (first_data.size() != second_data.size()) { + return ::testing::AssertionFailure() + << "Expected same batch sizes for " << first_expression + << " and " << second_expression << ", but got batch size " + << first_data.size() << " for " << first_expression + << " and batch size " << second_data.size() << " for " + << second_expression; + } + + for (size_type b = 0; b < first_data.size(); ++b) { + first_data[b].ensure_row_major_order(); + second_data[b].ensure_row_major_order(); + } + + return detail::batch_matrices_near_impl( + detail::remove_pointer_wrapper(first_expression), + detail::remove_pointer_wrapper(second_expression), tolerance_expression, + first_data, second_data, tolerance); +} + + +template +::testing::AssertionResult batch_matrices_near( + const std::string& first_expression, const std::string& second_expression, + const std::string& tolerance_expression, const Mat1* first, + std::initializer_list> second, double tolerance) +{ + auto second_mtx = + batch_initialize>>( + second, first->get_executor()->get_master()); + return batch_matrices_near( + first_expression, detail::remove_list_wrapper(second_expression), + tolerance_expression, first, second_mtx.get(), tolerance); +} + + /** * This is a gtest predicate which checks if two matrices are relatively near. * @@ -940,6 +1103,44 @@ T* plain_ptr(T* ptr) } +/** + * Checks if two batched matrices are near each other. + * + * More formally, it checks whether the following equation holds: + * + * ``` + * ||_mtx1 - _mtx2|| <= _tol * max(||_mtx1||, ||_mtx2||) + * ``` + * for all batches + * + * Has to be called from within a google test unit test. + * Internally calls gko::test::assertions::batch_matrices_near(). + * + * @param _mtx1 first matrix + * @param _mtx2 second matrix + * @param _tol tolerance level + */ +#define GKO_ASSERT_BATCH_MTX_NEAR(_mtx1, _mtx2, _tol) \ + { \ + using ::gko::test::assertions::detail::l; \ + using ::gko::test::assertions::detail::plain_ptr; \ + ASSERT_PRED_FORMAT3(::gko::test::assertions::batch_matrices_near, \ + plain_ptr(_mtx1), plain_ptr(_mtx2), _tol); \ + } + + +/** + * @copydoc GKO_ASSERT_MTX_NEAR + */ +#define GKO_EXPECT_BATCH_MTX_NEAR(_mtx1, _mtx2, _tol) \ + { \ + using ::gko::test::assertions::detail::l; \ + using ::gko::test::assertions::detail::plain_ptr; \ + EXPECT_PRED_FORMAT3(::gko::test::assertions::batch_matrices_near, \ + plain_ptr(_mtx1), plain_ptr(_mtx2), _tol); \ + } + + /** * Checks if two matrices are near each other. * diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp new file mode 100644 index 00000000000..c00cfbeee50 --- /dev/null +++ b/core/test/utils/batch_helpers.hpp @@ -0,0 +1,144 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_TEST_UTILS_BATCH_HELPERS_HPP_ +#define GKO_CORE_TEST_UTILS_BATCH_HELPERS_HPP_ + + +#include + + +#include +#include + + +#include "core/test/utils/assertions.hpp" + + +namespace gko { +namespace test { + + +/** + * Converts a vector of unique pointers to a vector of shared pointers. + */ +template +std::vector> share(std::vector>&& objs) +{ + std::vector> out; + out.reserve(objs.size()); + for (auto& obj : objs) { + out.push_back(std::move(obj)); + } + return out; +} + + +/** + * Generates a batch of random matrices of the specified type. + */ +template +std::unique_ptr generate_uniform_batch_random_matrix( + const size_type batch_size, const size_type num_rows, + const size_type num_cols, NonzeroDistribution&& nonzero_dist, + ValueDistribution&& value_dist, Engine&& engine, + const bool with_all_diagonals, std::shared_ptr exec, + MatrixArgs&&... args) +{ + using value_type = typename MatrixType::value_type; + using index_type = typename MatrixType::index_type; + + // generate sparsity pattern + matrix_data sdata{gko::dim<2>{num_rows, num_cols}, + {}}; + + for (size_type row = 0; row < num_rows; ++row) { + // randomly generate number of nonzeros in this row + std::vector col_idx(num_cols); + std::iota(begin(col_idx), end(col_idx), size_type(0)); + const auto nnz_row = static_cast(nonzero_dist(engine)); + size_type nnz_in_row = + std::max(size_type(0), std::min(nnz_row, num_cols)); + std::shuffle(std::begin(col_idx), std::end(col_idx), engine); + + if (with_all_diagonals) { + if (nnz_in_row == 0) { + nnz_in_row = 1; + } + bool has_diagonal = false; + for (size_type icol = 0; icol < nnz_in_row; icol++) { + if (col_idx[icol] == row) { + has_diagonal = true; + } + } + if (!has_diagonal) { + col_idx[0] = row; + } + } + + std::for_each( + std::begin(col_idx), std::begin(col_idx) + nnz_in_row, + [&](size_type col) { sdata.nonzeros.emplace_back(row, col, 1.0); }); + } + + std::vector> batchmtx; + batchmtx.reserve(batch_size); + + for (size_t ibatch = 0; ibatch < batch_size; ibatch++) { + matrix_data data = sdata; + for (size_type iz = 0; iz < data.nonzeros.size(); ++iz) { + value_type valnz = + gko::detail::get_rand_value(value_dist, engine); + if (data.nonzeros[iz].column == data.nonzeros[iz].row && + valnz == zero()) { + valnz = 1.0; + } + data.nonzeros[iz].value = valnz; + } + + data.ensure_row_major_order(); + batchmtx.emplace_back(std::move(data)); + } + + // convert to the correct matrix type + auto result = MatrixType::create(exec, std::forward(args)...); + result->read(batchmtx); + return result; +} + + +} // namespace test +} // namespace gko + + +#endif // GKO_CORE_TEST_UTILS_BATCH_HELPERS_HPP_ diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp new file mode 100644 index 00000000000..86534768c2b --- /dev/null +++ b/dpcpp/base/batch_struct.hpp @@ -0,0 +1,114 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_BASE_BATCH_STRUCT_HPP_ +#define GKO_DPCPP_BASE_BATCH_STRUCT_HPP_ + + +#include "core/base/batch_struct.hpp" + + +#include +#include + + +#include "dpcpp/base/config.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp, + * while also shallow-casting to the requried DPCPP scalar type. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_multi_vector::UniformBatch get_batch_struct( + const BatchMultiVector* const op) +{ + return {op->get_const_values(), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +/** + * Generates a uniform batch struct from a batch of dense matrices. + */ +template +inline gko::batch_multi_vector::UniformBatch get_batch_struct( + BatchMultiVector* const op) +{ + return {op->get_values(), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +/** + * Generates an immutable uniform batch struct from a batch of dense matrices + * that may be null. + */ +template +inline gko::batch_multi_vector::UniformBatch +maybe_null_batch_struct(const BatchMultiVector* const op) +{ + if (op) { + return {op->get_const_values(), op->get_num_batch_entries(), + op->get_common_size()[1], + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; + } else { + return {nullptr, 0, 0, 0, 0}; + } +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_ diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 0b8cb8b375e..3e2c90653dc 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -138,8 +138,7 @@ class BatchMultiVector auto exec = this->get_executor(); auto unbatch_mats = std::vector>{}; for (size_type b = 0; b < this->get_num_batch_entries(); ++b) { - auto mat = unbatch_type::create(exec, this->get_common_size(), - this->get_common_size()[1]); + auto mat = unbatch_type::create(exec, this->get_common_size()); exec->copy_from(exec.get(), mat->get_num_stored_elements(), this->get_const_values() + this->get_size().get_cumulative_offset(b), @@ -484,7 +483,7 @@ class BatchMultiVector { // Ensure that the values array has the correct size auto num_elems = compute_num_elems(size); - GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems()); + GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1); } /** @@ -669,7 +668,7 @@ std::unique_ptr batch_initialize( ++batch; } auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx.get()); + tmp->move_to(mtx); return mtx; } @@ -703,17 +702,25 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); - auto vals_begin = begin(vals); - size_type common_num_rows = vals_begin->size(); - size_type common_num_cols = vals_begin->begin()->size(); - auto common_size = dim<2>(common_num_rows, common_num_cols); size_type ind = 0; + size_type num_rows = 0; + size_type num_cols = 0; + gko::dim<2> common_size{}; + size_type idx = 0; for (const auto& b : vals) { - auto num_rows = b.size(); - auto num_cols = begin(b)->size(); + num_rows = b.size(); + num_cols = begin(b)->size(); + if (idx == 0) { + common_size = dim<2>(num_rows, num_cols); + } auto b_size = dim<2>(num_rows, num_cols); GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size); + ++idx; } + + size_type common_num_rows = num_rows; + size_type common_num_cols = num_cols; + common_size = dim<2>(common_num_rows, common_num_cols); auto b_size = batch_dim<2>(num_batch_entries, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); size_type batch = 0; @@ -730,7 +737,7 @@ std::unique_ptr batch_initialize( ++batch; } auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx.get()); + tmp->move_to(mtx); return mtx; } @@ -777,7 +784,7 @@ std::unique_ptr batch_initialize( } } auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx.get()); + tmp->move_to(mtx); return mtx; } @@ -828,7 +835,7 @@ std::unique_ptr batch_initialize( } } auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx.get()); + tmp->move_to(mtx); return mtx; } diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 8cff141e0a0..b2f86fa0383 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -#include "core/test/utils/batch.hpp" +#include "core/test/utils/batch_helpers.hpp" #include "test/utils/executor.hpp" From 1a4d7ab6d2efb4c12d4d95189302df78d2d7bfb4 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 20 Jul 2023 11:01:20 +0200 Subject: [PATCH 120/583] Core move_to and copy fixes --- core/test/base/batch_multi_vector.cpp | 6 ++- .../ginkgo/core/base/batch_multi_vector.hpp | 50 +++---------------- test/base/CMakeLists.txt | 2 +- 3 files changed, 11 insertions(+), 47 deletions(-) diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 225c6b799ac..9d0c15a2b0d 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -60,9 +60,10 @@ class BatchMultiVector : public ::testing::Test { static void assert_equal_to_original_mtx( - gko::BatchMultiVector* m) + const gko::BatchMultiVector* m) { - EXPECT_EQ(m->get_values()[0], value_type{-1.0}); + ASSERT_NE(m->get_const_values(), nullptr); + EXPECT_EQ(m->get_const_values()[0], value_type{-1.0}); ASSERT_EQ(m->get_num_batch_entries(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); @@ -108,6 +109,7 @@ TYPED_TEST(BatchMultiVector, ReturnsNullValuesArrayWhenEmpty) TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) { + ASSERT_NE(this->mtx->get_const_values(), nullptr); this->assert_equal_to_original_mtx(this->mtx.get()); } diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 3e2c90653dc..567fcb25662 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -57,13 +58,10 @@ namespace gko { * of the vector in each of the batches. * * The values in each of the batches are stored in row-major format (values - * belonging to the same row appear consecutive in the memory). Optionally, rows - * can be padded for better memory access. + * belonging to the same row appear consecutive in the memory). * * @tparam ValueType precision of matrix elements * - * @note While this format is not very useful for storing sparse matrices, it - * is often suitable to store vectors, and sets of vectors. * @ingroup batch_multi_vector */ template @@ -83,9 +81,11 @@ class BatchMultiVector public: using BatchReadableFromMatrixData::read; using BatchReadableFromMatrixData::read; - using EnablePolymorphicObject::EnablePolymorphicObject; using EnablePolymorphicAssignment::convert_to; using EnablePolymorphicAssignment::move_to; + using ConvertibleTo< + BatchMultiVector>>::convert_to; + using ConvertibleTo>>::move_to; using value_type = ValueType; using index_type = int32; @@ -187,7 +187,6 @@ class BatchMultiVector value_type* get_values(size_type batch) noexcept { GKO_ASSERT(batch < this->get_num_batch_entries()); - // TODO Verify return values_.get_data() + this->get_size().get_cumulative_offset(batch); } @@ -386,43 +385,6 @@ class BatchMultiVector exec, sizes, gko::detail::array_const_cast(std::move(values))}); } - /** - * Copy-assigns a BatchMultiVector. Preserves the executor and copies the - * size. - */ - BatchMultiVector& operator=(const BatchMultiVector&) = default; - - /** - * Move-assigns a BatchMultiVector. Preserves the executor and moves the - * size. The moved-from object has size 0x0 afterwards, but its executor is - * unchanged. - */ - BatchMultiVector& operator=(BatchMultiVector&& other) - { - if (this != &other) { - EnablePolymorphicObject::operator=( - std::move(other)); - this->set_size(other.get_size()); - other.set_size(batch_dim<2>{}); - } - return *this; - } - - /** - * Copy-constructs a BatchMultiVector. Inherits executor and size from the - * input. - */ - BatchMultiVector(const BatchMultiVector&) = default; - - /** - * Move-constructs a BatchMultiVector. Inherits executor and size from the - * input, which will have size 0x0 and unchanged executor afterwards. - */ - BatchMultiVector(BatchMultiVector&& other) - : EnablePolymorphicObject(std::move(other)), - batch_size_{std::exchange(other.batch_size_, batch_dim<2>{})} - {} - private: inline batch_dim<2> compute_batch_size( const std::vector*>& matrices) @@ -737,7 +699,7 @@ std::unique_ptr batch_initialize( ++batch; } auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx); + mtx->copy_from(tmp.get()); return mtx; } diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt index a80be354878..3c34a9068d4 100644 --- a/test/base/CMakeLists.txt +++ b/test/base/CMakeLists.txt @@ -1,4 +1,4 @@ -ginkgo_create_common_and_reference_test(batch_multi_vector_kernels) +# ginkgo_create_common_and_reference_test(batch_multi_vector_kernels) ginkgo_create_common_and_reference_test(device_matrix_data_kernels) ginkgo_create_common_device_test(kernel_launch_generic) ginkgo_create_common_and_reference_test(executor) From 241babacbcc9456572e617db73ddd281ad31f5ef Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 20 Jul 2023 11:23:17 +0200 Subject: [PATCH 121/583] Fix reference kernel tests --- core/test/base/batch_multi_vector.cpp | 2 +- .../ginkgo/core/base/batch_multi_vector.hpp | 22 +++----- reference/test/base/CMakeLists.txt | 1 + .../test/base/batch_multi_vector_kernels.cpp | 51 ++++--------------- 4 files changed, 19 insertions(+), 57 deletions(-) diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 9d0c15a2b0d..410ea70b4dd 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -407,7 +407,7 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixAssemblyData) data1.set_value(0, 1, 3.0); data1.set_value(1, 0, 0.0); data1.set_value(1, 1, 5.0); - gko::matrix_assembly_data data2(gko::dim<2>{2, 1}); + gko::matrix_assembly_data data2(gko::dim<2>{2, 2}); data2.set_value(0, 0, 2.0); data2.set_value(0, 1, 1.0); data2.set_value(1, 0, 5.0); diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 567fcb25662..143de27335b 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -664,25 +664,17 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); - size_type ind = 0; - size_type num_rows = 0; - size_type num_cols = 0; - gko::dim<2> common_size{}; - size_type idx = 0; + auto vals_begin = begin(vals); + size_type common_num_rows = vals_begin->size(); + size_type common_num_cols = vals_begin->begin()->size(); + auto common_size = dim<2>(common_num_rows, common_num_cols); for (const auto& b : vals) { - num_rows = b.size(); - num_cols = begin(b)->size(); - if (idx == 0) { - common_size = dim<2>(num_rows, num_cols); - } + auto num_rows = b.size(); + auto num_cols = begin(b)->size(); auto b_size = dim<2>(num_rows, num_cols); GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size); - ++idx; } - size_type common_num_rows = num_rows; - size_type common_num_cols = num_cols; - common_size = dim<2>(common_num_rows, common_num_cols); auto b_size = batch_dim<2>(num_batch_entries, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); size_type batch = 0; @@ -699,7 +691,7 @@ std::unique_ptr batch_initialize( ++batch; } auto mtx = Matrix::create(exec, std::forward(create_args)...); - mtx->copy_from(tmp.get()); + tmp->move_to(mtx); return mtx; } diff --git a/reference/test/base/CMakeLists.txt b/reference/test/base/CMakeLists.txt index b4d922ec187..7230b329918 100644 --- a/reference/test/base/CMakeLists.txt +++ b/reference/test/base/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_test(array) +ginkgo_create_test(batch_multi_vector_kernels) ginkgo_create_test(combination) ginkgo_create_test(composition) ginkgo_create_test(index_set) diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 8ed8f03dc25..c7ba4a0bcf2 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -41,9 +41,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include -#include #include #include #include @@ -54,9 +51,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -namespace { - - template class BatchMultiVector : public ::testing::Test { protected: @@ -77,26 +71,22 @@ class BatchMultiVector : public ::testing::Test { mtx_01(gko::initialize( {I({1.0, -2.0, -0.5}), I({1.0, -2.5, 4.0})}, exec)), mtx_1( - gko::batch_initialize(std::vector{4, 4}, - {{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}}, + gko::batch_initialize({{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, exec)), mtx_10(gko::initialize( {I({1.0, -1.0, 2.2}), I({-2.0, 2.0, -0.5})}, exec)), - mtx_11(gko::initialize( - 4, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)), + mtx_11(gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + exec)), mtx_2(gko::batch_initialize( - std::vector{2, 2}, {{{1.0, 1.5}, {6.0, 1.0}, {-0.25, 1.0}}, {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}}, exec)), mtx_20(gko::initialize( - 4, {I({1.0, 1.5}), I({6.0, 1.0}), I({-0.25, 1.0})}, - exec)), + {I({1.0, 1.5}), I({6.0, 1.0}), I({-0.25, 1.0})}, exec)), mtx_21(gko::initialize( {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}, exec)), mtx_3(gko::batch_initialize( - std::vector{4, 4}, {{I({1.0, 1.5}), I({6.0, 1.0})}, {{2.0, -2.0}, {1.0, 3.0}}}, exec)), mtx_30(gko::initialize({I({1.0, 1.5}), I({6.0, 1.0})}, @@ -146,7 +136,6 @@ TYPED_TEST(BatchMultiVector, ScalesData) using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; auto alpha = gko::batch_initialize( - std::vector{3, 3}, {{{2.0, -2.0, 1.5}}, {{3.0, -1.0, 0.25}}}, this->exec); auto ualpha = alpha->unbatch(); @@ -238,27 +227,14 @@ TYPED_TEST(BatchMultiVector, AddsScaledWithScalar) TYPED_TEST(BatchMultiVector, AddScaledFailsOnWrongSizes) { using Mtx = typename TestFixture::Mtx; - auto alpha = - gko::batch_initialize({{2.0, 3.0, 4.0, 5.0}, {-2.0}}, this->exec); + auto alpha = gko::batch_initialize( + {{2.0, 3.0, 4.0, 5.0}, {-2.0, 2.0, 4.0, 5.0}}, this->exec); ASSERT_THROW(this->mtx_1->add_scaled(alpha.get(), this->mtx_2.get()), gko::DimensionMismatch); } -TYPED_TEST(BatchMultiVector, AddScaleFailsOnWrongScalarSizes) -{ - using Mtx = typename TestFixture::Mtx; - auto alpha = gko::batch_initialize( - {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto beta = gko::batch_initialize({{3.0}, {1.5}}, this->exec); - - ASSERT_THROW( - this->mtx_1->add_scale(alpha.get(), this->mtx_0.get(), beta.get()), - gko::DimensionMismatch); -} - - TYPED_TEST(BatchMultiVector, ComputesDot) { using Mtx = typename TestFixture::Mtx; @@ -282,8 +258,7 @@ TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; auto result = - Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ - gko::dim<2>{1, 2}, gko::dim<2>{1, 3}})); + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); ASSERT_THROW(this->mtx_1->compute_dot(this->mtx_2.get(), result.get()), gko::DimensionMismatch); @@ -294,8 +269,7 @@ TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; auto result = - Mtx::create(this->exec, gko::batch_dim<2>(std::vector>{ - gko::dim<2>{1, 2}, gko::dim<2>{1, 2}})); + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); auto result2 = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); @@ -316,10 +290,8 @@ TYPED_TEST(BatchMultiVector, ComputesNorm2) {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, this->exec)); - auto batch_size = gko::batch_dim<2>( - std::vector>{gko::dim<2>{1, 2}, gko::dim<2>{1, 2}}); - auto result = - NormVector::create(this->exec, batch_size, gko::batch_stride(2, 2)); + auto batch_size = gko::batch_dim<2>(2, gko::dim<2>{1, 2}); + auto result = NormVector::create(this->exec, batch_size); mtx->compute_norm2(result.get()); @@ -413,6 +385,3 @@ TYPED_TEST(BatchMultiVector, MovesEmptyToPrecision) ASSERT_FALSE(res->get_num_batch_entries()); } - - -} // namespace From bc7a5edb5e65cafd6cb608f7828fc13276a70ed1 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 20 Jul 2023 11:48:15 +0200 Subject: [PATCH 122/583] Fix CUDA/HIP/DPCPP tests. Co-authored-by: Aditya Kashi --- core/test/utils/batch_helpers.hpp | 35 ++++++++++++------------ test/base/CMakeLists.txt | 2 +- test/base/batch_multi_vector_kernels.cpp | 20 +++++++------- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp index c00cfbeee50..3b9e673922e 100644 --- a/core/test/utils/batch_helpers.hpp +++ b/core/test/utils/batch_helpers.hpp @@ -79,8 +79,8 @@ std::unique_ptr generate_uniform_batch_random_matrix( using index_type = typename MatrixType::index_type; // generate sparsity pattern - matrix_data sdata{gko::dim<2>{num_rows, num_cols}, - {}}; + matrix_data in_data{gko::dim<2>{num_rows, num_cols}, + {}}; for (size_type row = 0; row < num_rows; ++row) { // randomly generate number of nonzeros in this row @@ -106,33 +106,34 @@ std::unique_ptr generate_uniform_batch_random_matrix( } } - std::for_each( - std::begin(col_idx), std::begin(col_idx) + nnz_in_row, - [&](size_type col) { sdata.nonzeros.emplace_back(row, col, 1.0); }); + std::for_each(std::begin(col_idx), std::begin(col_idx) + nnz_in_row, + [&](size_type col) { + in_data.nonzeros.emplace_back(row, col, 1.0); + }); } - std::vector> batchmtx; - batchmtx.reserve(batch_size); + std::vector> batch_mtx; + batch_mtx.reserve(batch_size); - for (size_t ibatch = 0; ibatch < batch_size; ibatch++) { - matrix_data data = sdata; - for (size_type iz = 0; iz < data.nonzeros.size(); ++iz) { - value_type valnz = + for (int batch = 0; batch < batch_size; batch++) { + matrix_data data = in_data; + for (size_type nnz = 0; nnz < data.nonzeros.size(); ++nnz) { + value_type val = gko::detail::get_rand_value(value_dist, engine); - if (data.nonzeros[iz].column == data.nonzeros[iz].row && - valnz == zero()) { - valnz = 1.0; + if (data.nonzeros[nnz].column == data.nonzeros[nnz].row && + val == zero()) { + val = 1.0; } - data.nonzeros[iz].value = valnz; + data.nonzeros[nnz].value = val; } data.ensure_row_major_order(); - batchmtx.emplace_back(std::move(data)); + batch_mtx.emplace_back(std::move(data)); } // convert to the correct matrix type auto result = MatrixType::create(exec, std::forward(args)...); - result->read(batchmtx); + result->read(batch_mtx); return result; } diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt index 3c34a9068d4..d0567f45403 100644 --- a/test/base/CMakeLists.txt +++ b/test/base/CMakeLists.txt @@ -1,4 +1,4 @@ -# ginkgo_create_common_and_reference_test(batch_multi_vector_kernels) +ginkgo_create_common_test(batch_multi_vector_kernels) ginkgo_create_common_and_reference_test(device_matrix_data_kernels) ginkgo_create_common_device_test(kernel_launch_generic) ginkgo_create_common_and_reference_test(executor) diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index b2f86fa0383..07bdf5899e9 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -59,11 +59,11 @@ class BatchMultiVector : public CommonTestFixture { BatchMultiVector() : rand_engine(15) {} template - std::unique_ptr gen_mtx(const size_t batchsize, int num_rows, + std::unique_ptr gen_mtx(const size_t batch_size, int num_rows, int num_cols) { return gko::test::generate_uniform_batch_random_matrix( - batchsize, num_rows, num_cols, + batch_size, num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), std::normal_distribution<>(-1.0, 1.0), rand_engine, false, ref); } @@ -89,9 +89,9 @@ class BatchMultiVector : public CommonTestFixture { dalpha->copy_from(alpha.get()); dbeta = gko::clone(exec, beta.get()); expected = Mtx::create( - ref, gko::batch_dim<>(batch_size, gko::dim<2>{1, num_vecs})); + ref, gko::batch_dim<2>(batch_size, gko::dim<2>{1, num_vecs})); dresult = Mtx::create( - exec, gko::batch_dim<>(batch_size, gko::dim<2>{1, num_vecs})); + exec, gko::batch_dim<2>(batch_size, gko::dim<2>{1, num_vecs})); } void set_up_apply_data(const int p = 1) @@ -103,8 +103,8 @@ class BatchMultiVector : public CommonTestFixture { expected = gen_mtx(batch_size, m, p); alpha = gko::batch_initialize(batch_size, {2.0}, ref); beta = gko::batch_initialize(batch_size, {-1.0}, ref); - square = gen_mtx(batch_size, x->get_size().at()[0], - x->get_size().at()[0]); + square = gen_mtx(batch_size, x->get_common_size()[0], + x->get_common_size()[0]); dx = Mtx::create(exec); dx->copy_from(x.get()); dc_x = ComplexMtx::create(exec); @@ -212,7 +212,7 @@ TEST_F(BatchMultiVector, ComputeNorm2SingleIsEquivalentToRef) { set_up_vector_data(1); auto norm_size = - gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); auto norm_expected = NormVector::create(this->ref, norm_size); auto dnorm = NormVector::create(this->exec, norm_size); @@ -227,7 +227,7 @@ TEST_F(BatchMultiVector, ComputeNorm2IsEquivalentToRef) { set_up_vector_data(20); auto norm_size = - gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); auto norm_expected = NormVector::create(this->ref, norm_size); auto dnorm = NormVector::create(this->exec, norm_size); @@ -242,7 +242,7 @@ TEST_F(BatchMultiVector, ComputeDotIsEquivalentToRef) { set_up_vector_data(20); auto dot_size = - gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); auto dot_expected = Mtx::create(this->ref, dot_size); auto ddot = Mtx::create(this->exec, dot_size); @@ -257,7 +257,7 @@ TEST_F(BatchMultiVector, ComputeDotSingleIsEquivalentToRef) { set_up_vector_data(1); auto dot_size = - gko::batch_dim<>(batch_size, gko::dim<2>{1, x->get_size().at()[1]}); + gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); auto dot_expected = Mtx::create(this->ref, dot_size); auto ddot = Mtx::create(this->exec, dot_size); From b28df4f23e12b613593ed1ec9ee4722c58ac2ba7 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 20 Jul 2023 15:07:18 +0200 Subject: [PATCH 123/583] Use ptr_param<> --- .../ginkgo/core/base/batch_multi_vector.hpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 143de27335b..47dbe6078f5 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -104,7 +104,7 @@ class BatchMultiVector * @param other The other matrix whose configuration needs to copied. */ static std::unique_ptr create_with_config_of( - const BatchMultiVector* other) + ptr_param other) { // De-referencing `other` before calling the functions (instead of // using operator `->`) is currently required to be compatible with @@ -292,7 +292,7 @@ class BatchMultiVector * of alpha (the number of columns of alpha has to match the number of * columns of the matrix). */ - void scale(const BatchMultiVector* alpha) + void scale(ptr_param> alpha) { auto exec = this->get_executor(); this->scale_impl(make_temporary_clone(exec, alpha).get()); @@ -308,8 +308,8 @@ class BatchMultiVector * vector). * @param b a matrix of the same dimension as this */ - void add_scaled(const BatchMultiVector* alpha, - const BatchMultiVector* b) + void add_scaled(ptr_param> alpha, + ptr_param> b) { auto exec = this->get_executor(); this->add_scaled_impl(make_temporary_clone(exec, alpha).get(), @@ -328,9 +328,9 @@ class BatchMultiVector * @param beta Scalar(s), of the same size as alpha, to multiply this * matrix. */ - void add_scale(const BatchMultiVector* alpha, - const BatchMultiVector* a, - const BatchMultiVector* beta); + void add_scale(ptr_param> alpha, + ptr_param> a, + ptr_param> beta); /** * Computes the column-wise dot product of each matrix in this batch and its @@ -342,8 +342,8 @@ class BatchMultiVector * product (the number of column in the vector must match the number of * columns of this) */ - void compute_dot(const BatchMultiVector* b, - BatchMultiVector* result) const + void compute_dot(ptr_param> b, + ptr_param> result) const { auto exec = this->get_executor(); this->compute_dot_impl(make_temporary_clone(exec, b).get(), @@ -358,7 +358,7 @@ class BatchMultiVector * of columns of this) */ void compute_norm2( - BatchMultiVector>* result) const + ptr_param>> result) const { auto exec = this->get_executor(); this->compute_norm2_impl(make_temporary_clone(exec, result).get()); From 3384a6d10422682f17c23254852d6a13f99616d7 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 20 Jul 2023 13:53:06 +0000 Subject: [PATCH 124/583] Format files Co-authored-by: Pratik Nayak --- .../base/batch_multi_vector_kernels.hpp.inc | 34 ++++++++----------- core/base/batch_multi_vector_kernels.hpp | 6 ++-- core/test/base/batch_dim.cpp | 6 ++-- cuda/base/batch_struct.hpp | 10 +++--- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 2 +- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 1 - dpcpp/base/batch_struct.hpp | 6 ++-- hip/base/batch_struct.hip.hpp | 10 +++--- include/ginkgo/ginkgo.hpp | 1 + .../base/batch_multi_vector_kernels.hpp.inc | 1 - reference/base/batch_struct.hpp | 12 +++---- test/base/batch_multi_vector_kernels.cpp | 4 +-- 12 files changed, 41 insertions(+), 52 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 6e9dc57681a..fa6270a0b60 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -57,9 +57,9 @@ __device__ __forceinline__ void scale( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( - const gko::batch_multi_vector::UniformBatch alpha, - const gko::batch_multi_vector::UniformBatch x) +__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( + const gko::batch_multi_vector::UniformBatch alpha, + const gko::batch_multi_vector::UniformBatch x) { for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { @@ -93,10 +93,10 @@ __device__ __forceinline__ void add_scaled( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( - const gko::batch_multi_vector::UniformBatch alpha, - const gko::batch_multi_vector::UniformBatch x, - const gko::batch_multi_vector::UniformBatch y) +__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( + const gko::batch_multi_vector::UniformBatch alpha, + const gko::batch_multi_vector::UniformBatch x, + const gko::batch_multi_vector::UniformBatch y) { for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { @@ -243,15 +243,11 @@ __device__ __forceinline__ void compute_norm2( template -__global__ __launch_bounds__( - default_block_size, - sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector:: - UniformBatch - x, - const gko::batch_multi_vector:: - UniformBatch< - remove_complex> - result) +__global__ +__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel( + const gko::batch_multi_vector::UniformBatch x, + const gko::batch_multi_vector::UniformBatch> + result) { for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { @@ -284,9 +280,9 @@ __device__ __forceinline__ void copy( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( - const gko::batch_multi_vector::UniformBatch src, - const gko::batch_multi_vector::UniformBatch dst) +__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( + const gko::batch_multi_vector::UniformBatch src, + const gko::batch_multi_vector::UniformBatch dst) { for (size_type ibatch = blockIdx.x; ibatch < src.num_batch_entries; ibatch += gridDim.x) { diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp index 7e7f9c3bb37..28c7b87de10 100644 --- a/core/base/batch_multi_vector_kernels.hpp +++ b/core/base/batch_multi_vector_kernels.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CORE_MATRIX_BATCH_MULTI_VECTOR_KERNELS_HPP_ -#define GKO_CORE_MATRIX_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#ifndef GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#define GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ #include @@ -101,4 +101,4 @@ GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_multi_vector, } // namespace gko -#endif // GKO_CORE_MATRIX_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#endif // GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp index 40743656ca3..71b954264c3 100644 --- a/core/test/base/batch_dim.cpp +++ b/core/test/base/batch_dim.cpp @@ -30,13 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include -#include +#include -#include +#include TEST(BatchDim, ConstructsCorrectUniformObject) diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 9d4eb436c16..9084cddfdfa 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -30,17 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ -#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ - - -#include "core/base/batch_struct.hpp" +#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_ +#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_ #include #include +#include "core/base/batch_struct.hpp" #include "cuda/base/config.hpp" #include "cuda/base/types.hpp" @@ -111,4 +109,4 @@ maybe_null_batch_struct(const BatchMultiVector* const op) } // namespace gko -#endif // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_CUDA_BASE_BATCH_STRUCT_HPP_ diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 88cdb1d6e6f..64343e02fad 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -42,7 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" -#include "dpcpp/matrix/batch_struct.hpp" namespace gko { diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index 07d6d97ff0a..c5e2848e1d6 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - template __dpct_inline__ void scale_kernel( const gko::batch_dense::BatchEntry& alpha, diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index 86534768c2b..bae0d43f6c0 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -34,13 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_DPCPP_BASE_BATCH_STRUCT_HPP_ -#include "core/base/batch_struct.hpp" - - #include #include +#include "core/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" @@ -111,4 +109,4 @@ maybe_null_batch_struct(const BatchMultiVector* const op) } // namespace gko -#endif // GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_DPCPP_BASE_BATCH_STRUCT_HPP_ diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index d796cdcdb37..f76e4fa8a79 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -30,17 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ -#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ - - -#include "core/base/batch_struct.hpp" +#ifndef GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ +#define GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ #include #include +#include "core/base/batch_struct.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/types.hip.hpp" @@ -111,4 +109,4 @@ maybe_null_batch_struct(const BatchMultiVector* const op) } // namespace gko -#endif // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ +#endif // GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 8a88bf003f8..eebb31772ea 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc index a793fe030f9..a80415572c2 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - template inline void scale_kernel( const gko::batch_multi_vector::BatchEntry& alpha, diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index 056bb575f8a..fec5b4f8803 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -30,17 +30,17 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ -#define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ - - -#include "core/base/batch_struct.hpp" +#ifndef GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ +#define GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ #include #include +#include "core/base/batch_struct.hpp" + + namespace gko { namespace kernels { /** @@ -111,4 +111,4 @@ maybe_null_batch_struct(const BatchMultiVector* const op) } // namespace gko -#endif // GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 07bdf5899e9..fe5fa0ed85f 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/base/batch_multi_vector_kernels.hpp" +#include #include @@ -40,10 +40,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include +#include "core/base/batch_multi_vector_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" #include "test/utils/executor.hpp" From 8f5e9c31d88dfd1669cb5eea0a5dd5622ddeac91 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 21 Jul 2023 09:46:46 +0200 Subject: [PATCH 125/583] dpcpp kernel updates Co-authored-by: Phuong Nguyen --- core/base/batch_multi_vector.cpp | 7 +- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 9 +- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 115 ++++++++++-------- dpcpp/base/batch_struct.hpp | 2 +- test/base/batch_multi_vector_kernels.cpp | 29 ++--- 5 files changed, 90 insertions(+), 72 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 0a3612ab205..0c2f1e0c1ba 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace batch_multi_vector { +namespace { GKO_REGISTER_OPERATION(scale, batch_multi_vector::scale); @@ -60,6 +61,7 @@ GKO_REGISTER_OPERATION(compute_norm2, batch_multi_vector::compute_norm2); GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); +} // namespace } // namespace batch_multi_vector @@ -248,9 +250,8 @@ void BatchMultiVector::write(std::vector& data) const } -#define GKO_DECLARE_BATCH_MULTI_VECTOR_MATRIX(_type) \ - class BatchMultiVector<_type> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_MATRIX); +#define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class BatchMultiVector<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); } // namespace gko diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 64343e02fad..74c3b842297 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -33,16 +33,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_multi_vector_kernels.hpp" -#include -#include +#include #include +#include #include #include #include "core/components/prefix_sum_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" namespace gko { diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index c5e2848e1d6..d881586e362 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -32,8 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __dpct_inline__ void scale_kernel( - const gko::batch_dense::BatchEntry& alpha, - const gko::batch_dense::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x, sycl::nd_item<3>& item_ct1) { const int max_li = x.num_rows * x.num_rhs; @@ -53,84 +53,95 @@ __dpct_inline__ void scale_kernel( } -/** - * Adds a scaled vector to another. - * - * @param num_rows Common length of both vectors. - * @param alpha Scaling factor. - * @param[in] x Vector to scale and add. - * @param[in,out] y Vector to add to. - */ template -__dpct_inline__ void add_scaled_kernel(const int num_rows, - const ValueType alpha, - const ValueType* const __restrict__ x, - ValueType* const __restrict__ y, - sycl::nd_item<3> item_ct1) +__dpct_inline__ void add_scaled_kernel( + const gko::batch_multi_vector::BatchEntry& alpha, + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, + sycl::nd_item<3>& item_ct1) { - for (int li = item_ct1.get_local_linear_id(); li < num_rows; - li += item_ct1.get_local_range().size()) { - y[li] += alpha * x[li]; + const int max_li = x.num_rows * x.num_rhs; + for (int li = item_ct1.get_local_id(2); li < max_li; + li += item_ct1.get_local_range(2)) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + if (alpha.num_rhs == 1) { + y.values[row * y.stride + col] += + alpha.values[0] * x.values[row * x.stride + col]; + } else { + y.values[row * y.stride + col] += + alpha.values[col] * x.values[row * x.stride + col]; + } } } template __dpct_inline__ void compute_dot_product_kernel( - const int num_rows, const ValueType* const __restrict__ x, - const ValueType* const __restrict__ y, ValueType& result, - sycl::nd_item<3> item_ct1) + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry& y, + const gko::batch_multi_vector::BatchEntry& result, + sycl::nd_item<3>& item_ct1) { - const auto group = item_ct1.get_group(); - const auto group_size = item_ct1.get_local_range().size(); - const auto tid = item_ct1.get_local_linear_id(); + const auto sg = item_ct1.get_sub_group(); + const int sg_id = sg.get_group_id(); + const int sg_size = sg.get_local_range().size(); + const int num_sg = sg.get_group_range().size(); + + for (int rhs_index = sg_id; rhs_index < x.num_rhs; rhs_index += num_sg) { + ValueType val = zero(); + + for (int r = sg.get_local_id(); r < x.num_rows; r += sg_size) { + val += conj(x.values[r * x.stride + rhs_index]) * + y.values[r * y.stride + rhs_index]; + } - ValueType val = zero(); + val = sycl::reduce_over_group(sg, val, sycl::plus<>()); - for (int r = tid; r < num_rows; r += group_size) { - val += conj(x[r]) * y[r]; + if (sg.get_local_id() == 0) { + result.values[rhs_index] = val; + } } - result = sycl::reduce_over_group(group, val, sycl::plus<>()); } template __dpct_inline__ void compute_norm2_kernel( - const int num_rows, const ValueType* const __restrict__ x, - gko::remove_complex& result, sycl::nd_item<3> item_ct1) + const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::BatchEntry>& + result, + sycl::nd_item<3>& item_ct1) { - const auto group = item_ct1.get_group(); - const auto group_size = item_ct1.get_local_range().size(); - const auto tid = item_ct1.get_local_linear_id(); + const auto sg = item_ct1.get_sub_group(); + const int sg_id = sg.get_group_id(); + const int sg_size = sg.get_local_range().size(); + const int num_sg = sg.get_group_range().size(); using real_type = typename gko::remove_complex; - real_type val = zero(); + for (int rhs_index = sg_id; rhs_index < x.num_rhs; rhs_index += num_sg) { + real_type val = zero(); - for (int r = tid; r < num_rows; r += group_size) { - val += squared_norm(x[r]); - } + for (int r = sg.get_local_id(); r < x.num_rows; r += sg_size) + val += squared_norm(x.values[r * x.stride + rhs_index]); - val = sycl::reduce_over_group(group, val, sycl::plus<>()); + val = sycl::reduce_over_group(sg, val, sycl::plus<>()); - result = sqrt(val); + if (sg.get_local_id() == 0) result.values[rhs_index] = sqrt(val); + } } -/** - * Copies the values of vector into another. - * - * @param num_rows Length of vector. - * @param in Vector to copy from. - * @param out Vector to copy into. - */ template -__dpct_inline__ void copy_kernel(const int num_rows, - const ValueType* const __restrict__ in, - ValueType* const __restrict__ out, - sycl::nd_item<3> item_ct1) +__dpct_inline__ void copy_kernel( + const gko::batch_multi_vector::BatchEntry& in, + const gko::batch_multi_vector::BatchEntry& out, + sycl::nd_item<3>& item_ct1) { - for (int iz = item_ct1.get_local_linear_id(); iz < num_rows; + for (int iz = item_ct1.get_local_linear_id(); iz < in.num_rows * in.num_rhs; iz += item_ct1.get_local_range().size()) { - out[iz] = in[iz]; + const int i = iz / in.num_rhs; + const int j = iz % in.num_rhs; + out.values[i * out.stride + j] = in.values[i * in.stride + j]; } } diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index bae0d43f6c0..16f0b528dda 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -34,8 +34,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_DPCPP_BASE_BATCH_STRUCT_HPP_ +#include #include -#include #include "core/base/batch_struct.hpp" diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index fe5fa0ed85f..05ea67bee1d 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -45,16 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_multi_vector_kernels.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" #include "test/utils/executor.hpp" class BatchMultiVector : public CommonTestFixture { protected: - using vtype = double; - using Mtx = gko::BatchMultiVector; - using NormVector = gko::BatchMultiVector>; - using ComplexMtx = gko::BatchMultiVector>; + using Mtx = gko::BatchMultiVector; + using NormVector = gko::BatchMultiVector>; + using ComplexMtx = gko::BatchMultiVector>; BatchMultiVector() : rand_engine(15) {} @@ -148,7 +149,7 @@ TEST_F(BatchMultiVector, SingleVectorAddScaledIsEquivalentToRef) x->add_scaled(alpha.get(), y.get()); dx->add_scaled(dalpha.get(), dy.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, r::value); } @@ -159,7 +160,7 @@ TEST_F(BatchMultiVector, MultipleVectorAddScaledIsEquivalentToRef) x->add_scaled(alpha.get(), y.get()); dx->add_scaled(dalpha.get(), dy.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r::value); } @@ -171,7 +172,7 @@ TEST_F(BatchMultiVector, x->add_scaled(alpha.get(), y.get()); dx->add_scaled(dalpha.get(), dy.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r::value); } @@ -182,7 +183,7 @@ TEST_F(BatchMultiVector, SingleVectorScaleIsEquivalentToRef) x->scale(alpha.get()); dx->scale(dalpha.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r::value); } @@ -193,7 +194,7 @@ TEST_F(BatchMultiVector, MultipleVectorScaleIsEquivalentToRef) x->scale(alpha.get()); dx->scale(dalpha.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r::value); } @@ -204,7 +205,7 @@ TEST_F(BatchMultiVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) x->scale(alpha.get()); dx->scale(dalpha.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r::value); } @@ -219,7 +220,7 @@ TEST_F(BatchMultiVector, ComputeNorm2SingleIsEquivalentToRef) x->compute_norm2(norm_expected.get()); dx->compute_norm2(dnorm.get()); - GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 5 * r::value); } @@ -234,7 +235,7 @@ TEST_F(BatchMultiVector, ComputeNorm2IsEquivalentToRef) x->compute_norm2(norm_expected.get()); dx->compute_norm2(dnorm.get()); - GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 5 * r::value); } @@ -249,7 +250,7 @@ TEST_F(BatchMultiVector, ComputeDotIsEquivalentToRef) x->compute_dot(y.get(), dot_expected.get()); dx->compute_dot(dy.get(), ddot.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r::value); } @@ -264,7 +265,7 @@ TEST_F(BatchMultiVector, ComputeDotSingleIsEquivalentToRef) x->compute_dot(y.get(), dot_expected.get()); dx->compute_dot(dy.get(), ddot.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r::value); } From 33b726dc77f6a3d77a8b3e963387d21d119175e5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 21 Jul 2023 10:39:35 +0200 Subject: [PATCH 126/583] Unify CUDA/HIP and enable streams --- ...batch_multi_vector_kernel_launcher.hpp.inc | 117 ++++++++++++++++++ cuda/base/batch_multi_vector_kernels.cu | 84 +------------ hip/base/batch_multi_vector_kernels.hip.cpp | 87 +------------ 3 files changed, 121 insertions(+), 167 deletions(-) create mode 100644 common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc new file mode 100644 index 00000000000..24cd24d1bf7 --- /dev/null +++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc @@ -0,0 +1,117 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +void scale(std::shared_ptr exec, + const BatchMultiVector* const alpha, + BatchMultiVector* const x) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + scale_kernel<<get_stream()>>>( + alpha_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const BatchMultiVector* const alpha, + const BatchMultiVector* const x, + BatchMultiVector* const y) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const size_type nrhs = x->get_common_size()[1]; + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + add_scaled_kernel<<get_stream()>>>(alpha_ub, x_ub, y_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const BatchMultiVector* x, + const BatchMultiVector* y, + BatchMultiVector* result) +{ + const auto num_blocks = x->get_num_batch_entries(); + const auto num_rhs = x->get_common_size()[1]; + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + compute_dot_product_kernel<<get_stream()>>>(x_ub, y_ub, res_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const BatchMultiVector* const x, + BatchMultiVector>* const result) +{ + const auto num_blocks = x->get_num_batch_entries(); + const auto num_rhs = x->get_common_size()[1]; + const auto x_ub = get_batch_struct(x); + const auto res_ub = get_batch_struct(result); + compute_norm2_kernel<<get_stream()>>>(x_ub, res_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); + + +template +void copy(std::shared_ptr exec, + const BatchMultiVector* x, + BatchMultiVector* result) +{ + const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto result_ub = get_batch_struct(result); + const auto x_ub = get_batch_struct(x); + copy_kernel<<get_stream()>>>( + x_ub, result_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 8bfb6fc0167..e7c57111463 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -62,90 +62,10 @@ namespace batch_multi_vector { constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; - +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - -template -void scale(std::shared_ptr exec, - const BatchMultiVector* const alpha, - BatchMultiVector* const x) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto alpha_ub = get_batch_struct(alpha); - const auto x_ub = get_batch_struct(x); - scale_kernel<<>>(alpha_ub, x_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); - - -template -void add_scaled(std::shared_ptr exec, - const BatchMultiVector* const alpha, - const BatchMultiVector* const x, - BatchMultiVector* const y) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const size_type nrhs = x->get_common_size()[1]; - const auto alpha_ub = get_batch_struct(alpha); - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - add_scaled_kernel<<>>(alpha_ub, x_ub, y_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); - - -template -void compute_dot(std::shared_ptr exec, - const BatchMultiVector* x, - const BatchMultiVector* y, - BatchMultiVector* result) -{ - const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_common_size()[1]; - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - const auto res_ub = get_batch_struct(result); - compute_dot_product_kernel<<>>(x_ub, y_ub, - res_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const BatchMultiVector* const x, - BatchMultiVector>* const result) -{ - const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_common_size()[1]; - const auto x_ub = get_batch_struct(x); - const auto res_ub = get_batch_struct(result); - compute_norm2_kernel<<>>(x_ub, res_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); - - -template -void copy(std::shared_ptr exec, - const BatchMultiVector* x, - BatchMultiVector* result) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto result_ub = get_batch_struct(result); - const auto x_ub = get_batch_struct(x); - copy_kernel<<>>(x_ub, result_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" } // namespace batch_multi_vector diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index 50f8593ffec..a8f0f8a7cd6 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -66,93 +66,10 @@ constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - -template -void scale(std::shared_ptr exec, - const BatchMultiVector* const alpha, - BatchMultiVector* const x) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto alpha_ub = get_batch_struct(alpha); - const auto x_ub = get_batch_struct(x); - hipLaunchKernelGGL(scale_kernel, dim3(num_blocks), dim3(default_block_size), - 0, 0, alpha_ub, x_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); - - -template -void add_scaled(std::shared_ptr exec, - const BatchMultiVector* const alpha, - const BatchMultiVector* const x, - BatchMultiVector* const y) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const size_type nrhs = x->get_common_size()[1]; - const auto alpha_ub = get_batch_struct(alpha); - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - hipLaunchKernelGGL(add_scaled_kernel, dim3(num_blocks), - dim3(default_block_size), 0, 0, alpha_ub, x_ub, y_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); - - -template -void compute_dot(std::shared_ptr exec, - const BatchMultiVector* x, - const BatchMultiVector* y, - BatchMultiVector* result) -{ - const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_common_size()[1]; - const auto x_ub = get_batch_struct(x); - const auto y_ub = get_batch_struct(y); - const auto res_ub = get_batch_struct(result); - hipLaunchKernelGGL(compute_dot_product_kernel, dim3(num_blocks), - dim3(default_block_size), 0, 0, x_ub, y_ub, res_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const BatchMultiVector* const x, - BatchMultiVector>* const result) -{ - const auto num_blocks = x->get_num_batch_entries(); - const auto num_rhs = x->get_common_size()[1]; - const auto x_ub = get_batch_struct(x); - const auto res_ub = get_batch_struct(result); - hipLaunchKernelGGL(compute_norm2_kernel, dim3(num_blocks), - dim3(default_block_size), 0, 0, x_ub, res_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); - - -template -void copy(std::shared_ptr exec, - const BatchMultiVector* x, - BatchMultiVector* result) -{ - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; - const auto result_ub = get_batch_struct(result); - const auto x_ub = get_batch_struct(x); - hipLaunchKernelGGL(copy_kernel, dim3(num_blocks), dim3(default_block_size), - 0, 0, x_ub, result_ub); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" } // namespace batch_multi_vector From 591f95f6cbe46c29bc248d39d04cd3c830850a07 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 21 Jul 2023 10:54:28 +0200 Subject: [PATCH 127/583] Force the correct include ordering. --- cuda/base/batch_multi_vector_kernels.cu | 3 +++ hip/base/batch_multi_vector_kernels.hip.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index e7c57111463..c1246df7374 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -63,7 +63,10 @@ constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES +// force-top: on #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" +// force-top: off + #include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index a8f0f8a7cd6..f3acaf9ec36 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -67,7 +67,10 @@ constexpr int sm_multiplier = 4; // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES +// force-top: on #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" +// force-top: off + #include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" From 74037d98edb07e971d0f023527369cbd0294fb7e Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 21 Jul 2023 15:57:14 +0200 Subject: [PATCH 128/583] Review updates Co-authored-by: Marcel Koch Co-authored-by: Tobias Ribizel --- ...batch_multi_vector_kernel_launcher.hpp.inc | 26 +++-- .../base/batch_multi_vector_kernels.hpp.inc | 104 +++++++++--------- core/base/batch_multi_vector.cpp | 36 ++---- core/base/batch_struct.hpp | 35 +++--- cuda/base/batch_multi_vector_kernels.cu | 5 + cuda/base/batch_struct.hpp | 6 +- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 24 ++-- dpcpp/base/batch_struct.hpp | 6 +- hip/base/batch_multi_vector_kernels.hip.cpp | 5 + hip/base/batch_struct.hip.hpp | 6 +- include/ginkgo/core/base/batch_dim.hpp | 44 ++++---- .../ginkgo/core/base/batch_multi_vector.hpp | 29 ++--- .../base/batch_multi_vector_kernels.hpp.inc | 24 ++-- reference/base/batch_struct.hpp | 18 +-- 14 files changed, 184 insertions(+), 184 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc index 24cd24d1bf7..43b0c6d8281 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc @@ -36,11 +36,16 @@ void scale(std::shared_ptr exec, const BatchMultiVector* const alpha, BatchMultiVector* const x) { - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto num_blocks = x->get_num_batch_entries(); const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); - scale_kernel<<get_stream()>>>( - alpha_ub, x_ub); + if (alpha->get_common_size()[1] == 1) { + scale_kernel<<get_stream()>>>( + alpha_ub, x_ub, [] __device__(int col) { return 0; }); + } else { + scale_kernel<<get_stream()>>>( + alpha_ub, x_ub, [] __device__(int col) { return col; }); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -53,13 +58,20 @@ void add_scaled(std::shared_ptr exec, const BatchMultiVector* const x, BatchMultiVector* const y) { - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto num_blocks = x->get_num_batch_entries(); const size_type nrhs = x->get_common_size()[1]; const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); - add_scaled_kernel<<get_stream()>>>(alpha_ub, x_ub, y_ub); + if (alpha->get_common_size()[1] == 1) { + add_scaled_kernel<<get_stream()>>>( + alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; }); + } else { + add_scaled_kernel<<get_stream()>>>( + alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; }); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -107,7 +119,7 @@ void copy(std::shared_ptr exec, const BatchMultiVector* x, BatchMultiVector* result) { - const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier; + const auto num_blocks = x->get_num_batch_entries(); const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); copy_kernel<<get_stream()>>>( diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index fa6270a0b60..6d1161aeaa6 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -35,85 +35,75 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Scales the vectors in global or shared memory with a factor of alpha (alpha * is in global memory or shared memory) */ -template +template __device__ __forceinline__ void scale( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x) + const gko::batch_multi_vector::batch_entry& alpha, + const gko::batch_multi_vector::batch_entry& x, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { const int row = li / x.num_rhs; const int col = li % x.num_rhs; - if (alpha.num_rhs == 1) { - x.values[row * x.stride + col] = - alpha.values[0] * x.values[row * x.stride + col]; - } else { - x.values[row * x.stride + col] = - alpha.values[col] * x.values[row * x.stride + col]; - } + x.values[row * x.stride + col] = + alpha.values[map(col)] * x.values[row * x.stride + col]; } } -template +template __global__ -__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( - const gko::batch_multi_vector::UniformBatch alpha, - const gko::batch_multi_vector::UniformBatch x) + __launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, Mapping map) { for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); const auto x_b = gko::batch::batch_entry(x, ibatch); - scale(alpha_b, x_b); + scale(alpha_b, x_b, map); } } -template +template __device__ __forceinline__ void add_scaled( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y) + const gko::batch_multi_vector::batch_entry& alpha, + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { const int row = li / x.num_rhs; const int col = li % x.num_rhs; - if (alpha.num_rhs == 1) { - y.values[row * y.stride + col] += - alpha.values[0] * x.values[row * x.stride + col]; - } else { - y.values[row * y.stride + col] += - alpha.values[col] * x.values[row * x.stride + col]; - } + y.values[row * y.stride + col] += + alpha.values[map(col)] * x.values[row * x.stride + col]; } } -template +template __global__ -__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( - const gko::batch_multi_vector::UniformBatch alpha, - const gko::batch_multi_vector::UniformBatch x, - const gko::batch_multi_vector::UniformBatch y) + __launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, + const gko::batch_multi_vector::uniform_batch y, Mapping map) { for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); const auto x_b = gko::batch::batch_entry(x, ibatch); const auto y_b = gko::batch::batch_entry(y, ibatch); - add_scaled(alpha_b, x_b, y_b); + add_scaled(alpha_b, x_b, y_b, map); } } template __device__ __forceinline__ void one_dot( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, const int rhs_index, - const gko::batch_multi_vector::BatchEntry& result, + const gko::batch_multi_vector::batch_entry& result, group::thread_block_tile& subwarp_grp) { ValueType val = zero(); @@ -143,9 +133,9 @@ __device__ __forceinline__ void one_dot( */ template __device__ __forceinline__ void compute_dot_product( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, - const gko::batch_multi_vector::BatchEntry& result) + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_entry& result) { constexpr auto tile_size = config::warp_size; auto thread_block = group::this_thread_block(); @@ -165,17 +155,17 @@ __global__ __launch_bounds__( default_block_size, sm_multiplier) void compute_dot_product_kernel(const gko:: batch_multi_vector:: - UniformBatch< + uniform_batch< const ValueType> x, const gko:: batch_multi_vector:: - UniformBatch< + uniform_batch< const ValueType> y, const gko:: batch_multi_vector:: - UniformBatch< + uniform_batch< ValueType> result) { @@ -191,9 +181,9 @@ __global__ __launch_bounds__( template __device__ __forceinline__ void one_norm2( - const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::batch_entry& x, const int rhs_index, - const gko::batch_multi_vector::BatchEntry>& + const gko::batch_multi_vector::batch_entry>& result, group::thread_block_tile& subwarp_grp) { @@ -225,8 +215,8 @@ __device__ __forceinline__ void one_norm2( */ template __device__ __forceinline__ void compute_norm2( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry>& + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry>& result) { constexpr auto tile_size = config::warp_size; @@ -243,11 +233,15 @@ __device__ __forceinline__ void compute_norm2( template -__global__ -__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel( - const gko::batch_multi_vector::UniformBatch x, - const gko::batch_multi_vector::UniformBatch> - result) +__global__ __launch_bounds__( + default_block_size, + sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector:: + uniform_batch + x, + const gko::batch_multi_vector:: + uniform_batch< + remove_complex> + result) { for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; ibatch += gridDim.x) { @@ -266,8 +260,8 @@ __launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel( */ template __device__ __forceinline__ void copy( - const gko::batch_multi_vector::BatchEntry& in, - const gko::batch_multi_vector::BatchEntry& out) + const gko::batch_multi_vector::batch_entry& in, + const gko::batch_multi_vector::batch_entry& out) { for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; iz += blockDim.x) { @@ -280,9 +274,9 @@ __device__ __forceinline__ void copy( template __global__ -__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( - const gko::batch_multi_vector::UniformBatch src, - const gko::batch_multi_vector::UniformBatch dst) + __launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( + const gko::batch_multi_vector::uniform_batch src, + const gko::batch_multi_vector::uniform_batch dst) { for (size_type ibatch = blockIdx.x; ibatch < src.num_batch_entries; ibatch += gridDim.x) { diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 0c2f1e0c1ba..d0d76ba5ec6 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -72,12 +72,10 @@ void BatchMultiVector::scale_impl( GKO_ASSERT_EQ(alpha->get_num_batch_entries(), this->get_num_batch_entries()); GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); - for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) { - if (alpha->get_common_size()[1] != 1) { - // different alpha for each column - GKO_ASSERT_EQUAL_COLS(this->get_common_size(), - alpha->get_common_size()); - } + if (alpha->get_common_size()[1] != 1) { + // different alpha for each column + GKO_ASSERT_EQUAL_COLS(this->get_common_size(), + alpha->get_common_size()); } this->get_executor()->run(batch_multi_vector::make_scale(alpha, this)); } @@ -91,12 +89,10 @@ void BatchMultiVector::add_scaled_impl( GKO_ASSERT_EQ(alpha->get_num_batch_entries(), this->get_num_batch_entries()); GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); - for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) { - if (alpha->get_common_size()[1] != 1) { - // different alpha for each column - GKO_ASSERT_EQUAL_COLS(this->get_common_size(), - alpha->get_common_size()); - } + if (alpha->get_common_size()[1] != 1) { + // different alpha for each column + GKO_ASSERT_EQUAL_COLS(this->get_common_size(), + alpha->get_common_size()); } GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); @@ -162,11 +158,11 @@ void BatchMultiVector::move_to( template -inline void read_impl(MatrixType* mtx, const std::vector& data) +void read_impl(MatrixType* mtx, const std::vector& data) { + GKO_ASSERT(data.size() > 0); auto common_size = data[0].size; auto batch_size = batch_dim<2>(data.size(), common_size); - size_type ind = 0; for (const auto& b : data) { auto b_size = b.size; GKO_ASSERT_EQUAL_DIMENSIONS(common_size, b_size); @@ -208,17 +204,9 @@ void BatchMultiVector::read(const std::vector& data) template -inline void write_impl(const MatrixType* mtx, std::vector& data) +void write_impl(const MatrixType* mtx, std::vector& data) { - std::unique_ptr> - op{}; - const MatrixType* tmp{}; - if (mtx->get_executor()->get_master() != mtx->get_executor()) { - op = mtx->clone(mtx->get_executor()->get_master()); - tmp = static_cast(op.get()); - } else { - tmp = mtx; - } + auto tmp = make_temporary_clone(mtx->get_executor()->get_master(), mtx); data = std::vector(mtx->get_num_batch_entries()); for (size_type b = 0; b < mtx->get_num_batch_entries(); ++b) { diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index 05ac4f0d105..d85c413e691 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -47,7 +47,7 @@ namespace batch_multi_vector { * Encapsulates one matrix from a batch of dense matrices (vectors). */ template -struct BatchEntry { +struct batch_entry { using value_type = ValueType; ValueType* values; size_type stride; @@ -61,9 +61,9 @@ struct BatchEntry { * It is uniform in the sense that all matrices in the batch have common sizes. */ template -struct UniformBatch { +struct uniform_batch { using value_type = ValueType; - using entry_type = BatchEntry; + using entry_type = batch_entry; ValueType* values; size_type num_batch_entries; @@ -85,16 +85,17 @@ namespace batch { template -GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::BatchEntry -to_const(const gko::batch_multi_vector::BatchEntry& b) +GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::batch_entry +to_const(const gko::batch_multi_vector::batch_entry& b) { return {b.values, b.stride, b.num_rows, b.num_rhs}; } template -GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::UniformBatch -to_const(const gko::batch_multi_vector::UniformBatch& ub) +GKO_ATTRIBUTES GKO_INLINE + gko::batch_multi_vector::uniform_batch + to_const(const gko::batch_multi_vector::uniform_batch& ub) { return {ub.values, ub.num_batch_entries, ub.stride, ub.num_rows, ub.num_rhs}; @@ -111,31 +112,23 @@ to_const(const gko::batch_multi_vector::UniformBatch& ub) * @param batch_idx The position of the desired object in the batch */ template -GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::BatchEntry batch_entry( - const batch_multi_vector::UniformBatch& batch, - const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_entry +batch_entry(const batch_multi_vector::uniform_batch& batch, + const size_type batch_idx) { return {batch.values + batch_idx * batch.stride * batch.num_rows, batch.stride, batch.num_rows, batch.num_rhs}; } template -GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::BatchEntry batch_entry( - ValueType* const batch_values, const size_type stride, const int num_rows, - const int num_rhs, const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_entry +batch_entry(ValueType* const batch_values, const size_type stride, + const int num_rows, const int num_rhs, const size_type batch_idx) { return {batch_values + batch_idx * stride * num_rows, stride, num_rows, num_rhs}; } -template -GKO_ATTRIBUTES GKO_INLINE ValueType* batch_entry_ptr( - ValueType* const batch_start, const size_type stride, const int num_rows, - const size_type batch_idx) -{ - return batch_start + batch_idx * stride * num_rows; -} - } // namespace batch } // namespace gko diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index c1246df7374..05e08be0adb 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -33,6 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_multi_vector_kernels.hpp" +#include +#include + + #include #include @@ -42,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/base/thrust.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 9084cddfdfa..4358d688f07 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -62,7 +62,7 @@ namespace cuda { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch> +inline gko::batch_multi_vector::uniform_batch> get_batch_struct(const BatchMultiVector* const op) { return {as_cuda_type(op->get_const_values()), op->get_num_batch_entries(), @@ -75,7 +75,7 @@ get_batch_struct(const BatchMultiVector* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch> +inline gko::batch_multi_vector::uniform_batch> get_batch_struct(BatchMultiVector* const op) { return {as_cuda_type(op->get_values()), op->get_num_batch_entries(), @@ -90,7 +90,7 @@ get_batch_struct(BatchMultiVector* const op) * that may be null. */ template -inline gko::batch_multi_vector::UniformBatch> +inline gko::batch_multi_vector::uniform_batch> maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index d881586e362..75f70cc2781 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -32,8 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __dpct_inline__ void scale_kernel( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x, + const gko::batch_multi_vector::batch_entry& alpha, + const gko::batch_multi_vector::batch_entry& x, sycl::nd_item<3>& item_ct1) { const int max_li = x.num_rows * x.num_rhs; @@ -55,9 +55,9 @@ __dpct_inline__ void scale_kernel( template __dpct_inline__ void add_scaled_kernel( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, + const gko::batch_multi_vector::batch_entry& alpha, + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, sycl::nd_item<3>& item_ct1) { const int max_li = x.num_rows * x.num_rhs; @@ -79,9 +79,9 @@ __dpct_inline__ void add_scaled_kernel( template __dpct_inline__ void compute_dot_product_kernel( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, - const gko::batch_multi_vector::BatchEntry& result, + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_entry& result, sycl::nd_item<3>& item_ct1) { const auto sg = item_ct1.get_sub_group(); @@ -108,8 +108,8 @@ __dpct_inline__ void compute_dot_product_kernel( template __dpct_inline__ void compute_norm2_kernel( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry>& + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry>& result, sycl::nd_item<3>& item_ct1) { @@ -134,8 +134,8 @@ __dpct_inline__ void compute_norm2_kernel( template __dpct_inline__ void copy_kernel( - const gko::batch_multi_vector::BatchEntry& in, - const gko::batch_multi_vector::BatchEntry& out, + const gko::batch_multi_vector::batch_entry& in, + const gko::batch_multi_vector::batch_entry& out, sycl::nd_item<3>& item_ct1) { for (int iz = item_ct1.get_local_linear_id(); iz < in.num_rows * in.num_rhs; diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index 16f0b528dda..5b88e992665 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -61,7 +61,7 @@ namespace dpcpp { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch get_batch_struct( +inline gko::batch_multi_vector::uniform_batch get_batch_struct( const BatchMultiVector* const op) { return {op->get_const_values(), op->get_num_batch_entries(), @@ -75,7 +75,7 @@ inline gko::batch_multi_vector::UniformBatch get_batch_struct( * Generates a uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch get_batch_struct( +inline gko::batch_multi_vector::uniform_batch get_batch_struct( BatchMultiVector* const op) { return {op->get_values(), op->get_num_batch_entries(), @@ -90,7 +90,7 @@ inline gko::batch_multi_vector::UniformBatch get_batch_struct( * that may be null. */ template -inline gko::batch_multi_vector::UniformBatch +inline gko::batch_multi_vector::uniform_batch maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index f3acaf9ec36..c1e7469ef9e 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -36,6 +36,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include + + #include #include @@ -45,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/config.hip.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/thrust.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index f76e4fa8a79..f8788b9e6a8 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -62,7 +62,7 @@ namespace hip { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch> +inline gko::batch_multi_vector::uniform_batch> get_batch_struct(const BatchMultiVector* const op) { return {as_hip_type(op->get_const_values()), op->get_num_batch_entries(), @@ -75,7 +75,7 @@ get_batch_struct(const BatchMultiVector* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch> +inline gko::batch_multi_vector::uniform_batch> get_batch_struct(BatchMultiVector* const op) { return {as_hip_type(op->get_values()), op->get_num_batch_entries(), @@ -90,7 +90,7 @@ get_batch_struct(BatchMultiVector* const op) * that may be null. */ template -inline gko::batch_multi_vector::UniformBatch> +inline gko::batch_multi_vector::uniform_batch> maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index bc17648be52..37ce5993220 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -77,13 +77,13 @@ struct batch_dim { /** * Get the cumulative storage size offset * - * @param b the batch id + * @param batch_id the batch id * * @return the cumulative offset */ - size_type get_cumulative_offset(size_type b) const + size_type get_cumulative_offset(size_type batch_id) const { - return b * common_size_[0] * common_size_[1]; + return batch_id * common_size_[0] * common_size_[1]; } /** @@ -100,6 +100,25 @@ struct batch_dim { x.common_size_ == y.common_size_; } + + /** + * Checks if two batch dim objects are different. + * + * @tparam Dimensionality number of dimensions of the dim objects + * @tparam DimensionType datatype used to represent each dimension + * + * @param x first object + * @param y second object + * + * @return `!(x == y)` + */ + friend bool operator!=(const batch_dim& x, + const batch_dim& y) + { + return !(x == y); + } + + /** * Creates a batch_dim object which stores a uniform size for all batch * entries. @@ -121,25 +140,6 @@ struct batch_dim { }; -/** - * Checks if two batch dim objects are different. - * - * @tparam Dimensionality number of dimensions of the dim objects - * @tparam DimensionType datatype used to represent each dimension - * - * @param x first object - * @param y second object - * - * @return `!(x == y)` - */ -template -inline bool operator!=(const batch_dim& x, - const batch_dim& y) -{ - return !(x == y); -} - - /** * Returns a batch_dim object with its dimensions swapped for batched operators * diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 47dbe6078f5..4ce88acc621 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -522,7 +522,7 @@ class BatchMultiVector * @returns a BatchMultiVector matrix with the same configuration as the * caller. */ - virtual std::unique_ptr create_with_same_config() const + std::unique_ptr create_with_same_config() const { return BatchMultiVector::create(this->get_executor(), this->get_size()); } @@ -533,7 +533,7 @@ class BatchMultiVector * @note Other implementations of batch_multi_vector should override this * function instead of scale(const BatchMultiVector *alpha). */ - virtual void scale_impl(const BatchMultiVector* alpha); + void scale_impl(const BatchMultiVector* alpha); /** * @copydoc add_scaled(const BatchMultiVector *, const BatchMultiVector *) @@ -542,8 +542,8 @@ class BatchMultiVector * function instead of add_scale(const BatchMultiVector *alpha, const * BatchMultiVector *b). */ - virtual void add_scaled_impl(const BatchMultiVector* alpha, - const BatchMultiVector* b); + void add_scaled_impl(const BatchMultiVector* alpha, + const BatchMultiVector* b); /** * @copydoc compute_dot(const BatchMultiVector *, BatchMultiVector *) const @@ -552,8 +552,8 @@ class BatchMultiVector * function instead of compute_dot(const BatchMultiVector *b, * BatchMultiVector *result). */ - virtual void compute_dot_impl(const BatchMultiVector* b, - BatchMultiVector* result) const; + void compute_dot_impl(const BatchMultiVector* b, + BatchMultiVector* result) const; /** * @copydoc compute_norm2(BatchMultiVector *) const @@ -561,7 +561,7 @@ class BatchMultiVector * @note Other implementations of batch_multi_vector should override this * function instead of compute_norm2(BatchMultiVector *result). */ - virtual void compute_norm2_impl( + void compute_norm2_impl( BatchMultiVector>* result) const; size_type linearize_index(size_type batch, size_type row, @@ -611,12 +611,12 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); + GKO_ASSERT(num_batch_entries > 0); auto vals_begin = begin(vals); size_type common_num_rows = vals_begin->size(); auto common_size = dim<2>(common_num_rows, 1); - for (size_type b = 0; b < num_batch_entries; ++b) { - GKO_ASSERT_EQ(common_num_rows, vals_begin->size()); - vals_begin++; + for (auto& val : vals) { + GKO_ASSERT_EQ(common_num_rows, val.size()); } auto b_size = batch_dim<2>(num_batch_entries, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); @@ -664,6 +664,7 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); + GKO_ASSERT(num_batch_entries > 0); auto vals_begin = begin(vals); size_type common_num_rows = vals_begin->size(); size_type common_num_cols = vals_begin->begin()->size(); @@ -728,6 +729,7 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = num_vectors; + GKO_ASSERT(num_batch_entries > 0); auto b_size = batch_dim<2>(num_batch_entries, dim<2>(vals.size(), 1)); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_vectors; batch++) { @@ -768,16 +770,17 @@ std::unique_ptr batch_initialize( */ template std::unique_ptr batch_initialize( - const size_type num_matrices, + const size_type num_batch_entries, std::initializer_list> vals, std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; + GKO_ASSERT(num_batch_entries > 0); auto common_size = dim<2>(vals.size(), begin(vals)->size()); - batch_dim<2> b_size(num_matrices, common_size); + batch_dim<2> b_size(num_batch_entries, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); - for (size_type batch = 0; batch < num_matrices; batch++) { + for (size_type batch = 0; batch < num_batch_entries; batch++) { size_type ridx = 0; for (const auto& row : vals) { size_type cidx = 0; diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc index a80415572c2..599013179ce 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -32,8 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void scale_kernel( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x) + const gko::batch_multi_vector::batch_entry& alpha, + const gko::batch_multi_vector::batch_entry& x) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -53,9 +53,9 @@ inline void scale_kernel( template inline void add_scaled_kernel( - const gko::batch_multi_vector::BatchEntry& alpha, - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y) + const gko::batch_multi_vector::batch_entry& alpha, + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -77,9 +77,9 @@ inline void add_scaled_kernel( template inline void compute_dot_product_kernel( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry& y, - const gko::batch_multi_vector::BatchEntry& result) + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_entry& result) { for (int c = 0; c < result.num_rhs; c++) { result.values[c] = gko::zero(); @@ -96,8 +96,8 @@ inline void compute_dot_product_kernel( template inline void compute_norm2_kernel( - const gko::batch_multi_vector::BatchEntry& x, - const gko::batch_multi_vector::BatchEntry>& + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry>& result) { for (int j = 0; j < x.num_rhs; ++j) { @@ -122,8 +122,8 @@ inline void compute_norm2_kernel( */ template inline void copy_kernel( - const gko::batch_multi_vector::BatchEntry& in, - const gko::batch_multi_vector::BatchEntry& out) + const gko::batch_multi_vector::batch_entry& in, + const gko::batch_multi_vector::batch_entry& out) { for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) { const int i = iz / in.num_rhs; diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index fec5b4f8803..cec3a4ed813 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -30,15 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ -#define GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ +#ifndef GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ -#include -#include +#include "core/base/batch_struct.hpp" -#include "core/base/batch_struct.hpp" +#include +#include namespace gko { @@ -63,7 +63,7 @@ namespace host { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch get_batch_struct( +inline gko::batch_multi_vector::uniform_batch get_batch_struct( const BatchMultiVector* const op) { return {op->get_const_values(), op->get_num_batch_entries(), @@ -77,7 +77,7 @@ inline gko::batch_multi_vector::UniformBatch get_batch_struct( * Generates a uniform batch struct from a batch of dense matrices. */ template -inline gko::batch_multi_vector::UniformBatch get_batch_struct( +inline gko::batch_multi_vector::uniform_batch get_batch_struct( BatchMultiVector* const op) { return {op->get_values(), op->get_num_batch_entries(), @@ -92,7 +92,7 @@ inline gko::batch_multi_vector::UniformBatch get_batch_struct( * that may be null. */ template -inline gko::batch_multi_vector::UniformBatch +inline gko::batch_multi_vector::uniform_batch maybe_null_batch_struct(const BatchMultiVector* const op) { if (op) { @@ -111,4 +111,4 @@ maybe_null_batch_struct(const BatchMultiVector* const op) } // namespace gko -#endif // GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ +#endif // GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ From d55865ca822002c05900307796b538b1f95820e9 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 22 Jul 2023 23:58:37 +0200 Subject: [PATCH 129/583] Add compute_conj_dot and kernels --- ...batch_multi_vector_kernel_launcher.hpp.inc | 19 +++++ .../base/batch_multi_vector_kernels.hpp.inc | 81 +++++++++++++++++-- core/base/batch_multi_vector.cpp | 18 +++++ core/base/batch_multi_vector_kernels.hpp | 28 ++++--- core/device_hooks/common_kernels.inc.cpp | 1 + dpcpp/base/batch_multi_vector_kernels.dp.cpp | 35 ++++++++ dpcpp/base/batch_multi_vector_kernels.hpp.inc | 29 +++++++ .../ginkgo/core/base/batch_multi_vector.hpp | 42 ++++++---- omp/base/batch_multi_vector_kernels.cpp | 23 ++++++ reference/base/batch_multi_vector_kernels.cpp | 22 +++++ .../base/batch_multi_vector_kernels.hpp.inc | 19 +++++ .../test/base/batch_multi_vector_kernels.cpp | 50 +++++++++++- test/base/batch_multi_vector_kernels.cpp | 30 +++++++ 13 files changed, 364 insertions(+), 33 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc index 43b0c6d8281..b797850059b 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc @@ -97,6 +97,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); +template +void compute_conj_dot(std::shared_ptr exec, + const BatchMultiVector* x, + const BatchMultiVector* y, + BatchMultiVector* result) +{ + const auto num_blocks = x->get_num_batch_entries(); + const auto num_rhs = x->get_common_size()[1]; + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + compute_conj_dot_product_kernel<<get_stream()>>>(x_ub, y_ub, res_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL); + + template void compute_norm2(std::shared_ptr exec, const BatchMultiVector* const x, diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 6d1161aeaa6..28ea60c7df4 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -108,6 +108,34 @@ __device__ __forceinline__ void one_dot( { ValueType val = zero(); + for (int r = subwarp_grp.thread_rank(); r < x.num_rows; + r += subwarp_grp.size()) { + val += x.values[r * x.stride + rhs_index] * + y.values[r * y.stride + rhs_index]; + } + + // subwarp_grp level reduction +#pragma unroll + for (int j = config::warp_size / 2; j > 0; j /= 2) { + val += subwarp_grp.shfl_down(val, j); + } + + if (subwarp_grp.thread_rank() == 0) { + result.values[rhs_index] = val; + } +} + + +template +__device__ __forceinline__ void one_conj_dot( + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, + const int rhs_index, + const gko::batch_multi_vector::batch_entry& result, + group::thread_block_tile& subwarp_grp) +{ + ValueType val = zero(); + for (int r = subwarp_grp.thread_rank(); r < x.num_rows; r += subwarp_grp.size()) { val += conj(x.values[r * x.stride + rhs_index]) * @@ -126,11 +154,6 @@ __device__ __forceinline__ void one_dot( } -/** - * Computes the dot product of some column vectors in global or shared memory. - * - * @param result Holds dot product value for vector in x and y. - */ template __device__ __forceinline__ void compute_dot_product( const gko::batch_multi_vector::batch_entry& x, @@ -150,6 +173,25 @@ __device__ __forceinline__ void compute_dot_product( } +template +__device__ __forceinline__ void compute_conj_dot_product( + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_entry& result) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); + const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subwarp_grp_id; rhs_index < x.num_rhs; + rhs_index += num_subwarp_grps_per_block) { + one_conj_dot(x, y, rhs_index, result, subwarp_grp); + } +} + + template __global__ __launch_bounds__( default_block_size, @@ -179,6 +221,35 @@ __global__ __launch_bounds__( } +template +__global__ __launch_bounds__( + default_block_size, + sm_multiplier) void compute_conj_dot_product_kernel(const gko:: + batch_multi_vector:: + uniform_batch< + const ValueType> + x, + const gko:: + batch_multi_vector:: + uniform_batch< + const ValueType> + y, + const gko:: + batch_multi_vector:: + uniform_batch< + ValueType> + result) +{ + for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; + ibatch += gridDim.x) { + const auto x_b = gko::batch::batch_entry(x, ibatch); + const auto y_b = gko::batch::batch_entry(y, ibatch); + const auto r_b = gko::batch::batch_entry(result, ibatch); + compute_conj_dot_product(x_b, y_b, r_b); + } +} + + template __device__ __forceinline__ void one_norm2( const gko::batch_multi_vector::batch_entry& x, diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index d0d76ba5ec6..3578d0678de 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -57,6 +57,7 @@ namespace { GKO_REGISTER_OPERATION(scale, batch_multi_vector::scale); GKO_REGISTER_OPERATION(add_scaled, batch_multi_vector::add_scaled); GKO_REGISTER_OPERATION(compute_dot, batch_multi_vector::compute_dot); +GKO_REGISTER_OPERATION(compute_conj_dot, batch_multi_vector::compute_conj_dot); GKO_REGISTER_OPERATION(compute_norm2, batch_multi_vector::compute_norm2); GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); @@ -109,6 +110,23 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) } +template +void BatchMultiVector::compute_conj_dot_impl( + const BatchMultiVector* b, + BatchMultiVector* result) const +{ + GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); + GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQ(this->get_num_batch_entries(), + result->get_num_batch_entries()); + GKO_ASSERT_EQUAL_DIMENSIONS( + result->get_common_size(), + get_col_sizes(this->get_size()).get_common_size()); + this->get_executor()->run( + batch_multi_vector::make_compute_conj_dot(this, b, result)); +} + + template void BatchMultiVector::compute_dot_impl( const BatchMultiVector* b, diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp index 28c7b87de10..6eba9eac829 100644 --- a/core/base/batch_multi_vector_kernels.hpp +++ b/core/base/batch_multi_vector_kernels.hpp @@ -66,6 +66,12 @@ namespace kernels { const BatchMultiVector<_type>* y, \ BatchMultiVector<_type>* result) +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL(_type) \ + void compute_conj_dot(std::shared_ptr exec, \ + const BatchMultiVector<_type>* x, \ + const BatchMultiVector<_type>* y, \ + BatchMultiVector<_type>* result) + #define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(_type) \ void compute_norm2(std::shared_ptr exec, \ const BatchMultiVector<_type>* x, \ @@ -77,16 +83,18 @@ namespace kernels { BatchMultiVector<_type>* result) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(ValueType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(ValueType); \ + template \ GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(ValueType) diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 9ab79160394..0f898b3ae73 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -279,6 +279,7 @@ namespace batch_multi_vector { GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 74c3b842297..97f7469a6f6 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -168,6 +168,41 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); +template +void compute_conj_dot(std::shared_ptr exec, + const BatchMultiVector* const x, + const BatchMultiVector* const y, + BatchMultiVector* const result) +{ + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + + const auto num_batches = x_ub.num_batch_entries; + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batches); + + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto y_b = batch::batch_entry(y_ub, group_id); + const auto res_b = batch::batch_entry(res_ub, group_id); + compute_conj_dot_product_kernel(x_b, y_b, res_b, item_ct1); + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL); + + template void compute_norm2(std::shared_ptr exec, const BatchMultiVector* const x, diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index 75f70cc2781..cb2ccd4ae50 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -89,6 +89,35 @@ __dpct_inline__ void compute_dot_product_kernel( const int sg_size = sg.get_local_range().size(); const int num_sg = sg.get_group_range().size(); + for (int rhs_index = sg_id; rhs_index < x.num_rhs; rhs_index += num_sg) { + ValueType val = zero(); + + for (int r = sg.get_local_id(); r < x.num_rows; r += sg_size) { + val += x.values[r * x.stride + rhs_index] * + y.values[r * y.stride + rhs_index]; + } + + val = sycl::reduce_over_group(sg, val, sycl::plus<>()); + + if (sg.get_local_id() == 0) { + result.values[rhs_index] = val; + } + } +} + + +template +__dpct_inline__ void compute_conj_dot_product_kernel( + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_entry& result, + sycl::nd_item<3>& item_ct1) +{ + const auto sg = item_ct1.get_sub_group(); + const int sg_id = sg.get_group_id(); + const int sg_size = sg.get_local_range().size(); + const int num_sg = sg.get_group_range().size(); + for (int rhs_index = sg_id; rhs_index < x.num_rhs; rhs_index += num_sg) { ValueType val = zero(); diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 4ce88acc621..34ca15db8c0 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -334,8 +334,7 @@ class BatchMultiVector /** * Computes the column-wise dot product of each matrix in this batch and its - * corresponding entry in `b`. If the vector has complex value_type, then - * the conjugate of this is taken. + * corresponding entry in `b`. * * @param b a BatchMultiVector matrix of same dimension as this * @param result a BatchMultiVector row vector, used to store the dot @@ -350,6 +349,24 @@ class BatchMultiVector make_temporary_clone(exec, result).get()); } + /** + * Computes the column-wise conjugate dot product of each matrix in this + * batch and its corresponding entry in `b`. If the vector has complex + * value_type, then the conjugate of this is taken. + * + * @param b a BatchMultiVector matrix of same dimension as this + * @param result a BatchMultiVector row vector, used to store the dot + * product (the number of column in the vector must match the number of + * columns of this) + */ + void compute_conj_dot(ptr_param> b, + ptr_param> result) const + { + auto exec = this->get_executor(); + this->compute_conj_dot_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, result).get()); + } + /** * Computes the Euclidean (L^2) norm of each matrix in this batch. * @@ -529,37 +546,30 @@ class BatchMultiVector /** * @copydoc scale(const BatchMultiVector *) - * - * @note Other implementations of batch_multi_vector should override this - * function instead of scale(const BatchMultiVector *alpha). */ void scale_impl(const BatchMultiVector* alpha); /** * @copydoc add_scaled(const BatchMultiVector *, const BatchMultiVector *) - * - * @note Other implementations of batch_multi_vector should override this - * function instead of add_scale(const BatchMultiVector *alpha, const - * BatchMultiVector *b). */ void add_scaled_impl(const BatchMultiVector* alpha, const BatchMultiVector* b); /** * @copydoc compute_dot(const BatchMultiVector *, BatchMultiVector *) const - * - * @note Other implementations of batch_multi_vector should override this - * function instead of compute_dot(const BatchMultiVector *b, - * BatchMultiVector *result). */ void compute_dot_impl(const BatchMultiVector* b, BatchMultiVector* result) const; + /** + * @copydoc compute_conj_dot(const BatchMultiVector *, BatchMultiVector *) + * const + */ + void compute_conj_dot_impl(const BatchMultiVector* b, + BatchMultiVector* result) const; + /** * @copydoc compute_norm2(BatchMultiVector *) const - * - * @note Other implementations of batch_multi_vector should override this - * function instead of compute_norm2(BatchMultiVector *result). */ void compute_norm2_impl( BatchMultiVector>* result) const; diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index f46cbb12ead..a88443f60b9 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -123,6 +123,29 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); +template +void compute_conj_dot(std::shared_ptr exec, + const BatchMultiVector* const x, + const BatchMultiVector* const y, + BatchMultiVector* const result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto res_ub = host::get_batch_struct(result); +#pragma omp parallel for + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + compute_conj_dot_product_kernel(x_b, y_b, res_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL); + + template void compute_norm2(std::shared_ptr exec, const BatchMultiVector* const x, diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index f494a326773..967dddb108a 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -120,6 +120,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); +template +void compute_conj_dot(std::shared_ptr exec, + const BatchMultiVector* x, + const BatchMultiVector* y, + BatchMultiVector* result) +{ + const auto x_ub = host::get_batch_struct(x); + const auto y_ub = host::get_batch_struct(y); + const auto res_ub = host::get_batch_struct(result); + for (size_type batch = 0; batch < result->get_num_batch_entries(); + ++batch) { + const auto res_b = gko::batch::batch_entry(res_ub, batch); + const auto x_b = gko::batch::batch_entry(x_ub, batch); + const auto y_b = gko::batch::batch_entry(y_ub, batch); + compute_conj_dot_product_kernel(x_b, y_b, res_b); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL); + + template void compute_norm2(std::shared_ptr exec, const BatchMultiVector* x, diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc index 599013179ce..6e3b195e175 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -85,6 +85,25 @@ inline void compute_dot_product_kernel( result.values[c] = gko::zero(); } + for (int r = 0; r < x.num_rows; r++) { + for (int c = 0; c < x.num_rhs; c++) { + result.values[c] += + x.values[r * x.stride + c] * y.values[r * y.stride + c]; + } + } +} + + +template +inline void compute_conj_dot_product_kernel( + const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_entry& result) +{ + for (int c = 0; c < result.num_rhs; c++) { + result.values[c] = gko::zero(); + } + for (int r = 0; r < x.num_rows; r++) { for (int c = 0; c < x.num_rhs; c++) { result.values[c] += diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index c7ba4a0bcf2..445cdedb73f 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -254,7 +254,7 @@ TYPED_TEST(BatchMultiVector, ComputesDot) } -TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongInputSize) +TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -265,7 +265,7 @@ TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongInputSize) } -TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongResultSize) +TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -280,6 +280,52 @@ TYPED_TEST(BatchMultiVector, ComputDotFailsOnWrongResultSize) } +TYPED_TEST(BatchMultiVector, ComputesConjDot) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto result = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); + + auto ures = result->unbatch(); + + this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()); + this->mtx_00->compute_conj_dot(this->mtx_10.get(), ures[0].get()); + this->mtx_01->compute_conj_dot(this->mtx_11.get(), ures[1].get()); + + auto res = result->unbatch(); + GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); +} + + +TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongInputSize) +{ + using Mtx = typename TestFixture::Mtx; + auto result = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); + + ASSERT_THROW(this->mtx_1->compute_conj_dot(this->mtx_2.get(), result.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongResultSize) +{ + using Mtx = typename TestFixture::Mtx; + auto result = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); + auto result2 = + Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); + + ASSERT_THROW(this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()), + gko::DimensionMismatch); + ASSERT_THROW( + this->mtx_0->compute_conj_dot(this->mtx_1.get(), result2.get()), + gko::DimensionMismatch); +} + + TYPED_TEST(BatchMultiVector, ComputesNorm2) { using Mtx = typename TestFixture::Mtx; diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 05ea67bee1d..631b9a10c24 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -269,6 +269,36 @@ TEST_F(BatchMultiVector, ComputeDotSingleIsEquivalentToRef) } +TEST_F(BatchMultiVector, ComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(20); + auto dot_size = + gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); + auto dot_expected = Mtx::create(this->ref, dot_size); + auto ddot = Mtx::create(this->exec, dot_size); + + x->compute_conj_dot(y.get(), dot_expected.get()); + dx->compute_conj_dot(dy.get(), ddot.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r::value); +} + + +TEST_F(BatchMultiVector, ComputeConjDotSingleIsEquivalentToRef) +{ + set_up_vector_data(1); + auto dot_size = + gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); + auto dot_expected = Mtx::create(this->ref, dot_size); + auto ddot = Mtx::create(this->exec, dot_size); + + x->compute_conj_dot(y.get(), dot_expected.get()); + dx->compute_conj_dot(dy.get(), ddot.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r::value); +} + + TEST_F(BatchMultiVector, CopySingleIsEquivalentToRef) { set_up_vector_data(1); From 4aa2d8b19839fc97772e6004d478ebd1d79be928 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 24 Jul 2023 09:35:57 +0200 Subject: [PATCH 130/583] Generalize CUDA/HIP kernels and use reduce prim --- ...batch_multi_vector_kernel_launcher.hpp.inc | 10 +- .../base/batch_multi_vector_kernels.hpp.inc | 220 ++++++------------ 2 files changed, 75 insertions(+), 155 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc index b797850059b..60af1de45af 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc @@ -89,8 +89,9 @@ void compute_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - compute_dot_product_kernel<<get_stream()>>>(x_ub, y_ub, res_ub); + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -108,8 +109,9 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - compute_conj_dot_product_kernel<<get_stream()>>>(x_ub, y_ub, res_ub); + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 28ea60c7df4..18c4f48811b 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -56,10 +56,10 @@ __global__ const gko::batch_multi_vector::uniform_batch alpha, const gko::batch_multi_vector::uniform_batch x, Mapping map) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; - ibatch += gridDim.x) { - const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); - const auto x_b = gko::batch::batch_entry(x, ibatch); + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::batch_entry(alpha, batch_id); + const auto x_b = gko::batch::batch_entry(x, batch_id); scale(alpha_b, x_b, map); } } @@ -88,191 +88,109 @@ __global__ const gko::batch_multi_vector::uniform_batch x, const gko::batch_multi_vector::uniform_batch y, Mapping map) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; - ibatch += gridDim.x) { - const auto alpha_b = gko::batch::batch_entry(alpha, ibatch); - const auto x_b = gko::batch::batch_entry(x, ibatch); - const auto y_b = gko::batch::batch_entry(y, ibatch); + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::batch_entry(alpha, batch_id); + const auto x_b = gko::batch::batch_entry(x, batch_id); + const auto y_b = gko::batch::batch_entry(y, batch_id); add_scaled(alpha_b, x_b, y_b, map); } } -template -__device__ __forceinline__ void one_dot( +template +__device__ __forceinline__ void gen_one_dot( const gko::batch_multi_vector::batch_entry& x, const gko::batch_multi_vector::batch_entry& y, const int rhs_index, const gko::batch_multi_vector::batch_entry& result, - group::thread_block_tile& subwarp_grp) + Group subgroup, Mapping conj_map) { ValueType val = zero(); - for (int r = subwarp_grp.thread_rank(); r < x.num_rows; - r += subwarp_grp.size()) { - val += x.values[r * x.stride + rhs_index] * + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += conj_map(x.values[r * x.stride + rhs_index]) * y.values[r * y.stride + rhs_index]; } - // subwarp_grp level reduction -#pragma unroll - for (int j = config::warp_size / 2; j > 0; j /= 2) { - val += subwarp_grp.shfl_down(val, j); - } + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); - if (subwarp_grp.thread_rank() == 0) { + if (subgroup.thread_rank() == 0) { result.values[rhs_index] = val; } } -template -__device__ __forceinline__ void one_conj_dot( +template +__device__ __forceinline__ void compute_gen_dot_product( const gko::batch_multi_vector::batch_entry& x, const gko::batch_multi_vector::batch_entry& y, - const int rhs_index, const gko::batch_multi_vector::batch_entry& result, - group::thread_block_tile& subwarp_grp) -{ - ValueType val = zero(); - - for (int r = subwarp_grp.thread_rank(); r < x.num_rows; - r += subwarp_grp.size()) { - val += conj(x.values[r * x.stride + rhs_index]) * - y.values[r * y.stride + rhs_index]; - } - - // subwarp_grp level reduction -#pragma unroll - for (int j = config::warp_size / 2; j > 0; j /= 2) { - val += subwarp_grp.shfl_down(val, j); - } - - if (subwarp_grp.thread_rank() == 0) { - result.values[rhs_index] = val; - } -} - - -template -__device__ __forceinline__ void compute_dot_product( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, - const gko::batch_multi_vector::batch_entry& result) -{ - constexpr auto tile_size = config::warp_size; - auto thread_block = group::this_thread_block(); - auto subwarp_grp = group::tiled_partition(thread_block); - const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); - const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); - - for (int rhs_index = subwarp_grp_id; rhs_index < x.num_rhs; - rhs_index += num_subwarp_grps_per_block) { - one_dot(x, y, rhs_index, result, subwarp_grp); - } -} - - -template -__device__ __forceinline__ void compute_conj_dot_product( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, - const gko::batch_multi_vector::batch_entry& result) + Mapping conj_map) { constexpr auto tile_size = config::warp_size; auto thread_block = group::this_thread_block(); - auto subwarp_grp = group::tiled_partition(thread_block); - const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); - const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); - - for (int rhs_index = subwarp_grp_id; rhs_index < x.num_rhs; - rhs_index += num_subwarp_grps_per_block) { - one_conj_dot(x, y, rhs_index, result, subwarp_grp); - } -} - + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); -template -__global__ __launch_bounds__( - default_block_size, - sm_multiplier) void compute_dot_product_kernel(const gko:: - batch_multi_vector:: - uniform_batch< - const ValueType> - x, - const gko:: - batch_multi_vector:: - uniform_batch< - const ValueType> - y, - const gko:: - batch_multi_vector:: - uniform_batch< - ValueType> - result) -{ - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; - ibatch += gridDim.x) { - const auto x_b = gko::batch::batch_entry(x, ibatch); - const auto y_b = gko::batch::batch_entry(y, ibatch); - const auto r_b = gko::batch::batch_entry(result, ibatch); - compute_dot_product(x_b, y_b, r_b); + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + gen_one_dot(x, y, rhs_index, result, subgroup, conj_map); } } -template +template __global__ __launch_bounds__( default_block_size, - sm_multiplier) void compute_conj_dot_product_kernel(const gko:: - batch_multi_vector:: - uniform_batch< - const ValueType> - x, - const gko:: - batch_multi_vector:: - uniform_batch< - const ValueType> - y, - const gko:: - batch_multi_vector:: - uniform_batch< - ValueType> - result) + sm_multiplier) void compute_gen_dot_product_kernel(const gko:: + batch_multi_vector:: + uniform_batch< + const ValueType> + x, + const gko:: + batch_multi_vector:: + uniform_batch< + const ValueType> + y, + const gko:: + batch_multi_vector:: + uniform_batch< + ValueType> + result, + Mapping map) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; - ibatch += gridDim.x) { - const auto x_b = gko::batch::batch_entry(x, ibatch); - const auto y_b = gko::batch::batch_entry(y, ibatch); - const auto r_b = gko::batch::batch_entry(result, ibatch); - compute_conj_dot_product(x_b, y_b, r_b); + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + batch_id += gridDim.x) { + const auto x_b = gko::batch::batch_entry(x, batch_id); + const auto y_b = gko::batch::batch_entry(y, batch_id); + const auto r_b = gko::batch::batch_entry(result, batch_id); + compute_gen_dot_product(x_b, y_b, r_b, map); } } -template +template __device__ __forceinline__ void one_norm2( const gko::batch_multi_vector::batch_entry& x, const int rhs_index, const gko::batch_multi_vector::batch_entry>& result, - group::thread_block_tile& subwarp_grp) + Group subgroup) { using real_type = typename gko::remove_complex; real_type val = zero(); - for (int r = subwarp_grp.thread_rank(); r < x.num_rows; - r += subwarp_grp.size()) { + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { val += squared_norm(x.values[r * x.stride + rhs_index]); } - // subwarp_grp level reduction -#pragma unroll - for (int j = config::warp_size / 2; j > 0; j /= 2) { - val += subwarp_grp.shfl_down(val, j); - } + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); - if (subwarp_grp.thread_rank() == 0) { + if (subgroup.thread_rank() == 0) { result.values[rhs_index] = sqrt(val); } } @@ -292,13 +210,13 @@ __device__ __forceinline__ void compute_norm2( { constexpr auto tile_size = config::warp_size; auto thread_block = group::this_thread_block(); - auto subwarp_grp = group::tiled_partition(thread_block); - const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); - const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); - for (int rhs_index = subwarp_grp_id; rhs_index < x.num_rhs; - rhs_index += num_subwarp_grps_per_block) { - one_norm2(x, rhs_index, result, subwarp_grp); + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + one_norm2(x, rhs_index, result, subgroup); } } @@ -314,10 +232,10 @@ __global__ __launch_bounds__( remove_complex> result) { - for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries; - ibatch += gridDim.x) { - const auto x_b = gko::batch::batch_entry(x, ibatch); - const auto r_b = gko::batch::batch_entry(result, ibatch); + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + batch_id += gridDim.x) { + const auto x_b = gko::batch::batch_entry(x, batch_id); + const auto r_b = gko::batch::batch_entry(result, batch_id); compute_norm2(x_b, r_b); } } @@ -349,10 +267,10 @@ __global__ const gko::batch_multi_vector::uniform_batch src, const gko::batch_multi_vector::uniform_batch dst) { - for (size_type ibatch = blockIdx.x; ibatch < src.num_batch_entries; - ibatch += gridDim.x) { - const auto dst_b = gko::batch::batch_entry(dst, ibatch); - const auto src_b = gko::batch::batch_entry(src, ibatch); + for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_entries; + batch_id += gridDim.x) { + const auto dst_b = gko::batch::batch_entry(dst, batch_id); + const auto src_b = gko::batch::batch_entry(src, batch_id); copy(src_b, dst_b); } } From c72ffadb8164c72b19376a8f9cced1fd813bbdb3 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 24 Jul 2023 08:42:37 +0000 Subject: [PATCH 131/583] Format files Co-authored-by: Pratik Nayak --- .../base/batch_multi_vector_kernels.hpp.inc | 34 ++++++++----------- hip/base/batch_multi_vector_kernels.hip.cpp | 2 -- reference/base/batch_struct.hpp | 12 +++---- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 18c4f48811b..efbbd323ef6 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -52,9 +52,9 @@ __device__ __forceinline__ void scale( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, Mapping map) +__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -83,10 +83,10 @@ __device__ __forceinline__ void add_scaled( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, - const gko::batch_multi_vector::uniform_batch y, Mapping map) +__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, + const gko::batch_multi_vector::uniform_batch y, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -222,15 +222,11 @@ __device__ __forceinline__ void compute_norm2( template -__global__ __launch_bounds__( - default_block_size, - sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector:: - uniform_batch - x, - const gko::batch_multi_vector:: - uniform_batch< - remove_complex> - result) +__global__ +__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel( + const gko::batch_multi_vector::uniform_batch x, + const gko::batch_multi_vector::uniform_batch> + result) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -263,9 +259,9 @@ __device__ __forceinline__ void copy( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( - const gko::batch_multi_vector::uniform_batch src, - const gko::batch_multi_vector::uniform_batch dst) +__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( + const gko::batch_multi_vector::uniform_batch src, + const gko::batch_multi_vector::uniform_batch dst) { for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_entries; batch_id += gridDim.x) { diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index c1e7469ef9e..096c5e8a5d3 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -34,8 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include - - #include #include diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index cec3a4ed813..f3512968d9e 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -30,17 +30,17 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ -#define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ - - -#include "core/base/batch_struct.hpp" +#ifndef GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ +#define GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ #include #include +#include "core/base/batch_struct.hpp" + + namespace gko { namespace kernels { /** @@ -111,4 +111,4 @@ maybe_null_batch_struct(const BatchMultiVector* const op) } // namespace gko -#endif // GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_ From b5feec142d8234f5191d36c996b1bfee1adabaed Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 24 Jul 2023 10:18:29 +0200 Subject: [PATCH 132/583] Add a fill method and test --- core/base/batch_multi_vector.cpp | 12 +++--------- core/test/base/batch_multi_vector.cpp | 19 +++++++++++++++++++ .../ginkgo/core/base/batch_multi_vector.hpp | 11 +++++++++++ 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 3578d0678de..7f6473fc5fc 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -187,19 +187,13 @@ void read_impl(MatrixType* mtx, const std::vector& data) } auto tmp = MatrixType::create(mtx->get_executor()->get_master(), batch_size); + tmp->fill(zero()); for (size_type b = 0; b < data.size(); ++b) { size_type ind = 0; for (size_type row = 0; row < data[b].size[0]; ++row) { for (size_type col = 0; col < data[b].size[1]; ++col) { - if (ind < data[b].nonzeros.size() && - data[b].nonzeros[ind].row == row && - data[b].nonzeros[ind].column == col) { - tmp->at(b, row, col) = data[b].nonzeros[ind].value; - ++ind; - } else { - tmp->at(b, row, col) = - zero(); - } + tmp->at(b, row, col) = data[b].nonzeros[ind].value; + ++ind; } } } diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 410ea70b4dd..5fbc4d5aa32 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -325,6 +325,25 @@ TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed) } +TYPED_TEST(BatchMultiVector, CanBeFilledWithValue) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::BatchMultiVector::create( + this->exec, gko::batch_dim<2>(2, gko::dim<2>(3, 1))); + + m->fill(value_type(2.0)); + + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 1)); + EXPECT_EQ(m->at(0, 0, 0), value_type{2.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 0, 2), value_type{2.0}); +} + + TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) { using value_type = typename TestFixture::value_type; diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 34ca15db8c0..e8e3d72ef09 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -402,6 +402,17 @@ class BatchMultiVector exec, sizes, gko::detail::array_const_cast(std::move(values))}); } + /** + * Fills the input BatchMultiVector with a given value + * + * @param value the value to be filled + */ + void fill(ValueType value) + { + GKO_ASSERT(this->values_.get_num_elems() > 0); + this->values_.fill(value); + } + private: inline batch_dim<2> compute_batch_size( const std::vector*>& matrices) From 9f0282050beec93013db612ae6f1cd7cf924e8dd Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 24 Jul 2023 14:14:45 +0200 Subject: [PATCH 133/583] Update dpcpp kernels and fix for 2022-1 Cannot use sycl::reduce_over_group for older DPCPP versions. --- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 84 +++++++++---- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 111 +++++++----------- 2 files changed, 104 insertions(+), 91 deletions(-) diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 97f7469a6f6..1cd7061c161 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -48,6 +48,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" namespace gko { @@ -81,16 +85,31 @@ void scale(std::shared_ptr exec, const dim3 grid(num_batches); // Launch a kernel that has nbatches blocks, each block has max group size - (exec->get_queue())->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_entry(alpha_ub, group_id); - const auto x_b = batch::batch_entry(x_ub, group_id); - scale_kernel(alpha_b, x_b, item_ct1); - }); - }); + if (alpha->get_common_size()[1] == 1) { + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = batch::batch_entry(alpha_ub, group_id); + const auto x_b = batch::batch_entry(x_ub, group_id); + scale_kernel(alpha_b, x_b, item_ct1, + [](int col) { return 0; }); + }); + }); + } else { + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = batch::batch_entry(alpha_ub, group_id); + const auto x_b = batch::batch_entry(x_ub, group_id); + scale_kernel(alpha_b, x_b, item_ct1, + [](int col) { return col; }); + }); + }); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -116,17 +135,33 @@ void add_scaled(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); - (exec->get_queue())->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_entry(alpha_ub, group_id); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto y_b = batch::batch_entry(y_ub, group_id); - add_scaled_kernel(alpha_b, x_b, y_b, item_ct1); - }); - }); + if (alpha->get_common_size()[1] == 1) { + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = batch::batch_entry(alpha_ub, group_id); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto y_b = batch::batch_entry(y_ub, group_id); + add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, + [](auto col) { return 0; }); + }); + }); + } else { + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = batch::batch_entry(alpha_ub, group_id); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto y_b = batch::batch_entry(y_ub, group_id); + add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, + [](auto col) { return col; }); + }); + }); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -159,7 +194,8 @@ void compute_dot(std::shared_ptr exec, const auto x_b = batch::batch_entry(x_ub, group_id); const auto y_b = batch::batch_entry(y_ub, group_id); const auto res_b = batch::batch_entry(res_ub, group_id); - compute_dot_product_kernel(x_b, y_b, res_b, item_ct1); + compute_gen_dot_product_kernel(x_b, y_b, res_b, item_ct1, + [](auto val) { return val; }); }); }); } @@ -194,7 +230,9 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_b = batch::batch_entry(x_ub, group_id); const auto y_b = batch::batch_entry(y_ub, group_id); const auto res_b = batch::batch_entry(res_ub, group_id); - compute_conj_dot_product_kernel(x_b, y_b, res_b, item_ct1); + compute_gen_dot_product_kernel( + x_b, y_b, res_b, item_ct1, + [](auto val) { return conj(val); }); }); }); } diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index cb2ccd4ae50..6e22c5c078f 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -30,11 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -template +template __dpct_inline__ void scale_kernel( const gko::batch_multi_vector::batch_entry& alpha, const gko::batch_multi_vector::batch_entry& x, - sycl::nd_item<3>& item_ct1) + sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = item_ct1.get_local_linear_id(); li < max_li; @@ -42,23 +42,18 @@ __dpct_inline__ void scale_kernel( const int row = li / x.num_rhs; const int col = li % x.num_rhs; - if (alpha.num_rhs == 1) { - x.values[row * x.stride + col] = - alpha.values[0] * x.values[row * x.stride + col]; - } else { - x.values[row * x.stride + col] = - alpha.values[col] * x.values[row * x.stride + col]; - } + x.values[row * x.stride + col] = + alpha.values[map(col)] * x.values[row * x.stride + col]; } } -template +template __dpct_inline__ void add_scaled_kernel( const gko::batch_multi_vector::batch_entry& alpha, const gko::batch_multi_vector::batch_entry& x, const gko::batch_multi_vector::batch_entry& y, - sycl::nd_item<3>& item_ct1) + sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = item_ct1.get_local_id(2); li < max_li; @@ -66,69 +61,41 @@ __dpct_inline__ void add_scaled_kernel( const int row = li / x.num_rhs; const int col = li % x.num_rhs; - if (alpha.num_rhs == 1) { - y.values[row * y.stride + col] += - alpha.values[0] * x.values[row * x.stride + col]; - } else { - y.values[row * y.stride + col] += - alpha.values[col] * x.values[row * x.stride + col]; - } + y.values[row * y.stride + col] += + alpha.values[map(col)] * x.values[row * x.stride + col]; } } -template -__dpct_inline__ void compute_dot_product_kernel( +template +__dpct_inline__ void compute_gen_dot_product_kernel( const gko::batch_multi_vector::batch_entry& x, const gko::batch_multi_vector::batch_entry& y, const gko::batch_multi_vector::batch_entry& result, - sycl::nd_item<3>& item_ct1) + sycl::nd_item<3>& item_ct1, Mapping conj_map) { - const auto sg = item_ct1.get_sub_group(); - const int sg_id = sg.get_group_id(); - const int sg_size = sg.get_local_range().size(); - const int num_sg = sg.get_group_range().size(); - - for (int rhs_index = sg_id; rhs_index < x.num_rhs; rhs_index += num_sg) { + constexpr auto tile_size = config::warp_size; + const auto subgroup = item_ct1.get_sub_group(); + const int subgroup_id = subgroup.get_group_id(); + const int subgroup_size = subgroup.get_local_range().size(); + const int num_subgroups = subgroup.get_group_range().size(); + auto subg = + group::tiled_partition(group::this_thread_block(item_ct1)); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups) { ValueType val = zero(); - for (int r = sg.get_local_id(); r < x.num_rows; r += sg_size) { - val += x.values[r * x.stride + rhs_index] * + for (int r = subgroup.get_local_id(); r < x.num_rows; + r += subgroup_size) { + val += conj_map(x.values[r * x.stride + rhs_index]) * y.values[r * y.stride + rhs_index]; } - val = sycl::reduce_over_group(sg, val, sycl::plus<>()); + val = ::gko::kernels::dpcpp::reduce( + subg, val, [](ValueType a, ValueType b) { return a + b; }); - if (sg.get_local_id() == 0) { - result.values[rhs_index] = val; - } - } -} - - -template -__dpct_inline__ void compute_conj_dot_product_kernel( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, - const gko::batch_multi_vector::batch_entry& result, - sycl::nd_item<3>& item_ct1) -{ - const auto sg = item_ct1.get_sub_group(); - const int sg_id = sg.get_group_id(); - const int sg_size = sg.get_local_range().size(); - const int num_sg = sg.get_group_range().size(); - - for (int rhs_index = sg_id; rhs_index < x.num_rhs; rhs_index += num_sg) { - ValueType val = zero(); - - for (int r = sg.get_local_id(); r < x.num_rows; r += sg_size) { - val += conj(x.values[r * x.stride + rhs_index]) * - y.values[r * y.stride + rhs_index]; - } - - val = sycl::reduce_over_group(sg, val, sycl::plus<>()); - - if (sg.get_local_id() == 0) { + if (subgroup.get_local_id() == 0) { result.values[rhs_index] = val; } } @@ -142,21 +109,29 @@ __dpct_inline__ void compute_norm2_kernel( result, sycl::nd_item<3>& item_ct1) { - const auto sg = item_ct1.get_sub_group(); - const int sg_id = sg.get_group_id(); - const int sg_size = sg.get_local_range().size(); - const int num_sg = sg.get_group_range().size(); + constexpr auto tile_size = config::warp_size; + const auto subgroup = item_ct1.get_sub_group(); + const int subgroup_id = subgroup.get_group_id(); + const int subgroup_size = subgroup.get_local_range().size(); + const int num_subgroups = subgroup.get_group_range().size(); + auto subg = + group::tiled_partition(group::this_thread_block(item_ct1)); using real_type = typename gko::remove_complex; - for (int rhs_index = sg_id; rhs_index < x.num_rhs; rhs_index += num_sg) { + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups) { real_type val = zero(); - for (int r = sg.get_local_id(); r < x.num_rows; r += sg_size) + for (int r = subgroup.get_local_id(); r < x.num_rows; + r += subgroup_size) val += squared_norm(x.values[r * x.stride + rhs_index]); - val = sycl::reduce_over_group(sg, val, sycl::plus<>()); + val = ::gko::kernels::dpcpp::reduce( + subg, val, [](real_type a, real_type b) { return a + b; }); - if (sg.get_local_id() == 0) result.values[rhs_index] = sqrt(val); + if (subgroup.get_local_id() == 0) { + result.values[rhs_index] = sqrt(val); + } } } From 02d85ae66bc3ba651971329a825dc7a79ef39379 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 25 Jul 2023 11:39:02 +0200 Subject: [PATCH 134/583] Fix dpcpp CPU subgroup_size issue Co-authored-by: Yu-Hsiang Mike Tsai --- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 13 ++++++++++--- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 12 ++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 1cd7061c161..2c48970d13d 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -186,9 +186,12 @@ void compute_dot(std::shared_ptr exec, const dim3 block(group_size); const dim3 grid(num_batches); + // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto x_b = batch::batch_entry(x_ub, group_id); @@ -224,7 +227,9 @@ void compute_conj_dot(std::shared_ptr exec, (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto x_b = batch::batch_entry(x_ub, group_id); @@ -259,7 +264,9 @@ void compute_norm2(std::shared_ptr exec, (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto x_b = batch::batch_entry(x_ub, group_id); diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index 6e22c5c078f..7dfe13d0fda 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -75,12 +75,12 @@ __dpct_inline__ void compute_gen_dot_product_kernel( sycl::nd_item<3>& item_ct1, Mapping conj_map) { constexpr auto tile_size = config::warp_size; - const auto subgroup = item_ct1.get_sub_group(); + auto subg = + group::tiled_partition(group::this_thread_block(item_ct1)); + const auto subgroup = static_cast(subg); const int subgroup_id = subgroup.get_group_id(); const int subgroup_size = subgroup.get_local_range().size(); const int num_subgroups = subgroup.get_group_range().size(); - auto subg = - group::tiled_partition(group::this_thread_block(item_ct1)); for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; rhs_index += num_subgroups) { @@ -110,12 +110,12 @@ __dpct_inline__ void compute_norm2_kernel( sycl::nd_item<3>& item_ct1) { constexpr auto tile_size = config::warp_size; - const auto subgroup = item_ct1.get_sub_group(); + auto subg = + group::tiled_partition(group::this_thread_block(item_ct1)); + const auto subgroup = static_cast(subg); const int subgroup_id = subgroup.get_group_id(); const int subgroup_size = subgroup.get_local_range().size(); const int num_subgroups = subgroup.get_group_range().size(); - auto subg = - group::tiled_partition(group::this_thread_block(item_ct1)); using real_type = typename gko::remove_complex; for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; From 4c2fafd78cf2bcae25d867f9e03d89876813a62a Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 25 Jul 2023 14:45:28 +0200 Subject: [PATCH 135/583] Move impls to source from header --- core/base/batch_multi_vector.cpp | 123 +++++++++++++--- .../ginkgo/core/base/batch_multi_vector.hpp | 135 +++--------------- 2 files changed, 118 insertions(+), 140 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 7f6473fc5fc..b73a92467f6 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -67,8 +67,77 @@ GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); template -void BatchMultiVector::scale_impl( - const BatchMultiVector* alpha) +std::unique_ptr> +BatchMultiVector::create_with_config_of( + ptr_param other) +{ + // De-referencing `other` before calling the functions (instead of + // using operator `->`) is currently required to be compatible with + // CUDA 10.1. + // Otherwise, it results in a compile error. + return (*other).create_with_same_config(); +} + + +template +std::vector>> +BatchMultiVector::unbatch() const +{ + using unbatch_type = matrix::Dense; + auto exec = this->get_executor(); + auto unbatch_mats = std::vector>{}; + for (size_type b = 0; b < this->get_num_batch_entries(); ++b) { + auto mat = unbatch_type::create(exec, this->get_common_size()); + exec->copy_from(exec.get(), mat->get_num_stored_elements(), + this->get_const_values() + + this->get_size().get_cumulative_offset(b), + mat->get_values()); + unbatch_mats.emplace_back(std::move(mat)); + } + return unbatch_mats; +} + + +template +std::unique_ptr> +BatchMultiVector::create_const( + std::shared_ptr exec, const batch_dim<2>& sizes, + gko::detail::const_array_view&& values) +{ + // cast const-ness away, but return a const object afterwards, + // so we can ensure that no modifications take place. + return std::unique_ptr(new BatchMultiVector{ + exec, sizes, gko::detail::array_const_cast(std::move(values))}); +} + + +template +void BatchMultiVector::fill(ValueType value) +{ + GKO_ASSERT(this->values_.get_num_elems() > 0); + this->values_.fill(value); +} + + +template +void BatchMultiVector::set_size(const batch_dim<2>& value) noexcept +{ + batch_size_ = value; +} + + +template +std::unique_ptr> +BatchMultiVector::create_with_same_config() const +{ + return BatchMultiVector::create(this->get_executor(), + this->get_size()); +} + + +template +void BatchMultiVector::scale( + ptr_param> alpha) { GKO_ASSERT_EQ(alpha->get_num_batch_entries(), this->get_num_batch_entries()); @@ -78,14 +147,16 @@ void BatchMultiVector::scale_impl( GKO_ASSERT_EQUAL_COLS(this->get_common_size(), alpha->get_common_size()); } - this->get_executor()->run(batch_multi_vector::make_scale(alpha, this)); + auto exec = this->get_executor(); + exec->run(batch_multi_vector::make_scale( + make_temporary_clone(exec, alpha).get(), this)); } template -void BatchMultiVector::add_scaled_impl( - const BatchMultiVector* alpha, - const BatchMultiVector* b) +void BatchMultiVector::add_scaled( + ptr_param> alpha, + ptr_param> b) { GKO_ASSERT_EQ(alpha->get_num_batch_entries(), this->get_num_batch_entries()); @@ -98,8 +169,10 @@ void BatchMultiVector::add_scaled_impl( GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); - this->get_executor()->run( - batch_multi_vector::make_add_scaled(alpha, b, this)); + auto exec = this->get_executor(); + exec->run(batch_multi_vector::make_add_scaled( + make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), this)); } @@ -111,9 +184,9 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) template -void BatchMultiVector::compute_conj_dot_impl( - const BatchMultiVector* b, - BatchMultiVector* result) const +void BatchMultiVector::compute_conj_dot( + ptr_param> b, + ptr_param> result) const { GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); @@ -122,15 +195,17 @@ void BatchMultiVector::compute_conj_dot_impl( GKO_ASSERT_EQUAL_DIMENSIONS( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); - this->get_executor()->run( - batch_multi_vector::make_compute_conj_dot(this, b, result)); + auto exec = this->get_executor(); + exec->run(batch_multi_vector::make_compute_conj_dot( + this, make_temporary_clone(exec, b).get(), + make_temporary_output_clone(exec, result).get())); } template -void BatchMultiVector::compute_dot_impl( - const BatchMultiVector* b, - BatchMultiVector* result) const +void BatchMultiVector::compute_dot( + ptr_param> b, + ptr_param> result) const { GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); @@ -139,22 +214,26 @@ void BatchMultiVector::compute_dot_impl( GKO_ASSERT_EQUAL_DIMENSIONS( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); - this->get_executor()->run( - batch_multi_vector::make_compute_dot(this, b, result)); + auto exec = this->get_executor(); + exec->run(batch_multi_vector::make_compute_dot( + this, make_temporary_clone(exec, b).get(), + make_temporary_output_clone(exec, result).get())); } template -void BatchMultiVector::compute_norm2_impl( - BatchMultiVector>* result) const +void BatchMultiVector::compute_norm2( + ptr_param>> result) const { GKO_ASSERT_EQ(this->get_num_batch_entries(), result->get_num_batch_entries()); GKO_ASSERT_EQUAL_DIMENSIONS( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); - this->get_executor()->run(batch_multi_vector::make_compute_norm2( - as>(this), result)); + + auto exec = this->get_executor(); + exec->run(batch_multi_vector::make_compute_norm2( + this, make_temporary_output_clone(exec, result).get())); } diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index e8e3d72ef09..4ee3def0af7 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -104,14 +104,7 @@ class BatchMultiVector * @param other The other matrix whose configuration needs to copied. */ static std::unique_ptr create_with_config_of( - ptr_param other) - { - // De-referencing `other` before calling the functions (instead of - // using operator `->`) is currently required to be compatible with - // CUDA 10.1. - // Otherwise, it results in a compile error. - return (*other).create_with_same_config(); - } + ptr_param other); friend class BatchMultiVector>; @@ -133,20 +126,7 @@ class BatchMultiVector * * @return a std::vector containing the Dense matrices. */ - std::vector> unbatch() const - { - auto exec = this->get_executor(); - auto unbatch_mats = std::vector>{}; - for (size_type b = 0; b < this->get_num_batch_entries(); ++b) { - auto mat = unbatch_type::create(exec, this->get_common_size()); - exec->copy_from(exec.get(), mat->get_num_stored_elements(), - this->get_const_values() + - this->get_size().get_cumulative_offset(b), - mat->get_values()); - unbatch_mats.emplace_back(std::move(mat)); - } - return unbatch_mats; - } + std::vector> unbatch() const; /** * Returns the batch size. @@ -292,11 +272,7 @@ class BatchMultiVector * of alpha (the number of columns of alpha has to match the number of * columns of the matrix). */ - void scale(ptr_param> alpha) - { - auto exec = this->get_executor(); - this->scale_impl(make_temporary_clone(exec, alpha).get()); - } + void scale(ptr_param> alpha); /** * Adds `b` scaled by `alpha` to the vector (aka: BLAS axpy). @@ -309,28 +285,7 @@ class BatchMultiVector * @param b a matrix of the same dimension as this */ void add_scaled(ptr_param> alpha, - ptr_param> b) - { - auto exec = this->get_executor(); - this->add_scaled_impl(make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, b).get()); - } - - /** - * Adds `a` scaled by `alpha` to the vector scaled by `beta`: - * this <- alpha * a + beta * this. - * - * @param alpha If alpha is 1x1 BatchMultiVector matrix, the entire matrix - * a is scaled by alpha. If it is a BatchMultiVector row vector of values, - * then i-th column of a is scaled with the i-th element of alpha (the - * number of columns of alpha has to match the number of columns of a). - * @param a a matrix of the same dimension as this. - * @param beta Scalar(s), of the same size as alpha, to multiply this - * matrix. - */ - void add_scale(ptr_param> alpha, - ptr_param> a, - ptr_param> beta); + ptr_param> b); /** * Computes the column-wise dot product of each matrix in this batch and its @@ -342,12 +297,7 @@ class BatchMultiVector * columns of this) */ void compute_dot(ptr_param> b, - ptr_param> result) const - { - auto exec = this->get_executor(); - this->compute_dot_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, result).get()); - } + ptr_param> result) const; /** * Computes the column-wise conjugate dot product of each matrix in this @@ -360,12 +310,7 @@ class BatchMultiVector * columns of this) */ void compute_conj_dot(ptr_param> b, - ptr_param> result) const - { - auto exec = this->get_executor(); - this->compute_conj_dot_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, result).get()); - } + ptr_param> result) const; /** * Computes the Euclidean (L^2) norm of each matrix in this batch. @@ -375,11 +320,7 @@ class BatchMultiVector * of columns of this) */ void compute_norm2( - ptr_param>> result) const - { - auto exec = this->get_executor(); - this->compute_norm2_impl(make_temporary_clone(exec, result).get()); - } + ptr_param>> result) const; /** * Creates a constant (immutable) batch dense matrix from a constant array. @@ -394,24 +335,14 @@ class BatchMultiVector */ static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - gko::detail::const_array_view&& values) - { - // cast const-ness away, but return a const object afterwards, - // so we can ensure that no modifications take place. - return std::unique_ptr(new BatchMultiVector{ - exec, sizes, gko::detail::array_const_cast(std::move(values))}); - } + gko::detail::const_array_view&& values); /** * Fills the input BatchMultiVector with a given value * * @param value the value to be filled */ - void fill(ValueType value) - { - GKO_ASSERT(this->values_.get_num_elems() > 0); - this->values_.fill(value); - } + void fill(ValueType value); private: inline batch_dim<2> compute_batch_size( @@ -429,13 +360,14 @@ class BatchMultiVector return size.get_cumulative_offset(size.get_num_batch_entries()); } + protected: /** * Sets the size of the BatchMultiVector. * * @param value the new size of the operator */ - void set_size(const batch_dim<2>& value) noexcept { batch_size_ = value; } + void set_size(const batch_dim<2>& value) noexcept; /** * Creates an uninitialized BatchMultiVector matrix of the specified size. @@ -445,7 +377,7 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const batch_dim<2>& size = batch_dim<2>{}) - : EnablePolymorphicObject(exec), + : EnablePolymorphicObject>(exec), batch_size_(size), values_(exec, compute_num_elems(size)) {} @@ -467,7 +399,7 @@ class BatchMultiVector template BatchMultiVector(std::shared_ptr exec, const batch_dim<2>& size, ValuesArray&& values) - : EnablePolymorphicObject(exec), + : EnablePolymorphicObject>(exec), batch_size_(size), values_{exec, std::forward(values)} { @@ -484,7 +416,7 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, const std::vector*>& matrices) - : EnablePolymorphicObject(exec), + : EnablePolymorphicObject>(exec), batch_size_{compute_batch_size(matrices)}, values_(exec, compute_num_elems(batch_size_)) { @@ -507,7 +439,7 @@ class BatchMultiVector BatchMultiVector(std::shared_ptr exec, size_type num_duplications, const BatchMultiVector* input) - : BatchMultiVector( + : BatchMultiVector( exec, gko::batch_dim<2>( input->get_num_batch_entries() * num_duplications, input->get_common_size())) @@ -531,7 +463,7 @@ class BatchMultiVector BatchMultiVector(std::shared_ptr exec, size_type num_duplications, const matrix::Dense* input) - : BatchMultiVector( + : BatchMultiVector( exec, gko::batch_dim<2>(num_duplications, input->get_size())) { size_type offset = 0; @@ -550,40 +482,7 @@ class BatchMultiVector * @returns a BatchMultiVector matrix with the same configuration as the * caller. */ - std::unique_ptr create_with_same_config() const - { - return BatchMultiVector::create(this->get_executor(), this->get_size()); - } - - /** - * @copydoc scale(const BatchMultiVector *) - */ - void scale_impl(const BatchMultiVector* alpha); - - /** - * @copydoc add_scaled(const BatchMultiVector *, const BatchMultiVector *) - */ - void add_scaled_impl(const BatchMultiVector* alpha, - const BatchMultiVector* b); - - /** - * @copydoc compute_dot(const BatchMultiVector *, BatchMultiVector *) const - */ - void compute_dot_impl(const BatchMultiVector* b, - BatchMultiVector* result) const; - - /** - * @copydoc compute_conj_dot(const BatchMultiVector *, BatchMultiVector *) - * const - */ - void compute_conj_dot_impl(const BatchMultiVector* b, - BatchMultiVector* result) const; - - /** - * @copydoc compute_norm2(BatchMultiVector *) const - */ - void compute_norm2_impl( - BatchMultiVector>* result) const; + std::unique_ptr create_with_same_config() const; size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept From 5556cc0a6e7d7af4b1e80161f0fd253e8aa1e372 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 26 Jul 2023 13:58:20 +0200 Subject: [PATCH 136/583] Update docs and zero-size issues --- .../ginkgo/core/base/batch_multi_vector.hpp | 223 +++++++++++------- 1 file changed, 132 insertions(+), 91 deletions(-) diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 4ee3def0af7..2096f30b85b 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -54,15 +54,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { /** - * BatchMultiVector is a batch matrix format which explicitly stores all values - * of the vector in each of the batches. + * BatchMultiVector stores multiple vectors in a batched fashion and is useful + * for batched operations. For example, if you want to store two batch entries + * with multi-vectors of size (3 x 2) given below: * - * The values in each of the batches are stored in row-major format (values - * belonging to the same row appear consecutive in the memory). + * [1 2 ; 3 4 + * 1 2 ; 3 4 + * 1 2 ; 3 4] * - * @tparam ValueType precision of matrix elements + * In memory, they would be stored as a single array: + * [1 2 1 2 1 2 3 4 3 4 3 4]. + * + * Access functions @at can help access individual + * entries if necessary. + * + * The values of the batches are stored consecutively and in each batch, the + * vectors are stored in a row-major fashion. + * + * @tparam ValueType precision of multi-vector elements * * @ingroup batch_multi_vector + * @ingroup batched */ template class BatchMultiVector @@ -98,10 +110,10 @@ class BatchMultiVector using row_major_range = gko::range>; /** - * Creates a BatchMultiVector matrix with the configuration of another - * BatchMultiVector matrix. + * Creates a BatchMultiVector with the configuration of another + * BatchMultiVector. * - * @param other The other matrix whose configuration needs to copied. + * @param other The other multi-vector whose configuration needs to copied. */ static std::unique_ptr create_with_config_of( ptr_param other); @@ -122,9 +134,15 @@ class BatchMultiVector void write(std::vector& data) const override; /** - * Unbatches the batched dense and creates a std::vector of Dense matrices + * Unbatches the batched multi-vector and creates a std::vector of Dense + * matrices * - * @return a std::vector containing the Dense matrices. + * @note This is an expensive operation as new memory needs to be allocated + * and the data from the batched multi-vector needs to copied to the + * individual matrices. This is mainly intended as a utility function + * for debugging and testing purposes. + * + * @return a std::vector containing the matrix::Dense objects. */ std::vector> unbatch() const; @@ -153,24 +171,13 @@ class BatchMultiVector dim<2> get_common_size() const { return batch_size_.get_common_size(); } /** - * Returns a pointer to the array of values of the vector. + * Returns a pointer to the array of values of the beginning of the batched + * multi-vector. * * @return the pointer to the array of values */ value_type* get_values() noexcept { return values_.get_data(); } - /** - * Returns a pointer to the array of values of the vector. - * - * @return the pointer to the array of values - */ - value_type* get_values(size_type batch) noexcept - { - GKO_ASSERT(batch < this->get_num_batch_entries()); - return values_.get_data() + - this->get_size().get_cumulative_offset(batch); - } - /** * @copydoc get_values() * @@ -183,6 +190,21 @@ class BatchMultiVector return values_.get_const_data(); } + /** + * Returns a pointer to the array of values of the multi-vector for a + * specific batch entry. + * + * @param batch_id the id of the batch entry. + * + * @return the pointer to the array of values + */ + value_type* get_values(size_type batch_id) noexcept + { + GKO_ASSERT(batch_id < this->get_num_batch_entries()); + return values_.get_data() + + this->get_size().get_cumulative_offset(batch_id); + } + /** * @copydoc get_values(size_type) * @@ -190,11 +212,11 @@ class BatchMultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values(size_type batch) const noexcept + const value_type* get_const_values(size_type batch_id) const noexcept { - GKO_ASSERT(batch < this->get_num_batch_entries()); + GKO_ASSERT(batch_id < this->get_num_batch_entries()); return values_.get_const_data() + - this->get_size().get_cumulative_offset(batch); + this->get_size().get_cumulative_offset(batch_id); } /** @@ -217,8 +239,8 @@ class BatchMultiVector * @param col the column of the requested element * * @note the method has to be called on the same Executor the vector is - * stored at (e.g. trying to call this method on a GPU matrix from - * the OMP results in a runtime error) + * stored at (e.g. trying to call this method on a GPU multi-vector + * from the OMP results in a runtime error) */ value_type& at(size_type batch, size_type row, size_type col) { @@ -244,11 +266,10 @@ class BatchMultiVector * * @param batch the batch index to be queried * @param idx a linear index of the requested element - * (ignoring the stride) * * @note the method has to be called on the same Executor the vector is - * stored at (e.g. trying to call this method on a GPU matrix from - * the OMP results in a runtime error) + * stored at (e.g. trying to call this method on a GPU multi-vector + * from the OMP results in a runtime error) */ ValueType& at(size_type batch, size_type idx) noexcept { @@ -266,56 +287,59 @@ class BatchMultiVector /** * Scales the vector with a scalar (aka: BLAS scal). * - * @param alpha If alpha is 1x1 BatchMultiVector matrix, the entire matrix - * (all batches) is scaled by alpha. If it is a BatchMultiVector row vector - * of values, then i-th column of the vector is scaled with the i-th element - * of alpha (the number of columns of alpha has to match the number of - * columns of the matrix). + * @param alpha the scalar + * + * @note If alpha is 1x1 BatchMultiVector matrix, the entire multi-vector + * (all batches) is scaled by alpha. If it is a BatchMultiVector row + * vector of values, then i-th column of the vector is scaled with the + * i-th element of alpha (the number of columns of alpha has to match + * the number of columns of the multi-vector). */ void scale(ptr_param> alpha); /** * Adds `b` scaled by `alpha` to the vector (aka: BLAS axpy). * - * @param alpha If alpha is 1x1 BatchMultiVector matrix, the entire matrix - * is scaled by alpha. If it is a BatchMultiVector row vector of values, - * then i-th column of the vector is scaled with the i-th element of alpha - * (the number of columns of alpha has to match the number of columns of the - * vector). - * @param b a matrix of the same dimension as this + * @param alpha the scalar + * @param b a multi-vector of the same dimension as this + * + * @note If alpha is 1x1 BatchMultiVector matrix, the entire multi-vector + * (all batches) is scaled by alpha. If it is a BatchMultiVector row + * vector of values, then i-th column of the vector is scaled with the + * i-th element of alpha (the number of columns of alpha has to match + * the number of columns of the multi-vector). */ void add_scaled(ptr_param> alpha, ptr_param> b); /** - * Computes the column-wise dot product of each matrix in this batch and its - * corresponding entry in `b`. + * Computes the column-wise dot product of each multi-vector in this batch + * and its corresponding entry in `b`. * - * @param b a BatchMultiVector matrix of same dimension as this + * @param b a BatchMultiVector of same dimension as this * @param result a BatchMultiVector row vector, used to store the dot - * product (the number of column in the vector must match the number of - * columns of this) + * product */ void compute_dot(ptr_param> b, ptr_param> result) const; /** - * Computes the column-wise conjugate dot product of each matrix in this - * batch and its corresponding entry in `b`. If the vector has complex + * Computes the column-wise conjugate dot product of each multi-vector in + * this batch and its corresponding entry in `b`. If the vector has complex * value_type, then the conjugate of this is taken. * - * @param b a BatchMultiVector matrix of same dimension as this + * @param b a BatchMultiVector of same dimension as this * @param result a BatchMultiVector row vector, used to store the dot - * product (the number of column in the vector must match the number of - * columns of this) + * product (the number of column in the vector must match the + * number of columns of this) */ void compute_conj_dot(ptr_param> b, ptr_param> result) const; /** - * Computes the Euclidean (L^2) norm of each matrix in this batch. + * Computes the Euclidean (L^2) norm of each multi-vector in this batch. * - * @param result a BatchMultiVector row vector, used to store the norm + * @param result a BatchMultiVector, used to store the norm * (the number of columns in the vector must match the number * of columns of this) */ @@ -323,15 +347,17 @@ class BatchMultiVector ptr_param>> result) const; /** - * Creates a constant (immutable) batch dense matrix from a constant array. + * Creates a constant (immutable) batch multi-vector from a constant + * array. * * @param exec the executor to create the vector on * @param size the dimensions of the vector * @param values the value array of the vector * @param stride the row-stride of the vector - * @returns A smart pointer to the constant matrix wrapping the input array - * (if it resides on the same executor as the vector) or a copy of - * the array on the correct executor. + * + * @return A smart pointer to the constant multi-vector wrapping the input + * array (if it resides on the same executor as the vector) or a copy of the + * array on the correct executor. */ static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, @@ -370,7 +396,8 @@ class BatchMultiVector void set_size(const batch_dim<2>& value) noexcept; /** - * Creates an uninitialized BatchMultiVector matrix of the specified size. + * Creates an uninitialized multi-vector of the specified + * size. * * @param exec Executor associated to the vector * @param size size of the vector @@ -383,14 +410,14 @@ class BatchMultiVector {} /** - * Creates a BatchMultiVector matrix from an already allocated (and + * Creates a BatchMultiVector from an already allocated (and * initialized) array. * * @tparam ValuesArray type of array of values * * @param exec Executor associated to the vector * @param size sizes of the batch matrices in a batch_dim object - * @param values array of matrix values + * @param values array of values * * @note If `values` is not an rvalue, not an array of ValueType, or is on * the wrong executor, an internal copy will be created, and the @@ -409,10 +436,16 @@ class BatchMultiVector } /** - * Creates a BatchMultiVector matrix from a vector of matrices + * Creates a BatchMultiVector from a vector of matrices * * @param exec Executor associated to the vector - * @param matrices The matrices that need to be batched. + * @param matrices The matrix::Dense objects that need to be batched. + * + * @note This is a utility function that can serve as a first step to port + * to batched data-structures and solvers. Even if the matrices are in + * device memory, this method can have siginificant overhead, as new + * allocations and deep copies are necessary and hence this constructor must + * not be used in performance sensitive applications */ BatchMultiVector(std::shared_ptr exec, const std::vector*>& matrices) @@ -430,11 +463,17 @@ class BatchMultiVector } /** - * Creates a BatchMultiVector matrix by duplicating BatchMultiVector matrix + * Creates a BatchMultiVector matrix by duplicating BatchMultiVector object * * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate * @param input the vector to be duplicated. + * + * @note This is a utility function that can serve as a first step to port + * to batched data-structures and solvers. Even if the matrices are in + * device memory, this method can have siginificant overhead, as new + * allocations and deep copies are necessary and hence this constructor must + * not be used in performance sensitive applications. */ BatchMultiVector(std::shared_ptr exec, size_type num_duplications, @@ -454,11 +493,11 @@ class BatchMultiVector } /** - * Creates a BatchMultiVector matrix by duplicating Dense matrix + * Creates a BatchMultiVector matrix by a duplicating a matrix::Dense object * * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate - * @param input the vector to be duplicated. + * @param input the matrix to be duplicated. */ BatchMultiVector(std::shared_ptr exec, size_type num_duplications, @@ -476,10 +515,10 @@ class BatchMultiVector } /** - * Creates a BatchMultiVector matrix with the same configuration as the - * callers matrix. + * Creates a BatchMultiVector with the same configuration as the + * callers object. * - * @returns a BatchMultiVector matrix with the same configuration as the + * @returns a BatchMultiVector with the same configuration as the * caller. */ std::unique_ptr create_with_same_config() const; @@ -504,13 +543,14 @@ class BatchMultiVector /** - * Creates and initializes a batch of column-vectors. + * Creates and initializes a batch of single column-vectors. * - * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the vector to the requested type. + * This function first creates a temporary BatchMultiVector, fills it with + * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo interface) + * (BatchMultiVector has to implement the ConvertibleTo + * interface) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * @@ -556,10 +596,10 @@ std::unique_ptr batch_initialize( /** - * Creates and initializes a batch of matrices. + * Creates and initializes a batch of multi-vectors. * - * This function first creates a temporary Dense matrix, fills it with passed in - * values, and then converts the vector to the requested type. + * This function first creates a temporary BatchMultiVector, fills it with + * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize * (Dense has to implement the ConvertibleTo interface) @@ -586,8 +626,9 @@ std::unique_ptr batch_initialize( size_type num_batch_entries = vals.size(); GKO_ASSERT(num_batch_entries > 0); auto vals_begin = begin(vals); - size_type common_num_rows = vals_begin->size(); - size_type common_num_cols = vals_begin->begin()->size(); + size_type common_num_rows = vals_begin ? vals_begin->size() : 0; + size_type common_num_cols = + vals_begin->begin() ? vals_begin->begin()->size() : 0; auto common_size = dim<2>(common_num_rows, common_num_cols); for (const auto& b : vals) { auto num_rows = b.size(); @@ -618,20 +659,19 @@ std::unique_ptr batch_initialize( /** - * Creates and initializes a batch column-vector by making copies of the single - * input column vector. + * Creates and initializes a batch single column-vector by making copies of the + * single input column vector. * - * This function first creates a temporary batch dense matrix, fills it with + * This function first creates a temporary batch multi-vector, fills it with * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo + * (BatchMultiVector has to implement the ConvertibleTo * interface) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param num_vectors The number of times the input vector is copied into - * the final output + * @param num_vectors The number of times the input vector is to be duplicated * @param vals values used to initialize each vector in the temp. batch * @param exec Executor associated to the vector * @param create_args additional arguments passed to Matrix::create, not @@ -650,7 +690,8 @@ std::unique_ptr batch_initialize( using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = num_vectors; GKO_ASSERT(num_batch_entries > 0); - auto b_size = batch_dim<2>(num_batch_entries, dim<2>(vals.size(), 1)); + auto b_size = + batch_dim<2>(num_batch_entries, dim<2>(vals ? vals.size() : 0, 1)); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_vectors; batch++) { size_type idx = 0; @@ -668,17 +709,16 @@ std::unique_ptr batch_initialize( /** * Creates and initializes a matrix from copies of a given matrix. * - * This function first creates a temporary batch dense matrix, fills it with + * This function first creates a temporary batch multi-vector, fills it with * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo interface) + * (BatchMultiVector has to implement the ConvertibleTo + * interface) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param stride row strides for the temporary batch dense matrix - * @param num_matrices The number of times the input matrix is copied into - * the final output + * @param num_batch_entries The number of times the input matrix is duplicated * @param vals values used to initialize each vector in the temp. batch * @param exec Executor associated to the vector * @param create_args additional arguments passed to Matrix::create, not @@ -697,7 +737,8 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; GKO_ASSERT(num_batch_entries > 0); - auto common_size = dim<2>(vals.size(), begin(vals)->size()); + auto common_size = + dim<2>(vals ? vals.size() : 0, vals ? begin(vals)->size() : 0); batch_dim<2> b_size(num_batch_entries, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_batch_entries; batch++) { From 5f55ccdf71bd900c40f797f449701d1dea4bccf6 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 26 Jul 2023 14:50:12 +0200 Subject: [PATCH 137/583] Review and doc updates --- core/base/batch_multi_vector.cpp | 3 ++- cuda/base/batch_struct.hpp | 23 ++-------------- dpcpp/base/batch_struct.hpp | 23 ++-------------- hip/base/batch_struct.hip.hpp | 23 ++-------------- .../ginkgo/core/base/batch_multi_vector.hpp | 20 +++++++------- .../ginkgo/core/base/exception_helpers.hpp | 27 +++++++++++++++++++ reference/base/batch_struct.hpp | 23 ++-------------- 7 files changed, 48 insertions(+), 94 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index b73a92467f6..3784c6645d7 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -257,7 +257,8 @@ void BatchMultiVector::move_to( template void read_impl(MatrixType* mtx, const std::vector& data) { - GKO_ASSERT(data.size() > 0); + GKO_THROW_IF_INVALID(data.size() > 0, "Input data is empty"); + auto common_size = data[0].size; auto batch_size = batch_dim<2>(data.size(), common_size); for (const auto& b : data) { diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 4358d688f07..f9a50376362 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -59,7 +59,7 @@ namespace cuda { /** - * Generates an immutable uniform batch struct from a batch of dense matrices. + * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch> @@ -72,7 +72,7 @@ get_batch_struct(const BatchMultiVector* const op) } /** - * Generates a uniform batch struct from a batch of dense matrices. + * Generates a uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch> @@ -85,25 +85,6 @@ get_batch_struct(BatchMultiVector* const op) } -/** - * Generates an immutable uniform batch struct from a batch of dense matrices - * that may be null. - */ -template -inline gko::batch_multi_vector::uniform_batch> -maybe_null_batch_struct(const BatchMultiVector* const op) -{ - if (op) { - return {as_cuda_type(op->get_const_values()), - op->get_num_batch_entries(), op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; - } else { - return {nullptr, 0, 0, 0, 0}; - } -} - - } // namespace cuda } // namespace kernels } // namespace gko diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index 5b88e992665..1a83fad020c 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -58,7 +58,7 @@ namespace dpcpp { /** - * Generates an immutable uniform batch struct from a batch of dense matrices. + * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( @@ -72,7 +72,7 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( /** - * Generates a uniform batch struct from a batch of dense matrices. + * Generates a uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( @@ -85,25 +85,6 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( } -/** - * Generates an immutable uniform batch struct from a batch of dense matrices - * that may be null. - */ -template -inline gko::batch_multi_vector::uniform_batch -maybe_null_batch_struct(const BatchMultiVector* const op) -{ - if (op) { - return {op->get_const_values(), op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; - } else { - return {nullptr, 0, 0, 0, 0}; - } -} - - } // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index f8788b9e6a8..bff659838bd 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -59,7 +59,7 @@ namespace hip { /** - * Generates an immutable uniform batch struct from a batch of dense matrices. + * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch> @@ -72,7 +72,7 @@ get_batch_struct(const BatchMultiVector* const op) } /** - * Generates a uniform batch struct from a batch of dense matrices. + * Generates a uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch> @@ -85,25 +85,6 @@ get_batch_struct(BatchMultiVector* const op) } -/** - * Generates an immutable uniform batch struct from a batch of dense matrices - * that may be null. - */ -template -inline gko::batch_multi_vector::uniform_batch> -maybe_null_batch_struct(const BatchMultiVector* const op) -{ - if (op) { - return {as_hip_type(op->get_const_values()), - op->get_num_batch_entries(), op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; - } else { - return {nullptr, 0, 0, 0, 0}; - } -} - - } // namespace hip } // namespace kernels } // namespace gko diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 2096f30b85b..ac4a2feb419 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -571,9 +571,9 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); - GKO_ASSERT(num_batch_entries > 0); + GKO_THROW_IF_INVALID(num_batch_entries > 0, "Input data is empty"); auto vals_begin = begin(vals); - size_type common_num_rows = vals_begin->size(); + size_type common_num_rows = vals_begin ? vals_begin->size() : 0; auto common_size = dim<2>(common_num_rows, 1); for (auto& val : vals) { GKO_ASSERT_EQ(common_num_rows, val.size()); @@ -624,7 +624,7 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = vals.size(); - GKO_ASSERT(num_batch_entries > 0); + GKO_THROW_IF_INVALID(num_batch_entries > 0, "Input data is empty"); auto vals_begin = begin(vals); size_type common_num_rows = vals_begin ? vals_begin->size() : 0; size_type common_num_cols = @@ -689,9 +689,10 @@ std::unique_ptr batch_initialize( { using batch_multi_vector = BatchMultiVector; size_type num_batch_entries = num_vectors; - GKO_ASSERT(num_batch_entries > 0); - auto b_size = - batch_dim<2>(num_batch_entries, dim<2>(vals ? vals.size() : 0, 1)); + GKO_THROW_IF_INVALID(num_batch_entries > 0 && vals.size() > 0, + "Input data is empty"); + auto b_size = batch_dim<2>(num_batch_entries, + dim<2>(begin(vals) ? vals.size() : 0, 1)); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_vectors; batch++) { size_type idx = 0; @@ -736,9 +737,10 @@ std::unique_ptr batch_initialize( std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - GKO_ASSERT(num_batch_entries > 0); - auto common_size = - dim<2>(vals ? vals.size() : 0, vals ? begin(vals)->size() : 0); + GKO_THROW_IF_INVALID(num_batch_entries > 0 && vals.size() > 0, + "Input data is empty"); + auto common_size = dim<2>(begin(vals) ? vals.size() : 0, + begin(vals) ? begin(vals)->size() : 0); batch_dim<2> b_size(num_batch_entries, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_batch_entries; batch++) { diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp index 50ff0354105..a9a93f15fe8 100644 --- a/include/ginkgo/core/base/exception_helpers.hpp +++ b/include/ginkgo/core/base/exception_helpers.hpp @@ -706,6 +706,13 @@ inline T ensure_allocated_impl(T ptr, const std::string& file, int line, "semi-colon warnings") +/** + * Throws an InvalidStateError with a user-specified message + * + * @param _message message to be displayed. + * + * @throw InvalidStateError. + */ #define GKO_INVALID_STATE(_message) \ { \ throw ::gko::InvalidStateError(__FILE__, __LINE__, __func__, \ @@ -716,6 +723,26 @@ inline T ensure_allocated_impl(T ptr, const std::string& file, int line, "semi-colon warnings") +/** + * Throws an InvalidStateError if condition is not satisfied + * + * @param _condition the condition to check. + * @param _message message to be displayed. + * + * @throw InvalidStateError. + */ +#define GKO_THROW_IF_INVALID(_condition, _message) \ + { \ + if (!_condition) { \ + throw ::gko::InvalidStateError(__FILE__, __LINE__, __func__, \ + _message); \ + } \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + } // namespace gko diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index f3512968d9e..ed1350dc366 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -60,7 +60,7 @@ namespace host { /** - * Generates an immutable uniform batch struct from a batch of dense matrices. + * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( @@ -74,7 +74,7 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( /** - * Generates a uniform batch struct from a batch of dense matrices. + * Generates a uniform batch struct from a batch of multi-vectors. */ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( @@ -87,25 +87,6 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( } -/** - * Generates an immutable uniform batch struct from a batch of dense matrices - * that may be null. - */ -template -inline gko::batch_multi_vector::uniform_batch -maybe_null_batch_struct(const BatchMultiVector* const op) -{ - if (op) { - return {op->get_const_values(), op->get_num_batch_entries(), - op->get_common_size()[1], - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; - } else { - return {nullptr, 0, 0, 0, 0}; - } -} - - } // namespace host } // namespace kernels } // namespace gko From 903ff5c9d62c29fe072b6bbe97a2b82b6e6e82c9 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 26 Jul 2023 17:16:07 +0200 Subject: [PATCH 138/583] Review updates Co-authored-by: Yu-Hsiang Tsai --- .../base/batch_multi_vector_kernels.hpp.inc | 38 +++++++------- core/base/batch_struct.hpp | 15 +++--- core/test/base/batch_multi_vector.cpp | 51 +++++++++---------- core/test/utils/assertions.hpp | 13 ++--- cuda/base/batch_multi_vector_kernels.cu | 6 ++- cuda/base/batch_struct.hpp | 4 +- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 16 +++--- dpcpp/base/batch_struct.hpp | 4 +- hip/base/batch_multi_vector_kernels.hip.cpp | 7 ++- hip/base/batch_struct.hip.hpp | 4 +- include/ginkgo/core/base/batch_dim.hpp | 4 +- .../ginkgo/core/base/batch_multi_vector.hpp | 24 +-------- reference/base/batch_struct.hpp | 4 +- .../test/base/batch_multi_vector_kernels.cpp | 8 --- test/base/batch_multi_vector_kernels.cpp | 38 ++------------ test/test_install/test_install.cpp | 14 +++++ 16 files changed, 103 insertions(+), 147 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index efbbd323ef6..17a7e125332 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -31,10 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -/** - * Scales the vectors in global or shared memory with a factor of alpha (alpha - * is in global memory or shared memory) - */ template __device__ __forceinline__ void scale( const gko::batch_multi_vector::batch_entry& alpha, @@ -52,9 +48,9 @@ __device__ __forceinline__ void scale( template __global__ -__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, Mapping map) + __launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -83,10 +79,10 @@ __device__ __forceinline__ void add_scaled( template __global__ -__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, - const gko::batch_multi_vector::uniform_batch y, Mapping map) + __launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, + const gko::batch_multi_vector::uniform_batch y, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -222,11 +218,15 @@ __device__ __forceinline__ void compute_norm2( template -__global__ -__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel( - const gko::batch_multi_vector::uniform_batch x, - const gko::batch_multi_vector::uniform_batch> - result) +__global__ __launch_bounds__( + default_block_size, + sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector:: + uniform_batch + x, + const gko::batch_multi_vector:: + uniform_batch< + remove_complex> + result) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -259,9 +259,9 @@ __device__ __forceinline__ void copy( template __global__ -__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( - const gko::batch_multi_vector::uniform_batch src, - const gko::batch_multi_vector::uniform_batch dst) + __launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( + const gko::batch_multi_vector::uniform_batch src, + const gko::batch_multi_vector::uniform_batch dst) { for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_entries; batch_id += gridDim.x) { diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index d85c413e691..ea1b3ef3f3f 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -44,21 +44,20 @@ namespace batch_multi_vector { /** - * Encapsulates one matrix from a batch of dense matrices (vectors). + * Encapsulates one matrix from a batch of multi-vectors. */ template struct batch_entry { using value_type = ValueType; ValueType* values; - size_type stride; + int stride; int num_rows; int num_rhs; }; + /** - * A 'simple' structure to store a global uniform batch of dense matrices. - * - * It is uniform in the sense that all matrices in the batch have common sizes. + * A 'simple' structure to store a global uniform batch of multi-vectors. */ template struct uniform_batch { @@ -67,7 +66,7 @@ struct uniform_batch { ValueType* values; size_type num_batch_entries; - size_type stride; + int stride; int num_rows; int num_rhs; @@ -122,8 +121,8 @@ batch_entry(const batch_multi_vector::uniform_batch& batch, template GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_entry -batch_entry(ValueType* const batch_values, const size_type stride, - const int num_rows, const int num_rhs, const size_type batch_idx) +batch_entry(ValueType* const batch_values, const int stride, const int num_rows, + const int num_rhs, const size_type batch_idx) { return {batch_values + batch_idx * stride * num_rows, stride, num_rows, num_rhs}; diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 5fbc4d5aa32..e63ed883517 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -84,6 +84,7 @@ class BatchMultiVector : public ::testing::Test { { ASSERT_EQ(m->get_num_batch_entries(), 0); ASSERT_EQ(m->get_common_size(), gko::dim<2>{}); + ASSERT_EQ(m->get_const_values(), nullptr); } std::shared_ptr exec; @@ -100,13 +101,6 @@ TYPED_TEST(BatchMultiVector, CanBeEmpty) } -TYPED_TEST(BatchMultiVector, ReturnsNullValuesArrayWhenEmpty) -{ - auto empty = gko::BatchMultiVector::create(this->exec); - ASSERT_EQ(empty->get_const_values(), nullptr); -} - - TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) { ASSERT_NE(this->mtx->get_const_values(), nullptr); @@ -165,10 +159,12 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingData) using size_type = gko::size_type; // clang-format off value_type data[] = { - 1.0, 2.0, -1.0, - 3.0, 4.0, -1.0, - 3.0, 5.0, 1.0, - 5.0, 6.0, -3.0}; + 1.0, 2.0, + -1.0,3.0, + 4.0, -1.0, + 3.0, 5.0, + 1.0, 5.0, + 6.0, -3.0}; // clang-format on auto m = gko::BatchMultiVector::create( @@ -192,11 +188,13 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) using value_type = typename TestFixture::value_type; using size_type = gko::size_type; // clang-format off - const value_type data[] = { - 1.0, 2.0, -1.0, - 3.0, 4.0, -1.0, - 3.0, 5.0, 1.0, - 5.0, 6.0, -3.0}; + value_type data[] = { + 1.0, 2.0, + -1.0,3.0, + 4.0, -1.0, + 3.0, 5.0, + 1.0, 5.0, + 6.0, -3.0}; // clang-format on auto m = gko::BatchMultiVector::create_const( @@ -215,7 +213,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) } -TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) +TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -227,12 +225,8 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) auto m = gko::BatchMultiVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::BatchMultiVector::create( - this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), - mat2.get(), mat1.get(), mat2.get()}); - auto m2 = gko::BatchMultiVector::create(this->exec, 3, m.get()); - GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); + this->assert_equal_to_original_mtx(m.get()); } @@ -255,7 +249,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatricesByDuplication) } -TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) +TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -264,11 +258,15 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::BatchMultiVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); + auto m_ref = gko::BatchMultiVector::create( + this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), + mat2.get(), mat1.get(), mat2.get()}); - this->assert_equal_to_original_mtx(m.get()); + auto m2 = gko::BatchMultiVector::create(this->exec, 3, m.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } @@ -356,6 +354,7 @@ TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) auto dense_mats = this->mtx->unbatch(); + ASSERT_EQ(dense_mats.size(), 2); GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.); } @@ -380,8 +379,8 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixData) ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 8e825a32d4f..44da77244f7 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -323,21 +323,18 @@ ::testing::AssertionResult batch_matrices_near_impl( const MatrixData2& second, double tolerance) { std::vector err; - std::vector err_flag; for (size_type b = 0; b < first.size(); ++b) { - auto num_rows = first[b].size[0]; - auto num_cols = first[b].size[1]; - if (num_rows != second[b].size[0] || num_cols != second[b].size[1]) { + if (first.size() != second.size()) { return ::testing::AssertionFailure() << "Expected matrices of equal size\n\t" << first_expression - << " is of size [" << num_rows << " x " << num_cols - << "]\n\t" << second_expression << " is of size [" - << second[b].size[0] << " x " << second[b].size[1] << "]" + << " is of size [" << first[b].size[0] << " x " + << first[b].size[1] << "]\n\t" << second_expression + << " is of size [" << second[b].size[0] << " x " + << second[b].size[1] << "]" << " for batch " << b; } err.push_back(detail::get_relative_error(first[b], second[b])); - err_flag.push_back(err.back() <= tolerance); } auto bat = std::find_if(err.begin(), err.end(), diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 05e08be0adb..3fd80a2aa41 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -67,14 +67,16 @@ namespace batch_multi_vector { constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; +// clang-format off + // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -// force-top: on + #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -// force-top: off #include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" +// clang-format on } // namespace batch_multi_vector } // namespace cuda diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index f9a50376362..d9907b41531 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -66,7 +66,7 @@ inline gko::batch_multi_vector::uniform_batch> get_batch_struct(const BatchMultiVector* const op) { return {as_cuda_type(op->get_const_values()), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } @@ -79,7 +79,7 @@ inline gko::batch_multi_vector::uniform_batch> get_batch_struct(BatchMultiVector* const op) { return {as_cuda_type(op->get_values()), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 2c48970d13d..e27b3fc810f 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -86,7 +86,7 @@ void scale(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size if (alpha->get_common_size()[1] == 1) { - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); @@ -98,7 +98,7 @@ void scale(std::shared_ptr exec, }); }); } else { - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); @@ -136,7 +136,7 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); if (alpha->get_common_size()[1] == 1) { - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); @@ -149,7 +149,7 @@ void add_scaled(std::shared_ptr exec, }); }); } else { - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); @@ -187,7 +187,7 @@ void compute_dot(std::shared_ptr exec, const dim3 grid(num_batches); // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( @@ -225,7 +225,7 @@ void compute_conj_dot(std::shared_ptr exec, const dim3 block(group_size); const dim3 grid(num_batches); - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( @@ -262,7 +262,7 @@ void compute_norm2(std::shared_ptr exec, const dim3 block(group_size); const dim3 grid(num_batches); - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( @@ -296,7 +296,7 @@ void copy(std::shared_ptr exec, const dim3 block(group_size); const dim3 grid(num_batches); - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index 1a83fad020c..c9ee5800b3e 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -65,7 +65,7 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( const BatchMultiVector* const op) { return {op->get_const_values(), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } @@ -79,7 +79,7 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( BatchMultiVector* const op) { return {op->get_values(), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index 096c5e8a5d3..40e828b5d45 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -69,14 +69,17 @@ constexpr auto default_block_size = 256; constexpr int sm_multiplier = 4; +// clang-format off + // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -// force-top: on + #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -// force-top: off #include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" +// clang-format on + } // namespace batch_multi_vector } // namespace hip diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index bff659838bd..3171e7e1df8 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -66,7 +66,7 @@ inline gko::batch_multi_vector::uniform_batch> get_batch_struct(const BatchMultiVector* const op) { return {as_hip_type(op->get_const_values()), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } @@ -79,7 +79,7 @@ inline gko::batch_multi_vector::uniform_batch> get_batch_struct(BatchMultiVector* const op) { return {as_hip_type(op->get_values()), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index 37ce5993220..c52b732f610 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -102,7 +102,7 @@ struct batch_dim { /** - * Checks if two batch dim objects are different. + * Checks if two batch_dim objects are different. * * @tparam Dimensionality number of dimensions of the dim objects * @tparam DimensionType datatype used to represent each dimension @@ -123,7 +123,7 @@ struct batch_dim { * Creates a batch_dim object which stores a uniform size for all batch * entries. * - * @param num_batch_entries number of batch entries to be stored + * @param num_batch_entries the number of batch entries to be stored * @param common_size the common size of all the batch entries stored * * @note Use this constructor when uniform batches need to be stored. diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index ac4a2feb419..b91c50966a1 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -170,26 +170,6 @@ class BatchMultiVector */ dim<2> get_common_size() const { return batch_size_.get_common_size(); } - /** - * Returns a pointer to the array of values of the beginning of the batched - * multi-vector. - * - * @return the pointer to the array of values - */ - value_type* get_values() noexcept { return values_.get_data(); } - - /** - * @copydoc get_values() - * - * @note This is the constant version of the function, which can be - * significantly more memory efficient than the non-constant version, - * so always prefer this version. - */ - const value_type* get_const_values() const noexcept - { - return values_.get_const_data(); - } - /** * Returns a pointer to the array of values of the multi-vector for a * specific batch entry. @@ -198,7 +178,7 @@ class BatchMultiVector * * @return the pointer to the array of values */ - value_type* get_values(size_type batch_id) noexcept + value_type* get_values(size_type batch_id = 0) noexcept { GKO_ASSERT(batch_id < this->get_num_batch_entries()); return values_.get_data() + @@ -212,7 +192,7 @@ class BatchMultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values(size_type batch_id) const noexcept + const value_type* get_const_values(size_type batch_id = 0) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_entries()); return values_.get_const_data() + diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index ed1350dc366..41262be1d48 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -67,7 +67,7 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( const BatchMultiVector* const op) { return {op->get_const_values(), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } @@ -81,7 +81,7 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( BatchMultiVector* const op) { return {op->get_values(), op->get_num_batch_entries(), - op->get_common_size()[1], + static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 445cdedb73f..f6ae66d8249 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -127,7 +127,6 @@ class BatchMultiVector : public ::testing::Test { std::ranlux48 rand_engine; }; - TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); @@ -137,7 +136,6 @@ TYPED_TEST(BatchMultiVector, ScalesData) using T = typename TestFixture::value_type; auto alpha = gko::batch_initialize( {{{2.0, -2.0, 1.5}}, {{3.0, -1.0, 0.25}}}, this->exec); - auto ualpha = alpha->unbatch(); this->mtx_0->scale(alpha.get()); @@ -155,7 +153,6 @@ TYPED_TEST(BatchMultiVector, ScalesDataWithScalar) using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); - auto ualpha = alpha->unbatch(); this->mtx_1->scale(alpha.get()); @@ -174,7 +171,6 @@ TYPED_TEST(BatchMultiVector, ScalesDataWithStride) using T = typename TestFixture::value_type; auto alpha = gko::batch_initialize( {{{2.0, -2.0, -1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto ualpha = alpha->unbatch(); this->mtx_1->scale(alpha.get()); @@ -193,7 +189,6 @@ TYPED_TEST(BatchMultiVector, AddsScaled) using T = typename TestFixture::value_type; auto alpha = gko::batch_initialize( {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto ualpha = alpha->unbatch(); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); @@ -211,7 +206,6 @@ TYPED_TEST(BatchMultiVector, AddsScaledWithScalar) using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); - auto ualpha = alpha->unbatch(); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); @@ -241,7 +235,6 @@ TYPED_TEST(BatchMultiVector, ComputesDot) using T = typename TestFixture::value_type; auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); - auto ures = result->unbatch(); this->mtx_0->compute_dot(this->mtx_1.get(), result.get()); @@ -286,7 +279,6 @@ TYPED_TEST(BatchMultiVector, ComputesConjDot) using T = typename TestFixture::value_type; auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); - auto ures = result->unbatch(); this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()); diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 631b9a10c24..015adbce798 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -82,46 +82,16 @@ class BatchMultiVector : public CommonTestFixture { alpha = gko::batch_initialize(batch_size, {2.0}, ref); beta = gko::batch_initialize(batch_size, {-0.5}, ref); } - dx = Mtx::create(exec); - dx->copy_from(x.get()); - dy = Mtx::create(exec); - dy->copy_from(y.get()); - dalpha = Mtx::create(exec); - dalpha->copy_from(alpha.get()); - dbeta = gko::clone(exec, beta.get()); + dx = gko::clone(exec, x); + dy = gko::clone(exec, y); + dalpha = gko::clone(exec, alpha); + dbeta = gko::clone(exec, beta); expected = Mtx::create( ref, gko::batch_dim<2>(batch_size, gko::dim<2>{1, num_vecs})); dresult = Mtx::create( exec, gko::batch_dim<2>(batch_size, gko::dim<2>{1, num_vecs})); } - void set_up_apply_data(const int p = 1) - { - const int m = 35, n = 15; - x = gen_mtx(batch_size, m, n); - c_x = gen_mtx(batch_size, m, n); - y = gen_mtx(batch_size, n, p); - expected = gen_mtx(batch_size, m, p); - alpha = gko::batch_initialize(batch_size, {2.0}, ref); - beta = gko::batch_initialize(batch_size, {-1.0}, ref); - square = gen_mtx(batch_size, x->get_common_size()[0], - x->get_common_size()[0]); - dx = Mtx::create(exec); - dx->copy_from(x.get()); - dc_x = ComplexMtx::create(exec); - dc_x->copy_from(c_x.get()); - dy = Mtx::create(exec); - dy->copy_from(y.get()); - dresult = Mtx::create(exec); - dresult->copy_from(expected.get()); - dalpha = Mtx::create(exec); - dalpha->copy_from(alpha.get()); - dbeta = Mtx::create(exec); - dbeta->copy_from(beta.get()); - dsquare = Mtx::create(exec); - dsquare->copy_from(square.get()); - } - std::ranlux48 rand_engine; const size_t batch_size = 11; diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index 2016f00dade..2467e99f62b 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -210,6 +210,20 @@ int main() array_type test; } + // core/base/batch_dim.hpp + { + using type1 = int; + auto common_size = gko::dim<2>{4, 2}; + auto test = gko::batch_dim<2, type1>{2, common_size}; + } + + // core/base/batch_multi_vector.hpp + { + using type1 = float; + using batch_multi_vector_type = gko::BatchMultiVector; + auto test = batch_multi_vector_type::create(exec); + } + // core/base/combination.hpp { using type1 = int; From 97f3eaafd9f2f80e19270235981ac5f0e76cbe7b Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 27 Jul 2023 07:37:51 +0000 Subject: [PATCH 139/583] Format files Co-authored-by: Pratik Nayak --- .../base/batch_multi_vector_kernels.hpp.inc | 34 +++++----- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 65 ++++++++++--------- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 17a7e125332..3df2bc14c84 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -48,9 +48,9 @@ __device__ __forceinline__ void scale( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, Mapping map) +__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -79,10 +79,10 @@ __device__ __forceinline__ void add_scaled( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, - const gko::batch_multi_vector::uniform_batch y, Mapping map) +__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( + const gko::batch_multi_vector::uniform_batch alpha, + const gko::batch_multi_vector::uniform_batch x, + const gko::batch_multi_vector::uniform_batch y, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -218,15 +218,11 @@ __device__ __forceinline__ void compute_norm2( template -__global__ __launch_bounds__( - default_block_size, - sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector:: - uniform_batch - x, - const gko::batch_multi_vector:: - uniform_batch< - remove_complex> - result) +__global__ +__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel( + const gko::batch_multi_vector::uniform_batch x, + const gko::batch_multi_vector::uniform_batch> + result) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -259,9 +255,9 @@ __device__ __forceinline__ void copy( template __global__ - __launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( - const gko::batch_multi_vector::uniform_batch src, - const gko::batch_multi_vector::uniform_batch dst) +__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( + const gko::batch_multi_vector::uniform_batch src, + const gko::batch_multi_vector::uniform_batch dst) { for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_entries; batch_id += gridDim.x) { diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index e27b3fc810f..85870a91df7 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -189,17 +189,18 @@ void compute_dot(std::shared_ptr exec, // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto y_b = batch::batch_entry(y_ub, group_id); - const auto res_b = batch::batch_entry(res_ub, group_id); - compute_gen_dot_product_kernel(x_b, y_b, res_b, item_ct1, - [](auto val) { return val; }); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto y_b = batch::batch_entry(y_ub, group_id); + const auto res_b = batch::batch_entry(res_ub, group_id); + compute_gen_dot_product_kernel( + x_b, y_b, res_b, item_ct1, + [](auto val) { return val; }); + }); }); } @@ -227,18 +228,18 @@ void compute_conj_dot(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto y_b = batch::batch_entry(y_ub, group_id); - const auto res_b = batch::batch_entry(res_ub, group_id); - compute_gen_dot_product_kernel( - x_b, y_b, res_b, item_ct1, - [](auto val) { return conj(val); }); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto y_b = batch::batch_entry(y_ub, group_id); + const auto res_b = batch::batch_entry(res_ub, group_id); + compute_gen_dot_product_kernel( + x_b, y_b, res_b, item_ct1, + [](auto val) { return conj(val); }); + }); }); } @@ -264,15 +265,15 @@ void compute_norm2(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto res_b = batch::batch_entry(res_ub, group_id); - compute_norm2_kernel(x_b, res_b, item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_entry(x_ub, group_id); + const auto res_b = batch::batch_entry(res_ub, group_id); + compute_norm2_kernel(x_b, res_b, item_ct1); + }); }); } From c808852972f27fbe29bf4ce6c03b15979a1795c5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 27 Jul 2023 10:31:56 +0200 Subject: [PATCH 140/583] Update get_values and add test --- core/test/base/batch_multi_vector.cpp | 8 +++++ .../ginkgo/core/base/batch_multi_vector.hpp | 29 +++++++++++++++++-- test/test_install/test_install.cpp | 3 +- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index e63ed883517..43d3a1ddac6 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -108,6 +108,14 @@ TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) } +TYPED_TEST(BatchMultiVector, CanGetValuesForEntry) +{ + using value_type = typename TestFixture::value_type; + + ASSERT_EQ(this->mtx->get_values_for_entry(1)[0], value_type{1.0}); +} + + TYPED_TEST(BatchMultiVector, CanBeCopied) { auto mtx_copy = gko::BatchMultiVector::create(this->exec); diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index b91c50966a1..f7c8258121f 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -170,6 +170,28 @@ class BatchMultiVector */ dim<2> get_common_size() const { return batch_size_.get_common_size(); } + /** + * Returns a pointer to the array of values of the multi-vector + * + * @return the pointer to the array of values + */ + value_type* get_values(size_type batch_id = 0) noexcept + { + return values_.get_data(); + } + + /** + * @copydoc get_values(size_type) + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_values() const noexcept + { + return values_.get_const_data(); + } + /** * Returns a pointer to the array of values of the multi-vector for a * specific batch entry. @@ -178,7 +200,7 @@ class BatchMultiVector * * @return the pointer to the array of values */ - value_type* get_values(size_type batch_id = 0) noexcept + value_type* get_values_for_entry(size_type batch_id) noexcept { GKO_ASSERT(batch_id < this->get_num_batch_entries()); return values_.get_data() + @@ -186,13 +208,14 @@ class BatchMultiVector } /** - * @copydoc get_values(size_type) + * @copydoc get_values_at_entry(size_type) * * @note This is the constant version of the function, which can be * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values(size_type batch_id = 0) const noexcept + const value_type* get_const_values_for_entry( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_entries()); return values_.get_const_data() + diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index 2467e99f62b..ed62e3ca3d3 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -213,8 +213,7 @@ int main() // core/base/batch_dim.hpp { using type1 = int; - auto common_size = gko::dim<2>{4, 2}; - auto test = gko::batch_dim<2, type1>{2, common_size}; + auto test = gko::batch_dim<2, type1>{}; } // core/base/batch_multi_vector.hpp From 17d54c38cc0b2ea629c16007e3427d03e225f35f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 27 Jul 2023 14:06:37 +0200 Subject: [PATCH 141/583] Fix read bug and add test --- core/base/batch_multi_vector.cpp | 8 +-- core/test/base/batch_multi_vector.cpp | 57 +++++++++---------- .../ginkgo/core/base/batch_lin_op_helpers.hpp | 18 ------ 3 files changed, 29 insertions(+), 54 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 3784c6645d7..9b5b908f5d1 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -269,12 +269,8 @@ void read_impl(MatrixType* mtx, const std::vector& data) MatrixType::create(mtx->get_executor()->get_master(), batch_size); tmp->fill(zero()); for (size_type b = 0; b < data.size(); ++b) { - size_type ind = 0; - for (size_type row = 0; row < data[b].size[0]; ++row) { - for (size_type col = 0; col < data[b].size[1]; ++col) { - tmp->at(b, row, col) = data[b].nonzeros[ind].value; - ++ind; - } + for (const auto& elem : data[b].nonzeros) { + tmp->at(b, elem.row, elem.column) = elem.value; } } tmp->move_to(mtx); diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 43d3a1ddac6..a201a80f741 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -397,6 +397,33 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixData) } +TYPED_TEST(BatchMultiVector, CanBeReadFromSparseMatrixData) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::BatchMultiVector::create(this->exec); + // clang-format off + m->read({gko::matrix_data{{2, 2}, + {{0, 0, 1.0}, + {0, 1, 3.0}, + {1, 1, 5.0}}}, + gko::matrix_data{{2, 2}, + {{0, 0, -1.0}, + {0, 1, 0.5}, + {1, 1, 9.0}}}}); + // clang-format on + + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); + EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); +} + + TYPED_TEST(BatchMultiVector, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; @@ -422,33 +449,3 @@ TYPED_TEST(BatchMultiVector, GeneratesCorrectMatrixData) EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0})); EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0})); } - - -TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixAssemblyData) -{ - using value_type = typename TestFixture::value_type; - auto m = gko::BatchMultiVector::create(this->exec); - gko::matrix_assembly_data data1(gko::dim<2>{2, 2}); - data1.set_value(0, 0, 1.0); - data1.set_value(0, 1, 3.0); - data1.set_value(1, 0, 0.0); - data1.set_value(1, 1, 5.0); - gko::matrix_assembly_data data2(gko::dim<2>{2, 2}); - data2.set_value(0, 0, 2.0); - data2.set_value(0, 1, 1.0); - data2.set_value(1, 0, 5.0); - data2.set_value(1, 1, 4.0); - auto data = std::vector>{data1, data2}; - - m->read(data); - - ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); - EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); - EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); - EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); - EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); - EXPECT_EQ(m->at(1, 1, 0), value_type{5.0}); - EXPECT_EQ(m->at(1, 0, 1), value_type{1.0}); - EXPECT_EQ(m->at(1, 1, 1), value_type{4.0}); -} diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp index ecb8bcc4556..6dd9297614a 100644 --- a/include/ginkgo/core/base/batch_lin_op_helpers.hpp +++ b/include/ginkgo/core/base/batch_lin_op_helpers.hpp @@ -75,24 +75,6 @@ class BatchReadableFromMatrixData { */ virtual void read( const std::vector>& data) = 0; - - /** - * Reads a matrix from a std::vector of matrix_assembly_data objects. - * - * @param data the std::vector of matrix_assembly_data objects - */ - void read(const std::vector>& - assembly_data) - { - auto mat_data = std::vector>( - assembly_data.size()); - size_type ind = 0; - for (const auto& i : assembly_data) { - mat_data[ind] = i.get_ordered_data(); - ++ind; - } - this->read(mat_data); - } }; From 367d46939607f7ee230e2ffa4e404b9fb1165686 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 28 Jul 2023 11:44:33 +0200 Subject: [PATCH 142/583] Review updates. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher Co-authored-by: Yu-Hsiang Tsai Co-authored-by: Marcel Koch --- .../base/batch_multi_vector_kernels.hpp.inc | 80 ++++++++------ core/base/batch_multi_vector.cpp | 87 ++++++++++++++- core/test/base/batch_multi_vector.cpp | 13 +++ cuda/base/batch_multi_vector_kernels.cu | 2 +- cuda/base/batch_struct.hpp | 2 +- dpcpp/base/batch_struct.hpp | 2 +- hip/base/batch_multi_vector_kernels.hip.cpp | 2 +- hip/base/batch_struct.hip.hpp | 2 +- include/ginkgo/core/base/batch_dim.hpp | 15 ++- .../ginkgo/core/base/batch_lin_op_helpers.hpp | 1 + .../ginkgo/core/base/batch_multi_vector.hpp | 101 ++++-------------- .../test/base/batch_multi_vector_kernels.cpp | 16 +-- test/base/batch_multi_vector_kernels.cpp | 25 ++++- 13 files changed, 204 insertions(+), 144 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 3df2bc14c84..5e63f451d19 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -47,10 +47,15 @@ __device__ __forceinline__ void scale( } template -__global__ -__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, Mapping map) +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void scale_kernel(const gko::batch_multi_vector:: + uniform_batch + alpha, + const gko::batch_multi_vector:: + uniform_batch + x, + Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -78,11 +83,20 @@ __device__ __forceinline__ void add_scaled( } template -__global__ -__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel( - const gko::batch_multi_vector::uniform_batch alpha, - const gko::batch_multi_vector::uniform_batch x, - const gko::batch_multi_vector::uniform_batch y, Mapping map) +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void add_scaled_kernel(const gko::batch_multi_vector:: + uniform_batch< + const ValueType> + alpha, + const gko::batch_multi_vector:: + uniform_batch< + const ValueType> + x, + const gko::batch_multi_vector:: + uniform_batch + y, + Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -139,24 +153,12 @@ __device__ __forceinline__ void compute_gen_dot_product( template -__global__ __launch_bounds__( - default_block_size, - sm_multiplier) void compute_gen_dot_product_kernel(const gko:: - batch_multi_vector:: - uniform_batch< - const ValueType> - x, - const gko:: - batch_multi_vector:: - uniform_batch< - const ValueType> - y, - const gko:: - batch_multi_vector:: - uniform_batch< - ValueType> - result, - Mapping map) +__global__ + __launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( + const gko::batch_multi_vector::uniform_batch x, + const gko::batch_multi_vector::uniform_batch y, + const gko::batch_multi_vector::uniform_batch result, + Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -218,11 +220,19 @@ __device__ __forceinline__ void compute_norm2( template -__global__ -__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel( - const gko::batch_multi_vector::uniform_batch x, - const gko::batch_multi_vector::uniform_batch> - result) +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void compute_norm2_kernel(const gko:: + batch_multi_vector:: + uniform_batch< + const ValueType> + x, + const gko:: + batch_multi_vector:: + uniform_batch< + remove_complex< + ValueType>> + result) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; batch_id += gridDim.x) { @@ -255,9 +265,9 @@ __device__ __forceinline__ void copy( template __global__ -__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel( - const gko::batch_multi_vector::uniform_batch src, - const gko::batch_multi_vector::uniform_batch dst) + __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( + const gko::batch_multi_vector::uniform_batch src, + const gko::batch_multi_vector::uniform_batch dst) { for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_entries; batch_id += gridDim.x) { diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 9b5b908f5d1..ac47260d82d 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -65,6 +65,85 @@ GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); } // namespace } // namespace batch_multi_vector +namespace detail { + + +template +batch_dim<2> compute_batch_size( + const std::vector*>& matrices) +{ + auto common_size = matrices[0]->get_size(); + for (size_type i = 1; i < matrices.size(); ++i) { + GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); + } + return batch_dim<2>{matrices.size(), common_size}; +} + + +} // namespace detail + + +template +BatchMultiVector::BatchMultiVector( + std::shared_ptr exec, const batch_dim<2>& size) + : EnablePolymorphicObject>(exec), + batch_size_(size), + values_(exec, compute_num_elems(size)) +{} + + +template +BatchMultiVector::BatchMultiVector( + std::shared_ptr exec, + const std::vector*>& matrices) + : EnablePolymorphicObject>(exec), + batch_size_{detail::compute_batch_size(matrices)}, + values_(exec, compute_num_elems(batch_size_)) +{ + for (size_type i = 0; i < this->get_num_batch_entries(); ++i) { + auto local_exec = matrices[i]->get_executor(); + exec->copy_from( + local_exec.get(), matrices[i]->get_num_stored_elements(), + matrices[i]->get_const_values(), + this->get_values() + this->get_size().get_cumulative_offset(i)); + } +} + + +template +BatchMultiVector::BatchMultiVector( + std::shared_ptr exec, size_type num_duplications, + const matrix::Dense* input) + : BatchMultiVector( + exec, batch_dim<2>(num_duplications, input->get_size())) +{ + size_type offset = 0; + for (size_type i = 0; i < num_duplications; ++i) { + exec->copy_from(input->get_executor().get(), + input->get_num_stored_elements(), + input->get_const_values(), this->get_values() + offset); + offset += input->get_num_stored_elements(); + } +} + + +template +BatchMultiVector::BatchMultiVector( + std::shared_ptr exec, size_type num_duplications, + const BatchMultiVector* input) + : BatchMultiVector( + exec, batch_dim<2>(input->get_num_batch_entries() * num_duplications, + input->get_common_size())) +{ + size_type offset = 0; + for (size_type i = 0; i < num_duplications; ++i) { + exec->copy_from(input->get_executor().get(), + input->get_num_stored_elements(), + input->get_const_values(), this->get_values() + offset); + offset += input->get_num_stored_elements(); + } +} + template std::unique_ptr> @@ -102,12 +181,12 @@ template std::unique_ptr> BatchMultiVector::create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - gko::detail::const_array_view&& values) + detail::const_array_view&& values) { // cast const-ness away, but return a const object afterwards, // so we can ensure that no modifications take place. return std::unique_ptr(new BatchMultiVector{ - exec, sizes, gko::detail::array_const_cast(std::move(values))}); + exec, sizes, detail::array_const_cast(std::move(values))}); } @@ -285,7 +364,7 @@ void BatchMultiVector::read(const std::vector& data) template -void BatchMultiVector::read(const std::vector& data) +void BatchMultiVector::read(const std::vector& data) { read_impl(this, data); } @@ -320,7 +399,7 @@ void BatchMultiVector::write(std::vector& data) const template -void BatchMultiVector::write(std::vector& data) const +void BatchMultiVector::write(std::vector& data) const { write_impl(this, data); } diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index a201a80f741..486a8301cf6 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -97,6 +97,7 @@ TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); TYPED_TEST(BatchMultiVector, CanBeEmpty) { auto empty = gko::BatchMultiVector::create(this->exec); + this->assert_empty(empty.get()); } @@ -104,6 +105,7 @@ TYPED_TEST(BatchMultiVector, CanBeEmpty) TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) { ASSERT_NE(this->mtx->get_const_values(), nullptr); + this->assert_equal_to_original_mtx(this->mtx.get()); } @@ -119,7 +121,9 @@ TYPED_TEST(BatchMultiVector, CanGetValuesForEntry) TYPED_TEST(BatchMultiVector, CanBeCopied) { auto mtx_copy = gko::BatchMultiVector::create(this->exec); + mtx_copy->copy_from(this->mtx.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); this->mtx->at(0, 0, 0) = 7; this->mtx->at(0, 1) = 7; @@ -130,7 +134,9 @@ TYPED_TEST(BatchMultiVector, CanBeCopied) TYPED_TEST(BatchMultiVector, CanBeMoved) { auto mtx_copy = gko::BatchMultiVector::create(this->exec); + this->mtx->move_to(mtx_copy.get()); + this->assert_equal_to_original_mtx(mtx_copy.get()); } @@ -138,6 +144,7 @@ TYPED_TEST(BatchMultiVector, CanBeMoved) TYPED_TEST(BatchMultiVector, CanBeCloned) { auto mtx_clone = this->mtx->clone(); + this->assert_equal_to_original_mtx( dynamic_castmtx.get())>(mtx_clone.get())); } @@ -146,6 +153,7 @@ TYPED_TEST(BatchMultiVector, CanBeCloned) TYPED_TEST(BatchMultiVector, CanBeCleared) { this->mtx->clear(); + this->assert_empty(this->mtx.get()); } @@ -153,6 +161,7 @@ TYPED_TEST(BatchMultiVector, CanBeCleared) TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize) { using size_type = gko::size_type; + auto m = gko::BatchMultiVector::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 4))); @@ -281,6 +290,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) TYPED_TEST(BatchMultiVector, CanBeListConstructed) { using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); @@ -296,6 +306,7 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructed) TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( 2, I({1.0, 2.0}), this->exec); @@ -312,6 +323,7 @@ TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; using T = value_type; + auto m = gko::batch_initialize>( {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, {I{1.0, 2.0, -1.0}, I{3.0, 4.0, -2.0}, I{5.0, 6.0, -3.0}}}, @@ -401,6 +413,7 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromSparseMatrixData) { using value_type = typename TestFixture::value_type; auto m = gko::BatchMultiVector::create(this->exec); + // clang-format off m->read({gko::matrix_data{{2, 2}, {{0, 0, 1.0}, diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 3fd80a2aa41..3e44b006552 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -65,7 +65,7 @@ namespace batch_multi_vector { constexpr auto default_block_size = 256; -constexpr int sm_multiplier = 4; +constexpr int sm_oversubscription = 4; // clang-format off diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index d9907b41531..70bc42aecac 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -51,7 +51,7 @@ namespace cuda { /** @file batch_struct.hpp * * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the requried CUDA scalar type. + * while also shallow-casting to the required CUDA scalar type. * * A specialization is needed for every format of every kind of linear algebra * object. These are intended to be called on the host. diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index c9ee5800b3e..4f8d8aa0350 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -50,7 +50,7 @@ namespace dpcpp { /** @file batch_struct.hpp * * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the requried DPCPP scalar type. + * while also shallow-casting to the required DPCPP scalar type. * * A specialization is needed for every format of every kind of linear algebra * object. These are intended to be called on the host. diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index 40e828b5d45..bb465ac7709 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -66,7 +66,7 @@ namespace batch_multi_vector { constexpr auto default_block_size = 256; -constexpr int sm_multiplier = 4; +constexpr int sm_oversubscription = 4; // clang-format off diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index 3171e7e1df8..55f81f7eaff 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -51,7 +51,7 @@ namespace hip { /** @file batch_struct.hpp * * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the requried Hip scalar type. + * while also shallow-casting to the required Hip scalar type. * * A specialization is needed for every format of every kind of linear algebra * object. These are intended to be called on the host. diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index c52b732f610..6d840f2ee86 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -119,6 +119,14 @@ struct batch_dim { } + /** + * The default empty constructor + */ + batch_dim() + : common_size_(dim{}), + num_batch_entries_(0) + {} + /** * Creates a batch_dim object which stores a uniform size for all batch * entries. @@ -128,9 +136,8 @@ struct batch_dim { * * @note Use this constructor when uniform batches need to be stored. */ - explicit batch_dim(const size_type num_batch_entries = 0, - const dim& common_size = - dim{}) + explicit batch_dim(const size_type num_batch_entries, + const dim& common_size) : common_size_(common_size), num_batch_entries_(num_batch_entries) {} @@ -154,7 +161,7 @@ inline batch_dim<2, DimensionType> transpose( const batch_dim<2, DimensionType>& input) { return batch_dim<2, DimensionType>(input.get_num_batch_entries(), - gko::transpose(input.get_common_size())); + transpose(input.get_common_size())); } diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp index 6dd9297614a..5d1a2f8ed0d 100644 --- a/include/ginkgo/core/base/batch_lin_op_helpers.hpp +++ b/include/ginkgo/core/base/batch_lin_op_helpers.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index f7c8258121f..c5cc0040047 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -89,6 +89,7 @@ class BatchMultiVector friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class BatchMultiVector>; + friend class BatchMultiVector>; public: using BatchReadableFromMatrixData::read; @@ -102,13 +103,11 @@ class BatchMultiVector using value_type = ValueType; using index_type = int32; using unbatch_type = matrix::Dense; - using mat_data = gko::matrix_data; - using mat_data32 = gko::matrix_data; + using mat_data = matrix_data; + using mat_data64 = matrix_data; using absolute_type = remove_complex>; using complex_type = to_complex>; - using row_major_range = gko::range>; - /** * Creates a BatchMultiVector with the configuration of another * BatchMultiVector. @@ -118,8 +117,6 @@ class BatchMultiVector static std::unique_ptr create_with_config_of( ptr_param other); - friend class BatchMultiVector>; - void convert_to( BatchMultiVector>* result) const override; @@ -127,11 +124,11 @@ class BatchMultiVector void read(const std::vector& data) override; - void read(const std::vector& data) override; + void read(const std::vector& data) override; void write(std::vector& data) const override; - void write(std::vector& data) const override; + void write(std::vector& data) const override; /** * Unbatches the batched multi-vector and creates a std::vector of Dense @@ -175,13 +172,10 @@ class BatchMultiVector * * @return the pointer to the array of values */ - value_type* get_values(size_type batch_id = 0) noexcept - { - return values_.get_data(); - } + value_type* get_values() noexcept { return values_.get_data(); } /** - * @copydoc get_values(size_type) + * @copydoc get_values() * * @note This is the constant version of the function, which can be * significantly more memory efficient than the non-constant version, @@ -224,10 +218,10 @@ class BatchMultiVector /** * Returns the number of elements explicitly stored in the batch matrix, - * cumulative across all the batches. + * cumulative across all the batch entries. * * @return the number of elements explicitly stored in the vector, - * cumulative across all the batches + * cumulative across all the batch entries */ size_type get_num_stored_elements() const noexcept { @@ -235,7 +229,7 @@ class BatchMultiVector } /** - * Returns a single element for a particular batch. + * Returns a single element for a particular batch entry. * * @param batch the batch index to be queried * @param row the row of the requested element @@ -267,24 +261,24 @@ class BatchMultiVector * However, it is less efficient than the two-parameter variant of this * method. * - * @param batch the batch index to be queried + * @param batch_id the batch entry index to be queried * @param idx a linear index of the requested element * * @note the method has to be called on the same Executor the vector is * stored at (e.g. trying to call this method on a GPU multi-vector * from the OMP results in a runtime error) */ - ValueType& at(size_type batch, size_type idx) noexcept + ValueType& at(size_type batch_id, size_type idx) noexcept { - return values_.get_data()[linearize_index(batch, idx)]; + return values_.get_data()[linearize_index(batch_id, idx)]; } /** * @copydoc BatchMultiVector::at(size_type, size_type, size_type) */ - ValueType at(size_type batch, size_type idx) const noexcept + ValueType at(size_type batch_id, size_type idx) const noexcept { - return values_.get_const_data()[linearize_index(batch, idx)]; + return values_.get_const_data()[linearize_index(batch_id, idx)]; } /** @@ -374,22 +368,11 @@ class BatchMultiVector void fill(ValueType value); private: - inline batch_dim<2> compute_batch_size( - const std::vector*>& matrices) - { - auto common_size = matrices[0]->get_size(); - for (int i = 1; i < matrices.size(); ++i) { - GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); - } - return batch_dim<2>{matrices.size(), common_size}; - } - inline size_type compute_num_elems(const batch_dim<2>& size) { return size.get_cumulative_offset(size.get_num_batch_entries()); } - protected: /** * Sets the size of the BatchMultiVector. @@ -403,14 +386,10 @@ class BatchMultiVector * size. * * @param exec Executor associated to the vector - * @param size size of the vector + * @param size size of the batch multi vector */ BatchMultiVector(std::shared_ptr exec, - const batch_dim<2>& size = batch_dim<2>{}) - : EnablePolymorphicObject>(exec), - batch_size_(size), - values_(exec, compute_num_elems(size)) - {} + const batch_dim<2>& size = batch_dim<2>{}); /** * Creates a BatchMultiVector from an already allocated (and @@ -446,24 +425,12 @@ class BatchMultiVector * * @note This is a utility function that can serve as a first step to port * to batched data-structures and solvers. Even if the matrices are in - * device memory, this method can have siginificant overhead, as new + * device memory, this method can have significant overhead, as new * allocations and deep copies are necessary and hence this constructor must * not be used in performance sensitive applications */ BatchMultiVector(std::shared_ptr exec, - const std::vector*>& matrices) - : EnablePolymorphicObject>(exec), - batch_size_{compute_batch_size(matrices)}, - values_(exec, compute_num_elems(batch_size_)) - { - for (size_type i = 0; i < this->get_num_batch_entries(); ++i) { - auto local_exec = matrices[i]->get_executor(); - exec->copy_from( - local_exec.get(), matrices[i]->get_num_stored_elements(), - matrices[i]->get_const_values(), - this->get_values() + this->get_size().get_cumulative_offset(i)); - } - } + const std::vector*>& matrices); /** * Creates a BatchMultiVector matrix by duplicating BatchMultiVector object @@ -474,26 +441,13 @@ class BatchMultiVector * * @note This is a utility function that can serve as a first step to port * to batched data-structures and solvers. Even if the matrices are in - * device memory, this method can have siginificant overhead, as new + * device memory, this method can have significant overhead, as new * allocations and deep copies are necessary and hence this constructor must * not be used in performance sensitive applications. */ BatchMultiVector(std::shared_ptr exec, size_type num_duplications, - const BatchMultiVector* input) - : BatchMultiVector( - exec, gko::batch_dim<2>( - input->get_num_batch_entries() * num_duplications, - input->get_common_size())) - { - size_type offset = 0; - for (size_type i = 0; i < num_duplications; ++i) { - exec->copy_from( - input->get_executor().get(), input->get_num_stored_elements(), - input->get_const_values(), this->get_values() + offset); - offset += input->get_num_stored_elements(); - } - } + const BatchMultiVector* input); /** * Creates a BatchMultiVector matrix by a duplicating a matrix::Dense object @@ -504,18 +458,7 @@ class BatchMultiVector */ BatchMultiVector(std::shared_ptr exec, size_type num_duplications, - const matrix::Dense* input) - : BatchMultiVector( - exec, gko::batch_dim<2>(num_duplications, input->get_size())) - { - size_type offset = 0; - for (size_type i = 0; i < num_duplications; ++i) { - exec->copy_from( - input->get_executor().get(), input->get_num_stored_elements(), - input->get_const_values(), this->get_values() + offset); - offset += input->get_num_stored_elements(); - } - } + const matrix::Dense* input); /** * Creates a BatchMultiVector with the same configuration as the diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index f6ae66d8249..f6d169bceaf 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -59,7 +59,6 @@ class BatchMultiVector : public ::testing::Test { using Mtx = gko::BatchMultiVector; using DenseMtx = gko::matrix::Dense; using ComplexMtx = gko::to_complex; - using RealMtx = gko::remove_complex; BatchMultiVector() : exec(gko::ReferenceExecutor::create()), mtx_0(gko::batch_initialize( @@ -124,7 +123,7 @@ class BatchMultiVector : public ::testing::Test { std::unique_ptr mtx_5; std::unique_ptr mtx_6; - std::ranlux48 rand_engine; + std::default_random_engine rand_engine; }; TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); @@ -165,7 +164,7 @@ TYPED_TEST(BatchMultiVector, ScalesDataWithScalar) } -TYPED_TEST(BatchMultiVector, ScalesDataWithStride) +TYPED_TEST(BatchMultiVector, ScalesDataWithMultipleScalars) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -261,15 +260,12 @@ TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongInputSize) TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; + auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); - auto result2 = - Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result.get()), gko::DimensionMismatch); - ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result2.get()), - gko::DimensionMismatch); } @@ -305,16 +301,12 @@ TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongInputSize) TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; + auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); - auto result2 = - Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2})); ASSERT_THROW(this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()), gko::DimensionMismatch); - ASSERT_THROW( - this->mtx_0->compute_conj_dot(this->mtx_1.get(), result2.get()), - gko::DimensionMismatch); } diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 015adbce798..631464a8d27 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -40,7 +40,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include @@ -60,11 +59,11 @@ class BatchMultiVector : public CommonTestFixture { BatchMultiVector() : rand_engine(15) {} template - std::unique_ptr gen_mtx(const size_t batch_size, int num_rows, - int num_cols) + std::unique_ptr gen_mtx(const size_t num_batch_entries, + int num_rows, int num_cols) { return gko::test::generate_uniform_batch_random_matrix( - batch_size, num_rows, num_cols, + num_batch_entries, num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), std::normal_distribution<>(-1.0, 1.0), rand_engine, false, ref); } @@ -75,6 +74,8 @@ class BatchMultiVector : public CommonTestFixture { const int num_rows = 252; x = gen_mtx(batch_size, num_rows, num_vecs); y = gen_mtx(batch_size, num_rows, num_vecs); + c_x = gen_mtx(batch_size, num_rows, num_vecs); + c_y = gen_mtx(batch_size, num_rows, num_vecs); if (different_alpha) { alpha = gen_mtx(batch_size, 1, num_vecs); beta = gen_mtx(batch_size, 1, num_vecs); @@ -84,6 +85,8 @@ class BatchMultiVector : public CommonTestFixture { } dx = gko::clone(exec, x); dy = gko::clone(exec, y); + dc_x = gko::clone(exec, c_x); + dc_y = gko::clone(exec, c_y); dalpha = gko::clone(exec, alpha); dbeta = gko::clone(exec, beta); expected = Mtx::create( @@ -92,11 +95,12 @@ class BatchMultiVector : public CommonTestFixture { exec, gko::batch_dim<2>(batch_size, gko::dim<2>{1, num_vecs})); } - std::ranlux48 rand_engine; + std::default_random_engine rand_engine; const size_t batch_size = 11; std::unique_ptr x; std::unique_ptr c_x; + std::unique_ptr c_y; std::unique_ptr y; std::unique_ptr alpha; std::unique_ptr beta; @@ -105,6 +109,7 @@ class BatchMultiVector : public CommonTestFixture { std::unique_ptr dresult; std::unique_ptr dx; std::unique_ptr dc_x; + std::unique_ptr dc_y; std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; @@ -216,11 +221,16 @@ TEST_F(BatchMultiVector, ComputeDotIsEquivalentToRef) gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); auto dot_expected = Mtx::create(this->ref, dot_size); auto ddot = Mtx::create(this->exec, dot_size); + auto cdot_expected = ComplexMtx::create(this->ref, dot_size); + auto dc_dot = ComplexMtx::create(this->exec, dot_size); x->compute_dot(y.get(), dot_expected.get()); dx->compute_dot(dy.get(), ddot.get()); + c_x->compute_dot(c_y.get(), cdot_expected.get()); + dc_x->compute_dot(dc_y.get(), dc_dot.get()); GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r::value); + GKO_ASSERT_BATCH_MTX_NEAR(cdot_expected, dc_dot, 5 * r::value); } @@ -246,11 +256,16 @@ TEST_F(BatchMultiVector, ComputeConjDotIsEquivalentToRef) gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]}); auto dot_expected = Mtx::create(this->ref, dot_size); auto ddot = Mtx::create(this->exec, dot_size); + auto cdot_expected = ComplexMtx::create(this->ref, dot_size); + auto dc_dot = ComplexMtx::create(this->exec, dot_size); x->compute_conj_dot(y.get(), dot_expected.get()); dx->compute_conj_dot(dy.get(), ddot.get()); + c_x->compute_conj_dot(c_y.get(), cdot_expected.get()); + dc_x->compute_conj_dot(dc_y.get(), dc_dot.get()); GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r::value); + GKO_ASSERT_BATCH_MTX_NEAR(cdot_expected, dc_dot, 5 * r::value); } From 008f04fcc154413d2454903dd9c4aea1a2f1a9cd Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 31 Jul 2023 15:43:18 +0200 Subject: [PATCH 143/583] Rename: batch_entry -> batch_item --- ...batch_multi_vector_kernel_launcher.hpp.inc | 12 +-- .../base/batch_multi_vector_kernels.hpp.inc | 68 ++++++------ core/base/batch_multi_vector.cpp | 33 +++--- core/base/batch_struct.hpp | 25 +++-- core/test/base/batch_dim.cpp | 4 +- core/test/base/batch_multi_vector.cpp | 14 +-- cuda/base/batch_struct.hpp | 4 +- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 101 +++++++++--------- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 24 ++--- dpcpp/base/batch_struct.hpp | 4 +- hip/base/batch_struct.hip.hpp | 4 +- include/ginkgo/core/base/batch_dim.hpp | 26 ++--- .../ginkgo/core/base/batch_multi_vector.hpp | 88 +++++++-------- omp/base/batch_multi_vector_kernels.cpp | 45 ++++---- reference/base/batch_multi_vector_kernels.cpp | 45 ++++---- .../base/batch_multi_vector_kernels.hpp.inc | 30 +++--- reference/base/batch_struct.hpp | 4 +- .../test/base/batch_multi_vector_kernels.cpp | 4 +- test/base/batch_multi_vector_kernels.cpp | 4 +- 19 files changed, 263 insertions(+), 276 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc index 60af1de45af..acd58b37327 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc @@ -36,7 +36,7 @@ void scale(std::shared_ptr exec, const BatchMultiVector* const alpha, BatchMultiVector* const x) { - const auto num_blocks = x->get_num_batch_entries(); + const auto num_blocks = x->get_num_batch_items(); const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); if (alpha->get_common_size()[1] == 1) { @@ -58,7 +58,7 @@ void add_scaled(std::shared_ptr exec, const BatchMultiVector* const x, BatchMultiVector* const y) { - const auto num_blocks = x->get_num_batch_entries(); + const auto num_blocks = x->get_num_batch_items(); const size_type nrhs = x->get_common_size()[1]; const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); @@ -84,7 +84,7 @@ void compute_dot(std::shared_ptr exec, const BatchMultiVector* y, BatchMultiVector* result) { - const auto num_blocks = x->get_num_batch_entries(); + const auto num_blocks = x->get_num_batch_items(); const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); @@ -104,7 +104,7 @@ void compute_conj_dot(std::shared_ptr exec, const BatchMultiVector* y, BatchMultiVector* result) { - const auto num_blocks = x->get_num_batch_entries(); + const auto num_blocks = x->get_num_batch_items(); const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); @@ -123,7 +123,7 @@ void compute_norm2(std::shared_ptr exec, const BatchMultiVector* const x, BatchMultiVector>* const result) { - const auto num_blocks = x->get_num_batch_entries(); + const auto num_blocks = x->get_num_batch_items(); const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); @@ -140,7 +140,7 @@ void copy(std::shared_ptr exec, const BatchMultiVector* x, BatchMultiVector* result) { - const auto num_blocks = x->get_num_batch_entries(); + const auto num_blocks = x->get_num_batch_items(); const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); copy_kernel<<get_stream()>>>( diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 5e63f451d19..cdb25d318f0 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -33,8 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __device__ __forceinline__ void scale( - const gko::batch_multi_vector::batch_entry& alpha, - const gko::batch_multi_vector::batch_entry& x, Mapping map) + const gko::batch_multi_vector::batch_item& alpha, + const gko::batch_multi_vector::batch_item& x, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { @@ -57,10 +57,10 @@ __global__ __launch_bounds__( x, Mapping map) { - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto alpha_b = gko::batch::batch_entry(alpha, batch_id); - const auto x_b = gko::batch::batch_entry(x, batch_id); + const auto alpha_b = gko::batch::batch_item(alpha, batch_id); + const auto x_b = gko::batch::batch_item(x, batch_id); scale(alpha_b, x_b, map); } } @@ -68,9 +68,9 @@ __global__ __launch_bounds__( template __device__ __forceinline__ void add_scaled( - const gko::batch_multi_vector::batch_entry& alpha, - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, Mapping map) + const gko::batch_multi_vector::batch_item& alpha, + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { @@ -98,11 +98,11 @@ __global__ __launch_bounds__( y, Mapping map) { - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto alpha_b = gko::batch::batch_entry(alpha, batch_id); - const auto x_b = gko::batch::batch_entry(x, batch_id); - const auto y_b = gko::batch::batch_entry(y, batch_id); + const auto alpha_b = gko::batch::batch_item(alpha, batch_id); + const auto x_b = gko::batch::batch_item(x, batch_id); + const auto y_b = gko::batch::batch_item(y, batch_id); add_scaled(alpha_b, x_b, y_b, map); } } @@ -110,10 +110,10 @@ __global__ __launch_bounds__( template __device__ __forceinline__ void gen_one_dot( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y, const int rhs_index, - const gko::batch_multi_vector::batch_entry& result, + const gko::batch_multi_vector::batch_item& result, Group subgroup, Mapping conj_map) { ValueType val = zero(); @@ -134,9 +134,9 @@ __device__ __forceinline__ void gen_one_dot( template __device__ __forceinline__ void compute_gen_dot_product( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, - const gko::batch_multi_vector::batch_entry& result, + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y, + const gko::batch_multi_vector::batch_item& result, Mapping conj_map) { constexpr auto tile_size = config::warp_size; @@ -160,11 +160,11 @@ __global__ const gko::batch_multi_vector::uniform_batch result, Mapping map) { - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto x_b = gko::batch::batch_entry(x, batch_id); - const auto y_b = gko::batch::batch_entry(y, batch_id); - const auto r_b = gko::batch::batch_entry(result, batch_id); + const auto x_b = gko::batch::batch_item(x, batch_id); + const auto y_b = gko::batch::batch_item(y, batch_id); + const auto r_b = gko::batch::batch_item(result, batch_id); compute_gen_dot_product(x_b, y_b, r_b, map); } } @@ -172,9 +172,9 @@ __global__ template __device__ __forceinline__ void one_norm2( - const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_item& x, const int rhs_index, - const gko::batch_multi_vector::batch_entry>& + const gko::batch_multi_vector::batch_item>& result, Group subgroup) { @@ -202,8 +202,8 @@ __device__ __forceinline__ void one_norm2( */ template __device__ __forceinline__ void compute_norm2( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry>& + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item>& result) { constexpr auto tile_size = config::warp_size; @@ -234,10 +234,10 @@ __global__ __launch_bounds__( ValueType>> result) { - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries; + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto x_b = gko::batch::batch_entry(x, batch_id); - const auto r_b = gko::batch::batch_entry(result, batch_id); + const auto x_b = gko::batch::batch_item(x, batch_id); + const auto r_b = gko::batch::batch_item(result, batch_id); compute_norm2(x_b, r_b); } } @@ -251,8 +251,8 @@ __global__ __launch_bounds__( */ template __device__ __forceinline__ void copy( - const gko::batch_multi_vector::batch_entry& in, - const gko::batch_multi_vector::batch_entry& out) + const gko::batch_multi_vector::batch_item& in, + const gko::batch_multi_vector::batch_item& out) { for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; iz += blockDim.x) { @@ -269,10 +269,10 @@ __global__ const gko::batch_multi_vector::uniform_batch src, const gko::batch_multi_vector::uniform_batch dst) { - for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_entries; + for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; batch_id += gridDim.x) { - const auto dst_b = gko::batch::batch_entry(dst, batch_id); - const auto src_b = gko::batch::batch_entry(src, batch_id); + const auto dst_b = gko::batch::batch_item(dst, batch_id); + const auto src_b = gko::batch::batch_item(src, batch_id); copy(src_b, dst_b); } } diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index ac47260d82d..a843ee376c1 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -100,7 +100,7 @@ BatchMultiVector::BatchMultiVector( batch_size_{detail::compute_batch_size(matrices)}, values_(exec, compute_num_elems(batch_size_)) { - for (size_type i = 0; i < this->get_num_batch_entries(); ++i) { + for (size_type i = 0; i < this->get_num_batch_items(); ++i) { auto local_exec = matrices[i]->get_executor(); exec->copy_from( local_exec.get(), matrices[i]->get_num_stored_elements(), @@ -132,7 +132,7 @@ BatchMultiVector::BatchMultiVector( std::shared_ptr exec, size_type num_duplications, const BatchMultiVector* input) : BatchMultiVector( - exec, batch_dim<2>(input->get_num_batch_entries() * num_duplications, + exec, batch_dim<2>(input->get_num_batch_items() * num_duplications, input->get_common_size())) { size_type offset = 0; @@ -165,7 +165,7 @@ BatchMultiVector::unbatch() const using unbatch_type = matrix::Dense; auto exec = this->get_executor(); auto unbatch_mats = std::vector>{}; - for (size_type b = 0; b < this->get_num_batch_entries(); ++b) { + for (size_type b = 0; b < this->get_num_batch_items(); ++b) { auto mat = unbatch_type::create(exec, this->get_common_size()); exec->copy_from(exec.get(), mat->get_num_stored_elements(), this->get_const_values() + @@ -218,8 +218,7 @@ template void BatchMultiVector::scale( ptr_param> alpha) { - GKO_ASSERT_EQ(alpha->get_num_batch_entries(), - this->get_num_batch_entries()); + GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); if (alpha->get_common_size()[1] != 1) { // different alpha for each column @@ -237,15 +236,14 @@ void BatchMultiVector::add_scaled( ptr_param> alpha, ptr_param> b) { - GKO_ASSERT_EQ(alpha->get_num_batch_entries(), - this->get_num_batch_entries()); + GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); if (alpha->get_common_size()[1] != 1) { // different alpha for each column GKO_ASSERT_EQUAL_COLS(this->get_common_size(), alpha->get_common_size()); } - GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); auto exec = this->get_executor(); @@ -257,7 +255,7 @@ void BatchMultiVector::add_scaled( inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) { - return batch_dim<2>(sizes.get_num_batch_entries(), + return batch_dim<2>(sizes.get_num_batch_items(), dim<2>(1, sizes.get_common_size()[1])); } @@ -267,10 +265,9 @@ void BatchMultiVector::compute_conj_dot( ptr_param> b, ptr_param> result) const { - GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); - GKO_ASSERT_EQ(this->get_num_batch_entries(), - result->get_num_batch_entries()); + GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); @@ -286,10 +283,9 @@ void BatchMultiVector::compute_dot( ptr_param> b, ptr_param> result) const { - GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries()); + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); - GKO_ASSERT_EQ(this->get_num_batch_entries(), - result->get_num_batch_entries()); + GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); @@ -304,8 +300,7 @@ template void BatchMultiVector::compute_norm2( ptr_param>> result) const { - GKO_ASSERT_EQ(this->get_num_batch_entries(), - result->get_num_batch_entries()); + GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); @@ -375,8 +370,8 @@ void write_impl(const MatrixType* mtx, std::vector& data) { auto tmp = make_temporary_clone(mtx->get_executor()->get_master(), mtx); - data = std::vector(mtx->get_num_batch_entries()); - for (size_type b = 0; b < mtx->get_num_batch_entries(); ++b) { + data = std::vector(mtx->get_num_batch_items()); + for (size_type b = 0; b < mtx->get_num_batch_items(); ++b) { data[b] = {mtx->get_common_size(), {}}; for (size_type row = 0; row < data[b].size[0]; ++row) { for (size_type col = 0; col < data[b].size[1]; ++col) { diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index ea1b3ef3f3f..9549c4eaaee 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -47,7 +47,7 @@ namespace batch_multi_vector { * Encapsulates one matrix from a batch of multi-vectors. */ template -struct batch_entry { +struct batch_item { using value_type = ValueType; ValueType* values; int stride; @@ -62,10 +62,10 @@ struct batch_entry { template struct uniform_batch { using value_type = ValueType; - using entry_type = batch_entry; + using entry_type = batch_item; ValueType* values; - size_type num_batch_entries; + size_type num_batch_items; int stride; int num_rows; int num_rhs; @@ -84,8 +84,8 @@ namespace batch { template -GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::batch_entry -to_const(const gko::batch_multi_vector::batch_entry& b) +GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::batch_item +to_const(const gko::batch_multi_vector::batch_item& b) { return {b.values, b.stride, b.num_rows, b.num_rhs}; } @@ -96,8 +96,7 @@ GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::uniform_batch to_const(const gko::batch_multi_vector::uniform_batch& ub) { - return {ub.values, ub.num_batch_entries, ub.stride, ub.num_rows, - ub.num_rhs}; + return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs}; } @@ -111,18 +110,18 @@ GKO_ATTRIBUTES GKO_INLINE * @param batch_idx The position of the desired object in the batch */ template -GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_entry -batch_entry(const batch_multi_vector::uniform_batch& batch, - const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_item batch_item( + const batch_multi_vector::uniform_batch& batch, + const size_type batch_idx) { return {batch.values + batch_idx * batch.stride * batch.num_rows, batch.stride, batch.num_rows, batch.num_rhs}; } template -GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_entry -batch_entry(ValueType* const batch_values, const int stride, const int num_rows, - const int num_rhs, const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_item batch_item( + ValueType* const batch_values, const int stride, const int num_rows, + const int num_rhs, const size_type batch_idx) { return {batch_values + batch_idx * stride * num_rows, stride, num_rows, num_rhs}; diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp index 71b954264c3..7914eb4d15e 100644 --- a/core/test/base/batch_dim.cpp +++ b/core/test/base/batch_dim.cpp @@ -43,7 +43,7 @@ TEST(BatchDim, ConstructsCorrectUniformObject) { gko::batch_dim<2> d{4, gko::dim<2>(5)}; - ASSERT_EQ(d.get_num_batch_entries(), 4); + ASSERT_EQ(d.get_num_batch_items(), 4); ASSERT_EQ(d.get_common_size(), gko::dim<2>(5)); } @@ -52,7 +52,7 @@ TEST(BatchDim, ConstructsNullObject) { gko::batch_dim<2> d{}; - ASSERT_EQ(d.get_num_batch_entries(), 0); + ASSERT_EQ(d.get_num_batch_items(), 0); ASSERT_EQ(d.get_common_size(), gko::dim<2>{}); } diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 486a8301cf6..844d4825a7a 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -64,7 +64,7 @@ class BatchMultiVector : public ::testing::Test { { ASSERT_NE(m->get_const_values(), nullptr); EXPECT_EQ(m->get_const_values()[0], value_type{-1.0}); - ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); @@ -82,7 +82,7 @@ class BatchMultiVector : public ::testing::Test { static void assert_empty(gko::BatchMultiVector* m) { - ASSERT_EQ(m->get_num_batch_entries(), 0); + ASSERT_EQ(m->get_num_batch_items(), 0); ASSERT_EQ(m->get_common_size(), gko::dim<2>{}); ASSERT_EQ(m->get_const_values(), nullptr); } @@ -114,7 +114,7 @@ TYPED_TEST(BatchMultiVector, CanGetValuesForEntry) { using value_type = typename TestFixture::value_type; - ASSERT_EQ(this->mtx->get_values_for_entry(1)[0], value_type{1.0}); + ASSERT_EQ(this->mtx->get_values_for_item(1)[0], value_type{1.0}); } @@ -165,7 +165,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize) auto m = gko::BatchMultiVector::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 4))); - ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 4)); } @@ -294,7 +294,7 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructed) auto m = gko::batch_initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); - ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); EXPECT_EQ(m->at(0, 0), value_type{1}); EXPECT_EQ(m->at(0, 1), value_type{2}); @@ -310,7 +310,7 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies) auto m = gko::batch_initialize>( 2, I({1.0, 2.0}), this->exec); - ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); @@ -351,7 +351,7 @@ TYPED_TEST(BatchMultiVector, CanBeFilledWithValue) m->fill(value_type(2.0)); - ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 1)); EXPECT_EQ(m->at(0, 0, 0), value_type{2.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 70bc42aecac..600cccc622b 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -65,7 +65,7 @@ template inline gko::batch_multi_vector::uniform_batch> get_batch_struct(const BatchMultiVector* const op) { - return {as_cuda_type(op->get_const_values()), op->get_num_batch_entries(), + return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -78,7 +78,7 @@ template inline gko::batch_multi_vector::uniform_batch> get_batch_struct(BatchMultiVector* const op) { - return {as_cuda_type(op->get_values()), op->get_num_batch_entries(), + return {as_cuda_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 85870a91df7..3542fc5ebad 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -76,7 +76,7 @@ void scale(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); - const auto num_batches = x_ub.num_batch_entries; + const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = device.get_info(); @@ -91,8 +91,8 @@ void scale(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_entry(alpha_ub, group_id); - const auto x_b = batch::batch_entry(x_ub, group_id); + const auto alpha_b = batch::batch_item(alpha_ub, group_id); + const auto x_b = batch::batch_item(x_ub, group_id); scale_kernel(alpha_b, x_b, item_ct1, [](int col) { return 0; }); }); @@ -103,8 +103,8 @@ void scale(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_entry(alpha_ub, group_id); - const auto x_b = batch::batch_entry(x_ub, group_id); + const auto alpha_b = batch::batch_item(alpha_ub, group_id); + const auto x_b = batch::batch_item(x_ub, group_id); scale_kernel(alpha_b, x_b, item_ct1, [](int col) { return col; }); }); @@ -125,7 +125,7 @@ void add_scaled(std::shared_ptr exec, const size_type num_rows = x->get_common_size()[0]; const size_type num_cols = x->get_common_size()[1]; - const auto num_batches = x->get_num_batch_entries(); + const auto num_batches = x->get_num_batch_items(); auto device = exec->get_queue()->get_device(); auto group_size = device.get_info(); @@ -141,9 +141,9 @@ void add_scaled(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_entry(alpha_ub, group_id); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto y_b = batch::batch_entry(y_ub, group_id); + const auto alpha_b = batch::batch_item(alpha_ub, group_id); + const auto x_b = batch::batch_item(x_ub, group_id); + const auto y_b = batch::batch_item(y_ub, group_id); add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, [](auto col) { return 0; }); }); @@ -154,9 +154,9 @@ void add_scaled(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_entry(alpha_ub, group_id); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto y_b = batch::batch_entry(y_ub, group_id); + const auto alpha_b = batch::batch_item(alpha_ub, group_id); + const auto x_b = batch::batch_item(x_ub, group_id); + const auto y_b = batch::batch_item(y_ub, group_id); add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, [](auto col) { return col; }); }); @@ -178,7 +178,7 @@ void compute_dot(std::shared_ptr exec, const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - const auto num_batches = x_ub.num_batch_entries; + const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = device.get_info(); @@ -189,18 +189,17 @@ void compute_dot(std::shared_ptr exec, // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto y_b = batch::batch_entry(y_ub, group_id); - const auto res_b = batch::batch_entry(res_ub, group_id); - compute_gen_dot_product_kernel( - x_b, y_b, res_b, item_ct1, - [](auto val) { return val; }); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_item(x_ub, group_id); + const auto y_b = batch::batch_item(y_ub, group_id); + const auto res_b = batch::batch_item(res_ub, group_id); + compute_gen_dot_product_kernel(x_b, y_b, res_b, item_ct1, + [](auto val) { return val; }); + }); }); } @@ -218,7 +217,7 @@ void compute_conj_dot(std::shared_ptr exec, const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - const auto num_batches = x_ub.num_batch_entries; + const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = device.get_info(); @@ -228,18 +227,18 @@ void compute_conj_dot(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto y_b = batch::batch_entry(y_ub, group_id); - const auto res_b = batch::batch_entry(res_ub, group_id); - compute_gen_dot_product_kernel( - x_b, y_b, res_b, item_ct1, - [](auto val) { return conj(val); }); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_item(x_ub, group_id); + const auto y_b = batch::batch_item(y_ub, group_id); + const auto res_b = batch::batch_item(res_ub, group_id); + compute_gen_dot_product_kernel( + x_b, y_b, res_b, item_ct1, + [](auto val) { return conj(val); }); + }); }); } @@ -255,7 +254,7 @@ void compute_norm2(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); - const auto num_batches = x_ub.num_batch_entries; + const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = device.get_info(); @@ -265,15 +264,15 @@ void compute_norm2(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto res_b = batch::batch_entry(res_ub, group_id); - compute_norm2_kernel(x_b, res_b, item_ct1); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::batch_item(x_ub, group_id); + const auto res_b = batch::batch_item(res_ub, group_id); + compute_norm2_kernel(x_b, res_b, item_ct1); + }); }); } @@ -289,7 +288,7 @@ void copy(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto result_ub = get_batch_struct(result); - const auto num_batches = x_ub.num_batch_entries; + const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = device.get_info(); @@ -302,8 +301,8 @@ void copy(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_entry(x_ub, group_id); - const auto result_b = batch::batch_entry(result_ub, group_id); + const auto x_b = batch::batch_item(x_ub, group_id); + const auto result_b = batch::batch_item(result_ub, group_id); copy_kernel(x_b, result_b, item_ct1); }); }); diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index 7dfe13d0fda..c328a50465a 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -32,8 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __dpct_inline__ void scale_kernel( - const gko::batch_multi_vector::batch_entry& alpha, - const gko::batch_multi_vector::batch_entry& x, + const gko::batch_multi_vector::batch_item& alpha, + const gko::batch_multi_vector::batch_item& x, sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; @@ -50,9 +50,9 @@ __dpct_inline__ void scale_kernel( template __dpct_inline__ void add_scaled_kernel( - const gko::batch_multi_vector::batch_entry& alpha, - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, + const gko::batch_multi_vector::batch_item& alpha, + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y, sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; @@ -69,9 +69,9 @@ __dpct_inline__ void add_scaled_kernel( template __dpct_inline__ void compute_gen_dot_product_kernel( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, - const gko::batch_multi_vector::batch_entry& result, + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y, + const gko::batch_multi_vector::batch_item& result, sycl::nd_item<3>& item_ct1, Mapping conj_map) { constexpr auto tile_size = config::warp_size; @@ -104,8 +104,8 @@ __dpct_inline__ void compute_gen_dot_product_kernel( template __dpct_inline__ void compute_norm2_kernel( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry>& + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item>& result, sycl::nd_item<3>& item_ct1) { @@ -138,8 +138,8 @@ __dpct_inline__ void compute_norm2_kernel( template __dpct_inline__ void copy_kernel( - const gko::batch_multi_vector::batch_entry& in, - const gko::batch_multi_vector::batch_entry& out, + const gko::batch_multi_vector::batch_item& in, + const gko::batch_multi_vector::batch_item& out, sycl::nd_item<3>& item_ct1) { for (int iz = item_ct1.get_local_linear_id(); iz < in.num_rows * in.num_rhs; diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index 4f8d8aa0350..ff3a6a87ade 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -64,7 +64,7 @@ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( const BatchMultiVector* const op) { - return {op->get_const_values(), op->get_num_batch_entries(), + return {op->get_const_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -78,7 +78,7 @@ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( BatchMultiVector* const op) { - return {op->get_values(), op->get_num_batch_entries(), + return {op->get_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index 55f81f7eaff..1732505bc6f 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -65,7 +65,7 @@ template inline gko::batch_multi_vector::uniform_batch> get_batch_struct(const BatchMultiVector* const op) { - return {as_hip_type(op->get_const_values()), op->get_num_batch_entries(), + return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -78,7 +78,7 @@ template inline gko::batch_multi_vector::uniform_batch> get_batch_struct(BatchMultiVector* const op) { - return {as_hip_type(op->get_values()), op->get_num_batch_entries(), + return {as_hip_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index 6d840f2ee86..3bda352fb9d 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -58,14 +58,14 @@ struct batch_dim { using dimension_type = DimensionType; /** - * Get the number of batch entries stored + * Get the number of batch items stored * - * @return num_batch_entries + * @return num_batch_items */ - size_type get_num_batch_entries() const { return num_batch_entries_; } + size_type get_num_batch_items() const { return num_batch_items_; } /** - * Get the common size of the batches + * Get the common size of the batch items * * @return common_size */ @@ -96,7 +96,7 @@ struct batch_dim { */ friend bool operator==(const batch_dim& x, const batch_dim& y) { - return x.num_batch_entries_ == y.num_batch_entries_ && + return x.num_batch_items_ == y.num_batch_items_ && x.common_size_ == y.common_size_; } @@ -120,29 +120,29 @@ struct batch_dim { /** - * The default empty constructor + * The default constructor */ batch_dim() : common_size_(dim{}), - num_batch_entries_(0) + num_batch_items_(0) {} /** * Creates a batch_dim object which stores a uniform size for all batch * entries. * - * @param num_batch_entries the number of batch entries to be stored - * @param common_size the common size of all the batch entries stored + * @param num_batch_items the number of batch items to be stored + * @param common_size the common size of all the batch items stored * * @note Use this constructor when uniform batches need to be stored. */ - explicit batch_dim(const size_type num_batch_entries, + explicit batch_dim(const size_type num_batch_items, const dim& common_size) - : common_size_(common_size), num_batch_entries_(num_batch_entries) + : common_size_(common_size), num_batch_items_(num_batch_items) {} private: - size_type num_batch_entries_{}; + size_type num_batch_items_{}; dim common_size_{}; }; @@ -160,7 +160,7 @@ template inline batch_dim<2, DimensionType> transpose( const batch_dim<2, DimensionType>& input) { - return batch_dim<2, DimensionType>(input.get_num_batch_entries(), + return batch_dim<2, DimensionType>(input.get_num_batch_items(), transpose(input.get_common_size())); } diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index c5cc0040047..a502a701307 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -55,7 +55,7 @@ namespace gko { /** * BatchMultiVector stores multiple vectors in a batched fashion and is useful - * for batched operations. For example, if you want to store two batch entries + * for batched operations. For example, if you want to store two batch items * with multi-vectors of size (3 x 2) given below: * * [1 2 ; 3 4 @@ -66,10 +66,10 @@ namespace gko { * [1 2 1 2 1 2 3 4 3 4 3 4]. * * Access functions @at can help access individual - * entries if necessary. + * item if necessary. * - * The values of the batches are stored consecutively and in each batch, the - * vectors are stored in a row-major fashion. + * The values of the different batch items are stored consecutively and in each + * batch item, the multi-vectors are stored in a row-major fashion. * * @tparam ValueType precision of multi-vector elements * @@ -151,17 +151,17 @@ class BatchMultiVector batch_dim<2> get_size() const { return batch_size_; } /** - * Returns the number of batch entries. + * Returns the number of batch items. * - * @return the number of batch entries + * @return the number of batch items */ - size_type get_num_batch_entries() const + size_type get_num_batch_items() const { - return batch_size_.get_num_batch_entries(); + return batch_size_.get_num_batch_items(); } /** - * Returns the common size of the batch entries. + * Returns the common size of the batch items. * * @return the common size stored */ @@ -188,40 +188,40 @@ class BatchMultiVector /** * Returns a pointer to the array of values of the multi-vector for a - * specific batch entry. + * specific batch item. * - * @param batch_id the id of the batch entry. + * @param batch_id the id of the batch item. * * @return the pointer to the array of values */ - value_type* get_values_for_entry(size_type batch_id) noexcept + value_type* get_values_for_item(size_type batch_id) noexcept { - GKO_ASSERT(batch_id < this->get_num_batch_entries()); + GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_data() + this->get_size().get_cumulative_offset(batch_id); } /** - * @copydoc get_values_at_entry(size_type) + * @copydoc get_values_for_item(size_type) * * @note This is the constant version of the function, which can be * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_entry( + const value_type* get_const_values_for_item( size_type batch_id) const noexcept { - GKO_ASSERT(batch_id < this->get_num_batch_entries()); + GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + this->get_size().get_cumulative_offset(batch_id); } /** * Returns the number of elements explicitly stored in the batch matrix, - * cumulative across all the batch entries. + * cumulative across all the batch items. * * @return the number of elements explicitly stored in the vector, - * cumulative across all the batch entries + * cumulative across all the batch items */ size_type get_num_stored_elements() const noexcept { @@ -229,9 +229,9 @@ class BatchMultiVector } /** - * Returns a single element for a particular batch entry. + * Returns a single element for a particular batch item. * - * @param batch the batch index to be queried + * @param batch_id the batch item index to be queried * @param row the row of the requested element * @param col the column of the requested element * @@ -239,29 +239,29 @@ class BatchMultiVector * stored at (e.g. trying to call this method on a GPU multi-vector * from the OMP results in a runtime error) */ - value_type& at(size_type batch, size_type row, size_type col) + value_type& at(size_type batch_id, size_type row, size_type col) { - GKO_ASSERT(batch < this->get_num_batch_entries()); - return values_.get_data()[linearize_index(batch, row, col)]; + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_data()[linearize_index(batch_id, row, col)]; } /** * @copydoc BatchMultiVector::at(size_type, size_type, size_type) */ - value_type at(size_type batch, size_type row, size_type col) const + value_type at(size_type batch_id, size_type row, size_type col) const { - GKO_ASSERT(batch < this->get_num_batch_entries()); - return values_.get_const_data()[linearize_index(batch, row, col)]; + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_const_data()[linearize_index(batch_id, row, col)]; } /** - * Returns a single element for a particular batch entry. + * Returns a single element for a particular batch item. * * Useful for iterating across all elements of the vector. * However, it is less efficient than the two-parameter variant of this * method. * - * @param batch_id the batch entry index to be queried + * @param batch_id the batch item index to be queried * @param idx a linear index of the requested element * * @note the method has to be called on the same Executor the vector is @@ -370,7 +370,7 @@ class BatchMultiVector private: inline size_type compute_num_elems(const batch_dim<2>& size) { - return size.get_cumulative_offset(size.get_num_batch_entries()); + return size.get_cumulative_offset(size.get_num_batch_items()); } protected: @@ -516,15 +516,15 @@ std::unique_ptr batch_initialize( std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - size_type num_batch_entries = vals.size(); - GKO_THROW_IF_INVALID(num_batch_entries > 0, "Input data is empty"); + size_type num_batch_items = vals.size(); + GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); auto vals_begin = begin(vals); size_type common_num_rows = vals_begin ? vals_begin->size() : 0; auto common_size = dim<2>(common_num_rows, 1); for (auto& val : vals) { GKO_ASSERT_EQ(common_num_rows, val.size()); } - auto b_size = batch_dim<2>(num_batch_entries, common_size); + auto b_size = batch_dim<2>(num_batch_items, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); size_type batch = 0; for (const auto& b : vals) { @@ -569,8 +569,8 @@ std::unique_ptr batch_initialize( std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - size_type num_batch_entries = vals.size(); - GKO_THROW_IF_INVALID(num_batch_entries > 0, "Input data is empty"); + size_type num_batch_items = vals.size(); + GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); auto vals_begin = begin(vals); size_type common_num_rows = vals_begin ? vals_begin->size() : 0; size_type common_num_cols = @@ -583,7 +583,7 @@ std::unique_ptr batch_initialize( GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size); } - auto b_size = batch_dim<2>(num_batch_entries, common_size); + auto b_size = batch_dim<2>(num_batch_items, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); size_type batch = 0; for (const auto& b : vals) { @@ -634,11 +634,11 @@ std::unique_ptr batch_initialize( std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - size_type num_batch_entries = num_vectors; - GKO_THROW_IF_INVALID(num_batch_entries > 0 && vals.size() > 0, + size_type num_batch_items = num_vectors; + GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, "Input data is empty"); - auto b_size = batch_dim<2>(num_batch_entries, - dim<2>(begin(vals) ? vals.size() : 0, 1)); + auto b_size = + batch_dim<2>(num_batch_items, dim<2>(begin(vals) ? vals.size() : 0, 1)); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); for (size_type batch = 0; batch < num_vectors; batch++) { size_type idx = 0; @@ -665,7 +665,7 @@ std::unique_ptr batch_initialize( * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param num_batch_entries The number of times the input matrix is duplicated + * @param num_batch_items The number of times the input matrix is duplicated * @param vals values used to initialize each vector in the temp. batch * @param exec Executor associated to the vector * @param create_args additional arguments passed to Matrix::create, not @@ -677,19 +677,19 @@ std::unique_ptr batch_initialize( */ template std::unique_ptr batch_initialize( - const size_type num_batch_entries, + const size_type num_batch_items, std::initializer_list> vals, std::shared_ptr exec, TArgs&&... create_args) { using batch_multi_vector = BatchMultiVector; - GKO_THROW_IF_INVALID(num_batch_entries > 0 && vals.size() > 0, + GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, "Input data is empty"); auto common_size = dim<2>(begin(vals) ? vals.size() : 0, begin(vals) ? begin(vals)->size() : 0); - batch_dim<2> b_size(num_batch_entries, common_size); + batch_dim<2> b_size(num_batch_items, common_size); auto tmp = batch_multi_vector::create(exec->get_master(), b_size); - for (size_type batch = 0; batch < num_batch_entries; batch++) { + for (size_type batch = 0; batch < num_batch_items; batch++) { size_type ridx = 0; for (const auto& row : vals) { size_type cidx = 0; diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index a88443f60b9..057efe5f05c 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -67,9 +67,9 @@ void scale(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); #pragma omp parallel for - for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); scale_kernel(alpha_b, x_b); } } @@ -88,10 +88,10 @@ void add_scaled(std::shared_ptr exec, const auto y_ub = host::get_batch_struct(y); const auto alpha_ub = host::get_batch_struct(alpha); #pragma omp parallel for - for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); + for (size_type batch = 0; batch < y->get_num_batch_items(); ++batch) { + const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto y_b = gko::batch::batch_item(y_ub, batch); add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -110,11 +110,10 @@ void compute_dot(std::shared_ptr exec, const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); #pragma omp parallel for - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); + for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { + const auto res_b = gko::batch::batch_item(res_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto y_b = gko::batch::batch_item(y_ub, batch); compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -133,11 +132,10 @@ void compute_conj_dot(std::shared_ptr exec, const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); #pragma omp parallel for - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); + for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { + const auto res_b = gko::batch::batch_item(res_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto y_b = gko::batch::batch_item(y_ub, batch); compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -154,10 +152,9 @@ void compute_norm2(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); #pragma omp parallel for - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); + for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { + const auto res_b = gko::batch::batch_item(res_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); compute_norm2_kernel(x_b, res_b); } } @@ -174,9 +171,9 @@ void copy(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); #pragma omp parallel for - for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { - const auto result_b = gko::batch::batch_entry(result_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto result_b = gko::batch::batch_item(result_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); copy_kernel(x_b, result_b); } } diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index 967dddb108a..b5cdb03d214 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -66,9 +66,9 @@ void scale(std::shared_ptr exec, { const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); - for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); scale_kernel(alpha_b, x_b); } } @@ -86,10 +86,10 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); const auto alpha_ub = host::get_batch_struct(alpha); - for (size_type batch = 0; batch < y->get_num_batch_entries(); ++batch) { - const auto alpha_b = gko::batch::batch_entry(alpha_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); + for (size_type batch = 0; batch < y->get_num_batch_items(); ++batch) { + const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto y_b = gko::batch::batch_item(y_ub, batch); add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -107,11 +107,10 @@ void compute_dot(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); + for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { + const auto res_b = gko::batch::batch_item(res_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto y_b = gko::batch::batch_item(y_ub, batch); compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -129,11 +128,10 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); - const auto y_b = gko::batch::batch_entry(y_ub, batch); + for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { + const auto res_b = gko::batch::batch_item(res_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto y_b = gko::batch::batch_item(y_ub, batch); compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -149,10 +147,9 @@ void compute_norm2(std::shared_ptr exec, { const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); - for (size_type batch = 0; batch < result->get_num_batch_entries(); - ++batch) { - const auto res_b = gko::batch::batch_entry(res_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); + for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { + const auto res_b = gko::batch::batch_item(res_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); compute_norm2_kernel(x_b, res_b); } } @@ -168,9 +165,9 @@ void copy(std::shared_ptr exec, { const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); - for (size_type batch = 0; batch < x->get_num_batch_entries(); ++batch) { - const auto result_b = gko::batch::batch_entry(result_ub, batch); - const auto x_b = gko::batch::batch_entry(x_ub, batch); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto result_b = gko::batch::batch_item(result_ub, batch); + const auto x_b = gko::batch::batch_item(x_ub, batch); copy_kernel(x_b, result_b); } } diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc index 6e3b195e175..a6935866f56 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -32,8 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void scale_kernel( - const gko::batch_multi_vector::batch_entry& alpha, - const gko::batch_multi_vector::batch_entry& x) + const gko::batch_multi_vector::batch_item& alpha, + const gko::batch_multi_vector::batch_item& x) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -53,9 +53,9 @@ inline void scale_kernel( template inline void add_scaled_kernel( - const gko::batch_multi_vector::batch_entry& alpha, - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y) + const gko::batch_multi_vector::batch_item& alpha, + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -77,9 +77,9 @@ inline void add_scaled_kernel( template inline void compute_dot_product_kernel( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, - const gko::batch_multi_vector::batch_entry& result) + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y, + const gko::batch_multi_vector::batch_item& result) { for (int c = 0; c < result.num_rhs; c++) { result.values[c] = gko::zero(); @@ -96,9 +96,9 @@ inline void compute_dot_product_kernel( template inline void compute_conj_dot_product_kernel( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry& y, - const gko::batch_multi_vector::batch_entry& result) + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item& y, + const gko::batch_multi_vector::batch_item& result) { for (int c = 0; c < result.num_rhs; c++) { result.values[c] = gko::zero(); @@ -115,8 +115,8 @@ inline void compute_conj_dot_product_kernel( template inline void compute_norm2_kernel( - const gko::batch_multi_vector::batch_entry& x, - const gko::batch_multi_vector::batch_entry>& + const gko::batch_multi_vector::batch_item& x, + const gko::batch_multi_vector::batch_item>& result) { for (int j = 0; j < x.num_rhs; ++j) { @@ -141,8 +141,8 @@ inline void compute_norm2_kernel( */ template inline void copy_kernel( - const gko::batch_multi_vector::batch_entry& in, - const gko::batch_multi_vector::batch_entry& out) + const gko::batch_multi_vector::batch_item& in, + const gko::batch_multi_vector::batch_item& out) { for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) { const int i = iz / in.num_rhs; diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index 41262be1d48..21ff280baba 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -66,7 +66,7 @@ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( const BatchMultiVector* const op) { - return {op->get_const_values(), op->get_num_batch_entries(), + return {op->get_const_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -80,7 +80,7 @@ template inline gko::batch_multi_vector::uniform_batch get_batch_struct( BatchMultiVector* const op) { - return {op->get_values(), op->get_num_batch_entries(), + return {op->get_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index f6d169bceaf..62567cc91ee 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -398,7 +398,7 @@ TYPED_TEST(BatchMultiVector, ConvertsEmptyToPrecision) empty->convert_to(res.get()); - ASSERT_FALSE(res->get_num_batch_entries()); + ASSERT_FALSE(res->get_num_batch_items()); } @@ -413,5 +413,5 @@ TYPED_TEST(BatchMultiVector, MovesEmptyToPrecision) empty->move_to(res.get()); - ASSERT_FALSE(res->get_num_batch_entries()); + ASSERT_FALSE(res->get_num_batch_items()); } diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 631464a8d27..a55ff0792ad 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -59,11 +59,11 @@ class BatchMultiVector : public CommonTestFixture { BatchMultiVector() : rand_engine(15) {} template - std::unique_ptr gen_mtx(const size_t num_batch_entries, + std::unique_ptr gen_mtx(const size_t num_batch_items, int num_rows, int num_cols) { return gko::test::generate_uniform_batch_random_matrix( - num_batch_entries, num_rows, num_cols, + num_batch_items, num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), std::normal_distribution<>(-1.0, 1.0), rand_engine, false, ref); } From 18697f44ac497a10c1c277235ed1f538b3f96498 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 31 Jul 2023 17:14:17 +0200 Subject: [PATCH 144/583] Use batch:: namespace,rename to batch::MultiVector --- ...batch_multi_vector_kernel_launcher.hpp.inc | 30 ++-- .../base/batch_multi_vector_kernels.hpp.inc | 62 +++---- core/base/batch_multi_vector.cpp | 121 ++++++------- core/base/batch_multi_vector_kernels.hpp | 30 ++-- core/base/batch_struct.hpp | 23 ++- core/test/base/batch_multi_vector.cpp | 93 +++++----- core/test/utils/assertions.hpp | 2 +- cuda/base/batch_multi_vector_kernels.cu | 2 +- cuda/base/batch_struct.hpp | 8 +- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 34 ++-- dpcpp/base/batch_multi_vector_kernels.hpp.inc | 24 +-- dpcpp/base/batch_struct.hpp | 8 +- hip/base/batch_multi_vector_kernels.hip.cpp | 2 +- hip/base/batch_struct.hip.hpp | 8 +- .../ginkgo/core/base/batch_multi_vector.hpp | 163 +++++++++--------- omp/base/batch_multi_vector_kernels.cpp | 34 ++-- reference/base/batch_multi_vector_kernels.cpp | 64 +++---- .../base/batch_multi_vector_kernels.hpp.inc | 30 ++-- reference/base/batch_struct.hpp | 8 +- .../test/base/batch_multi_vector_kernels.cpp | 112 ++++++------ test/base/batch_multi_vector_kernels.cpp | 42 ++--- test/test_install/test_install.cpp | 2 +- 22 files changed, 452 insertions(+), 450 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc index acd58b37327..6c0c5363baa 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc @@ -33,8 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template void scale(std::shared_ptr exec, - const BatchMultiVector* const alpha, - BatchMultiVector* const x) + const batch::MultiVector* const alpha, + batch::MultiVector* const x) { const auto num_blocks = x->get_num_batch_items(); const auto alpha_ub = get_batch_struct(alpha); @@ -54,9 +54,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void add_scaled(std::shared_ptr exec, - const BatchMultiVector* const alpha, - const BatchMultiVector* const x, - BatchMultiVector* const y) + const batch::MultiVector* const alpha, + const batch::MultiVector* const x, + batch::MultiVector* const y) { const auto num_blocks = x->get_num_batch_items(); const size_type nrhs = x->get_common_size()[1]; @@ -80,9 +80,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_dot(std::shared_ptr exec, - const BatchMultiVector* x, - const BatchMultiVector* y, - BatchMultiVector* result) + const batch::MultiVector* x, + const batch::MultiVector* y, + batch::MultiVector* result) { const auto num_blocks = x->get_num_batch_items(); const auto num_rhs = x->get_common_size()[1]; @@ -100,9 +100,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_conj_dot(std::shared_ptr exec, - const BatchMultiVector* x, - const BatchMultiVector* y, - BatchMultiVector* result) + const batch::MultiVector* x, + const batch::MultiVector* y, + batch::MultiVector* result) { const auto num_blocks = x->get_num_batch_items(); const auto num_rhs = x->get_common_size()[1]; @@ -120,8 +120,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_norm2(std::shared_ptr exec, - const BatchMultiVector* const x, - BatchMultiVector>* const result) + const batch::MultiVector* const x, + batch::MultiVector>* const result) { const auto num_blocks = x->get_num_batch_items(); const auto num_rhs = x->get_common_size()[1]; @@ -137,8 +137,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void copy(std::shared_ptr exec, - const BatchMultiVector* x, - BatchMultiVector* result) + const batch::MultiVector* x, + batch::MultiVector* result) { const auto num_blocks = x->get_num_batch_items(); const auto result_ub = get_batch_struct(result); diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index cdb25d318f0..df64e5cfe85 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -33,8 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __device__ __forceinline__ void scale( - const gko::batch_multi_vector::batch_item& alpha, - const gko::batch_multi_vector::batch_item& x, Mapping map) + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { @@ -49,10 +49,10 @@ __device__ __forceinline__ void scale( template __global__ __launch_bounds__( default_block_size, - sm_oversubscription) void scale_kernel(const gko::batch_multi_vector:: + sm_oversubscription) void scale_kernel(const gko::batch::multi_vector:: uniform_batch alpha, - const gko::batch_multi_vector:: + const gko::batch::multi_vector:: uniform_batch x, Mapping map) @@ -68,9 +68,9 @@ __global__ __launch_bounds__( template __device__ __forceinline__ void add_scaled( - const gko::batch_multi_vector::batch_item& alpha, - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y, Mapping map) + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = threadIdx.x; li < max_li; li += blockDim.x) { @@ -85,15 +85,15 @@ __device__ __forceinline__ void add_scaled( template __global__ __launch_bounds__( default_block_size, - sm_oversubscription) void add_scaled_kernel(const gko::batch_multi_vector:: + sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector:: uniform_batch< const ValueType> alpha, - const gko::batch_multi_vector:: + const gko::batch::multi_vector:: uniform_batch< const ValueType> x, - const gko::batch_multi_vector:: + const gko::batch::multi_vector:: uniform_batch y, Mapping map) @@ -110,10 +110,10 @@ __global__ __launch_bounds__( template __device__ __forceinline__ void gen_one_dot( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, const int rhs_index, - const gko::batch_multi_vector::batch_item& result, + const gko::batch::multi_vector::batch_item& result, Group subgroup, Mapping conj_map) { ValueType val = zero(); @@ -134,9 +134,9 @@ __device__ __forceinline__ void gen_one_dot( template __device__ __forceinline__ void compute_gen_dot_product( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y, - const gko::batch_multi_vector::batch_item& result, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result, Mapping conj_map) { constexpr auto tile_size = config::warp_size; @@ -155,9 +155,9 @@ __device__ __forceinline__ void compute_gen_dot_product( template __global__ __launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( - const gko::batch_multi_vector::uniform_batch x, - const gko::batch_multi_vector::uniform_batch y, - const gko::batch_multi_vector::uniform_batch result, + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, + const gko::batch::multi_vector::uniform_batch result, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; @@ -172,9 +172,9 @@ __global__ template __device__ __forceinline__ void one_norm2( - const gko::batch_multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& x, const int rhs_index, - const gko::batch_multi_vector::batch_item>& + const gko::batch::multi_vector::batch_item>& result, Group subgroup) { @@ -202,8 +202,8 @@ __device__ __forceinline__ void one_norm2( */ template __device__ __forceinline__ void compute_norm2( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item>& + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item>& result) { constexpr auto tile_size = config::warp_size; @@ -222,13 +222,13 @@ __device__ __forceinline__ void compute_norm2( template __global__ __launch_bounds__( default_block_size, - sm_oversubscription) void compute_norm2_kernel(const gko:: - batch_multi_vector:: + sm_oversubscription) void compute_norm2_kernel(const gko::batch:: + multi_vector:: uniform_batch< const ValueType> x, - const gko:: - batch_multi_vector:: + const gko::batch:: + multi_vector:: uniform_batch< remove_complex< ValueType>> @@ -251,8 +251,8 @@ __global__ __launch_bounds__( */ template __device__ __forceinline__ void copy( - const gko::batch_multi_vector::batch_item& in, - const gko::batch_multi_vector::batch_item& out) + const gko::batch::multi_vector::batch_item& in, + const gko::batch::multi_vector::batch_item& out) { for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; iz += blockDim.x) { @@ -266,8 +266,8 @@ __device__ __forceinline__ void copy( template __global__ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( - const gko::batch_multi_vector::uniform_batch src, - const gko::batch_multi_vector::uniform_batch dst) + const gko::batch::multi_vector::uniform_batch src, + const gko::batch::multi_vector::uniform_batch dst) { for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; batch_id += gridDim.x) { diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index a843ee376c1..f17f1479f5f 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -50,7 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace batch_multi_vector { +namespace batch { +namespace multi_vector { namespace { @@ -63,7 +64,8 @@ GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); } // namespace -} // namespace batch_multi_vector +} // namespace multi_vector + namespace detail { @@ -84,19 +86,19 @@ batch_dim<2> compute_batch_size( template -BatchMultiVector::BatchMultiVector( - std::shared_ptr exec, const batch_dim<2>& size) - : EnablePolymorphicObject>(exec), +MultiVector::MultiVector(std::shared_ptr exec, + const batch_dim<2>& size) + : EnablePolymorphicObject>(exec), batch_size_(size), values_(exec, compute_num_elems(size)) {} template -BatchMultiVector::BatchMultiVector( +MultiVector::MultiVector( std::shared_ptr exec, const std::vector*>& matrices) - : EnablePolymorphicObject>(exec), + : EnablePolymorphicObject>(exec), batch_size_{detail::compute_batch_size(matrices)}, values_(exec, compute_num_elems(batch_size_)) { @@ -111,11 +113,11 @@ BatchMultiVector::BatchMultiVector( template -BatchMultiVector::BatchMultiVector( - std::shared_ptr exec, size_type num_duplications, - const matrix::Dense* input) - : BatchMultiVector( - exec, batch_dim<2>(num_duplications, input->get_size())) +MultiVector::MultiVector(std::shared_ptr exec, + size_type num_duplications, + const matrix::Dense* input) + : MultiVector(exec, + batch_dim<2>(num_duplications, input->get_size())) { size_type offset = 0; for (size_type i = 0; i < num_duplications; ++i) { @@ -128,10 +130,10 @@ BatchMultiVector::BatchMultiVector( template -BatchMultiVector::BatchMultiVector( - std::shared_ptr exec, size_type num_duplications, - const BatchMultiVector* input) - : BatchMultiVector( +MultiVector::MultiVector(std::shared_ptr exec, + size_type num_duplications, + const MultiVector* input) + : MultiVector( exec, batch_dim<2>(input->get_num_batch_items() * num_duplications, input->get_common_size())) { @@ -146,9 +148,9 @@ BatchMultiVector::BatchMultiVector( template -std::unique_ptr> -BatchMultiVector::create_with_config_of( - ptr_param other) +std::unique_ptr> +MultiVector::create_with_config_of( + ptr_param other) { // De-referencing `other` before calling the functions (instead of // using operator `->`) is currently required to be compatible with @@ -160,7 +162,7 @@ BatchMultiVector::create_with_config_of( template std::vector>> -BatchMultiVector::unbatch() const +MultiVector::unbatch() const { using unbatch_type = matrix::Dense; auto exec = this->get_executor(); @@ -178,20 +180,20 @@ BatchMultiVector::unbatch() const template -std::unique_ptr> -BatchMultiVector::create_const( +std::unique_ptr> +MultiVector::create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - detail::const_array_view&& values) + gko::detail::const_array_view&& values) { // cast const-ness away, but return a const object afterwards, // so we can ensure that no modifications take place. - return std::unique_ptr(new BatchMultiVector{ - exec, sizes, detail::array_const_cast(std::move(values))}); + return std::unique_ptr(new MultiVector{ + exec, sizes, gko::detail::array_const_cast(std::move(values))}); } template -void BatchMultiVector::fill(ValueType value) +void MultiVector::fill(ValueType value) { GKO_ASSERT(this->values_.get_num_elems() > 0); this->values_.fill(value); @@ -199,24 +201,24 @@ void BatchMultiVector::fill(ValueType value) template -void BatchMultiVector::set_size(const batch_dim<2>& value) noexcept +void MultiVector::set_size(const batch_dim<2>& value) noexcept { batch_size_ = value; } template -std::unique_ptr> -BatchMultiVector::create_with_same_config() const +std::unique_ptr> +MultiVector::create_with_same_config() const { - return BatchMultiVector::create(this->get_executor(), - this->get_size()); + return MultiVector::create(this->get_executor(), + this->get_size()); } template -void BatchMultiVector::scale( - ptr_param> alpha) +void MultiVector::scale( + ptr_param> alpha) { GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); @@ -226,15 +228,15 @@ void BatchMultiVector::scale( alpha->get_common_size()); } auto exec = this->get_executor(); - exec->run(batch_multi_vector::make_scale( - make_temporary_clone(exec, alpha).get(), this)); + exec->run(multi_vector::make_scale(make_temporary_clone(exec, alpha).get(), + this)); } template -void BatchMultiVector::add_scaled( - ptr_param> alpha, - ptr_param> b) +void MultiVector::add_scaled( + ptr_param> alpha, + ptr_param> b) { GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); @@ -247,7 +249,7 @@ void BatchMultiVector::add_scaled( GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); auto exec = this->get_executor(); - exec->run(batch_multi_vector::make_add_scaled( + exec->run(multi_vector::make_add_scaled( make_temporary_clone(exec, alpha).get(), make_temporary_clone(exec, b).get(), this)); } @@ -261,9 +263,9 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) template -void BatchMultiVector::compute_conj_dot( - ptr_param> b, - ptr_param> result) const +void MultiVector::compute_conj_dot( + ptr_param> b, + ptr_param> result) const { GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); @@ -272,16 +274,16 @@ void BatchMultiVector::compute_conj_dot( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); auto exec = this->get_executor(); - exec->run(batch_multi_vector::make_compute_conj_dot( + exec->run(multi_vector::make_compute_conj_dot( this, make_temporary_clone(exec, b).get(), make_temporary_output_clone(exec, result).get())); } template -void BatchMultiVector::compute_dot( - ptr_param> b, - ptr_param> result) const +void MultiVector::compute_dot( + ptr_param> b, + ptr_param> result) const { GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); @@ -290,15 +292,15 @@ void BatchMultiVector::compute_dot( result->get_common_size(), get_col_sizes(this->get_size()).get_common_size()); auto exec = this->get_executor(); - exec->run(batch_multi_vector::make_compute_dot( + exec->run(multi_vector::make_compute_dot( this, make_temporary_clone(exec, b).get(), make_temporary_output_clone(exec, result).get())); } template -void BatchMultiVector::compute_norm2( - ptr_param>> result) const +void MultiVector::compute_norm2( + ptr_param>> result) const { GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items()); GKO_ASSERT_EQUAL_DIMENSIONS( @@ -306,14 +308,14 @@ void BatchMultiVector::compute_norm2( get_col_sizes(this->get_size()).get_common_size()); auto exec = this->get_executor(); - exec->run(batch_multi_vector::make_compute_norm2( + exec->run(multi_vector::make_compute_norm2( this, make_temporary_output_clone(exec, result).get())); } template -void BatchMultiVector::convert_to( - BatchMultiVector>* result) const +void MultiVector::convert_to( + MultiVector>* result) const { result->values_ = this->values_; result->set_size(this->get_size()); @@ -321,8 +323,8 @@ void BatchMultiVector::convert_to( template -void BatchMultiVector::move_to( - BatchMultiVector>* result) +void MultiVector::move_to( + MultiVector>* result) { this->convert_to(result); } @@ -352,14 +354,14 @@ void read_impl(MatrixType* mtx, const std::vector& data) template -void BatchMultiVector::read(const std::vector& data) +void MultiVector::read(const std::vector& data) { read_impl(this, data); } template -void BatchMultiVector::read(const std::vector& data) +void MultiVector::read(const std::vector& data) { read_impl(this, data); } @@ -387,21 +389,22 @@ void write_impl(const MatrixType* mtx, std::vector& data) template -void BatchMultiVector::write(std::vector& data) const +void MultiVector::write(std::vector& data) const { write_impl(this, data); } template -void BatchMultiVector::write(std::vector& data) const +void MultiVector::write(std::vector& data) const { write_impl(this, data); } -#define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class BatchMultiVector<_type> +#define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); +} // namespace batch } // namespace gko diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp index 6eba9eac829..8603a2b9055 100644 --- a/core/base/batch_multi_vector_kernels.hpp +++ b/core/base/batch_multi_vector_kernels.hpp @@ -51,36 +51,36 @@ namespace kernels { #define GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(_type) \ void scale(std::shared_ptr exec, \ - const BatchMultiVector<_type>* alpha, \ - BatchMultiVector<_type>* x) + const batch::MultiVector<_type>* alpha, \ + batch::MultiVector<_type>* x) #define GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(_type) \ void add_scaled(std::shared_ptr exec, \ - const BatchMultiVector<_type>* alpha, \ - const BatchMultiVector<_type>* x, \ - BatchMultiVector<_type>* y) + const batch::MultiVector<_type>* alpha, \ + const batch::MultiVector<_type>* x, \ + batch::MultiVector<_type>* y) #define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(_type) \ void compute_dot(std::shared_ptr exec, \ - const BatchMultiVector<_type>* x, \ - const BatchMultiVector<_type>* y, \ - BatchMultiVector<_type>* result) + const batch::MultiVector<_type>* x, \ + const batch::MultiVector<_type>* y, \ + batch::MultiVector<_type>* result) #define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL(_type) \ void compute_conj_dot(std::shared_ptr exec, \ - const BatchMultiVector<_type>* x, \ - const BatchMultiVector<_type>* y, \ - BatchMultiVector<_type>* result) + const batch::MultiVector<_type>* x, \ + const batch::MultiVector<_type>* y, \ + batch::MultiVector<_type>* result) #define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(_type) \ void compute_norm2(std::shared_ptr exec, \ - const BatchMultiVector<_type>* x, \ - BatchMultiVector>* result) + const batch::MultiVector<_type>* x, \ + batch::MultiVector>* result) #define GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(_type) \ void copy(std::shared_ptr exec, \ - const BatchMultiVector<_type>* x, \ - BatchMultiVector<_type>* result) + const batch::MultiVector<_type>* x, \ + batch::MultiVector<_type>* result) #define GKO_DECLARE_ALL_AS_TEMPLATES \ diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index 9549c4eaaee..d22b64f3320 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -40,7 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -namespace batch_multi_vector { +namespace batch { +namespace multi_vector { /** @@ -77,24 +78,20 @@ struct uniform_batch { }; -} // namespace batch_multi_vector - - -namespace batch { +} // namespace multi_vector template -GKO_ATTRIBUTES GKO_INLINE gko::batch_multi_vector::batch_item -to_const(const gko::batch_multi_vector::batch_item& b) +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item to_const( + const multi_vector::batch_item& b) { return {b.values, b.stride, b.num_rows, b.num_rhs}; } template -GKO_ATTRIBUTES GKO_INLINE - gko::batch_multi_vector::uniform_batch - to_const(const gko::batch_multi_vector::uniform_batch& ub) +GKO_ATTRIBUTES GKO_INLINE multi_vector::uniform_batch to_const( + const multi_vector::uniform_batch& ub) { return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs}; } @@ -110,8 +107,8 @@ GKO_ATTRIBUTES GKO_INLINE * @param batch_idx The position of the desired object in the batch */ template -GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_item batch_item( - const batch_multi_vector::uniform_batch& batch, +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item batch_item( + const multi_vector::uniform_batch& batch, const size_type batch_idx) { return {batch.values + batch_idx * batch.stride * batch.num_rows, @@ -119,7 +116,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_item batch_item( } template -GKO_ATTRIBUTES GKO_INLINE batch_multi_vector::batch_item batch_item( +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item batch_item( ValueType* const batch_values, const int stride, const int num_rows, const int num_rhs, const size_type batch_idx) { diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 844d4825a7a..e87cedca913 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -45,14 +45,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template -class BatchMultiVector : public ::testing::Test { +class MultiVector : public ::testing::Test { protected: using value_type = T; using DenseMtx = gko::matrix::Dense; using size_type = gko::size_type; - BatchMultiVector() + MultiVector() : exec(gko::ReferenceExecutor::create()), - mtx(gko::batch_initialize>( + mtx(gko::batch::initialize>( {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, exec)) @@ -60,7 +60,7 @@ class BatchMultiVector : public ::testing::Test { static void assert_equal_to_original_mtx( - const gko::BatchMultiVector* m) + const gko::batch::MultiVector* m) { ASSERT_NE(m->get_const_values(), nullptr); EXPECT_EQ(m->get_const_values()[0], value_type{-1.0}); @@ -80,7 +80,7 @@ class BatchMultiVector : public ::testing::Test { ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); } - static void assert_empty(gko::BatchMultiVector* m) + static void assert_empty(gko::batch::MultiVector* m) { ASSERT_EQ(m->get_num_batch_items(), 0); ASSERT_EQ(m->get_common_size(), gko::dim<2>{}); @@ -88,21 +88,21 @@ class BatchMultiVector : public ::testing::Test { } std::shared_ptr exec; - std::unique_ptr> mtx; + std::unique_ptr> mtx; }; -TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); +TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes); -TYPED_TEST(BatchMultiVector, CanBeEmpty) +TYPED_TEST(MultiVector, CanBeEmpty) { - auto empty = gko::BatchMultiVector::create(this->exec); + auto empty = gko::batch::MultiVector::create(this->exec); this->assert_empty(empty.get()); } -TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) +TYPED_TEST(MultiVector, KnowsItsSizeAndValues) { ASSERT_NE(this->mtx->get_const_values(), nullptr); @@ -110,7 +110,7 @@ TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues) } -TYPED_TEST(BatchMultiVector, CanGetValuesForEntry) +TYPED_TEST(MultiVector, CanGetValuesForEntry) { using value_type = typename TestFixture::value_type; @@ -118,9 +118,9 @@ TYPED_TEST(BatchMultiVector, CanGetValuesForEntry) } -TYPED_TEST(BatchMultiVector, CanBeCopied) +TYPED_TEST(MultiVector, CanBeCopied) { - auto mtx_copy = gko::BatchMultiVector::create(this->exec); + auto mtx_copy = gko::batch::MultiVector::create(this->exec); mtx_copy->copy_from(this->mtx.get()); @@ -131,9 +131,9 @@ TYPED_TEST(BatchMultiVector, CanBeCopied) } -TYPED_TEST(BatchMultiVector, CanBeMoved) +TYPED_TEST(MultiVector, CanBeMoved) { - auto mtx_copy = gko::BatchMultiVector::create(this->exec); + auto mtx_copy = gko::batch::MultiVector::create(this->exec); this->mtx->move_to(mtx_copy.get()); @@ -141,7 +141,7 @@ TYPED_TEST(BatchMultiVector, CanBeMoved) } -TYPED_TEST(BatchMultiVector, CanBeCloned) +TYPED_TEST(MultiVector, CanBeCloned) { auto mtx_clone = this->mtx->clone(); @@ -150,7 +150,7 @@ TYPED_TEST(BatchMultiVector, CanBeCloned) } -TYPED_TEST(BatchMultiVector, CanBeCleared) +TYPED_TEST(MultiVector, CanBeCleared) { this->mtx->clear(); @@ -158,11 +158,11 @@ TYPED_TEST(BatchMultiVector, CanBeCleared) } -TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize) +TYPED_TEST(MultiVector, CanBeConstructedWithSize) { using size_type = gko::size_type; - auto m = gko::BatchMultiVector::create( + auto m = gko::batch::MultiVector::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 4))); ASSERT_EQ(m->get_num_batch_items(), 2); @@ -170,7 +170,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize) } -TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingData) +TYPED_TEST(MultiVector, CanBeConstructedFromExistingData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -184,7 +184,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingData) 6.0, -3.0}; // clang-format on - auto m = gko::BatchMultiVector::create( + auto m = gko::batch::MultiVector::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), gko::array::view(this->exec, 8, data)); @@ -200,7 +200,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingData) } -TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) +TYPED_TEST(MultiVector, CanBeConstructedFromExistingConstData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -214,7 +214,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) 6.0, -3.0}; // clang-format on - auto m = gko::BatchMultiVector::create_const( + auto m = gko::batch::MultiVector::create_const( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), gko::array::const_view(this->exec, 8, data)); @@ -230,7 +230,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromExistingConstData) } -TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) +TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -240,14 +240,14 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::BatchMultiVector::create( + auto m = gko::batch::MultiVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); this->assert_equal_to_original_mtx(m.get()); } -TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatricesByDuplication) +TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatricesByDuplication) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -257,16 +257,16 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromDenseMatricesByDuplication) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::BatchMultiVector::create( + auto bat_m = gko::batch::MultiVector::create( this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); auto m = - gko::BatchMultiVector::create(this->exec, 3, mat1.get()); + gko::batch::MultiVector::create(this->exec, 3, mat1.get()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } -TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) +TYPED_TEST(MultiVector, CanBeConstructedFromMultiVectorMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -275,23 +275,24 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices) this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::BatchMultiVector::create( + auto m = gko::batch::MultiVector::create( this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::BatchMultiVector::create( + auto m_ref = gko::batch::MultiVector::create( this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), mat1.get(), mat2.get()}); - auto m2 = gko::BatchMultiVector::create(this->exec, 3, m.get()); + auto m2 = + gko::batch::MultiVector::create(this->exec, 3, m.get()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } -TYPED_TEST(BatchMultiVector, CanBeListConstructed) +TYPED_TEST(MultiVector, CanBeListConstructed) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch::initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); ASSERT_EQ(m->get_num_batch_items(), 2); @@ -303,11 +304,11 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructed) } -TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies) +TYPED_TEST(MultiVector, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch::initialize>( 2, I({1.0, 2.0}), this->exec); ASSERT_EQ(m->get_num_batch_items(), 2); @@ -319,12 +320,12 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies) } -TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed) +TYPED_TEST(MultiVector, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; using T = value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch::initialize>( {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, {I{1.0, 2.0, -1.0}, I{3.0, 4.0, -2.0}, I{5.0, 6.0, -3.0}}}, this->exec); @@ -343,10 +344,10 @@ TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed) } -TYPED_TEST(BatchMultiVector, CanBeFilledWithValue) +TYPED_TEST(MultiVector, CanBeFilledWithValue) { using value_type = typename TestFixture::value_type; - auto m = gko::BatchMultiVector::create( + auto m = gko::batch::MultiVector::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(3, 1))); m->fill(value_type(2.0)); @@ -362,7 +363,7 @@ TYPED_TEST(BatchMultiVector, CanBeFilledWithValue) } -TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) +TYPED_TEST(MultiVector, CanBeUnbatchedIntoDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -380,10 +381,10 @@ TYPED_TEST(BatchMultiVector, CanBeUnbatchedIntoDenseMatrices) } -TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixData) +TYPED_TEST(MultiVector, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; - auto m = gko::BatchMultiVector::create(this->exec); + auto m = gko::batch::MultiVector::create(this->exec); // clang-format off m->read({gko::matrix_data{{2, 2}, {{0, 0, 1.0}, @@ -409,10 +410,10 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromMatrixData) } -TYPED_TEST(BatchMultiVector, CanBeReadFromSparseMatrixData) +TYPED_TEST(MultiVector, CanBeReadFromSparseMatrixData) { using value_type = typename TestFixture::value_type; - auto m = gko::BatchMultiVector::create(this->exec); + auto m = gko::batch::MultiVector::create(this->exec); // clang-format off m->read({gko::matrix_data{{2, 2}, @@ -437,7 +438,7 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromSparseMatrixData) } -TYPED_TEST(BatchMultiVector, GeneratesCorrectMatrixData) +TYPED_TEST(MultiVector, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; using tpl = typename gko::matrix_data::nonzero_type; diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 44da77244f7..bae78912a6c 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -752,7 +752,7 @@ ::testing::AssertionResult batch_matrices_near( std::initializer_list> second, double tolerance) { auto second_mtx = - batch_initialize>>( + batch::initialize>>( second, first->get_executor()->get_master()); return batch_matrices_near( first_expression, detail::remove_list_wrapper(second_expression), diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 3e44b006552..7729d006b75 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -57,7 +57,7 @@ namespace gko { namespace kernels { namespace cuda { /** - * @brief The BatchMultiVector matrix format namespace. + * @brief The MultiVector matrix format namespace. * * @ingroup batch_multi_vector */ diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 600cccc622b..715332418fb 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -62,8 +62,8 @@ namespace cuda { * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch> -get_batch_struct(const BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch> +get_batch_struct(const batch::MultiVector* const op) { return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -75,8 +75,8 @@ get_batch_struct(const BatchMultiVector* const op) * Generates a uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch> -get_batch_struct(BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch> +get_batch_struct(batch::MultiVector* const op) { return {as_cuda_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 3542fc5ebad..f307b6ba240 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -58,8 +58,8 @@ namespace gko { namespace kernels { namespace dpcpp { /** - * @brief The BatchMultiVector matrix format namespace. - * @ref BatchMultiVector + * @brief The MultiVector matrix format namespace. + * @ref MultiVector * @ingroup batch_multi_vector */ namespace batch_multi_vector { @@ -70,8 +70,8 @@ namespace batch_multi_vector { template void scale(std::shared_ptr exec, - const BatchMultiVector* const alpha, - BatchMultiVector* const x) + const batch::MultiVector* const alpha, + batch::MultiVector* const x) { const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); @@ -118,9 +118,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void add_scaled(std::shared_ptr exec, - const BatchMultiVector* const alpha, - const BatchMultiVector* const x, - BatchMultiVector* const y) + const batch::MultiVector* const alpha, + const batch::MultiVector* const x, + batch::MultiVector* const y) { const size_type num_rows = x->get_common_size()[0]; const size_type num_cols = x->get_common_size()[1]; @@ -170,9 +170,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_dot(std::shared_ptr exec, - const BatchMultiVector* const x, - const BatchMultiVector* const y, - BatchMultiVector* const result) + const batch::MultiVector* const x, + const batch::MultiVector* const y, + batch::MultiVector* const result) { const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); @@ -209,9 +209,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_conj_dot(std::shared_ptr exec, - const BatchMultiVector* const x, - const BatchMultiVector* const y, - BatchMultiVector* const result) + const batch::MultiVector* const x, + const batch::MultiVector* const y, + batch::MultiVector* const result) { const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); @@ -248,8 +248,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_norm2(std::shared_ptr exec, - const BatchMultiVector* const x, - BatchMultiVector>* const result) + const batch::MultiVector* const x, + batch::MultiVector>* const result) { const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); @@ -282,8 +282,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void copy(std::shared_ptr exec, - const BatchMultiVector* x, - BatchMultiVector* result) + const batch::MultiVector* x, + batch::MultiVector* result) { const auto x_ub = get_batch_struct(x); const auto result_ub = get_batch_struct(result); diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index c328a50465a..22d00d780f9 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -32,8 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __dpct_inline__ void scale_kernel( - const gko::batch_multi_vector::batch_item& alpha, - const gko::batch_multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; @@ -50,9 +50,9 @@ __dpct_inline__ void scale_kernel( template __dpct_inline__ void add_scaled_kernel( - const gko::batch_multi_vector::batch_item& alpha, - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; @@ -69,9 +69,9 @@ __dpct_inline__ void add_scaled_kernel( template __dpct_inline__ void compute_gen_dot_product_kernel( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y, - const gko::batch_multi_vector::batch_item& result, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result, sycl::nd_item<3>& item_ct1, Mapping conj_map) { constexpr auto tile_size = config::warp_size; @@ -104,8 +104,8 @@ __dpct_inline__ void compute_gen_dot_product_kernel( template __dpct_inline__ void compute_norm2_kernel( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item>& + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item>& result, sycl::nd_item<3>& item_ct1) { @@ -138,8 +138,8 @@ __dpct_inline__ void compute_norm2_kernel( template __dpct_inline__ void copy_kernel( - const gko::batch_multi_vector::batch_item& in, - const gko::batch_multi_vector::batch_item& out, + const gko::batch::multi_vector::batch_item& in, + const gko::batch::multi_vector::batch_item& out, sycl::nd_item<3>& item_ct1) { for (int iz = item_ct1.get_local_linear_id(); iz < in.num_rows * in.num_rhs; diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index ff3a6a87ade..9c752a94b4f 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -61,8 +61,8 @@ namespace dpcpp { * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch get_batch_struct( - const BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch get_batch_struct( + const batch::MultiVector* const op) { return {op->get_const_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -75,8 +75,8 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( * Generates a uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch get_batch_struct( - BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch get_batch_struct( + batch::MultiVector* const op) { return {op->get_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index bb465ac7709..f59d873840c 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -58,7 +58,7 @@ namespace gko { namespace kernels { namespace hip { /** - * @brief The BatchMultiVector matrix format namespace. + * @brief The MultiVector matrix format namespace. * * @ingroup batch_multi_vector */ diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index 1732505bc6f..442260e50e6 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -62,8 +62,8 @@ namespace hip { * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch> -get_batch_struct(const BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch> +get_batch_struct(const batch::MultiVector* const op) { return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -75,8 +75,8 @@ get_batch_struct(const BatchMultiVector* const op) * Generates a uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch> -get_batch_struct(BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch> get_batch_struct( + batch::MultiVector* const op) { return {as_hip_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index a502a701307..0e011f6b3ef 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -52,9 +52,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace batch { /** - * BatchMultiVector stores multiple vectors in a batched fashion and is useful + * MultiVector stores multiple vectors in a batched fashion and is useful * for batched operations. For example, if you want to store two batch items * with multi-vectors of size (3 x 2) given below: * @@ -77,50 +78,49 @@ namespace gko { * @ingroup batched */ template -class BatchMultiVector - : public EnablePolymorphicObject>, - public EnablePolymorphicAssignment>, - public EnableCreateMethod>, - public ConvertibleTo>>, +class MultiVector + : public EnablePolymorphicObject>, + public EnablePolymorphicAssignment>, + public EnableCreateMethod>, + public ConvertibleTo>>, public BatchReadableFromMatrixData, public BatchReadableFromMatrixData, public BatchWritableToMatrixData, public BatchWritableToMatrixData { - friend class EnableCreateMethod; - friend class EnablePolymorphicObject; - friend class BatchMultiVector>; - friend class BatchMultiVector>; + friend class EnableCreateMethod; + friend class EnablePolymorphicObject; + friend class MultiVector>; + friend class MultiVector>; public: using BatchReadableFromMatrixData::read; using BatchReadableFromMatrixData::read; - using EnablePolymorphicAssignment::convert_to; - using EnablePolymorphicAssignment::move_to; - using ConvertibleTo< - BatchMultiVector>>::convert_to; - using ConvertibleTo>>::move_to; + using EnablePolymorphicAssignment::convert_to; + using EnablePolymorphicAssignment::move_to; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; using value_type = ValueType; using index_type = int32; using unbatch_type = matrix::Dense; using mat_data = matrix_data; using mat_data64 = matrix_data; - using absolute_type = remove_complex>; - using complex_type = to_complex>; + using absolute_type = remove_complex>; + using complex_type = to_complex>; /** - * Creates a BatchMultiVector with the configuration of another - * BatchMultiVector. + * Creates a MultiVector with the configuration of another + * MultiVector. * * @param other The other multi-vector whose configuration needs to copied. */ - static std::unique_ptr create_with_config_of( - ptr_param other); + static std::unique_ptr create_with_config_of( + ptr_param other); void convert_to( - BatchMultiVector>* result) const override; + MultiVector>* result) const override; - void move_to(BatchMultiVector>* result) override; + void move_to(MultiVector>* result) override; void read(const std::vector& data) override; @@ -246,7 +246,7 @@ class BatchMultiVector } /** - * @copydoc BatchMultiVector::at(size_type, size_type, size_type) + * @copydoc MultiVector::at(size_type, size_type, size_type) */ value_type at(size_type batch_id, size_type row, size_type col) const { @@ -274,7 +274,7 @@ class BatchMultiVector } /** - * @copydoc BatchMultiVector::at(size_type, size_type, size_type) + * @copydoc MultiVector::at(size_type, size_type, size_type) */ ValueType at(size_type batch_id, size_type idx) const noexcept { @@ -286,13 +286,13 @@ class BatchMultiVector * * @param alpha the scalar * - * @note If alpha is 1x1 BatchMultiVector matrix, the entire multi-vector - * (all batches) is scaled by alpha. If it is a BatchMultiVector row + * @note If alpha is 1x1 MultiVector matrix, the entire multi-vector + * (all batches) is scaled by alpha. If it is a MultiVector row * vector of values, then i-th column of the vector is scaled with the * i-th element of alpha (the number of columns of alpha has to match * the number of columns of the multi-vector). */ - void scale(ptr_param> alpha); + void scale(ptr_param> alpha); /** * Adds `b` scaled by `alpha` to the vector (aka: BLAS axpy). @@ -300,48 +300,48 @@ class BatchMultiVector * @param alpha the scalar * @param b a multi-vector of the same dimension as this * - * @note If alpha is 1x1 BatchMultiVector matrix, the entire multi-vector - * (all batches) is scaled by alpha. If it is a BatchMultiVector row + * @note If alpha is 1x1 MultiVector matrix, the entire multi-vector + * (all batches) is scaled by alpha. If it is a MultiVector row * vector of values, then i-th column of the vector is scaled with the * i-th element of alpha (the number of columns of alpha has to match * the number of columns of the multi-vector). */ - void add_scaled(ptr_param> alpha, - ptr_param> b); + void add_scaled(ptr_param> alpha, + ptr_param> b); /** * Computes the column-wise dot product of each multi-vector in this batch * and its corresponding entry in `b`. * - * @param b a BatchMultiVector of same dimension as this - * @param result a BatchMultiVector row vector, used to store the dot + * @param b a MultiVector of same dimension as this + * @param result a MultiVector row vector, used to store the dot * product */ - void compute_dot(ptr_param> b, - ptr_param> result) const; + void compute_dot(ptr_param> b, + ptr_param> result) const; /** * Computes the column-wise conjugate dot product of each multi-vector in * this batch and its corresponding entry in `b`. If the vector has complex * value_type, then the conjugate of this is taken. * - * @param b a BatchMultiVector of same dimension as this - * @param result a BatchMultiVector row vector, used to store the dot + * @param b a MultiVector of same dimension as this + * @param result a MultiVector row vector, used to store the dot * product (the number of column in the vector must match the * number of columns of this) */ - void compute_conj_dot(ptr_param> b, - ptr_param> result) const; + void compute_conj_dot(ptr_param> b, + ptr_param> result) const; /** * Computes the Euclidean (L^2) norm of each multi-vector in this batch. * - * @param result a BatchMultiVector, used to store the norm + * @param result a MultiVector, used to store the norm * (the number of columns in the vector must match the number * of columns of this) */ void compute_norm2( - ptr_param>> result) const; + ptr_param>> result) const; /** * Creates a constant (immutable) batch multi-vector from a constant @@ -356,12 +356,12 @@ class BatchMultiVector * array (if it resides on the same executor as the vector) or a copy of the * array on the correct executor. */ - static std::unique_ptr> create_const( + static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values); /** - * Fills the input BatchMultiVector with a given value + * Fills the input MultiVector with a given value * * @param value the value to be filled */ @@ -375,7 +375,7 @@ class BatchMultiVector protected: /** - * Sets the size of the BatchMultiVector. + * Sets the size of the MultiVector. * * @param value the new size of the operator */ @@ -388,11 +388,11 @@ class BatchMultiVector * @param exec Executor associated to the vector * @param size size of the batch multi vector */ - BatchMultiVector(std::shared_ptr exec, - const batch_dim<2>& size = batch_dim<2>{}); + MultiVector(std::shared_ptr exec, + const batch_dim<2>& size = batch_dim<2>{}); /** - * Creates a BatchMultiVector from an already allocated (and + * Creates a MultiVector from an already allocated (and * initialized) array. * * @tparam ValuesArray type of array of values @@ -406,9 +406,9 @@ class BatchMultiVector * original array data will not be used in the vector. */ template - BatchMultiVector(std::shared_ptr exec, - const batch_dim<2>& size, ValuesArray&& values) - : EnablePolymorphicObject>(exec), + MultiVector(std::shared_ptr exec, const batch_dim<2>& size, + ValuesArray&& values) + : EnablePolymorphicObject>(exec), batch_size_(size), values_{exec, std::forward(values)} { @@ -418,7 +418,7 @@ class BatchMultiVector } /** - * Creates a BatchMultiVector from a vector of matrices + * Creates a MultiVector from a vector of matrices * * @param exec Executor associated to the vector * @param matrices The matrix::Dense objects that need to be batched. @@ -429,11 +429,11 @@ class BatchMultiVector * allocations and deep copies are necessary and hence this constructor must * not be used in performance sensitive applications */ - BatchMultiVector(std::shared_ptr exec, - const std::vector*>& matrices); + MultiVector(std::shared_ptr exec, + const std::vector*>& matrices); /** - * Creates a BatchMultiVector matrix by duplicating BatchMultiVector object + * Creates a MultiVector matrix by duplicating MultiVector object * * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate @@ -445,29 +445,29 @@ class BatchMultiVector * allocations and deep copies are necessary and hence this constructor must * not be used in performance sensitive applications. */ - BatchMultiVector(std::shared_ptr exec, - size_type num_duplications, - const BatchMultiVector* input); + MultiVector(std::shared_ptr exec, + size_type num_duplications, + const MultiVector* input); /** - * Creates a BatchMultiVector matrix by a duplicating a matrix::Dense object + * Creates a MultiVector matrix by a duplicating a matrix::Dense object * * @param exec Executor associated to the vector * @param num_duplications The number of times to duplicate * @param input the matrix to be duplicated. */ - BatchMultiVector(std::shared_ptr exec, - size_type num_duplications, - const matrix::Dense* input); + MultiVector(std::shared_ptr exec, + size_type num_duplications, + const matrix::Dense* input); /** - * Creates a BatchMultiVector with the same configuration as the + * Creates a MultiVector with the same configuration as the * callers object. * - * @returns a BatchMultiVector with the same configuration as the + * @returns a MultiVector with the same configuration as the * caller. */ - std::unique_ptr create_with_same_config() const; + std::unique_ptr create_with_same_config() const; size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept @@ -491,11 +491,11 @@ class BatchMultiVector /** * Creates and initializes a batch of single column-vectors. * - * This function first creates a temporary BatchMultiVector, fills it with + * This function first creates a temporary MultiVector, fills it with * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize - * (BatchMultiVector has to implement the ConvertibleTo + * (MultiVector has to implement the ConvertibleTo * interface) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) @@ -506,16 +506,16 @@ class BatchMultiVector * including the Executor, which is passed as the first * argument * - * @ingroup BatchMultiVector + * @ingroup MultiVector * @ingroup mat_formats */ template -std::unique_ptr batch_initialize( +std::unique_ptr initialize( std::initializer_list> vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_multi_vector = BatchMultiVector; + using batch_multi_vector = MultiVector; size_type num_batch_items = vals.size(); GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); auto vals_begin = begin(vals); @@ -544,7 +544,7 @@ std::unique_ptr batch_initialize( /** * Creates and initializes a batch of multi-vectors. * - * This function first creates a temporary BatchMultiVector, fills it with + * This function first creates a temporary MultiVector, fills it with * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize @@ -558,17 +558,17 @@ std::unique_ptr batch_initialize( * including the Executor, which is passed as the first * argument * - * @ingroup BatchMultiVector + * @ingroup MultiVector * @ingroup mat_formats */ template -std::unique_ptr batch_initialize( +std::unique_ptr initialize( std::initializer_list>> vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_multi_vector = BatchMultiVector; + using batch_multi_vector = MultiVector; size_type num_batch_items = vals.size(); GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); auto vals_begin = begin(vals); @@ -612,7 +612,7 @@ std::unique_ptr batch_initialize( * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize - * (BatchMultiVector has to implement the ConvertibleTo + * (MultiVector has to implement the ConvertibleTo * interface) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) @@ -624,16 +624,16 @@ std::unique_ptr batch_initialize( * including the Executor, which is passed as the first * argument * - * @ingroup BatchMultiVector + * @ingroup MultiVector * @ingroup mat_formats */ template -std::unique_ptr batch_initialize( +std::unique_ptr initialize( const size_type num_vectors, std::initializer_list vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_multi_vector = BatchMultiVector; + using batch_multi_vector = MultiVector; size_type num_batch_items = num_vectors; GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, "Input data is empty"); @@ -660,7 +660,7 @@ std::unique_ptr batch_initialize( * passed in values, and then converts the vector to the requested type. * * @tparam Matrix matrix type to initialize - * (BatchMultiVector has to implement the ConvertibleTo + * (MultiVector has to implement the ConvertibleTo * interface) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) @@ -676,13 +676,13 @@ std::unique_ptr batch_initialize( * @ingroup mat_formats */ template -std::unique_ptr batch_initialize( +std::unique_ptr initialize( const size_type num_batch_items, std::initializer_list> vals, std::shared_ptr exec, TArgs&&... create_args) { - using batch_multi_vector = BatchMultiVector; + using batch_multi_vector = MultiVector; GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, "Input data is empty"); auto common_size = dim<2>(begin(vals) ? vals.size() : 0, @@ -706,6 +706,7 @@ std::unique_ptr batch_initialize( } +} // namespace batch } // namespace gko diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index 057efe5f05c..deef105db0d 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -49,8 +49,8 @@ namespace gko { namespace kernels { namespace omp { /** - * @brief The BatchMultiVector matrix format namespace. - * @ref BatchMultiVector + * @brief The batch::MultiVector matrix format namespace. + * @ref batch::MultiVector * @ingroup batch_multi_vector */ namespace batch_multi_vector { @@ -61,8 +61,8 @@ namespace batch_multi_vector { template void scale(std::shared_ptr exec, - const BatchMultiVector* const alpha, - BatchMultiVector* const x) + const batch::MultiVector* const alpha, + batch::MultiVector* const x) { const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); @@ -80,9 +80,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void add_scaled(std::shared_ptr exec, - const BatchMultiVector* const alpha, - const BatchMultiVector* const x, - BatchMultiVector* const y) + const batch::MultiVector* const alpha, + const batch::MultiVector* const x, + batch::MultiVector* const y) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -102,9 +102,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_dot(std::shared_ptr exec, - const BatchMultiVector* const x, - const BatchMultiVector* const y, - BatchMultiVector* const result) + const batch::MultiVector* const x, + const batch::MultiVector* const y, + batch::MultiVector* const result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -124,9 +124,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_conj_dot(std::shared_ptr exec, - const BatchMultiVector* const x, - const BatchMultiVector* const y, - BatchMultiVector* const result) + const batch::MultiVector* const x, + const batch::MultiVector* const y, + batch::MultiVector* const result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); @@ -146,8 +146,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_norm2(std::shared_ptr exec, - const BatchMultiVector* const x, - BatchMultiVector>* const result) + const batch::MultiVector* const x, + batch::MultiVector>* const result) { const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); @@ -165,8 +165,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void copy(std::shared_ptr exec, - const BatchMultiVector* x, - BatchMultiVector* result) + const batch::MultiVector* x, + batch::MultiVector* result) { const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index b5cdb03d214..076fd87778d 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -49,8 +49,8 @@ namespace gko { namespace kernels { namespace reference { /** - * @brief The BatchMultiVector matrix format namespace. - * @ref BatchMultiVector + * @brief The batch::MultiVector matrix format namespace. + * @ref batch::MultiVector * @ingroup batch_multi_vector */ namespace batch_multi_vector { @@ -61,14 +61,14 @@ namespace batch_multi_vector { template void scale(std::shared_ptr exec, - const BatchMultiVector* alpha, - BatchMultiVector* x) + const batch::MultiVector* alpha, + batch::MultiVector* x) { const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto alpha_b = batch::batch_item(alpha_ub, batch); + const auto x_b = batch::batch_item(x_ub, batch); scale_kernel(alpha_b, x_b); } } @@ -79,17 +79,17 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void add_scaled(std::shared_ptr exec, - const BatchMultiVector* alpha, - const BatchMultiVector* x, - BatchMultiVector* y) + const batch::MultiVector* alpha, + const batch::MultiVector* x, + batch::MultiVector* y) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); const auto alpha_ub = host::get_batch_struct(alpha); for (size_type batch = 0; batch < y->get_num_batch_items(); ++batch) { - const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); - const auto y_b = gko::batch::batch_item(y_ub, batch); + const auto alpha_b = batch::batch_item(alpha_ub, batch); + const auto x_b = batch::batch_item(x_ub, batch); + const auto y_b = batch::batch_item(y_ub, batch); add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -100,17 +100,17 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_dot(std::shared_ptr exec, - const BatchMultiVector* x, - const BatchMultiVector* y, - BatchMultiVector* result) + const batch::MultiVector* x, + const batch::MultiVector* y, + batch::MultiVector* result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = gko::batch::batch_item(res_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); - const auto y_b = gko::batch::batch_item(y_ub, batch); + const auto res_b = batch::batch_item(res_ub, batch); + const auto x_b = batch::batch_item(x_ub, batch); + const auto y_b = batch::batch_item(y_ub, batch); compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -121,17 +121,17 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_conj_dot(std::shared_ptr exec, - const BatchMultiVector* x, - const BatchMultiVector* y, - BatchMultiVector* result) + const batch::MultiVector* x, + const batch::MultiVector* y, + batch::MultiVector* result) { const auto x_ub = host::get_batch_struct(x); const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = gko::batch::batch_item(res_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); - const auto y_b = gko::batch::batch_item(y_ub, batch); + const auto res_b = batch::batch_item(res_ub, batch); + const auto x_b = batch::batch_item(x_ub, batch); + const auto y_b = batch::batch_item(y_ub, batch); compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -142,14 +142,14 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void compute_norm2(std::shared_ptr exec, - const BatchMultiVector* x, - BatchMultiVector>* result) + const batch::MultiVector* x, + batch::MultiVector>* result) { const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = gko::batch::batch_item(res_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto res_b = batch::batch_item(res_ub, batch); + const auto x_b = batch::batch_item(x_ub, batch); compute_norm2_kernel(x_b, res_b); } } @@ -160,14 +160,14 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void copy(std::shared_ptr exec, - const BatchMultiVector* x, - BatchMultiVector* result) + const batch::MultiVector* x, + batch::MultiVector* result) { const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto result_b = gko::batch::batch_item(result_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto result_b = batch::batch_item(result_ub, batch); + const auto x_b = batch::batch_item(x_ub, batch); copy_kernel(x_b, result_b); } } diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc index a6935866f56..a14b18ec9f7 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp.inc @@ -32,8 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void scale_kernel( - const gko::batch_multi_vector::batch_item& alpha, - const gko::batch_multi_vector::batch_item& x) + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -53,9 +53,9 @@ inline void scale_kernel( template inline void add_scaled_kernel( - const gko::batch_multi_vector::batch_item& alpha, - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y) + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y) { if (alpha.num_rhs == 1) { for (int i = 0; i < x.num_rows; ++i) { @@ -77,9 +77,9 @@ inline void add_scaled_kernel( template inline void compute_dot_product_kernel( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y, - const gko::batch_multi_vector::batch_item& result) + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result) { for (int c = 0; c < result.num_rhs; c++) { result.values[c] = gko::zero(); @@ -96,9 +96,9 @@ inline void compute_dot_product_kernel( template inline void compute_conj_dot_product_kernel( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item& y, - const gko::batch_multi_vector::batch_item& result) + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result) { for (int c = 0; c < result.num_rhs; c++) { result.values[c] = gko::zero(); @@ -115,8 +115,8 @@ inline void compute_conj_dot_product_kernel( template inline void compute_norm2_kernel( - const gko::batch_multi_vector::batch_item& x, - const gko::batch_multi_vector::batch_item>& + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item>& result) { for (int j = 0; j < x.num_rhs; ++j) { @@ -141,8 +141,8 @@ inline void compute_norm2_kernel( */ template inline void copy_kernel( - const gko::batch_multi_vector::batch_item& in, - const gko::batch_multi_vector::batch_item& out) + const gko::batch::multi_vector::batch_item& in, + const gko::batch::multi_vector::batch_item& out) { for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) { const int i = iz / in.num_rhs; diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index 21ff280baba..ce7c7af5605 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -63,8 +63,8 @@ namespace host { * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch get_batch_struct( - const BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch get_batch_struct( + const batch::MultiVector* const op) { return {op->get_const_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -77,8 +77,8 @@ inline gko::batch_multi_vector::uniform_batch get_batch_struct( * Generates a uniform batch struct from a batch of multi-vectors. */ template -inline gko::batch_multi_vector::uniform_batch get_batch_struct( - BatchMultiVector* const op) +inline batch::multi_vector::uniform_batch get_batch_struct( + batch::MultiVector* const op) { return {op->get_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 62567cc91ee..82429660b32 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -52,16 +52,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template -class BatchMultiVector : public ::testing::Test { +class MultiVector : public ::testing::Test { protected: using value_type = T; using size_type = gko::size_type; - using Mtx = gko::BatchMultiVector; + using Mtx = gko::batch::MultiVector; using DenseMtx = gko::matrix::Dense; using ComplexMtx = gko::to_complex; - BatchMultiVector() + MultiVector() : exec(gko::ReferenceExecutor::create()), - mtx_0(gko::batch_initialize( + mtx_0(gko::batch::initialize( {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}}, exec)), @@ -69,15 +69,15 @@ class BatchMultiVector : public ::testing::Test { {I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, exec)), mtx_01(gko::initialize( {I({1.0, -2.0, -0.5}), I({1.0, -2.5, 4.0})}, exec)), - mtx_1( - gko::batch_initialize({{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}}, - {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, - exec)), + mtx_1(gko::batch::initialize( + {{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}}, + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + exec)), mtx_10(gko::initialize( {I({1.0, -1.0, 2.2}), I({-2.0, 2.0, -0.5})}, exec)), mtx_11(gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)), - mtx_2(gko::batch_initialize( + mtx_2(gko::batch::initialize( {{{1.0, 1.5}, {6.0, 1.0}, {-0.25, 1.0}}, {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}}, exec)), @@ -85,22 +85,22 @@ class BatchMultiVector : public ::testing::Test { {I({1.0, 1.5}), I({6.0, 1.0}), I({-0.25, 1.0})}, exec)), mtx_21(gko::initialize( {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}, exec)), - mtx_3(gko::batch_initialize( + mtx_3(gko::batch::initialize( {{I({1.0, 1.5}), I({6.0, 1.0})}, {{2.0, -2.0}, {1.0, 3.0}}}, exec)), mtx_30(gko::initialize({I({1.0, 1.5}), I({6.0, 1.0})}, exec)), mtx_31(gko::initialize( {I({2.0, -2.0}), I({1.0, 3.0})}, exec)), - mtx_4(gko::batch_initialize( + mtx_4(gko::batch::initialize( {{{1.0, 1.5, 3.0}, {6.0, 1.0, 5.0}, {6.0, 1.0, 5.5}}, {{2.0, -2.0, 1.5}, {4.0, 3.0, 2.2}, {-1.25, 3.0, 0.5}}}, exec)), - mtx_5(gko::batch_initialize( + mtx_5(gko::batch::initialize( {{{1.0, 1.5}, {6.0, 1.0}, {7.0, -4.5}}, {I({2.0, -2.0}), I({1.0, 3.0}), I({4.0, 3.0})}}, exec)), - mtx_6(gko::batch_initialize( + mtx_6(gko::batch::initialize( {{{1.0, 0.0, 3.0}, {0.0, 3.0, 0.0}, {0.0, 1.0, 5.0}}, {{2.0, 0.0, 5.0}, {0.0, 1.0, 0.0}, {0.0, -1.0, 8.0}}}, exec)) @@ -126,14 +126,14 @@ class BatchMultiVector : public ::testing::Test { std::default_random_engine rand_engine; }; -TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes); +TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes); -TYPED_TEST(BatchMultiVector, ScalesData) +TYPED_TEST(MultiVector, ScalesData) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize( + auto alpha = gko::batch::initialize( {{{2.0, -2.0, 1.5}}, {{3.0, -1.0, 0.25}}}, this->exec); auto ualpha = alpha->unbatch(); @@ -147,11 +147,11 @@ TYPED_TEST(BatchMultiVector, ScalesData) } -TYPED_TEST(BatchMultiVector, ScalesDataWithScalar) +TYPED_TEST(MultiVector, ScalesDataWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + auto alpha = gko::batch::initialize({{2.0}, {-2.0}}, this->exec); auto ualpha = alpha->unbatch(); this->mtx_1->scale(alpha.get()); @@ -164,11 +164,11 @@ TYPED_TEST(BatchMultiVector, ScalesDataWithScalar) } -TYPED_TEST(BatchMultiVector, ScalesDataWithMultipleScalars) +TYPED_TEST(MultiVector, ScalesDataWithMultipleScalars) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize( + auto alpha = gko::batch::initialize( {{{2.0, -2.0, -1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); auto ualpha = alpha->unbatch(); @@ -182,11 +182,11 @@ TYPED_TEST(BatchMultiVector, ScalesDataWithMultipleScalars) } -TYPED_TEST(BatchMultiVector, AddsScaled) +TYPED_TEST(MultiVector, AddsScaled) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize( + auto alpha = gko::batch::initialize( {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); auto ualpha = alpha->unbatch(); @@ -200,11 +200,11 @@ TYPED_TEST(BatchMultiVector, AddsScaled) } -TYPED_TEST(BatchMultiVector, AddsScaledWithScalar) +TYPED_TEST(MultiVector, AddsScaledWithScalar) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch_initialize({{2.0}, {-2.0}}, this->exec); + auto alpha = gko::batch::initialize({{2.0}, {-2.0}}, this->exec); auto ualpha = alpha->unbatch(); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); @@ -217,10 +217,10 @@ TYPED_TEST(BatchMultiVector, AddsScaledWithScalar) } -TYPED_TEST(BatchMultiVector, AddScaledFailsOnWrongSizes) +TYPED_TEST(MultiVector, AddScaledFailsOnWrongSizes) { using Mtx = typename TestFixture::Mtx; - auto alpha = gko::batch_initialize( + auto alpha = gko::batch::initialize( {{2.0, 3.0, 4.0, 5.0}, {-2.0, 2.0, 4.0, 5.0}}, this->exec); ASSERT_THROW(this->mtx_1->add_scaled(alpha.get(), this->mtx_2.get()), @@ -228,7 +228,7 @@ TYPED_TEST(BatchMultiVector, AddScaledFailsOnWrongSizes) } -TYPED_TEST(BatchMultiVector, ComputesDot) +TYPED_TEST(MultiVector, ComputesDot) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -246,7 +246,7 @@ TYPED_TEST(BatchMultiVector, ComputesDot) } -TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongInputSize) +TYPED_TEST(MultiVector, ComputeDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -257,7 +257,7 @@ TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongInputSize) } -TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongResultSize) +TYPED_TEST(MultiVector, ComputeDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; @@ -269,7 +269,7 @@ TYPED_TEST(BatchMultiVector, ComputeDotFailsOnWrongResultSize) } -TYPED_TEST(BatchMultiVector, ComputesConjDot) +TYPED_TEST(MultiVector, ComputesConjDot) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; @@ -287,7 +287,7 @@ TYPED_TEST(BatchMultiVector, ComputesConjDot) } -TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongInputSize) +TYPED_TEST(MultiVector, ComputeConjDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; auto result = @@ -298,7 +298,7 @@ TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongInputSize) } -TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongResultSize) +TYPED_TEST(MultiVector, ComputeConjDotFailsOnWrongResultSize) { using Mtx = typename TestFixture::Mtx; @@ -310,13 +310,13 @@ TYPED_TEST(BatchMultiVector, ComputeConjDotFailsOnWrongResultSize) } -TYPED_TEST(BatchMultiVector, ComputesNorm2) +TYPED_TEST(MultiVector, ComputesNorm2) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using T_nc = gko::remove_complex; - using NormVector = gko::BatchMultiVector; - auto mtx(gko::batch_initialize( + using NormVector = gko::batch::MultiVector; + auto mtx(gko::batch::initialize( {{I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}}, {I{-4.0, 2.0}, I{-3.0, -2.0}, I{0.0, 1.0}}}, this->exec)); @@ -332,7 +332,7 @@ TYPED_TEST(BatchMultiVector, ComputesNorm2) } -TYPED_TEST(BatchMultiVector, CopiesData) +TYPED_TEST(MultiVector, CopiesData) { gko::kernels::reference::batch_multi_vector::copy( this->exec, this->mtx_0.get(), this->mtx_1.get()); @@ -341,14 +341,14 @@ TYPED_TEST(BatchMultiVector, CopiesData) } -TYPED_TEST(BatchMultiVector, ConvertsToPrecision) +TYPED_TEST(MultiVector, ConvertsToPrecision) { - using BatchMultiVector = typename TestFixture::Mtx; + using MultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchMultiVector = typename gko::BatchMultiVector; - auto tmp = OtherBatchMultiVector::create(this->exec); - auto res = BatchMultiVector::create(this->exec); + using OtherMultiVector = typename gko::batch::MultiVector; + auto tmp = OtherMultiVector::create(this->exec); + auto res = MultiVector::create(this->exec); // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} @@ -364,14 +364,14 @@ TYPED_TEST(BatchMultiVector, ConvertsToPrecision) } -TYPED_TEST(BatchMultiVector, MovesToPrecision) +TYPED_TEST(MultiVector, MovesToPrecision) { - using BatchMultiVector = typename TestFixture::Mtx; + using MultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchMultiVector = typename gko::BatchMultiVector; - auto tmp = OtherBatchMultiVector::create(this->exec); - auto res = BatchMultiVector::create(this->exec); + using OtherMultiVector = typename gko::batch::MultiVector; + auto tmp = OtherMultiVector::create(this->exec); + auto res = MultiVector::create(this->exec); // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} @@ -387,14 +387,14 @@ TYPED_TEST(BatchMultiVector, MovesToPrecision) } -TYPED_TEST(BatchMultiVector, ConvertsEmptyToPrecision) +TYPED_TEST(MultiVector, ConvertsEmptyToPrecision) { - using BatchMultiVector = typename TestFixture::Mtx; + using MultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchMultiVector = typename gko::BatchMultiVector; - auto empty = OtherBatchMultiVector::create(this->exec); - auto res = BatchMultiVector::create(this->exec); + using OtherMultiVector = typename gko::batch::MultiVector; + auto empty = OtherMultiVector::create(this->exec); + auto res = MultiVector::create(this->exec); empty->convert_to(res.get()); @@ -402,14 +402,14 @@ TYPED_TEST(BatchMultiVector, ConvertsEmptyToPrecision) } -TYPED_TEST(BatchMultiVector, MovesEmptyToPrecision) +TYPED_TEST(MultiVector, MovesEmptyToPrecision) { - using BatchMultiVector = typename TestFixture::Mtx; + using MultiVector = typename TestFixture::Mtx; using T = typename TestFixture::value_type; using OtherT = typename gko::next_precision; - using OtherBatchMultiVector = typename gko::BatchMultiVector; - auto empty = OtherBatchMultiVector::create(this->exec); - auto res = BatchMultiVector::create(this->exec); + using OtherMultiVector = typename gko::batch::MultiVector; + auto empty = OtherMultiVector::create(this->exec); + auto res = MultiVector::create(this->exec); empty->move_to(res.get()); diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index a55ff0792ad..abd7b02fd1a 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -50,13 +50,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "test/utils/executor.hpp" -class BatchMultiVector : public CommonTestFixture { +class MultiVector : public CommonTestFixture { protected: - using Mtx = gko::BatchMultiVector; - using NormVector = gko::BatchMultiVector>; - using ComplexMtx = gko::BatchMultiVector>; + using Mtx = gko::batch::MultiVector; + using NormVector = gko::batch::MultiVector>; + using ComplexMtx = gko::batch::MultiVector>; - BatchMultiVector() : rand_engine(15) {} + MultiVector() : rand_engine(15) {} template std::unique_ptr gen_mtx(const size_t num_batch_items, @@ -80,8 +80,8 @@ class BatchMultiVector : public CommonTestFixture { alpha = gen_mtx(batch_size, 1, num_vecs); beta = gen_mtx(batch_size, 1, num_vecs); } else { - alpha = gko::batch_initialize(batch_size, {2.0}, ref); - beta = gko::batch_initialize(batch_size, {-0.5}, ref); + alpha = gko::batch::initialize(batch_size, {2.0}, ref); + beta = gko::batch::initialize(batch_size, {-0.5}, ref); } dx = gko::clone(exec, x); dy = gko::clone(exec, y); @@ -117,7 +117,7 @@ class BatchMultiVector : public CommonTestFixture { }; -TEST_F(BatchMultiVector, SingleVectorAddScaledIsEquivalentToRef) +TEST_F(MultiVector, SingleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(1); @@ -128,7 +128,7 @@ TEST_F(BatchMultiVector, SingleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchMultiVector, MultipleVectorAddScaledIsEquivalentToRef) +TEST_F(MultiVector, MultipleVectorAddScaledIsEquivalentToRef) { set_up_vector_data(20); @@ -139,7 +139,7 @@ TEST_F(BatchMultiVector, MultipleVectorAddScaledIsEquivalentToRef) } -TEST_F(BatchMultiVector, +TEST_F(MultiVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); @@ -151,7 +151,7 @@ TEST_F(BatchMultiVector, } -TEST_F(BatchMultiVector, SingleVectorScaleIsEquivalentToRef) +TEST_F(MultiVector, SingleVectorScaleIsEquivalentToRef) { set_up_vector_data(1); @@ -162,7 +162,7 @@ TEST_F(BatchMultiVector, SingleVectorScaleIsEquivalentToRef) } -TEST_F(BatchMultiVector, MultipleVectorScaleIsEquivalentToRef) +TEST_F(MultiVector, MultipleVectorScaleIsEquivalentToRef) { set_up_vector_data(20); @@ -173,7 +173,7 @@ TEST_F(BatchMultiVector, MultipleVectorScaleIsEquivalentToRef) } -TEST_F(BatchMultiVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) +TEST_F(MultiVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); @@ -184,7 +184,7 @@ TEST_F(BatchMultiVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef) } -TEST_F(BatchMultiVector, ComputeNorm2SingleIsEquivalentToRef) +TEST_F(MultiVector, ComputeNorm2SingleIsEquivalentToRef) { set_up_vector_data(1); auto norm_size = @@ -199,7 +199,7 @@ TEST_F(BatchMultiVector, ComputeNorm2SingleIsEquivalentToRef) } -TEST_F(BatchMultiVector, ComputeNorm2IsEquivalentToRef) +TEST_F(MultiVector, ComputeNorm2IsEquivalentToRef) { set_up_vector_data(20); auto norm_size = @@ -214,7 +214,7 @@ TEST_F(BatchMultiVector, ComputeNorm2IsEquivalentToRef) } -TEST_F(BatchMultiVector, ComputeDotIsEquivalentToRef) +TEST_F(MultiVector, ComputeDotIsEquivalentToRef) { set_up_vector_data(20); auto dot_size = @@ -234,7 +234,7 @@ TEST_F(BatchMultiVector, ComputeDotIsEquivalentToRef) } -TEST_F(BatchMultiVector, ComputeDotSingleIsEquivalentToRef) +TEST_F(MultiVector, ComputeDotSingleIsEquivalentToRef) { set_up_vector_data(1); auto dot_size = @@ -249,7 +249,7 @@ TEST_F(BatchMultiVector, ComputeDotSingleIsEquivalentToRef) } -TEST_F(BatchMultiVector, ComputeConjDotIsEquivalentToRef) +TEST_F(MultiVector, ComputeConjDotIsEquivalentToRef) { set_up_vector_data(20); auto dot_size = @@ -269,7 +269,7 @@ TEST_F(BatchMultiVector, ComputeConjDotIsEquivalentToRef) } -TEST_F(BatchMultiVector, ComputeConjDotSingleIsEquivalentToRef) +TEST_F(MultiVector, ComputeConjDotSingleIsEquivalentToRef) { set_up_vector_data(1); auto dot_size = @@ -284,7 +284,7 @@ TEST_F(BatchMultiVector, ComputeConjDotSingleIsEquivalentToRef) } -TEST_F(BatchMultiVector, CopySingleIsEquivalentToRef) +TEST_F(MultiVector, CopySingleIsEquivalentToRef) { set_up_vector_data(1); @@ -297,7 +297,7 @@ TEST_F(BatchMultiVector, CopySingleIsEquivalentToRef) } -TEST_F(BatchMultiVector, CopyIsEquivalentToRef) +TEST_F(MultiVector, CopyIsEquivalentToRef) { set_up_vector_data(20); diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index ed62e3ca3d3..d2c273b4e0f 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -219,7 +219,7 @@ int main() // core/base/batch_multi_vector.hpp { using type1 = float; - using batch_multi_vector_type = gko::BatchMultiVector; + using batch_multi_vector_type = gko::batch::MultiVector; auto test = batch_multi_vector_type::create(exec); } From 3cc3925acbc4eebc5296f754d986c165896edda2 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 1 Aug 2023 14:57:30 +0200 Subject: [PATCH 145/583] Rename to extract_batch_item --- .../base/batch_multi_vector_kernels.hpp.inc | 24 +++++----- core/base/batch_struct.hpp | 13 +++--- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 45 ++++++++++--------- omp/base/batch_multi_vector_kernels.cpp | 30 ++++++------- reference/base/batch_multi_vector_kernels.cpp | 30 ++++++------- 5 files changed, 74 insertions(+), 68 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index df64e5cfe85..19c3c330f45 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -59,8 +59,8 @@ __global__ __launch_bounds__( { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto alpha_b = gko::batch::batch_item(alpha, batch_id); - const auto x_b = gko::batch::batch_item(x, batch_id); + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); scale(alpha_b, x_b, map); } } @@ -100,9 +100,9 @@ __global__ __launch_bounds__( { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto alpha_b = gko::batch::batch_item(alpha, batch_id); - const auto x_b = gko::batch::batch_item(x, batch_id); - const auto y_b = gko::batch::batch_item(y, batch_id); + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); add_scaled(alpha_b, x_b, y_b, map); } } @@ -162,9 +162,9 @@ __global__ { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto x_b = gko::batch::batch_item(x, batch_id); - const auto y_b = gko::batch::batch_item(y, batch_id); - const auto r_b = gko::batch::batch_item(result, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); compute_gen_dot_product(x_b, y_b, r_b, map); } } @@ -236,8 +236,8 @@ __global__ __launch_bounds__( { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { - const auto x_b = gko::batch::batch_item(x, batch_id); - const auto r_b = gko::batch::batch_item(result, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); compute_norm2(x_b, r_b); } } @@ -271,8 +271,8 @@ __global__ { for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; batch_id += gridDim.x) { - const auto dst_b = gko::batch::batch_item(dst, batch_id); - const auto src_b = gko::batch::batch_item(src, batch_id); + const auto dst_b = gko::batch::extract_batch_item(dst, batch_id); + const auto src_b = gko::batch::extract_batch_item(src, batch_id); copy(src_b, dst_b); } } diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index d22b64f3320..caca4577cf7 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -107,18 +107,19 @@ GKO_ATTRIBUTES GKO_INLINE multi_vector::uniform_batch to_const( * @param batch_idx The position of the desired object in the batch */ template -GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item batch_item( - const multi_vector::uniform_batch& batch, - const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item +extract_batch_item(const multi_vector::uniform_batch& batch, + const size_type batch_idx) { return {batch.values + batch_idx * batch.stride * batch.num_rows, batch.stride, batch.num_rows, batch.num_rhs}; } template -GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item batch_item( - ValueType* const batch_values, const int stride, const int num_rows, - const int num_rhs, const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item +extract_batch_item(ValueType* const batch_values, const int stride, + const int num_rows, const int num_rhs, + const size_type batch_idx) { return {batch_values + batch_idx * stride * num_rows, stride, num_rows, num_rhs}; diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index f307b6ba240..5c52ab5a50f 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -91,8 +91,9 @@ void scale(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_item(alpha_ub, group_id); - const auto x_b = batch::batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); scale_kernel(alpha_b, x_b, item_ct1, [](int col) { return 0; }); }); @@ -103,8 +104,9 @@ void scale(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_item(alpha_ub, group_id); - const auto x_b = batch::batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); scale_kernel(alpha_b, x_b, item_ct1, [](int col) { return col; }); }); @@ -141,9 +143,10 @@ void add_scaled(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_item(alpha_ub, group_id); - const auto x_b = batch::batch_item(x_ub, group_id); - const auto y_b = batch::batch_item(y_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto y_b = batch::extract_batch_item(y_ub, group_id); add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, [](auto col) { return 0; }); }); @@ -154,9 +157,10 @@ void add_scaled(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto alpha_b = batch::batch_item(alpha_ub, group_id); - const auto x_b = batch::batch_item(x_ub, group_id); - const auto y_b = batch::batch_item(y_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto y_b = batch::extract_batch_item(y_ub, group_id); add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, [](auto col) { return col; }); }); @@ -194,9 +198,9 @@ void compute_dot(std::shared_ptr exec, config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_item(x_ub, group_id); - const auto y_b = batch::batch_item(y_ub, group_id); - const auto res_b = batch::batch_item(res_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto y_b = batch::extract_batch_item(y_ub, group_id); + const auto res_b = batch::extract_batch_item(res_ub, group_id); compute_gen_dot_product_kernel(x_b, y_b, res_b, item_ct1, [](auto val) { return val; }); }); @@ -232,9 +236,9 @@ void compute_conj_dot(std::shared_ptr exec, config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_item(x_ub, group_id); - const auto y_b = batch::batch_item(y_ub, group_id); - const auto res_b = batch::batch_item(res_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto y_b = batch::extract_batch_item(y_ub, group_id); + const auto res_b = batch::extract_batch_item(res_ub, group_id); compute_gen_dot_product_kernel( x_b, y_b, res_b, item_ct1, [](auto val) { return conj(val); }); @@ -269,8 +273,8 @@ void compute_norm2(std::shared_ptr exec, config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_item(x_ub, group_id); - const auto res_b = batch::batch_item(res_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto res_b = batch::extract_batch_item(res_ub, group_id); compute_norm2_kernel(x_b, res_b, item_ct1); }); }); @@ -301,8 +305,9 @@ void copy(std::shared_ptr exec, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); - const auto x_b = batch::batch_item(x_ub, group_id); - const auto result_b = batch::batch_item(result_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto result_b = + batch::extract_batch_item(result_ub, group_id); copy_kernel(x_b, result_b, item_ct1); }); }); diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index deef105db0d..6067e762c98 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -68,8 +68,8 @@ void scale(std::shared_ptr exec, const auto alpha_ub = host::get_batch_struct(alpha); #pragma omp parallel for for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch); + const auto x_b = gko::batch::extract_batch_item(x_ub, batch); scale_kernel(alpha_b, x_b); } } @@ -89,9 +89,9 @@ void add_scaled(std::shared_ptr exec, const auto alpha_ub = host::get_batch_struct(alpha); #pragma omp parallel for for (size_type batch = 0; batch < y->get_num_batch_items(); ++batch) { - const auto alpha_b = gko::batch::batch_item(alpha_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); - const auto y_b = gko::batch::batch_item(y_ub, batch); + const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch); + const auto x_b = gko::batch::extract_batch_item(x_ub, batch); + const auto y_b = gko::batch::extract_batch_item(y_ub, batch); add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -111,9 +111,9 @@ void compute_dot(std::shared_ptr exec, const auto res_ub = host::get_batch_struct(result); #pragma omp parallel for for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = gko::batch::batch_item(res_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); - const auto y_b = gko::batch::batch_item(y_ub, batch); + const auto res_b = gko::batch::extract_batch_item(res_ub, batch); + const auto x_b = gko::batch::extract_batch_item(x_ub, batch); + const auto y_b = gko::batch::extract_batch_item(y_ub, batch); compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -133,9 +133,9 @@ void compute_conj_dot(std::shared_ptr exec, const auto res_ub = host::get_batch_struct(result); #pragma omp parallel for for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = gko::batch::batch_item(res_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); - const auto y_b = gko::batch::batch_item(y_ub, batch); + const auto res_b = gko::batch::extract_batch_item(res_ub, batch); + const auto x_b = gko::batch::extract_batch_item(x_ub, batch); + const auto y_b = gko::batch::extract_batch_item(y_ub, batch); compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -153,8 +153,8 @@ void compute_norm2(std::shared_ptr exec, const auto res_ub = host::get_batch_struct(result); #pragma omp parallel for for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = gko::batch::batch_item(res_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto res_b = gko::batch::extract_batch_item(res_ub, batch); + const auto x_b = gko::batch::extract_batch_item(x_ub, batch); compute_norm2_kernel(x_b, res_b); } } @@ -172,8 +172,8 @@ void copy(std::shared_ptr exec, const auto result_ub = host::get_batch_struct(result); #pragma omp parallel for for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto result_b = gko::batch::batch_item(result_ub, batch); - const auto x_b = gko::batch::batch_item(x_ub, batch); + const auto result_b = gko::batch::extract_batch_item(result_ub, batch); + const auto x_b = gko::batch::extract_batch_item(x_ub, batch); copy_kernel(x_b, result_b); } } diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index 076fd87778d..89476e61453 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -67,8 +67,8 @@ void scale(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto alpha_ub = host::get_batch_struct(alpha); for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto alpha_b = batch::batch_item(alpha_ub, batch); - const auto x_b = batch::batch_item(x_ub, batch); + const auto alpha_b = batch::extract_batch_item(alpha_ub, batch); + const auto x_b = batch::extract_batch_item(x_ub, batch); scale_kernel(alpha_b, x_b); } } @@ -87,9 +87,9 @@ void add_scaled(std::shared_ptr exec, const auto y_ub = host::get_batch_struct(y); const auto alpha_ub = host::get_batch_struct(alpha); for (size_type batch = 0; batch < y->get_num_batch_items(); ++batch) { - const auto alpha_b = batch::batch_item(alpha_ub, batch); - const auto x_b = batch::batch_item(x_ub, batch); - const auto y_b = batch::batch_item(y_ub, batch); + const auto alpha_b = batch::extract_batch_item(alpha_ub, batch); + const auto x_b = batch::extract_batch_item(x_ub, batch); + const auto y_b = batch::extract_batch_item(y_ub, batch); add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -108,9 +108,9 @@ void compute_dot(std::shared_ptr exec, const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = batch::batch_item(res_ub, batch); - const auto x_b = batch::batch_item(x_ub, batch); - const auto y_b = batch::batch_item(y_ub, batch); + const auto res_b = batch::extract_batch_item(res_ub, batch); + const auto x_b = batch::extract_batch_item(x_ub, batch); + const auto y_b = batch::extract_batch_item(y_ub, batch); compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -129,9 +129,9 @@ void compute_conj_dot(std::shared_ptr exec, const auto y_ub = host::get_batch_struct(y); const auto res_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = batch::batch_item(res_ub, batch); - const auto x_b = batch::batch_item(x_ub, batch); - const auto y_b = batch::batch_item(y_ub, batch); + const auto res_b = batch::extract_batch_item(res_ub, batch); + const auto x_b = batch::extract_batch_item(x_ub, batch); + const auto y_b = batch::extract_batch_item(y_ub, batch); compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -148,8 +148,8 @@ void compute_norm2(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto res_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { - const auto res_b = batch::batch_item(res_ub, batch); - const auto x_b = batch::batch_item(x_ub, batch); + const auto res_b = batch::extract_batch_item(res_ub, batch); + const auto x_b = batch::extract_batch_item(x_ub, batch); compute_norm2_kernel(x_b, res_b); } } @@ -166,8 +166,8 @@ void copy(std::shared_ptr exec, const auto x_ub = host::get_batch_struct(x); const auto result_ub = host::get_batch_struct(result); for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto result_b = batch::batch_item(result_ub, batch); - const auto x_b = batch::batch_item(x_ub, batch); + const auto result_b = batch::extract_batch_item(result_ub, batch); + const auto x_b = batch::extract_batch_item(x_ub, batch); copy_kernel(x_b, result_b); } } From f41b0df4797363f9b7e4727b112b6c89920a7c2e Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Tue, 1 Aug 2023 13:00:43 +0000 Subject: [PATCH 146/583] Format files Co-authored-by: Pratik Nayak --- .../base/batch_multi_vector_kernels.hpp.inc | 29 +++++------ dpcpp/base/batch_multi_vector_kernels.dp.cpp | 52 ++++++++++--------- test/base/batch_multi_vector_kernels.cpp | 7 ++- 3 files changed, 42 insertions(+), 46 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 19c3c330f45..9f77598ff5a 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -47,15 +47,10 @@ __device__ __forceinline__ void scale( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void scale_kernel(const gko::batch::multi_vector:: - uniform_batch - alpha, - const gko::batch::multi_vector:: - uniform_batch - x, - Mapping map) +__global__ +__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { @@ -154,11 +149,11 @@ __device__ __forceinline__ void compute_gen_dot_product( template __global__ - __launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( - const gko::batch::multi_vector::uniform_batch x, - const gko::batch::multi_vector::uniform_batch y, - const gko::batch::multi_vector::uniform_batch result, - Mapping map) +__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, + const gko::batch::multi_vector::uniform_batch result, + Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { @@ -265,9 +260,9 @@ __device__ __forceinline__ void copy( template __global__ - __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( - const gko::batch::multi_vector::uniform_batch src, - const gko::batch::multi_vector::uniform_batch dst) +__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( + const gko::batch::multi_vector::uniform_batch src, + const gko::batch::multi_vector::uniform_batch dst) { for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; batch_id += gridDim.x) { diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 5c52ab5a50f..10e47ba080e 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -193,9 +193,9 @@ void compute_dot(std::shared_ptr exec, // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto x_b = batch::extract_batch_item(x_ub, group_id); @@ -231,18 +231,19 @@ void compute_conj_dot(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto y_b = batch::extract_batch_item(y_ub, group_id); - const auto res_b = batch::extract_batch_item(res_ub, group_id); - compute_gen_dot_product_kernel( - x_b, y_b, res_b, item_ct1, - [](auto val) { return conj(val); }); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto y_b = batch::extract_batch_item(y_ub, group_id); + const auto res_b = + batch::extract_batch_item(res_ub, group_id); + compute_gen_dot_product_kernel( + x_b, y_b, res_b, item_ct1, + [](auto val) { return conj(val); }); + }); }); } @@ -267,16 +268,17 @@ void compute_norm2(std::shared_ptr exec, const dim3 grid(num_batches); exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto res_b = batch::extract_batch_item(res_ub, group_id); - compute_norm2_kernel(x_b, res_b, item_ct1); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = + batch::extract_batch_item(x_ub, group_id); + const auto res_b = batch::extract_batch_item( + res_ub, group_id); + compute_norm2_kernel(x_b, res_b, item_ct1); + }); }); } diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index abd7b02fd1a..2d0c79d0664 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -59,8 +59,8 @@ class MultiVector : public CommonTestFixture { MultiVector() : rand_engine(15) {} template - std::unique_ptr gen_mtx(const size_t num_batch_items, - int num_rows, int num_cols) + std::unique_ptr gen_mtx(const size_t num_batch_items, int num_rows, + int num_cols) { return gko::test::generate_uniform_batch_random_matrix( num_batch_items, num_rows, num_cols, @@ -139,8 +139,7 @@ TEST_F(MultiVector, MultipleVectorAddScaledIsEquivalentToRef) } -TEST_F(MultiVector, - MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) +TEST_F(MultiVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef) { set_up_vector_data(20, true); From 9f150ba084c068d459f4c27e84718ab158e79852 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 2 Aug 2023 00:41:59 +0200 Subject: [PATCH 147/583] Add Dense matrix view creation --- core/base/batch_multi_vector.cpp | 73 +++++++++++-------- core/test/base/batch_multi_vector.cpp | 12 ++- .../ginkgo/core/base/batch_multi_vector.hpp | 22 +++++- 3 files changed, 74 insertions(+), 33 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index f17f1479f5f..88a203300de 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -85,6 +85,38 @@ batch_dim<2> compute_batch_size( } // namespace detail +template +std::unique_ptr> +MultiVector::create_view_for_item(size_type item_id) +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create( + exec, this->get_common_size(), + make_array_view(exec, num_rows * stride, + this->get_values_for_item(item_id)), + stride); + return mat; +} + + +template +std::unique_ptr> +MultiVector::create_const_view_for_item(size_type item_id) const +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create_const( + exec, this->get_common_size(), + make_const_array_view(exec, num_rows * stride, + this->get_const_values_for_item(item_id)), + stride); + return mat; +} + + template MultiVector::MultiVector(std::shared_ptr exec, const batch_dim<2>& size) @@ -164,18 +196,13 @@ template std::vector>> MultiVector::unbatch() const { - using unbatch_type = matrix::Dense; auto exec = this->get_executor(); - auto unbatch_mats = std::vector>{}; + auto unbatched_mats = std::vector>{}; for (size_type b = 0; b < this->get_num_batch_items(); ++b) { - auto mat = unbatch_type::create(exec, this->get_common_size()); - exec->copy_from(exec.get(), mat->get_num_stored_elements(), - this->get_const_values() + - this->get_size().get_cumulative_offset(b), - mat->get_values()); - unbatch_mats.emplace_back(std::move(mat)); + unbatched_mats.emplace_back( + this->create_const_view_for_item(b)->clone()); } - return unbatch_mats; + return unbatched_mats; } @@ -336,19 +363,15 @@ void read_impl(MatrixType* mtx, const std::vector& data) GKO_THROW_IF_INVALID(data.size() > 0, "Input data is empty"); auto common_size = data[0].size; - auto batch_size = batch_dim<2>(data.size(), common_size); - for (const auto& b : data) { - auto b_size = b.size; - GKO_ASSERT_EQUAL_DIMENSIONS(common_size, b_size); - } + auto num_batch_items = data.size(); + auto batch_size = batch_dim<2>(num_batch_items, common_size); auto tmp = MatrixType::create(mtx->get_executor()->get_master(), batch_size); - tmp->fill(zero()); - for (size_type b = 0; b < data.size(); ++b) { - for (const auto& elem : data[b].nonzeros) { - tmp->at(b, elem.row, elem.column) = elem.value; - } + for (size_type b = 0; b < num_batch_items; ++b) { + assert(data[b].size == common_size); + tmp->create_view_for_item(b)->read(data[b]); } + tmp->move_to(mtx); } @@ -370,20 +393,10 @@ void MultiVector::read(const std::vector& data) template void write_impl(const MatrixType* mtx, std::vector& data) { - auto tmp = make_temporary_clone(mtx->get_executor()->get_master(), mtx); - data = std::vector(mtx->get_num_batch_items()); for (size_type b = 0; b < mtx->get_num_batch_items(); ++b) { data[b] = {mtx->get_common_size(), {}}; - for (size_type row = 0; row < data[b].size[0]; ++row) { - for (size_type col = 0; col < data[b].size[1]; ++col) { - if (tmp->at(b, row, col) != - zero()) { - data[b].nonzeros.emplace_back(row, col, - tmp->at(b, row, col)); - } - } - } + mtx->create_const_view_for_item(b)->write(data[b]); } } diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index e87cedca913..055c2b899d0 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -55,7 +55,9 @@ class MultiVector : public ::testing::Test { mtx(gko::batch::initialize>( {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, - exec)) + exec)), + dense_mtx(gko::initialize>( + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)) {} @@ -89,6 +91,7 @@ class MultiVector : public ::testing::Test { std::shared_ptr exec; std::unique_ptr> mtx; + std::unique_ptr> dense_mtx; }; TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes); @@ -118,6 +121,13 @@ TYPED_TEST(MultiVector, CanGetValuesForEntry) } +TYPED_TEST(MultiVector, CanCreateDenseItemView) +{ + GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->dense_mtx, + 0.0); +} + + TYPED_TEST(MultiVector, CanBeCopied) { auto mtx_copy = gko::batch::MultiVector::create(this->exec); diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 0e011f6b3ef..77171569320 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -130,6 +130,24 @@ class MultiVector void write(std::vector& data) const override; + /** + * Creates a mutable view (of matrix::Dense type) of one item of the Batch + * MultiVector object. Does not perform any deep copies, but only returns a + * view of the data. + * + * @param item_id The index of the batch item + * + * @return a matrix::Dense object with the data from the batch item at the + * given index. + */ + std::unique_ptr create_view_for_item(size_type item_id); + + /** + * @copydoc create_view_for_item(size_type) + */ + std::unique_ptr create_const_view_for_item( + size_type item_id) const; + /** * Unbatches the batched multi-vector and creates a std::vector of Dense * matrices @@ -208,8 +226,8 @@ class MultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + From 93e401b11bc1ab105a8c865ba8e7885d1f5c6288 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 2 Aug 2023 17:24:47 +0200 Subject: [PATCH 148/583] Move read/write/unbatch to Ginkgo internal --- core/base/batch_multi_vector.cpp | 125 ------------- core/base/batch_utilities.hpp | 167 ++++++++++++++++++ core/test/base/batch_multi_vector.cpp | 67 ++++--- core/test/utils/assertions.hpp | 13 +- core/test/utils/batch_helpers.hpp | 76 ++------ .../ginkgo/core/base/batch_lin_op_helpers.hpp | 109 ------------ .../ginkgo/core/base/batch_multi_vector.hpp | 75 +------- include/ginkgo/ginkgo.hpp | 1 - .../test/base/batch_multi_vector_kernels.cpp | 38 ++-- test/base/batch_multi_vector_kernels.cpp | 10 +- 10 files changed, 248 insertions(+), 433 deletions(-) create mode 100644 core/base/batch_utilities.hpp delete mode 100644 include/ginkgo/core/base/batch_lin_op_helpers.hpp diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 88a203300de..23591cd1ffe 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -126,59 +126,6 @@ MultiVector::MultiVector(std::shared_ptr exec, {} -template -MultiVector::MultiVector( - std::shared_ptr exec, - const std::vector*>& matrices) - : EnablePolymorphicObject>(exec), - batch_size_{detail::compute_batch_size(matrices)}, - values_(exec, compute_num_elems(batch_size_)) -{ - for (size_type i = 0; i < this->get_num_batch_items(); ++i) { - auto local_exec = matrices[i]->get_executor(); - exec->copy_from( - local_exec.get(), matrices[i]->get_num_stored_elements(), - matrices[i]->get_const_values(), - this->get_values() + this->get_size().get_cumulative_offset(i)); - } -} - - -template -MultiVector::MultiVector(std::shared_ptr exec, - size_type num_duplications, - const matrix::Dense* input) - : MultiVector(exec, - batch_dim<2>(num_duplications, input->get_size())) -{ - size_type offset = 0; - for (size_type i = 0; i < num_duplications; ++i) { - exec->copy_from(input->get_executor().get(), - input->get_num_stored_elements(), - input->get_const_values(), this->get_values() + offset); - offset += input->get_num_stored_elements(); - } -} - - -template -MultiVector::MultiVector(std::shared_ptr exec, - size_type num_duplications, - const MultiVector* input) - : MultiVector( - exec, batch_dim<2>(input->get_num_batch_items() * num_duplications, - input->get_common_size())) -{ - size_type offset = 0; - for (size_type i = 0; i < num_duplications; ++i) { - exec->copy_from(input->get_executor().get(), - input->get_num_stored_elements(), - input->get_const_values(), this->get_values() + offset); - offset += input->get_num_stored_elements(); - } -} - - template std::unique_ptr> MultiVector::create_with_config_of( @@ -192,20 +139,6 @@ MultiVector::create_with_config_of( } -template -std::vector>> -MultiVector::unbatch() const -{ - auto exec = this->get_executor(); - auto unbatched_mats = std::vector>{}; - for (size_type b = 0; b < this->get_num_batch_items(); ++b) { - unbatched_mats.emplace_back( - this->create_const_view_for_item(b)->clone()); - } - return unbatched_mats; -} - - template std::unique_ptr> MultiVector::create_const( @@ -357,64 +290,6 @@ void MultiVector::move_to( } -template -void read_impl(MatrixType* mtx, const std::vector& data) -{ - GKO_THROW_IF_INVALID(data.size() > 0, "Input data is empty"); - - auto common_size = data[0].size; - auto num_batch_items = data.size(); - auto batch_size = batch_dim<2>(num_batch_items, common_size); - auto tmp = - MatrixType::create(mtx->get_executor()->get_master(), batch_size); - for (size_type b = 0; b < num_batch_items; ++b) { - assert(data[b].size == common_size); - tmp->create_view_for_item(b)->read(data[b]); - } - - tmp->move_to(mtx); -} - - -template -void MultiVector::read(const std::vector& data) -{ - read_impl(this, data); -} - - -template -void MultiVector::read(const std::vector& data) -{ - read_impl(this, data); -} - - -template -void write_impl(const MatrixType* mtx, std::vector& data) -{ - data = std::vector(mtx->get_num_batch_items()); - for (size_type b = 0; b < mtx->get_num_batch_items(); ++b) { - data[b] = {mtx->get_common_size(), {}}; - mtx->create_const_view_for_item(b)->write(data[b]); - } -} - - -template -void MultiVector::write(std::vector& data) const -{ - write_impl(this, data); -} - - -template -void MultiVector::write(std::vector& data) const -{ - write_impl(this, data); -} - - #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp new file mode 100644 index 00000000000..f7f28a616d8 --- /dev/null +++ b/core/base/batch_utilities.hpp @@ -0,0 +1,167 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_BATCH_UTILITIES_HPP_ +#define GKO_CORE_BASE_BATCH_UTILITIES_HPP_ + + +#include + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace batch { +namespace multivector { + + +template +std::unique_ptr> duplicate( + std::shared_ptr exec, size_type num_duplications, + const batch::MultiVector* input) +{ + auto num_batch_items = input->get_num_batch_items(); + auto tmp = batch::MultiVector::create( + exec, batch_dim<2>(num_batch_items * num_duplications, + input->get_common_size())); + + for (size_type i = 0; i < num_duplications; ++i) { + for (size_type b = 0; b < num_batch_items; ++b) { + tmp->create_view_for_item(i * num_batch_items + b) + ->copy_from(input->create_const_view_for_item(b).get()); + } + } + + return std::move(tmp); +} + + +template +std::unique_ptr> create_from_dense( + std::shared_ptr exec, const size_type num_duplications, + const matrix::Dense* input) +{ + auto num_batch_items = num_duplications; + auto tmp = batch::MultiVector::create( + exec, batch_dim<2>(num_batch_items, input->get_size())); + + for (size_type b = 0; b < num_batch_items; ++b) { + tmp->create_view_for_item(b)->copy_from(input); + } + + return std::move(tmp); +} + + +template +std::unique_ptr> create_from_dense( + std::shared_ptr exec, + const std::vector*>& input) +{ + auto num_batch_items = input.size(); + auto tmp = batch::MultiVector::create( + exec, batch_dim<2>(num_batch_items, input[0]->get_size())); + + for (size_type b = 0; b < num_batch_items; ++b) { + tmp->create_view_for_item(b)->copy_from(input[b]); + } + + return std::move(tmp); +} + + +template +std::vector>> unbatch( + const batch::MultiVector* batch_multivec) +{ + auto exec = batch_multivec->get_executor(); + auto unbatched_mats = + std::vector>>{}; + for (size_type b = 0; b < batch_multivec->get_num_batch_items(); ++b) { + unbatched_mats.emplace_back( + batch_multivec->create_const_view_for_item(b)->clone()); + } + return unbatched_mats; +} + + +template +std::unique_ptr> read( + std::shared_ptr exec, + const std::vector>& data) +{ + auto num_batch_items = data.size(); + auto tmp = MultiVector::create( + exec, batch_dim<2>(num_batch_items, data[0].size)); + + for (size_type b = 0; b < num_batch_items; ++b) { + tmp->create_view_for_item(b)->read(data[b]); + } + + return std::move(tmp); +} + + +template +std::vector> write( + const MultiVector* mvec) +{ + auto data = std::vector>( + mvec->get_num_batch_items()); + + for (size_type b = 0; b < mvec->get_num_batch_items(); ++b) { + data[b] = {mvec->get_common_size(), {}}; + mvec->create_const_view_for_item(b)->write(data[b]); + } + + return data; +} + + +} // namespace multivector +} // namespace batch +} // namespace gko + + +#endif // GKO_CORE_BASE_BATCH_UTILITIES_HPP_ diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 055c2b899d0..85168a406cc 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -41,7 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/batch_helpers.hpp" template @@ -250,7 +252,7 @@ TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::MultiVector::create( + auto m = gko::batch::multivector::create_from_dense( this->exec, std::vector{mat1.get(), mat2.get()}); this->assert_equal_to_original_mtx(m.get()); @@ -267,16 +269,16 @@ TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatricesByDuplication) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::batch::MultiVector::create( + auto bat_m = gko::batch::multivector::create_from_dense( this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); auto m = - gko::batch::MultiVector::create(this->exec, 3, mat1.get()); + gko::batch::multivector::create_from_dense(this->exec, 3, mat1.get()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } -TYPED_TEST(MultiVector, CanBeConstructedFromMultiVectorMatrices) +TYPED_TEST(MultiVector, CanBeConstructedByDuplicatingMultiVectors) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -285,14 +287,14 @@ TYPED_TEST(MultiVector, CanBeConstructedFromMultiVectorMatrices) this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::MultiVector::create( + auto m = gko::batch::multivector::create_from_dense( this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::batch::MultiVector::create( + auto m_ref = gko::batch::multivector::create_from_dense( this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), mat1.get(), mat2.get()}); auto m2 = - gko::batch::MultiVector::create(this->exec, 3, m.get()); + gko::batch::multivector::duplicate(this->exec, 3, m.get()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } @@ -383,7 +385,7 @@ TYPED_TEST(MultiVector, CanBeUnbatchedIntoDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto dense_mats = this->mtx->unbatch(); + auto dense_mats = gko::batch::multivector::unbatch(this->mtx.get()); ASSERT_EQ(dense_mats.size(), 2); GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); @@ -394,22 +396,19 @@ TYPED_TEST(MultiVector, CanBeUnbatchedIntoDenseMatrices) TYPED_TEST(MultiVector, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; - auto m = gko::batch::MultiVector::create(this->exec); - // clang-format off - m->read({gko::matrix_data{{2, 2}, - {{0, 0, 1.0}, - {0, 1, 3.0}, - {1, 0, 0.0}, - {1, 1, 5.0}}}, - gko::matrix_data{{2, 2}, - {{0, 0, -1.0}, - {0, 1, 0.5}, - {1, 0, 0.0}, - {1, 1, 9.0}}}}); - // clang-format on + using index_type = int; - ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); + auto vec_data = std::vector>{}; + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}})); + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}})); + + auto m = gko::batch::multivector::read(this->exec, + vec_data); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); @@ -423,18 +422,15 @@ TYPED_TEST(MultiVector, CanBeReadFromMatrixData) TYPED_TEST(MultiVector, CanBeReadFromSparseMatrixData) { using value_type = typename TestFixture::value_type; - auto m = gko::batch::MultiVector::create(this->exec); + using index_type = int; + auto vec_data = std::vector>{}; + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}})); + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}})); - // clang-format off - m->read({gko::matrix_data{{2, 2}, - {{0, 0, 1.0}, - {0, 1, 3.0}, - {1, 1, 5.0}}}, - gko::matrix_data{{2, 2}, - {{0, 0, -1.0}, - {0, 1, 0.5}, - {1, 1, 9.0}}}}); - // clang-format on + auto m = gko::batch::multivector::read(this->exec, + vec_data); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); @@ -451,10 +447,11 @@ TYPED_TEST(MultiVector, CanBeReadFromSparseMatrixData) TYPED_TEST(MultiVector, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; + using index_type = int; using tpl = typename gko::matrix_data::nonzero_type; - std::vector> data; - this->mtx->write(data); + auto data = + gko::batch::multivector::write(this->mtx.get()); ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); ASSERT_EQ(data[0].nonzeros.size(), 6); diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index bae78912a6c..153907cf2cf 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -55,6 +55,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/batch_utilities.hpp" #include "core/base/extended_float.hpp" @@ -714,15 +715,11 @@ ::testing::AssertionResult batch_matrices_near( const Mat2* second, double tolerance) { auto exec = first->get_executor()->get_master(); - std::vector< - matrix_data> - first_data; - std::vector< - matrix_data> - second_data; + using value_type1 = typename Mat1::value_type; + using value_type2 = typename Mat2::value_type; - first->write(first_data); - second->write(second_data); + auto first_data = gko::batch::multivector::write(first); + auto second_data = gko::batch::multivector::write(second); if (first_data.size() != second_data.size()) { return ::testing::AssertionFailure() diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp index 3b9e673922e..4cf9d4973e2 100644 --- a/core/test/utils/batch_helpers.hpp +++ b/core/test/utils/batch_helpers.hpp @@ -35,13 +35,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include +#include #include "core/test/utils/assertions.hpp" +#include "core/test/utils/matrix_generator.hpp" namespace gko { @@ -68,72 +71,27 @@ std::vector> share(std::vector>&& objs) */ template -std::unique_ptr generate_uniform_batch_random_matrix( - const size_type batch_size, const size_type num_rows, +std::unique_ptr generate_random_batch_matrix( + const size_type num_batch_items, const size_type num_rows, const size_type num_cols, NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist, Engine&& engine, - const bool with_all_diagonals, std::shared_ptr exec, - MatrixArgs&&... args) + std::shared_ptr exec, MatrixArgs&&... args) { using value_type = typename MatrixType::value_type; using index_type = typename MatrixType::index_type; - - // generate sparsity pattern - matrix_data in_data{gko::dim<2>{num_rows, num_cols}, - {}}; - - for (size_type row = 0; row < num_rows; ++row) { - // randomly generate number of nonzeros in this row - std::vector col_idx(num_cols); - std::iota(begin(col_idx), end(col_idx), size_type(0)); - const auto nnz_row = static_cast(nonzero_dist(engine)); - size_type nnz_in_row = - std::max(size_type(0), std::min(nnz_row, num_cols)); - std::shuffle(std::begin(col_idx), std::end(col_idx), engine); - - if (with_all_diagonals) { - if (nnz_in_row == 0) { - nnz_in_row = 1; - } - bool has_diagonal = false; - for (size_type icol = 0; icol < nnz_in_row; icol++) { - if (col_idx[icol] == row) { - has_diagonal = true; - } - } - if (!has_diagonal) { - col_idx[0] = row; - } - } - - std::for_each(std::begin(col_idx), std::begin(col_idx) + nnz_in_row, - [&](size_type col) { - in_data.nonzeros.emplace_back(row, col, 1.0); - }); - } - - std::vector> batch_mtx; - batch_mtx.reserve(batch_size); - - for (int batch = 0; batch < batch_size; batch++) { - matrix_data data = in_data; - for (size_type nnz = 0; nnz < data.nonzeros.size(); ++nnz) { - value_type val = - gko::detail::get_rand_value(value_dist, engine); - if (data.nonzeros[nnz].column == data.nonzeros[nnz].row && - val == zero()) { - val = 1.0; - } - data.nonzeros[nnz].value = val; - } - - data.ensure_row_major_order(); - batch_mtx.emplace_back(std::move(data)); + auto result = MatrixType::create( + exec, batch_dim<2>(num_batch_items, dim<2>(num_rows, num_cols)), + std::forward(args)...); + + // TODO: Need to preserve sparsity pattern across batch items for batched + // sparse matrix formats + for (size_type b = 0; b < num_batch_items; b++) { + auto rand_mat = + generate_random_matrix( + num_rows, num_cols, nonzero_dist, value_dist, engine, exec); + result->create_view_for_item(b)->copy_from(rand_mat.get()); } - // convert to the correct matrix type - auto result = MatrixType::create(exec, std::forward(args)...); - result->read(batch_mtx); return result; } diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp deleted file mode 100644 index 5d1a2f8ed0d..00000000000 --- a/include/ginkgo/core/base/batch_lin_op_helpers.hpp +++ /dev/null @@ -1,109 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ -#define GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ - - -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace gko { - - -/** - * A BatchLinOp implementing this interface can read its data from a matrix_data - * structure. - * - * @ingroup BatchLinOp - */ -template -class BatchReadableFromMatrixData { -public: - using value_type = ValueType; - using index_type = IndexType; - - virtual ~BatchReadableFromMatrixData() = default; - - /** - * Reads a batch matrix from a std::vector of matrix_data objects. - * - * @param data the std::vector of matrix_data objects - */ - virtual void read( - const std::vector>& data) = 0; -}; - - -/** - * A BatchLinOp implementing this interface can write its data to a std::vector - * of matrix_data objects. - * - * @ingroup BatchLinOp - */ -template -class BatchWritableToMatrixData { -public: - using value_type = ValueType; - using index_type = IndexType; - - virtual ~BatchWritableToMatrixData() = default; - - /** - * Writes a matrix to a matrix_data structure. - * - * @param data the matrix_data structure - */ - virtual void write( - std::vector>& data) const = 0; -}; - - -} // namespace gko - - -#endif // GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 77171569320..8003f5499f1 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -40,7 +40,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include #include #include #include @@ -82,19 +81,13 @@ class MultiVector : public EnablePolymorphicObject>, public EnablePolymorphicAssignment>, public EnableCreateMethod>, - public ConvertibleTo>>, - public BatchReadableFromMatrixData, - public BatchReadableFromMatrixData, - public BatchWritableToMatrixData, - public BatchWritableToMatrixData { + public ConvertibleTo>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class MultiVector>; friend class MultiVector>; public: - using BatchReadableFromMatrixData::read; - using BatchReadableFromMatrixData::read; using EnablePolymorphicAssignment::convert_to; using EnablePolymorphicAssignment::move_to; using ConvertibleTo>>::convert_to; @@ -103,8 +96,6 @@ class MultiVector using value_type = ValueType; using index_type = int32; using unbatch_type = matrix::Dense; - using mat_data = matrix_data; - using mat_data64 = matrix_data; using absolute_type = remove_complex>; using complex_type = to_complex>; @@ -122,14 +113,6 @@ class MultiVector void move_to(MultiVector>* result) override; - void read(const std::vector& data) override; - - void read(const std::vector& data) override; - - void write(std::vector& data) const override; - - void write(std::vector& data) const override; - /** * Creates a mutable view (of matrix::Dense type) of one item of the Batch * MultiVector object. Does not perform any deep copies, but only returns a @@ -148,19 +131,6 @@ class MultiVector std::unique_ptr create_const_view_for_item( size_type item_id) const; - /** - * Unbatches the batched multi-vector and creates a std::vector of Dense - * matrices - * - * @note This is an expensive operation as new memory needs to be allocated - * and the data from the batched multi-vector needs to copied to the - * individual matrices. This is mainly intended as a utility function - * for debugging and testing purposes. - * - * @return a std::vector containing the matrix::Dense objects. - */ - std::vector> unbatch() const; - /** * Returns the batch size. * @@ -435,49 +405,6 @@ class MultiVector GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1); } - /** - * Creates a MultiVector from a vector of matrices - * - * @param exec Executor associated to the vector - * @param matrices The matrix::Dense objects that need to be batched. - * - * @note This is a utility function that can serve as a first step to port - * to batched data-structures and solvers. Even if the matrices are in - * device memory, this method can have significant overhead, as new - * allocations and deep copies are necessary and hence this constructor must - * not be used in performance sensitive applications - */ - MultiVector(std::shared_ptr exec, - const std::vector*>& matrices); - - /** - * Creates a MultiVector matrix by duplicating MultiVector object - * - * @param exec Executor associated to the vector - * @param num_duplications The number of times to duplicate - * @param input the vector to be duplicated. - * - * @note This is a utility function that can serve as a first step to port - * to batched data-structures and solvers. Even if the matrices are in - * device memory, this method can have significant overhead, as new - * allocations and deep copies are necessary and hence this constructor must - * not be used in performance sensitive applications. - */ - MultiVector(std::shared_ptr exec, - size_type num_duplications, - const MultiVector* input); - - /** - * Creates a MultiVector matrix by a duplicating a matrix::Dense object - * - * @param exec Executor associated to the vector - * @param num_duplications The number of times to duplicate - * @param input the matrix to be duplicated. - */ - MultiVector(std::shared_ptr exec, - size_type num_duplications, - const matrix::Dense* input); - /** * Creates a MultiVector with the same configuration as the * callers object. diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index eebb31772ea..179a8a01a46 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -40,7 +40,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 82429660b32..4f922c37703 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -48,7 +48,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_multi_vector_kernels.hpp" +#include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/batch_helpers.hpp" template @@ -135,13 +137,13 @@ TYPED_TEST(MultiVector, ScalesData) using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize( {{{2.0, -2.0, 1.5}}, {{3.0, -1.0, 0.25}}}, this->exec); - auto ualpha = alpha->unbatch(); + auto ualpha = gko::batch::multivector::unbatch(alpha.get()); this->mtx_0->scale(alpha.get()); this->mtx_00->scale(ualpha[0].get()); this->mtx_01->scale(ualpha[1].get()); - auto res = this->mtx_0->unbatch(); + auto res = gko::batch::multivector::unbatch(this->mtx_0.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_00.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_01.get(), 0.); } @@ -152,13 +154,13 @@ TYPED_TEST(MultiVector, ScalesDataWithScalar) using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize({{2.0}, {-2.0}}, this->exec); - auto ualpha = alpha->unbatch(); + auto ualpha = gko::batch::multivector::unbatch(alpha.get()); this->mtx_1->scale(alpha.get()); this->mtx_10->scale(ualpha[0].get()); this->mtx_11->scale(ualpha[1].get()); - auto res = this->mtx_1->unbatch(); + auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -170,13 +172,13 @@ TYPED_TEST(MultiVector, ScalesDataWithMultipleScalars) using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize( {{{2.0, -2.0, -1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto ualpha = alpha->unbatch(); + auto ualpha = gko::batch::multivector::unbatch(alpha.get()); this->mtx_1->scale(alpha.get()); this->mtx_10->scale(ualpha[0].get()); this->mtx_11->scale(ualpha[1].get()); - auto res = this->mtx_1->unbatch(); + auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -188,13 +190,13 @@ TYPED_TEST(MultiVector, AddsScaled) using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize( {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto ualpha = alpha->unbatch(); + auto ualpha = gko::batch::multivector::unbatch(alpha.get()); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - auto res = this->mtx_1->unbatch(); + auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -205,13 +207,13 @@ TYPED_TEST(MultiVector, AddsScaledWithScalar) using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize({{2.0}, {-2.0}}, this->exec); - auto ualpha = alpha->unbatch(); + auto ualpha = gko::batch::multivector::unbatch(alpha.get()); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - auto res = this->mtx_1->unbatch(); + auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -234,13 +236,13 @@ TYPED_TEST(MultiVector, ComputesDot) using T = typename TestFixture::value_type; auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); - auto ures = result->unbatch(); + auto ures = gko::batch::multivector::unbatch(result.get()); this->mtx_0->compute_dot(this->mtx_1.get(), result.get()); this->mtx_00->compute_dot(this->mtx_10.get(), ures[0].get()); this->mtx_01->compute_dot(this->mtx_11.get(), ures[1].get()); - auto res = result->unbatch(); + auto res = gko::batch::multivector::unbatch(result.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); } @@ -275,13 +277,13 @@ TYPED_TEST(MultiVector, ComputesConjDot) using T = typename TestFixture::value_type; auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); - auto ures = result->unbatch(); + auto ures = gko::batch::multivector::unbatch(result.get()); this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()); this->mtx_00->compute_conj_dot(this->mtx_10.get(), ures[0].get()); this->mtx_01->compute_conj_dot(this->mtx_11.get(), ures[1].get()); - auto res = result->unbatch(); + auto res = gko::batch::multivector::unbatch(result.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); } @@ -357,8 +359,8 @@ TYPED_TEST(MultiVector, ConvertsToPrecision) this->mtx_1->convert_to(tmp.get()); tmp->convert_to(res.get()); - auto ures = res->unbatch(); - auto umtx = this->mtx_1->unbatch(); + auto ures = gko::batch::multivector::unbatch(res.get()); + auto umtx = gko::batch::multivector::unbatch(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual); GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual); } @@ -380,8 +382,8 @@ TYPED_TEST(MultiVector, MovesToPrecision) this->mtx_1->move_to(tmp.get()); tmp->move_to(res.get()); - auto ures = res->unbatch(); - auto umtx = this->mtx_1->unbatch(); + auto ures = gko::batch::multivector::unbatch(res.get()); + auto umtx = gko::batch::multivector::unbatch(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual); GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual); } diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 2d0c79d0664..be625853656 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_multi_vector_kernels.hpp" +#include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" @@ -59,13 +60,14 @@ class MultiVector : public CommonTestFixture { MultiVector() : rand_engine(15) {} template - std::unique_ptr gen_mtx(const size_t num_batch_items, int num_rows, - int num_cols) + std::unique_ptr gen_mtx(const gko::size_type num_batch_items, + gko::size_type num_rows, + gko::size_type num_cols) { - return gko::test::generate_uniform_batch_random_matrix( + return gko::test::generate_random_batch_matrix( num_batch_items, num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution<>(-1.0, 1.0), rand_engine, false, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); } void set_up_vector_data(gko::size_type num_vecs, From 0df4d692992b2fd639230914ab9f8486fdef44f3 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 2 Aug 2023 22:56:49 +0200 Subject: [PATCH 149/583] Remove warnings from CI builds --- .github/workflows/intel.yml | 2 +- .gitlab-ci.yml | 6 ++++++ .gitlab/scripts.yml | 2 ++ .gitlab/variables.yml | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 0d8acd52a34..9fd85708737 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -35,7 +35,7 @@ jobs: spack find --loaded mkdir build cd build - cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON + cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DGINKGO_COMPILER_FLAGS="-ffp-model=precise" -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON make -j8 ONEAPI_DEVICE_SELECTOR=level_zero:gpu ctest -j10 --output-on-failure diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4ad66eca652..d2cae1ddf5f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -613,6 +613,7 @@ build/dpcpp/2022-1/cpu/release/static: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" SYCL_DEVICE_FILTER: "*:cpu" @@ -631,6 +632,7 @@ build/dpcpp/igpu/release/shared: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" DPCPP_SINGLE_MODE: "ON" @@ -647,6 +649,7 @@ build/dpcpp/igpu/release/shared: # C_COMPILER: "gcc" # CXX_COMPILER: "dpcpp" # BUILD_DPCPP: "ON" +# GKO_COMPILER_FLAGS: "-ffp-model=precise" # BUILD_TYPE: "Debug" # BUILD_SHARED_LIBS: "ON" # DPCPP_SINGLE_MODE: "ON" @@ -663,6 +666,7 @@ build/dpcpp/dgpu/release/static: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OF" DPCPP_SINGLE_MODE: "ON" @@ -678,6 +682,7 @@ build/dpcpp/level_zero_dgpu/release/shared: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" DPCPP_SINGLE_MODE: "ON" ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" @@ -695,6 +700,7 @@ warnings: BUILD_CUDA: "ON" BUILD_HIP: "ON" CXX_FLAGS: "-Werror=pedantic -pedantic-errors" + GKO_COMPILER_FLAGS: "-Wpedantic" allow_failure: yes # Ensure kernel modules do not depend on core diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 7b1c30c27c0..4f699cb53fc 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -38,6 +38,7 @@ -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} + -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} -DGINKGO_BUILD_HIP=${BUILD_HIP} @@ -82,6 +83,7 @@ -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} + -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} -DGINKGO_BUILD_HIP=${BUILD_HIP} diff --git a/.gitlab/variables.yml b/.gitlab/variables.yml index 6ae62b8c899..183bdef9e4e 100644 --- a/.gitlab/variables.yml +++ b/.gitlab/variables.yml @@ -13,6 +13,7 @@ BUILD_HIP: "OFF" BUILD_HWLOC: "ON" BUILD_MPI: "OFF" + GKO_COMPILER_FLAGS: "" MPI_AS_ROOT: "OFF" FAST_TESTS: "OFF" NONDEFAULT_STREAM: "OFF" From 12a7e66f62b1d50aef24158a670cd70ad87eab6b Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Wed, 2 Aug 2023 21:45:29 +0000 Subject: [PATCH 150/583] Format files Co-authored-by: Pratik Nayak --- core/base/batch_utilities.hpp | 4 +--- include/ginkgo/core/base/batch_multi_vector.hpp | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index f7f28a616d8..e5dc22faeda 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -34,14 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_BATCH_UTILITIES_HPP_ -#include - - #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 8003f5499f1..d91274526d3 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -196,8 +196,8 @@ class MultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + From bd28e2b040c60778a685a8ecf142e903a52c390a Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 2 Aug 2023 23:49:00 +0200 Subject: [PATCH 151/583] Fix warning in exception --- include/ginkgo/core/base/exception_helpers.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp index a9a93f15fe8..4a12865f374 100644 --- a/include/ginkgo/core/base/exception_helpers.hpp +++ b/include/ginkgo/core/base/exception_helpers.hpp @@ -733,7 +733,7 @@ inline T ensure_allocated_impl(T ptr, const std::string& file, int line, */ #define GKO_THROW_IF_INVALID(_condition, _message) \ { \ - if (!_condition) { \ + if (!(_condition)) { \ throw ::gko::InvalidStateError(__FILE__, __LINE__, __func__, \ _message); \ } \ From 24d58587eee7a26b01e3d6c263cec799024a430d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 7 Aug 2023 09:42:23 +0200 Subject: [PATCH 152/583] remove CUDA 9.2 support --- .gitlab-ci.yml | 13 ------ .gitlab/image.yml | 6 --- README.md | 6 +-- cmake/cuda.cmake | 9 ---- cmake/hip.cmake | 15 ------- .../base/device_matrix_data_kernels.hpp.inc | 44 +++++++------------ common/cuda_hip/matrix/csr_kernels.hpp.inc | 38 +++++++--------- common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 9 +--- common/cuda_hip/multigrid/pgm_kernels.hpp.inc | 15 ++----- common/unified/matrix/csr_kernels.cpp | 8 ++-- cuda/solver/common_trs_kernels.cuh | 2 +- hip/CMakeLists.txt | 2 +- .../identify_stream_usage.cpp | 4 -- 13 files changed, 44 insertions(+), 127 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d2cae1ddf5f..709f2b4f53a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,19 +93,6 @@ trigger_pipeline: # Build jobs # Job with example runs. -# cuda 9.2 and friends -build/cuda92/nompi/gcc/all/release/shared: - extends: - - .build_and_test_template - - .default_variables - - .quick_test_condition - - .use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017 - variables: - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_HIP: "ON" - BUILD_TYPE: "Release" - # cuda 10.1 and friends # Build CUDA NVIDIA without omp # Make sure that our jobs run when HWLOC is diff --git a/.gitlab/image.yml b/.gitlab/image.yml index 50dfbe9d2f8..cad06674aee 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -24,12 +24,6 @@ - cpu - controller -.use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017: - image: ginkgohub/cuda:92-mvapich2-gnu7-llvm50-intel2017 - tags: - - private_ci - - nvidia-gpu - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019: image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019 tags: diff --git a/README.md b/README.md index be865e933f2..ba9082839bd 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ For Ginkgo core library: The Ginkgo CUDA module has the following __additional__ requirements: -* _CUDA 9.2+_ or _NVHPC Package 22.7+_ +* _CUDA 10.1+_ or _NVHPC Package 22.7+_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) @@ -58,7 +58,7 @@ The Ginkgo HIP module has the following __additional__ requirements: * _ROCm 4.5+_ * the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either: * _AMD_ backend (using the `clang` compiler) - * _9.2 <= CUDA < 11_ backend + * _10.1 <= CUDA < 11_ backend * if the hipFFT package is available, it is used to implement the FFT LinOps. The Ginkgo DPC++ module has the following __additional__ requirements: @@ -90,7 +90,7 @@ following: The Ginkgo CUDA module has the following __additional__ requirements: -* _CUDA 9.2+_ +* _CUDA 10.1+_ * _Microsoft Visual Studio_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index c5ba334e983..88a1b4e777a 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -85,12 +85,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_ "The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.") endif() -if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION - MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" ) - ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION) - - if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*") - message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue." - "Consider using a different CUDA host compiler or CUDA version.") - endif() -endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 5b7a268c7b6..bb141450b25 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -22,11 +22,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21) set(CMAKE_HIP_ARCHITECTURES OFF) endif() -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" - AND GINKGO_BUILD_CUDA AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2) - message(FATAL_ERROR "Ginkgo HIP backend requires CUDA >= 9.2.") -endif() - if(NOT DEFINED ROCM_PATH) if(DEFINED ENV{ROCM_PATH}) set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed") @@ -197,16 +192,6 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") # Remove false positive CUDA warnings when calling one() and zero() list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda) - if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" - AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2" - AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" ) - ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION) - - if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*") - message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue." - "Consider using a different CUDA host compiler or CUDA version.") - endif() - endif() # select GPU architecture include(cmake/Modules/CudaArchitectureSelector.cmake) cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc index 5930902ed37..faf0ad15146 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc +++ b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc @@ -35,19 +35,13 @@ void remove_zeros(std::shared_ptr exec, array& values, array& row_idxs, array& col_idxs) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto value_ptr = - reinterpret_cast(values.get_const_data()); + using device_value_type = device_type; + auto value_ptr = as_device_type(values.get_const_data()); auto size = values.get_num_elems(); // count nonzeros - auto nnz = - thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size, - [] __device__(device_value_type value) { - return is_nonzero(fake_complex_unpack(value)); - }); + auto nnz = thrust::count_if( + thrust_policy(exec), value_ptr, value_ptr + size, + [] __device__(device_value_type value) { return is_nonzero(value); }); if (nnz < size) { using tuple_type = thrust::tuple; @@ -58,14 +52,13 @@ void remove_zeros(std::shared_ptr exec, // copy nonzeros auto it = thrust::make_zip_iterator(thrust::make_tuple( row_idxs.get_const_data(), col_idxs.get_const_data(), value_ptr)); - auto out_it = thrust::make_zip_iterator(thrust::make_tuple( - new_row_idxs.get_data(), new_col_idxs.get_data(), - reinterpret_cast(new_values.get_data()))); - thrust::copy_if( - thrust_policy(exec), it, it + size, out_it, - [] __device__(tuple_type entry) { - return is_nonzero(fake_complex_unpack(thrust::get<2>(entry))); - }); + auto out_it = thrust::make_zip_iterator( + thrust::make_tuple(new_row_idxs.get_data(), new_col_idxs.get_data(), + as_device_type(new_values.get_data()))); + thrust::copy_if(thrust_policy(exec), it, it + size, out_it, + [] __device__(tuple_type entry) { + return is_nonzero(thrust::get<2>(entry)); + }); // swap out storage values = std::move(new_values); row_idxs = std::move(new_row_idxs); @@ -82,7 +75,6 @@ void sum_duplicates(std::shared_ptr exec, size_type, array& values, array& row_idxs, array& col_idxs) { - using device_value_type = device_member_type; const auto size = values.get_num_elems(); const auto rows = row_idxs.get_const_data(); const auto cols = col_idxs.get_const_data(); @@ -104,12 +96,10 @@ void sum_duplicates(std::shared_ptr exec, size_type, // reduce duplicates auto in_locs = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - auto in_vals = - reinterpret_cast(values.get_const_data()); + auto in_vals = as_device_type(values.get_const_data()); auto out_locs = thrust::make_zip_iterator(thrust::make_tuple( new_row_idxs.get_data(), new_col_idxs.get_data())); - auto out_vals = - reinterpret_cast(new_values.get_data()); + auto out_vals = as_device_type(new_values.get_data()); thrust::reduce_by_key(thrust_policy(exec), in_locs, in_locs + size, in_vals, out_locs, out_vals); // swap out storage @@ -127,13 +117,9 @@ template void sort_row_major(std::shared_ptr exec, device_matrix_data& data) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; auto it = thrust::make_zip_iterator( thrust::make_tuple(data.get_row_idxs(), data.get_col_idxs())); - auto vals = reinterpret_cast(data.get_values()); + auto vals = as_device_type(data.get_values()); thrust::sort_by_key(thrust_policy(exec), it, it + data.get_num_elems(), vals); } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index c370075c8a8..3f02337747e 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -872,11 +872,7 @@ void convert_to_fbcsr(std::shared_ptr exec, } auto in_rows = in_row_idxs.get_data(); auto in_cols = in_col_idxs.get_data(); - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - auto in_vals = - reinterpret_cast*>(in_values.get_data()); + auto in_vals = as_device_type(in_values.get_data()); auto in_loc_it = thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols)); thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz, @@ -924,17 +920,17 @@ void convert_to_fbcsr(std::shared_ptr exec, // fill in values components::fill_array(exec, block_value_array.get_data(), num_blocks * bs * bs, zero()); - thrust::for_each_n( - thrust_policy(exec), iota, num_blocks, - [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, - values] __device__(size_type i) { - const auto block_begin = block_ptrs[i]; - const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; - for (auto nz = block_begin; nz < block_end; nz++) { - values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]); - } - }); + thrust::for_each_n(thrust_policy(exec), iota, num_blocks, + [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, + in_vals, values] __device__(size_type i) { + const auto block_begin = block_ptrs[i]; + const auto block_end = + i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; + for (auto nz = block_begin; nz < block_end; nz++) { + values[i * bs * bs + (in_cols[nz] % bs) * bs + + (in_rows[nz] % bs)] = in_vals[nz]; + } + }); } @@ -1130,13 +1126,10 @@ void fallback_transpose(std::shared_ptr exec, const auto nnz = output->get_num_stored_elements(); const auto in_row_ptrs = input->get_const_row_ptrs(); const auto in_col_idxs = input->get_const_col_idxs(); - // workaround for CUDA 9.2 Thrust unconstrained constructor issues - const auto in_vals = reinterpret_cast*>( - input->get_const_values()); + const auto in_vals = as_device_type(input->get_const_values()); const auto out_row_ptrs = output->get_row_ptrs(); const auto out_col_idxs = output->get_col_idxs(); - const auto out_vals = - reinterpret_cast*>(output->get_values()); + const auto out_vals = as_device_type(output->get_values()); array out_row_idxs{exec, nnz}; components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows, out_col_idxs); @@ -1156,8 +1149,7 @@ void fallback_sort(std::shared_ptr exec, { const auto row_ptrs = to_sort->get_const_row_ptrs(); const auto col_idxs = to_sort->get_col_idxs(); - const auto vals = - reinterpret_cast*>(to_sort->get_values()); + const auto vals = as_device_type(to_sort->get_values()); const auto nnz = to_sort->get_num_stored_elements(); const auto num_rows = to_sort->get_size()[0]; array row_idx_array(exec, nnz); diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index d71d593b0a2..607ec5046ea 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -172,11 +172,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } auto in_rows = data.get_row_idxs(); auto in_cols = data.get_col_idxs(); - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - auto in_vals = - reinterpret_cast*>(data.get_values()); + auto in_vals = as_device_type(data.get_values()); auto in_loc_it = thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols)); thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz, @@ -232,8 +228,7 @@ void fill_in_matrix_data(std::shared_ptr exec, const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; for (auto nz = block_begin; nz < block_end; nz++) { block_values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = - fake_complex_unpack(in_vals[nz]); + (in_rows[nz] % bs)] = in_vals[nz]; } }); } diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc index d8b6c4786b0..b08e86efaaa 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc +++ b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc @@ -45,11 +45,7 @@ template void sort_row_major(std::shared_ptr exec, size_type nnz, IndexType* row_idxs, IndexType* col_idxs, ValueType* vals) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto vals_it = reinterpret_cast(vals); + auto vals_it = as_device_type(vals); auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs)); // Because reduce_by_key is not deterministic, so we do not need // stable_sort_by_key @@ -67,16 +63,11 @@ void compute_coarse_coo(std::shared_ptr exec, const IndexType* col_idxs, const ValueType* vals, matrix::Coo* coarse_coo) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto vals_it = reinterpret_cast(vals); + auto vals_it = as_device_type(vals); auto key_it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs)); - auto coarse_vals_it = - reinterpret_cast(coarse_coo->get_values()); + auto coarse_vals_it = as_device_type(coarse_coo->get_values()); auto coarse_key_it = thrust::make_zip_iterator(thrust::make_tuple( coarse_coo->get_row_idxs(), coarse_coo->get_col_idxs())); diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index f4e034998bd..1704fdd1f9c 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -154,8 +154,8 @@ void convert_to_sellp(std::shared_ptr exec, for (auto i = row_begin; i < row_begin + slice_length; i++) { cols[out_idx] = i < row_end ? in_cols[i] : invalid_index(); - values[out_idx] = i < row_end ? unpack_member(in_values[i]) - : zero(values[out_idx]); + values[out_idx] = + i < row_end ? in_values[i] : zero(values[out_idx]); out_idx += slice_size; } }, @@ -185,8 +185,8 @@ void convert_to_ell(std::shared_ptr exec, for (auto i = row_begin; i < row_begin + ell_length; i++) { cols[out_idx] = i < row_end ? in_cols[i] : invalid_index(); - values[out_idx] = i < row_end ? unpack_member(in_values[i]) - : zero(values[out_idx]); + values[out_idx] = + i < row_end ? in_values[i] : zero(values[out_idx]); out_idx += ell_stride; } }, diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index bfdb4a5f854..f42b11f510d 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -198,7 +198,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { }; -#elif (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) +#else template struct CudaSolveStruct : gko::solver::SolveStruct { diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 1573169527d..5ec1718ca4d 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -142,7 +142,7 @@ if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") endif() target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES}) elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - find_package(CUDA 9.2 REQUIRED) + find_package(CUDA 10.1 REQUIRED) target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES}) endif() diff --git a/third_party/identify_stream_usage/identify_stream_usage.cpp b/third_party/identify_stream_usage/identify_stream_usage.cpp index a88de4ee427..5cdd4d30b09 100644 --- a/third_party/identify_stream_usage/identify_stream_usage.cpp +++ b/third_party/identify_stream_usage/identify_stream_usage.cpp @@ -124,14 +124,10 @@ DEFINE_OVERLOAD(cudaLaunchCooperativeKernel, size_t sharedMem, cudaStream_t stream), ARG(func, gridDim, blockDim, args, sharedMem, stream)); -#if CUDA_VERSION >= 10000 - DEFINE_OVERLOAD(cudaLaunchHostFunc, ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData), ARG(stream, fn, userData)); -#endif - // Memory transfer APIS: // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY DEFINE_OVERLOAD(cudaMemPrefetchAsync, From f63484b51363cb3da9b530b45453efe9ab4ae9ae Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 7 May 2023 12:55:12 +0200 Subject: [PATCH 153/583] modernize CUDA setup --- CMakeLists.txt | 5 +- benchmark/CMakeLists.txt | 8 +-- cmake/DownloadNonCMakeCMakeLists.txt.in | 2 +- cmake/GinkgoConfig.cmake.in | 6 +- cmake/Modules/FindNVTX.cmake | 4 +- cmake/cuda.cmake | 62 +------------------ cuda/CMakeLists.txt | 7 +-- cuda/get_info.cmake | 5 -- cuda/test/solver/CMakeLists.txt | 4 +- doc/CMakeLists.txt | 2 +- .../CMakeLists.txt | 2 +- examples/cb-gmres/CMakeLists.txt | 2 +- examples/custom-logger/CMakeLists.txt | 2 +- examples/custom-matrix-format/CMakeLists.txt | 2 +- .../custom-stopping-criterion/CMakeLists.txt | 2 +- .../external-lib-interfacing/CMakeLists.txt | 2 +- examples/ginkgo-overhead/CMakeLists.txt | 2 +- examples/ginkgo-ranges/CMakeLists.txt | 2 +- examples/heat-equation/CMakeLists.txt | 2 +- .../ilu-preconditioned-solver/CMakeLists.txt | 2 +- examples/inverse-iteration/CMakeLists.txt | 2 +- .../CMakeLists.txt | 2 +- examples/iterative-refinement/CMakeLists.txt | 2 +- examples/kokkos_assembly/CMakeLists.txt | 2 +- examples/minimal-cuda-solver/CMakeLists.txt | 2 +- .../CMakeLists.txt | 2 +- .../mixed-multigrid-solver/CMakeLists.txt | 2 +- examples/mixed-precision-ir/CMakeLists.txt | 2 +- examples/mixed-spmv/CMakeLists.txt | 2 +- .../CMakeLists.txt | 2 +- .../CMakeLists.txt | 2 +- .../nine-pt-stencil-solver/CMakeLists.txt | 2 +- examples/papi-logging/CMakeLists.txt | 2 +- examples/par-ilu-convergence/CMakeLists.txt | 2 +- examples/performance-debugging/CMakeLists.txt | 2 +- examples/poisson-solver/CMakeLists.txt | 2 +- examples/preconditioned-solver/CMakeLists.txt | 2 +- examples/preconditioner-export/CMakeLists.txt | 2 +- .../schroedinger-splitting/CMakeLists.txt | 2 +- examples/simple-solver-logging/CMakeLists.txt | 2 +- examples/simple-solver/CMakeLists.txt | 2 +- .../three-pt-stencil-solver/CMakeLists.txt | 2 +- test/test_exportbuild/CMakeLists.txt | 2 +- test/test_install/CMakeLists.txt | 2 +- test/test_pkgconfig/CMakeLists.txt | 2 +- test/test_subdir/CMakeLists.txt | 2 +- 46 files changed, 52 insertions(+), 125 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d70ac404ce..89c2b65d74b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,4 @@ -cmake_minimum_required(VERSION 3.13) - -# Use *_ROOT environment variables for find_package calls -cmake_policy(SET CMP0074 NEW) +cmake_minimum_required(VERSION 3.16) # Let CAS handle the CUDA architecture flags (for now) # Windows still gives CMP0104 warning if putting it in cuda. diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 434474fd336..f29620fac41 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -20,8 +20,8 @@ function(ginkgo_benchmark_cusparse_linops type def) endif() # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) - target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE}) - target_include_directories(cusparse_linops_${type} SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) + target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) + target_include_directories(cusparse_linops_${type} SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) target_compile_definitions(cusparse_linops_${type} PRIVATE ALLOWMP=1) endfunction() @@ -122,8 +122,8 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(cuda_timer utils/cuda_timer.cpp) - target_link_libraries(cuda_timer ginkgo ${CUDA_RUNTIME_LIBS}) - target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) + target_link_libraries(cuda_timer ginkgo CUDA::cudart) + target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) endif() if (GINKGO_BUILD_HIP) ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) diff --git a/cmake/DownloadNonCMakeCMakeLists.txt.in b/cmake/DownloadNonCMakeCMakeLists.txt.in index c2d848e8d49..bae2281e63b 100644 --- a/cmake/DownloadNonCMakeCMakeLists.txt.in +++ b/cmake/DownloadNonCMakeCMakeLists.txt.in @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(${package_name}) include(ExternalProject) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index fe2ac05d7e5..a1e209a0c79 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -122,11 +122,7 @@ set(GINKGO_INTERFACE_CXX_FLAGS "@GINKGO_INTERFACE_CXX_FLAGS@") set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") set(GINKGO_CUDA_COMPILER_VERSION @CMAKE_CUDA_COMPILER_VERSION@) set(GINKGO_CUDA_HOST_LINK_LAUNCHER "@CMAKE_CUDA_HOST_LINK_LAUNCHER@") - -set(GINKGO_CUBLAS_LIBRARIES "@CUBLAS@") -set(GINKGO_CUSPARSE_LIBRARIES "@CUSPARSE@") -set(GINKGO_CUDA_LIBRARIES "@CUDA_RUNTIME_LIBS@") -set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@") +set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CUDAToolkit_INCLUDE_DIRS@") set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS_MODIFY@") set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG_MODIFY@") diff --git a/cmake/Modules/FindNVTX.cmake b/cmake/Modules/FindNVTX.cmake index 7078c9dcb36..879c66f2d59 100644 --- a/cmake/Modules/FindNVTX.cmake +++ b/cmake/Modules/FindNVTX.cmake @@ -27,8 +27,8 @@ # ``NVTX_FOUND`` # If false, do not try to use the NVTX library. -find_path(NVTX3_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/nvtx3) -find_path(NVTX_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +find_path(NVTX3_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CUDAToolkit_INCLUDE_DIRS}/nvtx3) +find_path(NVTX_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CUDAToolkit_INCLUDE_DIRS}) mark_as_advanced(NVTX3_INCLUDE_DIR) mark_as_advanced(NVTX_INCLUDE_DIR) include(FindPackageHandleStandardArgs) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 88a1b4e777a..9d0b435be9f 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -1,76 +1,16 @@ enable_language(CUDA) -if(MSVC) - # MSVC can not find CUDA automatically - # Use CUDA_COMPILER PATH to define the CUDA TOOLKIT ROOT DIR - string(REPLACE "/bin/nvcc.exe" "" CMAKE_CUDA_ROOT_DIR ${CMAKE_CUDA_COMPILER}) - if("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" STREQUAL "") - set(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/include") - endif() - if("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" STREQUAL "") - set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/lib/x64") - endif() -endif() +find_package(CUDAToolkit REQUIRED) include(cmake/Modules/CudaArchitectureSelector.cmake) -set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - # Detect the CUDA architecture flags and propagate to all the project cas_variable_cuda_architectures(GINKGO_CUDA_ARCH_FLAGS ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} UNSUPPORTED "20" "21") -if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC") - find_package(NVHPC REQUIRED - HINTS - $ENV{NVIDIA_PATH} - ${CMAKE_CUDA_COMPILER}/../../.. - ) - - set(CUDA_RUNTIME_LIBS_DYNAMIC ${NVHPC_CUDART_LIBRARY}) - set(CUDA_RUNTIME_LIBS_STATIC ${NVHPC_CUDART_LIBRARY_STATIC}) - set(CUBLAS ${NVHPC_CUBLAS_LIBRARY}) - set(CUSPARSE ${NVHPC_CUSPARSE_LIBRARY}) - set(CURAND ${NVHPC_CURAND_LIBRARY}) - set(CUFFT ${NVHPC_CUFFT_LIBRARY}) -else() - find_library(CUDA_RUNTIME_LIBS_DYNAMIC cudart - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - find_library(CUDA_RUNTIME_LIBS_STATIC cudart_static - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - - # CUDA 10.1/10.2 put cublas, cublasLt, cudnn in /usr/lib/-linux-gnu/, but - # others (<= 10.0 or >= 11) put them in cuda own directory - # If the environment installs several cuda including 10.1/10.2, cmake will find - # the 10.1/10.2 .so files when searching others cuda in the default path. - # CMake already puts /usr/lib/-linux-gnu/ after cuda own directory in the - # `CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES`, so we always put NO_DEFAULT_PATH here. - find_library(CUBLAS cublas - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} NO_DEFAULT_PATH) - find_library(CUSPARSE cusparse - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - find_library(CURAND curand - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - find_library(CUFFT cufft - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) -endif() - find_package(NVTX REQUIRED) -# MSVC nvcc uses static cudartlibrary by default, and other platforms use shared cudartlibrary. -# add `-cudart shared` or `-cudart=shared` according system into CMAKE_CUDA_FLAGS -# to force nvcc to use dynamic cudart library in MSVC. -if(MSVC) - if("${CMAKE_CUDA_FLAGS}" MATCHES "-cudart(=| )shared") - set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE) - else() - set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_STATIC}" CACHE STRING "Path to a library" FORCE) - endif() -else() - set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE) -endif() - if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index dccc9e91401..7cf9053e2cf 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -114,13 +114,12 @@ target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_C target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) -target_include_directories(ginkgo_cuda - SYSTEM PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +target_include_directories(ginkgo_cuda SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) # include path for generated headers like jacobi_common.hpp target_include_directories(ginkgo_cuda PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..) -target_link_libraries(ginkgo_cuda PRIVATE ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE} ${CURAND} ${CUFFT} nvtx::nvtx) +target_link_libraries(ginkgo_cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cufft nvtx::nvtx) # NVTX3 is header-only and requires dlopen/dlclose in static builds target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS}) target_compile_options(ginkgo_cuda @@ -133,7 +132,7 @@ list(GET CUDA_RUNTIME_LIBS 0 CUDA_FIRST_LIB) get_filename_component(GKO_CUDA_LIBDIR "${CUDA_FIRST_LIB}" DIRECTORY) ginkgo_default_includes(ginkgo_cuda) -ginkgo_install_library(ginkgo_cuda "${GKO_CUDA_LIBDIR}") +ginkgo_install_library(ginkgo_cuda "${CUDAToolkit_LIBRARY_DIR}") if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_cuda GKO_COMPILING_CUDA) diff --git a/cuda/get_info.cmake b/cuda/get_info.cmake index 3d91ea9f23a..7955c3f636b 100644 --- a/cuda/get_info.cmake +++ b/cuda/get_info.cmake @@ -8,9 +8,4 @@ ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER") ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER_VERSION") ginkgo_print_flags(${detailed_log} "CMAKE_CUDA_FLAGS") ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_HOST_COMPILER") -ginkgo_print_variable(${detailed_log} "CUDA_INCLUDE_DIRS") -ginkgo_print_module_footer(${detailed_log} "CUDA Libraries:") -ginkgo_print_variable(${detailed_log} "CUBLAS") -ginkgo_print_variable(${detailed_log} "CUDA_RUNTIME_LIBS") -ginkgo_print_variable(${detailed_log} "CUSPARSE") ginkgo_print_module_footer(${detailed_log} "") diff --git a/cuda/test/solver/CMakeLists.txt b/cuda/test/solver/CMakeLists.txt index 0220d94c8d9..65187e68e1b 100644 --- a/cuda/test/solver/CMakeLists.txt +++ b/cuda/test/solver/CMakeLists.txt @@ -1,2 +1,2 @@ -ginkgo_create_test(lower_trs_kernels ADDITIONAL_INCLUDES ${CUDA_INCLUDE_DIRS}) -ginkgo_create_test(upper_trs_kernels ADDITIONAL_INCLUDES ${CUDA_INCLUDE_DIRS}) +ginkgo_create_test(lower_trs_kernels ADDITIONAL_INCLUDES ${CUDAToolkit_INCLUDE_DIRS}) +ginkgo_create_test(upper_trs_kernels ADDITIONAL_INCLUDES ${CUDAToolkit_INCLUDE_DIRS}) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 8b975bb6544..8965b42add4 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) include(helpers.cmake) find_package(Doxygen REQUIRED) find_package(Perl REQUIRED) diff --git a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt index 744df84a74b..324400e9cb4 100644 --- a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt +++ b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(adaptiveprecision-blockjacobi) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/cb-gmres/CMakeLists.txt b/examples/cb-gmres/CMakeLists.txt index d616b16c882..826100b8bd2 100644 --- a/examples/cb-gmres/CMakeLists.txt +++ b/examples/cb-gmres/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(cb-gmres) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/custom-logger/CMakeLists.txt b/examples/custom-logger/CMakeLists.txt index f986dd52e76..8278d3e72ba 100644 --- a/examples/custom-logger/CMakeLists.txt +++ b/examples/custom-logger/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(custom-logger) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index 47eeda0143c..26034b7dce2 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(custom-matrix-format CXX CUDA) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/custom-stopping-criterion/CMakeLists.txt b/examples/custom-stopping-criterion/CMakeLists.txt index 811baa59a9c..b429fba7c59 100644 --- a/examples/custom-stopping-criterion/CMakeLists.txt +++ b/examples/custom-stopping-criterion/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(custom-stopping-criterion) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/external-lib-interfacing/CMakeLists.txt b/examples/external-lib-interfacing/CMakeLists.txt index 4501ace4088..56d7b92ea0f 100644 --- a/examples/external-lib-interfacing/CMakeLists.txt +++ b/examples/external-lib-interfacing/CMakeLists.txt @@ -1,7 +1,7 @@ if(GINKGO_BUILD_EXTLIB_EXAMPLE) # This is just an example of the CMakeLists.txt file that can be used after the # correct version of deal.ii has been installed. - cmake_minimum_required(VERSION 3.9) + cmake_minimum_required(VERSION 3.16) project(DEAL_II_EXAMPLE LANGUAGES CXX) find_package(MPI 3.1 COMPONENTS CXX REQUIRED) diff --git a/examples/ginkgo-overhead/CMakeLists.txt b/examples/ginkgo-overhead/CMakeLists.txt index fcd7a81c230..350b58312fc 100644 --- a/examples/ginkgo-overhead/CMakeLists.txt +++ b/examples/ginkgo-overhead/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(ginkgo-overhead) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/ginkgo-ranges/CMakeLists.txt b/examples/ginkgo-ranges/CMakeLists.txt index 6e30c4f9af4..734a4567376 100644 --- a/examples/ginkgo-ranges/CMakeLists.txt +++ b/examples/ginkgo-ranges/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(ginkgo-ranges) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/heat-equation/CMakeLists.txt b/examples/heat-equation/CMakeLists.txt index f4790edaa8d..89dfb9e513b 100644 --- a/examples/heat-equation/CMakeLists.txt +++ b/examples/heat-equation/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(heat-equation) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/ilu-preconditioned-solver/CMakeLists.txt b/examples/ilu-preconditioned-solver/CMakeLists.txt index e6c840f38f8..0d1d215860e 100644 --- a/examples/ilu-preconditioned-solver/CMakeLists.txt +++ b/examples/ilu-preconditioned-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(ilu-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/inverse-iteration/CMakeLists.txt b/examples/inverse-iteration/CMakeLists.txt index deb72accffd..c73da656587 100644 --- a/examples/inverse-iteration/CMakeLists.txt +++ b/examples/inverse-iteration/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(inverse-iteration) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt index fc1205fbd0d..3a05cb56a81 100644 --- a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt +++ b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(ir-ilu-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/iterative-refinement/CMakeLists.txt b/examples/iterative-refinement/CMakeLists.txt index fe94a94455b..f8c06ddcafa 100644 --- a/examples/iterative-refinement/CMakeLists.txt +++ b/examples/iterative-refinement/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(iterative-refinement) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/kokkos_assembly/CMakeLists.txt b/examples/kokkos_assembly/CMakeLists.txt index bfee201c91d..9e229c29f58 100644 --- a/examples/kokkos_assembly/CMakeLists.txt +++ b/examples/kokkos_assembly/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13) +cmake_minimum_required(VERSION 3.16) project(kokkos-assembly CXX) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/minimal-cuda-solver/CMakeLists.txt b/examples/minimal-cuda-solver/CMakeLists.txt index 3add4bb30ad..2d81e558eec 100644 --- a/examples/minimal-cuda-solver/CMakeLists.txt +++ b/examples/minimal-cuda-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(minimal-cuda-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt index d710f10f146..a66a8410bfb 100644 --- a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt +++ b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(mixed-multigrid-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/mixed-multigrid-solver/CMakeLists.txt b/examples/mixed-multigrid-solver/CMakeLists.txt index 17ec2fa398e..af73c94c334 100644 --- a/examples/mixed-multigrid-solver/CMakeLists.txt +++ b/examples/mixed-multigrid-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(mixed-multigrid-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/mixed-precision-ir/CMakeLists.txt b/examples/mixed-precision-ir/CMakeLists.txt index 01094a5376b..156ede4fe13 100644 --- a/examples/mixed-precision-ir/CMakeLists.txt +++ b/examples/mixed-precision-ir/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(mixed-precision-ir) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/mixed-spmv/CMakeLists.txt b/examples/mixed-spmv/CMakeLists.txt index 0e4378ca82f..2e2ed9bb074 100644 --- a/examples/mixed-spmv/CMakeLists.txt +++ b/examples/mixed-spmv/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(mixed-spmv) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt index 411b57b2c83..99ba03167f5 100644 --- a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt +++ b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(multigrid-preconditioned-solver-customized) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/multigrid-preconditioned-solver/CMakeLists.txt b/examples/multigrid-preconditioned-solver/CMakeLists.txt index 90277398b85..75c56b80062 100644 --- a/examples/multigrid-preconditioned-solver/CMakeLists.txt +++ b/examples/multigrid-preconditioned-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(multigrid-preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/nine-pt-stencil-solver/CMakeLists.txt b/examples/nine-pt-stencil-solver/CMakeLists.txt index 35610ba758a..511bb334d7c 100644 --- a/examples/nine-pt-stencil-solver/CMakeLists.txt +++ b/examples/nine-pt-stencil-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(nine-pt-stencil-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/papi-logging/CMakeLists.txt b/examples/papi-logging/CMakeLists.txt index 6927675e2ec..3695e12b814 100644 --- a/examples/papi-logging/CMakeLists.txt +++ b/examples/papi-logging/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(papi-logging) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/par-ilu-convergence/CMakeLists.txt b/examples/par-ilu-convergence/CMakeLists.txt index 23b7afd1e75..8679ccdf526 100644 --- a/examples/par-ilu-convergence/CMakeLists.txt +++ b/examples/par-ilu-convergence/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(par-ilu-convergence) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/performance-debugging/CMakeLists.txt b/examples/performance-debugging/CMakeLists.txt index 715cd99fe1b..7f6317a491f 100644 --- a/examples/performance-debugging/CMakeLists.txt +++ b/examples/performance-debugging/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(performance-debugging) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/poisson-solver/CMakeLists.txt b/examples/poisson-solver/CMakeLists.txt index bd5383876d5..83791b5cfda 100644 --- a/examples/poisson-solver/CMakeLists.txt +++ b/examples/poisson-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(poisson-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/preconditioned-solver/CMakeLists.txt b/examples/preconditioned-solver/CMakeLists.txt index a412885f219..b8d9bb8fc9f 100644 --- a/examples/preconditioned-solver/CMakeLists.txt +++ b/examples/preconditioned-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(preconditioned-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/preconditioner-export/CMakeLists.txt b/examples/preconditioner-export/CMakeLists.txt index 1cfd6d7ff84..83a20952d51 100644 --- a/examples/preconditioner-export/CMakeLists.txt +++ b/examples/preconditioner-export/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(preconditioner-export) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/schroedinger-splitting/CMakeLists.txt b/examples/schroedinger-splitting/CMakeLists.txt index 1e49a1f88b4..555fb59b554 100644 --- a/examples/schroedinger-splitting/CMakeLists.txt +++ b/examples/schroedinger-splitting/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(schroedinger-splitting) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/simple-solver-logging/CMakeLists.txt b/examples/simple-solver-logging/CMakeLists.txt index befead38e7d..2272413f52a 100644 --- a/examples/simple-solver-logging/CMakeLists.txt +++ b/examples/simple-solver-logging/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(simple-solver-logging) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/simple-solver/CMakeLists.txt b/examples/simple-solver/CMakeLists.txt index dd0faec5f53..d2a30ac084f 100644 --- a/examples/simple-solver/CMakeLists.txt +++ b/examples/simple-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(simple-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/examples/three-pt-stencil-solver/CMakeLists.txt b/examples/three-pt-stencil-solver/CMakeLists.txt index fc0691dd7c9..164c9e08302 100644 --- a/examples/three-pt-stencil-solver/CMakeLists.txt +++ b/examples/three-pt-stencil-solver/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(three-pt-stencil-solver) # We only need to find Ginkgo if we build this example stand-alone diff --git a/test/test_exportbuild/CMakeLists.txt b/test/test_exportbuild/CMakeLists.txt index 52a8d3851cd..71633b91c35 100644 --- a/test/test_exportbuild/CMakeLists.txt +++ b/test/test_exportbuild/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(GinkgoExportBuildTest LANGUAGES CXX) find_package(Ginkgo REQUIRED) diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt index a36a936e867..070410ec48c 100644 --- a/test/test_install/CMakeLists.txt +++ b/test/test_install/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(TestInstall LANGUAGES CXX) diff --git a/test/test_pkgconfig/CMakeLists.txt b/test/test_pkgconfig/CMakeLists.txt index 883ad134f05..e904f997f26 100644 --- a/test/test_pkgconfig/CMakeLists.txt +++ b/test/test_pkgconfig/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(GinkgoExportBuildWithPkgConfigTest LANGUAGES CXX) find_package(PkgConfig REQUIRED) diff --git a/test/test_subdir/CMakeLists.txt b/test/test_subdir/CMakeLists.txt index 2017b69366f..dcf846f4adc 100644 --- a/test/test_subdir/CMakeLists.txt +++ b/test/test_subdir/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.14) +cmake_minimum_required(VERSION 3.16) project(GinkgoSubdirTest LANGUAGES CXX) file(CREATE_LINK "${CMAKE_CURRENT_SOURCE_DIR}/../.." "${CMAKE_CURRENT_BINARY_DIR}/ginkgo" SYMBOLIC) From ed0a8b31eb6577fc2be8b4a70dc9b1ddf2f462eb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 17 Jul 2023 11:00:50 +0200 Subject: [PATCH 154/583] find CUDAToolkit in installed ginkgo --- cmake/GinkgoConfig.cmake.in | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index a1e209a0c79..1ba77bd9f19 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -176,6 +176,7 @@ endif() # For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614 if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA) enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) find_package(NVTX REQUIRED) endif() From 954f5340d971c791dd9a7c476a1919e19449bb74 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 17 Jul 2023 11:04:17 +0200 Subject: [PATCH 155/583] throw error on insufficient CMake version --- cuda/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 7cf9053e2cf..4c09bf96645 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,3 +1,4 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) From 37d09da5c8dad11aceb98a6425845927ae7db8a7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 17 Jul 2023 11:05:52 +0200 Subject: [PATCH 156/583] fix version requirement --- cuda/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 4c09bf96645..358ce8092c0 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.17 FATAL_ERROR) add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) From fec9b94f8180c542caa6b8a6e13171e2752537fb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 18 Jul 2023 22:43:19 +0200 Subject: [PATCH 157/583] set CUDA host compiler before enabling language Otherwise, the host compiler might not be used --- cmake/cuda.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 9d0b435be9f..20e734f2d9f 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -1,3 +1,9 @@ +if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) + set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) +elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) + unset(CMAKE_CUDA_HOST_COMPILER CACHE) +endif() + enable_language(CUDA) find_package(CUDAToolkit REQUIRED) @@ -11,12 +17,6 @@ cas_variable_cuda_architectures(GINKGO_CUDA_ARCH_FLAGS find_package(NVTX REQUIRED) -if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) - set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) -elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) - unset(CMAKE_CUDA_HOST_COMPILER CACHE) -endif() - if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER) message(WARNING "The CMake CXX compiler and CUDA host compiler do not match. " "If you encounter any build error, especially while linking, try to use " From 25b24ca0dd1250dbe5c4e8cdc3efdcf862aa661f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 7 Aug 2023 15:37:51 +0200 Subject: [PATCH 158/583] review updates - Set host compiler via environment variable - update CMake version requirement with CUDA to 3.18 - remove RPATH for CUDA Co-authored-by: Marcel Koch --- .gitlab/scripts.yml | 11 +++++------ CMakeLists.txt | 8 -------- README.md | 3 ++- benchmark/CMakeLists.txt | 2 -- benchmark/utils/cuda_linops.cpp | 2 -- cmake/GinkgoConfig.cmake.in | 12 +----------- cmake/cuda.cmake | 7 ------- cmake/hip.cmake | 5 ----- cuda/CMakeLists.txt | 13 ++----------- cuda/get_info.cmake | 2 +- cuda/test/solver/CMakeLists.txt | 4 ++-- 11 files changed, 13 insertions(+), 56 deletions(-) diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 4f699cb53fc..becf0ed5b8d 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -22,8 +22,7 @@ script: - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME} - if [ -n "${CUDA_ARCH}" ]; then - CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; - CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER}); + export CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; fi - if [[ "${MPI_AS_ROOT}" == "ON" ]];then export OMPI_ALLOW_RUN_AS_ROOT=1; @@ -32,12 +31,12 @@ - if [[ "${BUILD_MPI}" == "ON" ]]; then MPI_STR=-DGINKGO_MPI_EXEC_SUFFIX=${MPI_SUFFIX}; fi + - export CC=${C_COMPILER} CXX=${CXX_COMPILER} CUDAHOSTCXX=${CXX_COMPILER} CUDACXX=${CUDA_COMPILER} - cmake ${CI_PROJECT_DIR}${CI_PROJECT_DIR_SUFFIX} -GNinja - -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} - ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} + ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} @@ -65,7 +64,6 @@ - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME} - if [ -n "${CUDA_ARCH}" ]; then CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; - CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER}); fi - if [ -n "${SYCL_DEVICE_TYPE}" ]; then export SYCL_DEVICE_TYPE; fi - if [ -n "${SYCL_DEVICE_FILTER}" ]; then export SYCL_DEVICE_FILTER; fi @@ -77,6 +75,7 @@ - if [[ "${BUILD_MPI}" == "ON" ]]; then MPI_STR=-DGINKGO_MPI_EXEC_SUFFIX=${MPI_SUFFIX}; fi + - export CC=${C_COMPILER} CXX=${CXX_COMPILER} CUDAHOSTCXX=${CXX_COMPILER} CUDACXX=${CUDA_COMPILER} - cmake ${CI_PROJECT_DIR}${CI_PROJECT_DIR_SUFFIX} -GNinja -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} diff --git a/CMakeLists.txt b/CMakeLists.txt index 89c2b65d74b..195a6a1df69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,5 @@ cmake_minimum_required(VERSION 3.16) -# Let CAS handle the CUDA architecture flags (for now) -# Windows still gives CMP0104 warning if putting it in cuda. -if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - cmake_policy(SET CMP0104 OLD) -endif() - - project(Ginkgo LANGUAGES C CXX VERSION 1.7.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") set(Ginkgo_VERSION_TAG "master") set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG}) @@ -67,7 +60,6 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING "Set the required NVCC compiler flags, mainly used for warnings. Current default is an empty string") set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING "A list of target NVIDIA GPU architectures. See README.md for more detail.") -option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) # the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON) set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING diff --git a/README.md b/README.md index ba9082839bd..b3b7d8660b7 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Prerequisites For Ginkgo core library: -* _cmake 3.13+_ +* _cmake 3.16+_ * C++14 compliant compiler, one of: * _gcc 5.5+_ * _clang 3.9+_ @@ -47,6 +47,7 @@ For Ginkgo core library: The Ginkgo CUDA module has the following __additional__ requirements: +* _cmake 3.18+_ * _CUDA 10.1+_ or _NVHPC Package 22.7+_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index f29620fac41..641c6f363ec 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -21,8 +21,6 @@ function(ginkgo_benchmark_cusparse_linops type def) # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) - target_include_directories(cusparse_linops_${type} SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) - target_compile_definitions(cusparse_linops_${type} PRIVATE ALLOWMP=1) endfunction() function(ginkgo_benchmark_hipsparse_linops type def) diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index dd1dda5c774..e2221614d9c 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -438,9 +438,7 @@ class CusparseCsrEx trans_(CUSPARSE_OPERATION_NON_TRANSPOSE), buffer_(exec) { -#ifdef ALLOWMP algmode_ = CUSPARSE_ALG_MERGE_PATH; -#endif // ALLOWMP } private: diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 1ba77bd9f19..093690e16f8 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -62,7 +62,7 @@ set(GINKGO_IWYU_PATH @GINKGO_IWYU_PATH@) set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@) set(GINKGO_CUDA_ARCHITECTURES "@GINKGO_CUDA_ARCHITECTURES@") -set(GINKGO_CUDA_DEFAULT_HOST_COMPILER @GINKGO_CUDA_DEFAULT_HOST_COMPILER@) +set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") set(GINKGO_CUDA_ARCH_FLAGS "@GINKGO_CUDA_ARCH_FLAGS@") set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@") @@ -144,16 +144,6 @@ set(VTune_PATH "@VTune_PATH@") # NOTE: we do not export benchmarks, examples, tests or devel tools # so `third_party` libraries are currently unneeded. -# propagate CUDA_HOST_COMPILER if needed -if (GINKGO_BUILD_CUDA OR (GINKGO_BUILD_HIP - AND GINKGO_HIP_PLATFORM MATCHES "${GINKGO_HIP_PLATFORM_NVIDIA_REGEX}")) - if (GINKGO_CUDA_HOST_COMPILER AND NOT CMAKE_CUDA_HOST_COMPILER - AND EXISTS "${GINKGO_CUDA_HOST_COMPILER}") - message(STATUS "Ginkgo: Setting CUDA host compiler to ${GINKGO_CUDA_HOST_COMPILER}") - set(CMAKE_CUDA_HOST_COMPILER "${GINKGO_CUDA_HOST_COMPILER}" CACHE STRING "" FORCE) - endif() -endif() - if(GINKGO_HAVE_PAPI_SDE) find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde) endif() diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 20e734f2d9f..378003df50d 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -1,9 +1,3 @@ -if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) - set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) -elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) - unset(CMAKE_CUDA_HOST_COMPILER CACHE) -endif() - enable_language(CUDA) find_package(CUDAToolkit REQUIRED) @@ -24,4 +18,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_ "The CXX compiler is ${CMAKE_CXX_COMPILER} with version ${CMAKE_CXX_COMPILER_VERSION}.\n" "The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.") endif() - diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bb141450b25..e1897b42c9c 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -180,11 +180,6 @@ endif() set(GINKGO_HIP_NVCC_ARCH "") if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) - set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) - elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) - unset(CMAKE_CUDA_HOST_COMPILER CACHE) - endif() if (CMAKE_CUDA_HOST_COMPILER) list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS "-ccbin=${CMAKE_CUDA_HOST_COMPILER}") endif() diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 358ce8092c0..764f47afb83 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.17 FATAL_ERROR) +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) @@ -115,7 +115,6 @@ target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_C target_compile_options(ginkgo_cuda PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) -target_include_directories(ginkgo_cuda SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) # include path for generated headers like jacobi_common.hpp target_include_directories(ginkgo_cuda @@ -123,17 +122,9 @@ target_include_directories(ginkgo_cuda target_link_libraries(ginkgo_cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cufft nvtx::nvtx) # NVTX3 is header-only and requires dlopen/dlclose in static builds target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS}) -target_compile_options(ginkgo_cuda - PRIVATE "$<$:${GINKGO_CUDA_ARCH_FLAGS}>") -# we handle CUDA architecture flags for now, disable CMake handling -if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - set_target_properties(ginkgo_cuda PROPERTIES CUDA_ARCHITECTURES OFF) -endif() -list(GET CUDA_RUNTIME_LIBS 0 CUDA_FIRST_LIB) -get_filename_component(GKO_CUDA_LIBDIR "${CUDA_FIRST_LIB}" DIRECTORY) ginkgo_default_includes(ginkgo_cuda) -ginkgo_install_library(ginkgo_cuda "${CUDAToolkit_LIBRARY_DIR}") +ginkgo_install_library(ginkgo_cuda) if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_cuda GKO_COMPILING_CUDA) diff --git a/cuda/get_info.cmake b/cuda/get_info.cmake index 7955c3f636b..6d9b6c1f4d0 100644 --- a/cuda/get_info.cmake +++ b/cuda/get_info.cmake @@ -1,11 +1,11 @@ ginkgo_print_module_header(${detailed_log} "CUDA") ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_ARCHITECTURES") ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_COMPILER_FLAGS") -ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_DEFAULT_HOST_COMPILER") ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_ARCH_FLAGS") ginkgo_print_module_footer(${detailed_log} "CUDA variables:") ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER") ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER_VERSION") ginkgo_print_flags(${detailed_log} "CMAKE_CUDA_FLAGS") ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_HOST_COMPILER") +ginkgo_print_variable(${detailed_log} "CUDAToolkit_LIBRARY_DIR") ginkgo_print_module_footer(${detailed_log} "") diff --git a/cuda/test/solver/CMakeLists.txt b/cuda/test/solver/CMakeLists.txt index 65187e68e1b..f8cd67c1241 100644 --- a/cuda/test/solver/CMakeLists.txt +++ b/cuda/test/solver/CMakeLists.txt @@ -1,2 +1,2 @@ -ginkgo_create_test(lower_trs_kernels ADDITIONAL_INCLUDES ${CUDAToolkit_INCLUDE_DIRS}) -ginkgo_create_test(upper_trs_kernels ADDITIONAL_INCLUDES ${CUDAToolkit_INCLUDE_DIRS}) +ginkgo_create_cuda_test(lower_trs_kernels) +ginkgo_create_cuda_test(upper_trs_kernels) From 6a12cac0169709a45b347411303b80e8caba79fa Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 9 Aug 2023 20:51:04 +0200 Subject: [PATCH 159/583] fix test names --- cuda/test/solver/{lower_trs_kernels.cpp => lower_trs_kernels.cu} | 0 cuda/test/solver/{upper_trs_kernels.cpp => upper_trs_kernels.cu} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename cuda/test/solver/{lower_trs_kernels.cpp => lower_trs_kernels.cu} (100%) rename cuda/test/solver/{upper_trs_kernels.cpp => upper_trs_kernels.cu} (100%) diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cu similarity index 100% rename from cuda/test/solver/lower_trs_kernels.cpp rename to cuda/test/solver/lower_trs_kernels.cu diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cu similarity index 100% rename from cuda/test/solver/upper_trs_kernels.cpp rename to cuda/test/solver/upper_trs_kernels.cu From 9245700aecfbb220584d2593c374c59ba76774a6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 9 Aug 2023 20:52:25 +0200 Subject: [PATCH 160/583] set CMAKE_CUDA_ARCHITECTURES from CAS --- cmake/GinkgoConfig.cmake.in | 3 +- cmake/Modules/CudaArchitectureSelector.cmake | 36 ++++++++++++++++++++ cmake/create_test.cmake | 1 - cmake/cuda.cmake | 9 ++--- cuda/get_info.cmake | 3 +- examples/custom-matrix-format/CMakeLists.txt | 8 ----- test/test_install/CMakeLists.txt | 11 +----- 7 files changed, 44 insertions(+), 27 deletions(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 093690e16f8..13888ae0b10 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -61,9 +61,8 @@ set(GINKGO_IWYU_PATH @GINKGO_IWYU_PATH@) set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@) -set(GINKGO_CUDA_ARCHITECTURES "@GINKGO_CUDA_ARCHITECTURES@") +set(GINKGO_CUDA_ARCHITECTURES "@CMAKE_CUDA_ARCHITECTURES@") set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") -set(GINKGO_CUDA_ARCH_FLAGS "@GINKGO_CUDA_ARCH_FLAGS@") set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@") set(GINKGO_HIP_HCC_COMPILER_FLAGS "@GINKGO_HIP_HCC_COMPILER_FLAGS@") diff --git a/cmake/Modules/CudaArchitectureSelector.cmake b/cmake/Modules/CudaArchitectureSelector.cmake index 1838ed4b932..017fd2f0f1d 100644 --- a/cmake/Modules/CudaArchitectureSelector.cmake +++ b/cmake/Modules/CudaArchitectureSelector.cmake @@ -65,6 +65,15 @@ # The command has the same result as ``cas_target_cuda_architectures``. It does # not add the compiler flags to the target, but stores the compiler flags in # the variable (string). +# +# cas_variable_cmake_cuda_architectures( +# [] # variable for storing architecture list +# [] # list of architecture specifications +# ) +# +# The command prepares an architecture list supported by the CMake +# ``CUDA_ARCHITECTURES`` target property and ``CMAKE_CUDA_ARCHITECTURES`` +# variable. The architecture specification # # # ``ARCHITECTURES`` specification list @@ -404,3 +413,30 @@ function(cas_variable_cuda_architectures variable) cas_get_compiler_flags(flags ${ARGN}) set(${variable} "${flags}" PARENT_SCOPE) endfunction() + + +function(cas_variable_cmake_cuda_architectures variable) + cas_get_onboard_architectures(onboard_archs) + cas_get_supported_architectures(supported_archs) + if(("${ARGN}" STREQUAL "All") OR ("${ARGN}" STREQUAL "Auto" AND (NOT onboard_archs))) + set(archs "${supported_archs}") + elseif("${ARGN}" STREQUAL "Auto") + set(archs "${onboard_archs}") + else() + set(archs) + foreach(arch IN LISTS ARGN) + if(arch MATCHES "${cas_spec_regex}") + if(CMAKE_MATCH_1) + list(APPEND archs ${CMAKE_MATCH_1}-real) + endif() + if(CMAKE_MATCH_3) + list(APPEND archs ${CMAKE_MATCH_3}-virtual) + endif() + else() + cas_get_architectures_by_name("${arch}" arch) + list(APPEND archs ${arch}) + endif() + endforeach() + endif() + set("${variable}" "${archs}" PARENT_SCOPE) +endfunction() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 3fbafe35858..58a49ca066c 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -119,7 +119,6 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA) target_compile_options(${test_target_name} PRIVATE - $<$:${GINKGO_CUDA_ARCH_FLAGS}> $<$:${GINKGO_CUDA_COMPILER_FLAGS}>) if(MSVC) target_compile_options(${test_target_name} diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 378003df50d..33d785b8c52 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -4,10 +4,11 @@ find_package(CUDAToolkit REQUIRED) include(cmake/Modules/CudaArchitectureSelector.cmake) -# Detect the CUDA architecture flags and propagate to all the project -cas_variable_cuda_architectures(GINKGO_CUDA_ARCH_FLAGS - ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} - UNSUPPORTED "20" "21") +# Detect the CUDA architecture and propagate to all the project +cas_variable_cmake_cuda_architectures(cuda_detected_archs ${GINKGO_CUDA_ARCHITECTURES}) +if(NOT CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES "${cuda_detected_archs}") +endif() find_package(NVTX REQUIRED) diff --git a/cuda/get_info.cmake b/cuda/get_info.cmake index 6d9b6c1f4d0..eeadaf9725c 100644 --- a/cuda/get_info.cmake +++ b/cuda/get_info.cmake @@ -1,7 +1,6 @@ ginkgo_print_module_header(${detailed_log} "CUDA") -ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_ARCHITECTURES") +ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_ARCHITECTURES") ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_COMPILER_FLAGS") -ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_ARCH_FLAGS") ginkgo_print_module_footer(${detailed_log} "CUDA variables:") ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER") ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER_VERSION") diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index 26034b7dce2..0f7c5f2f32c 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -18,13 +18,5 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON) add_executable(custom-matrix-format custom-matrix-format.cpp stencil_kernel.cu) target_link_libraries(custom-matrix-format Ginkgo::ginkgo OpenMP::OpenMP_CXX) -# inherit CUDA architecture flags from Ginkgo -target_compile_options(custom-matrix-format - PRIVATE "$<$:${GINKGO_CUDA_ARCH_FLAGS}>") -# we handle CUDA architecture flags for now, disable CMake handling -if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - set_target_properties(custom-matrix-format PROPERTIES CUDA_ARCHITECTURES OFF) -endif() - # workaround for clang-cuda/g++ interaction set_target_properties(custom-matrix-format PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt index 070410ec48c..513af67e923 100644 --- a/test/test_install/CMakeLists.txt +++ b/test/test_install/CMakeLists.txt @@ -38,12 +38,6 @@ if(GINKGO_BUILD_CUDA) enable_language(CUDA) configure_file(test_install.cpp test_install.cu COPYONLY) add_executable(test_install_cuda ${CMAKE_CURRENT_BINARY_DIR}/test_install.cu) - target_compile_options(test_install_cuda - PRIVATE "$<$:${GINKGO_CUDA_ARCH_FLAGS}>") - # we handle CUDA architecture flags for now, disable CMake handling - if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - set_target_properties(test_install_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - endif() target_compile_definitions(test_install_cuda PRIVATE HAS_CUDA=1) target_compile_definitions(test_install_cuda PRIVATE HAS_REFERENCE=${HAS_REFERENCE}) target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo) @@ -60,16 +54,13 @@ if(GINKGO_BUILD_HIP) else() set (GINKGO_PIC_OPTION "$<$:-fPIC>") endif() - if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") - set(TESTINSTALL_CUDA_ARCH_FLAGS "${GINKGO_CUDA_ARCH_FLAGS}") - endif() if (CMAKE_CUDA_HOST_COMPILER) set(TESTINSTALL_CUDA_HOST_COMPILER "-ccbin=${CMAKE_CUDA_HOST_COMPILER}") endif() hip_add_executable(test_install_hip test_install.cpp HIPCC_OPTIONS "-std=c++14" CLANG_OPTIONS "${GINKGO_PIC_OPTION}" - NVCC_OPTIONS "${GINKGO_CUDA_PIC_OPTION}" "${TESTINSTALL_CUDA_ARCH_FLAGS}" "${TESTINSTALL_CUDA_HOST_COMPILER}") + NVCC_OPTIONS "${GINKGO_CUDA_PIC_OPTION}" "${TESTINSTALL_CUDA_HOST_COMPILER}") target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo) target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1) From 00e680d26bea29c79ec4a63c7a1ecc3a88be3291 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 10:06:48 +0200 Subject: [PATCH 161/583] update container names --- .gitlab-ci.yml | 6 +++--- .gitlab/image.yml | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 709f2b4f53a..c976e1b15da 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -339,7 +339,7 @@ build/cuda114/nompi/gcc/cuda/debug/shared: - .build_and_test_template - .default_variables - .quick_test_condition - - .use_gko_cuda114-openmpi-gnu11-llvm12 + - .use_gko_cuda114-openmpi-gnu10-llvm12 variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" @@ -543,7 +543,7 @@ build/nocuda/nompi/gcc/omp/release/static: - .build_and_test_template - .default_variables - .quick_test_condition - - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018 + - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019 variables: BUILD_OMP: "ON" BUILD_TYPE: "Release" @@ -554,7 +554,7 @@ build/nocuda-nomixed/nompi/clang/omp/release/static: - .build_and_test_template - .default_variables - .full_test_condition - - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018 + - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019 variables: C_COMPILER: "clang" CXX_COMPILER: "clang++" diff --git a/.gitlab/image.yml b/.gitlab/image.yml index cad06674aee..72fb51ad372 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -17,8 +17,8 @@ - cpu - amdci -.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018: - image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2018 +.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019: + image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2019 tags: - private_ci - cpu @@ -50,8 +50,8 @@ - private_ci - horeka -.use_gko_cuda114-openmpi-gnu11-llvm12: - image: ginkgohub/cuda:114-openmpi-gnu11-llvm12 +.use_gko_cuda114-openmpi-gnu10-llvm12: + image: ginkgohub/cuda:114-openmpi-gnu10-llvm12 tags: - private_ci - nvidia-gpu From a8985cc0fa9739c58311861c4f48f3bce87f6a4e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 13:48:02 +0200 Subject: [PATCH 162/583] remove duplicate pipeline --- .gitlab-ci.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c976e1b15da..ae7fa86fd38 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -526,17 +526,6 @@ build/nocuda/openmpi/clang/omp/debug/static: FAST_TESTS: "ON" BUILD_SHARED_LIBS: "OFF" -test/nocuda/openmpi/clang/omp/debug/static: - extends: - - .build_and_test_template - - .default_variables - - .full_test_condition - - .use_gko-nocuda-openmpi-gnu9-llvm8 - variables: - USE_NAME: "nocuda-openmpi-clang-${CI_PIPELINE_ID}" - dependencies: null - needs: [ "build/nocuda/openmpi/clang/omp/debug/static" ] - # nocuda with the oldest supported compiler build/nocuda/nompi/gcc/omp/release/static: extends: From 0b7b439e692e7539d50b68e078623abbe8f2c7ae Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 13:48:35 +0200 Subject: [PATCH 163/583] adapt remaining CMake flags --- .gitlab/scripts.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index becf0ed5b8d..5cd36de0b9f 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -63,7 +63,7 @@ script: - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME} - if [ -n "${CUDA_ARCH}" ]; then - CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; + export CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; fi - if [ -n "${SYCL_DEVICE_TYPE}" ]; then export SYCL_DEVICE_TYPE; fi - if [ -n "${SYCL_DEVICE_FILTER}" ]; then export SYCL_DEVICE_FILTER; fi @@ -77,11 +77,9 @@ fi - export CC=${C_COMPILER} CXX=${CXX_COMPILER} CUDAHOSTCXX=${CXX_COMPILER} CUDACXX=${CUDA_COMPILER} - cmake ${CI_PROJECT_DIR}${CI_PROJECT_DIR_SUFFIX} - -GNinja - -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -GNinja -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} - ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} + ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} From e2009071896673bafd59c3c73913639fc4a79b9d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 13:57:44 +0200 Subject: [PATCH 164/583] review updates - Only detect available GPUs if requested - Remove unnecessary include paths - Remove unnecessary config variables Co-authored-by: Marcel Koch Co-authored-by: Terry Cojean --- benchmark/CMakeLists.txt | 1 - cmake/GinkgoConfig.cmake.in | 1 - cmake/Modules/CudaArchitectureSelector.cmake | 10 +++++++--- cmake/cuda.cmake | 5 ++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 641c6f363ec..44a0a3d1d9e 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -121,7 +121,6 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(cuda_timer utils/cuda_timer.cpp) target_link_libraries(cuda_timer ginkgo CUDA::cudart) - target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) endif() if (GINKGO_BUILD_HIP) ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 13888ae0b10..f4eace2fdbc 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -121,7 +121,6 @@ set(GINKGO_INTERFACE_CXX_FLAGS "@GINKGO_INTERFACE_CXX_FLAGS@") set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") set(GINKGO_CUDA_COMPILER_VERSION @CMAKE_CUDA_COMPILER_VERSION@) set(GINKGO_CUDA_HOST_LINK_LAUNCHER "@CMAKE_CUDA_HOST_LINK_LAUNCHER@") -set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CUDAToolkit_INCLUDE_DIRS@") set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS_MODIFY@") set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG_MODIFY@") diff --git a/cmake/Modules/CudaArchitectureSelector.cmake b/cmake/Modules/CudaArchitectureSelector.cmake index 017fd2f0f1d..f863b144ab7 100644 --- a/cmake/Modules/CudaArchitectureSelector.cmake +++ b/cmake/Modules/CudaArchitectureSelector.cmake @@ -416,12 +416,16 @@ endfunction() function(cas_variable_cmake_cuda_architectures variable) - cas_get_onboard_architectures(onboard_archs) cas_get_supported_architectures(supported_archs) - if(("${ARGN}" STREQUAL "All") OR ("${ARGN}" STREQUAL "Auto" AND (NOT onboard_archs))) + if("${ARGN}" STREQUAL "All") set(archs "${supported_archs}") elseif("${ARGN}" STREQUAL "Auto") - set(archs "${onboard_archs}") + cas_get_onboard_architectures(onboard_archs) + if (onboard_archs) + set(archs "${onboard_archs}") + else() + set(archs "${supported_archs}") + endif() else() set(archs) foreach(arch IN LISTS ARGN) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 33d785b8c52..2e1c82db6b0 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -4,10 +4,9 @@ find_package(CUDAToolkit REQUIRED) include(cmake/Modules/CudaArchitectureSelector.cmake) -# Detect the CUDA architecture and propagate to all the project -cas_variable_cmake_cuda_architectures(cuda_detected_archs ${GINKGO_CUDA_ARCHITECTURES}) if(NOT CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES "${cuda_detected_archs}") + # Detect the CUDA architecture and propagate it to the entire project + cas_variable_cmake_cuda_architectures(CMAKE_CUDA_ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}) endif() find_package(NVTX REQUIRED) From 5c6ac16365063c0c619be4d5f42a15a859575031 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 14:41:54 +0200 Subject: [PATCH 165/583] temporarily disable PAPI --- CMakeLists.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 195a6a1df69..3886efb7c14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,10 +191,11 @@ check_include_file_cxx(cxxabi.h GKO_HAVE_CXXABI_H) # Automatically find PAPI and search for the required 'sde' component set(GINKGO_HAVE_PAPI_SDE 0) -find_package(PAPI OPTIONAL_COMPONENTS sde) -if(PAPI_sde_FOUND) - set(GINKGO_HAVE_PAPI_SDE 1) -endif() +# PAPI is temporarily disabled +#find_package(PAPI OPTIONAL_COMPONENTS sde) +#if(PAPI_sde_FOUND) +# set(GINKGO_HAVE_PAPI_SDE 1) +#endif() # Automatically find TAU set(GINKGO_HAVE_TAU 0) From f545ace68fb279f78956ccdeca34054af7c21c59 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 15:31:20 +0200 Subject: [PATCH 166/583] review updates - Bump CMake requirement in example - Add NVHPC CMake requirements to README Co-authored-by: Yuhsiang M. Tsai --- README.md | 2 +- examples/custom-matrix-format/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b3b7d8660b7..44428386b83 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ For Ginkgo core library: The Ginkgo CUDA module has the following __additional__ requirements: -* _cmake 3.18+_ +* _cmake 3.18+_ (If CUDA was installed through the NVIDIA HPC Toolkit, we require _cmake 3.22+_) * _CUDA 10.1+_ or _NVHPC Package 22.7+_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index 0f7c5f2f32c..b5182fb6bbc 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.18) project(custom-matrix-format CXX CUDA) # We only need to find Ginkgo if we build this example stand-alone From 64378eadbec004680e0663280c36b0d5f5802e4c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 18:32:36 +0200 Subject: [PATCH 167/583] collect build time statistics --- .gitlab/scripts.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 5cd36de0b9f..cf6baad6fab 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -52,6 +52,7 @@ -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE} -DGINKGO_EXPORT_BUILD_DIR=${EXPORT_BUILD_DIR} - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install + - awk '!/^#/ { print ($2 - $1)/1000 " " $4 }' .ninja_log | sort -nr - if [ "${EXPORT_BUILD_DIR}" == "ON" ]; then ninja test_exportbuild; fi - LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ninja test_pkgconfig dependencies: [] @@ -94,6 +95,7 @@ -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES} -DGINKGO_EXPORT_BUILD_DIR=${EXPORT_BUILD_DIR} - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install + - awk '!/^#/ { print ($2 - $1)/1000 " " $4 }' .ninja_log | sort -nr - | (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1 - ctest -V --timeout 6000 From 84af8b24a83e81ed115e7f8faf84c8b76829b80a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 18:33:17 +0200 Subject: [PATCH 168/583] split up mixed-precision builds for slow ROCm debug builds --- hip/matrix/csr_kernels.instantiate.hip.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp index 498f3ec1795..dcfa4c7b8c8 100644 --- a/hip/matrix/csr_kernels.instantiate.hip.cpp +++ b/hip/matrix/csr_kernels.instantiate.hip.cpp @@ -48,11 +48,15 @@ namespace csr { GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, + int64); // split GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); // split From d95f4440c3cc14b0578820e3b90ea8c3ad38d142 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 18:46:16 +0200 Subject: [PATCH 169/583] split up HIP csr mixed precision instantiations --- core/base/mixed_precision_types.hpp | 83 ++++++++++++++-------- hip/matrix/csr_kernels.instantiate.hip.cpp | 56 +++++++++++++-- 2 files changed, 103 insertions(+), 36 deletions(-) diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 84b0af21c5e..9579caaac4f 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -39,42 +39,65 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ - template _macro(float, float, float, __VA_ARGS__); \ - template _macro(float, float, double, __VA_ARGS__); \ - template _macro(float, double, float, __VA_ARGS__); \ - template _macro(float, double, double, __VA_ARGS__); \ - template _macro(double, float, float, __VA_ARGS__); \ - template _macro(double, float, double, __VA_ARGS__); \ - template _macro(double, double, float, __VA_ARGS__); \ - template _macro(double, double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + template _macro(float, float, double, __VA_ARGS__); \ + template _macro(float, double, float, __VA_ARGS__); \ + template _macro(float, double, double, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ + template _macro(double, float, float, __VA_ARGS__); \ + template _macro(double, float, double, __VA_ARGS__); \ + template _macro(double, double, float, __VA_ARGS__); \ + template _macro(double, double, double, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) + #else -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ - template _macro(float, float, float, __VA_ARGS__); \ - template _macro(double, double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ + template _macro(double, double, double, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ + template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) + #endif +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__) + + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64) diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp index dcfa4c7b8c8..9a6c29206de 100644 --- a/hip/matrix/csr_kernels.instantiate.hip.cpp +++ b/hip/matrix/csr_kernels.instantiate.hip.cpp @@ -47,16 +47,60 @@ namespace csr { // begin GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL, int32); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); + + +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL, int64); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, - int32); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, - int64); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); + + // split GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); // split From cd26e282027fe7da340582d8eb990c76b0b71263 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 20:34:04 +0200 Subject: [PATCH 170/583] allow specifying allocator for benchmarks --- benchmark/utils/general.hpp | 54 +++++++++++++++++++++++++++++--- core/device_hooks/cuda_hooks.cpp | 4 +++ core/device_hooks/hip_hooks.cpp | 3 ++ 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 5c6d849fe36..335ed687002 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -58,6 +59,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include "benchmark/utils/json.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -69,6 +73,10 @@ DEFINE_string(executor, "reference", "The executor used to run the benchmarks, one of: reference, " "omp, cuda, hip"); +DEFINE_string(allocator, "default", + "The allocator used in the executor. Only relevant for CUDA and " + "HIP executors, one of: default, async, host, unified"); + DEFINE_uint32(device_id, 0, "ID of the device where to run the code"); DEFINE_bool(overwrite, false, @@ -329,6 +337,40 @@ void backup_results(rapidjson::Document& results) } +inline std::shared_ptr create_cuda_allocator() +{ + std::string flag{FLAGS_allocator}; + if (flag == "default") { + return std::make_shared(); + } else if (flag == "async") { + return std::make_shared(nullptr); + } else if (flag == "unified") { + return std::make_shared(FLAGS_device_id); + } else if (flag == "host") { + return std::make_shared(FLAGS_device_id); + } else { + throw std::runtime_error{"Unknown allocator type " + flag}; + } +} + + +inline std::shared_ptr create_hip_allocator() +{ + std::string flag{FLAGS_allocator}; + if (flag == "default") { + return std::make_shared(); + } else if (flag == "async") { + return std::make_shared(nullptr); + } else if (flag == "unified") { + return std::make_shared(FLAGS_device_id); + } else if (flag == "host") { + return std::make_shared(FLAGS_device_id); + } else { + throw std::runtime_error{"Unknown allocator type " + flag}; + } +} + + // executor mapping const std::map(bool)>> executor_factory{ @@ -337,12 +379,14 @@ const std::map(bool)>> {"cuda", [](bool) { return gko::CudaExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create()); + gko::OmpExecutor::create(), + create_cuda_allocator()); }}, {"hip", [](bool) { return gko::HipExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create()); + gko::OmpExecutor::create(), + create_hip_allocator()); }}, {"dpcpp", [](bool use_gpu_timer) { auto property = dpcpp_queue_property::in_order; @@ -369,14 +413,16 @@ const std::map Date: Thu, 10 Aug 2023 21:03:25 +0200 Subject: [PATCH 171/583] fix CUDA_VERSION availability --- cuda/base/memory.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index f605d9135ea..b5bfb14ac74 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include From 41eb9d7aca3b3983279dbcb83ddebaea23666934 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 10 Aug 2023 21:46:26 +0200 Subject: [PATCH 172/583] increase repetitions for sparse_blas --- benchmark/sparse_blas/sparse_blas.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index cfa56ef81fe..d906e9f9e12 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -127,9 +127,12 @@ void apply_sparse_blas(const char* operation_name, allocator); auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - test_case[operation_name]["components"], allocator, 1); + test_case[operation_name]["components"], allocator, + repetitions); exec->add_logger(gen_logger); - op->run(); + for (unsigned i = 0; i < repetitions; i++) { + op->run(); + } exec->remove_logger(gen_logger); } op->write_stats(test_case[operation_name], allocator); From c79874e86576e3c999e80470a0212f149eeda764 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Thu, 6 Apr 2023 13:08:49 +0200 Subject: [PATCH 173/583] Update papi_sde to current status. Use externals. --- CMakeLists.txt | 20 +++- cmake/DownloadNonCMakeCMakeLists.txt.in | 23 ++++- cmake/get_info.cmake | 13 ++- cmake/information_helpers.cmake | 5 +- cmake/install_helpers.cmake | 5 - cmake/package_helpers.cmake | 3 +- core/CMakeLists.txt | 5 +- core/test/log/CMakeLists.txt | 2 +- core/test/log/papi.cpp | 6 +- include/ginkgo/core/log/papi.hpp | 33 ++++--- reference/test/log/CMakeLists.txt | 2 +- third_party/CMakeLists.txt | 4 + third_party/hwloc/CMakeLists.txt | 1 + third_party/papi_sde/CMakeLists.txt | 37 +++++++ third_party/papi_sde/papi_sde_interface.h | 113 ---------------------- 15 files changed, 119 insertions(+), 153 deletions(-) create mode 100644 third_party/papi_sde/CMakeLists.txt delete mode 100644 third_party/papi_sde/papi_sde_interface.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3886efb7c14..6dc01ed27ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ if(MSVC OR WIN32 OR CYGWIN OR APPLE) else() option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON) endif() +option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Default is OFF." OFF) option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF) option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) @@ -191,11 +192,10 @@ check_include_file_cxx(cxxabi.h GKO_HAVE_CXXABI_H) # Automatically find PAPI and search for the required 'sde' component set(GINKGO_HAVE_PAPI_SDE 0) -# PAPI is temporarily disabled -#find_package(PAPI OPTIONAL_COMPONENTS sde) -#if(PAPI_sde_FOUND) -# set(GINKGO_HAVE_PAPI_SDE 1) -#endif() +find_package(PAPI OPTIONAL_COMPONENTS sde) +if(PAPI_sde_FOUND) + set(GINKGO_HAVE_PAPI_SDE 1) +endif() # Automatically find TAU set(GINKGO_HAVE_TAU 0) @@ -232,6 +232,12 @@ else() set(GINKGO_HAVE_HWLOC 0) message(STATUS "HWLOC is being forcibly switched off") endif() +if(GINKGO_BUILD_PAPI_SDE) + set(GINKGO_HAVE_PAPI_SDE 1) +else() + set(GINKGO_HAVE_PAPI_SDE 0) + message(STATUS "PAPI SDE is being forcibly switched off") +endif() set(GINKGO_HAVE_GPU_AWARE_MPI OFF) set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF) @@ -272,6 +278,10 @@ endif() if(GINKGO_BUILD_HWLOC) find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC endif() +if(GINKGO_BUILD_PAPI_SDE) + # No need for QUIET as we ship FindPAPI + find_package(PAPI OPTIONAL_COMPONENTS sde) +endif() add_subdirectory(third_party) # Third-party tools and libraries if(MSVC) diff --git a/cmake/DownloadNonCMakeCMakeLists.txt.in b/cmake/DownloadNonCMakeCMakeLists.txt.in index bae2281e63b..55e2f833985 100644 --- a/cmake/DownloadNonCMakeCMakeLists.txt.in +++ b/cmake/DownloadNonCMakeCMakeLists.txt.in @@ -3,12 +3,25 @@ project(${package_name}) include(ExternalProject) ExternalProject_Add(${package_name} - URL "${package_url}" - URL_HASH "${package_hash}" + URL "${package_url}" + URL_HASH "${package_hash}" DOWNLOAD_NO_PROGRESS TRUE SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" - CONFIGURE_COMMAND "${config_command}" "${ARGN}" - INSTALL_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" UPDATE_DISCONNECTED ${GINKGO_SKIP_DEPENDENCY_UPDATE} ) + +ExternalProject_Add_Step(${package_name} custom_configure + COMMAND "${config_command}" "${ARGN}" + WORKING_DIRECTORY "${working_dir}" + DEPENDEES download) +ExternalProject_Add_Step(${package_name} custom_build + COMMAND make + WORKING_DIRECTORY "${working_dir}" + DEPENDEES custom_configure) +ExternalProject_Add_Step(${package_name} custom_install + COMMAND make all install + WORKING_DIRECTORY "${working_dir}" + DEPENDEES custom_build) diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 2cf8dd06c3f..2dd068abb50 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -190,16 +190,21 @@ ginkgo_print_module_footer(${detailed_log} "") ginkgo_print_generic_header(${minimal_log} " Components:") ginkgo_print_generic_header(${detailed_log} " Components:") -if(PAPI_sde_FOUND) +ginkgo_print_variable(${minimal_log} "GINKGO_BUILD_PAPI_SDE") +ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_PAPI_SDE") +if(TARGET PAPI::PAPI) ginkgo_print_variable(${detailed_log} "PAPI_VERSION") ginkgo_print_variable(${detailed_log} "PAPI_INCLUDE_DIR") ginkgo_print_flags(${detailed_log} "PAPI_LIBRARY") endif() + ginkgo_print_variable(${minimal_log} "GINKGO_BUILD_HWLOC") ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_HWLOC") -ginkgo_print_variable(${detailed_log} "HWLOC_VERSION") -ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") -ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") +if(TARGET hwloc) + ginkgo_print_variable(${detailed_log} "HWLOC_VERSION") + ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") + ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") +endif() _minimal( " diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake index 9a6a4481bf5..cef920a09ce 100644 --- a/cmake/information_helpers.cmake +++ b/cmake/information_helpers.cmake @@ -78,8 +78,9 @@ macro(ginkgo_interface_libraries_recursively INTERFACE_LIBS) list(TRANSFORM GINKGO_LIBS_INTERFACE_LIBS REPLACE "\\$" "\\1") ginkgo_interface_libraries_recursively("${GINKGO_LIBS_INTERFACE_LIBS}") elseif(EXISTS "${_libs}") - if ("${_libs}" MATCHES "${PROJECT_BINARY_DIR}.*hwloc.so") - list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${CMAKE_INSTALL_FULL_LIBDIR}/libhwloc.so") + if ("${_libs}" MATCHES "${PROJECT_BINARY_DIR}.*(papi|sde|pfm|hwloc).so") + get_filename_component(_lib_name "${_libs}" NAME) + list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${CMAKE_INSTALL_FULL_LIBDIR}/${_lib_name}") else() list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${_libs}") endif() diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake index 58cc730bb14..8bec34d7a41 100644 --- a/cmake/install_helpers.cmake +++ b/cmake/install_helpers.cmake @@ -80,11 +80,6 @@ function(ginkgo_install) install(FILES "${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp" DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}/ginkgo" ) - if (GINKGO_HAVE_PAPI_SDE) - install(FILES "${Ginkgo_SOURCE_DIR}/third_party/papi_sde/papi_sde_interface.h" - DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}/third_party/papi_sde" - ) - endif() if (GINKGO_HAVE_HWLOC AND NOT HWLOC_FOUND) get_filename_component(HWLOC_LIB_PATH ${HWLOC_LIBRARIES} DIRECTORY) diff --git a/cmake/package_helpers.cmake b/cmake/package_helpers.cmake index e1d196ad553..1abc1a72587 100644 --- a/cmake/package_helpers.cmake +++ b/cmake/package_helpers.cmake @@ -7,9 +7,10 @@ set(NON_CMAKE_PACKAGE_DOWNLOADER_SCRIPT # \param package_name Name of the package # \param package_url Url of the package # \param package_tag Tag or version of the package to be downloaded. +# \param working_dir The directory where the configure/build should happen. # \param config_command The command for the configuration step. # -function(ginkgo_load_and_configure_package package_name package_url package_hash config_command) +function(ginkgo_load_and_configure_package package_name package_url package_hash working_dir config_command) set(GINKGO_THIRD_PARTY_BUILD_TYPE "Debug") if (CMAKE_BUILD_TYPE MATCHES "[Rr][Ee][Ll][Ee][Aa][Ss][Ee]") set(GINKGO_THIRD_PARTY_BUILD_TYPE "Release") diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index d224a7e0f90..e7c2bf7ce45 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -113,9 +113,8 @@ target_link_libraries(ginkgo set(GKO_RPATH_ADDITIONS "") if(GINKGO_HAVE_PAPI_SDE) - target_link_libraries(ginkgo PUBLIC PAPI::PAPI) - list(GET PAPI_LIBRARIES 0 PAPI_FIRST_LIB) - get_filename_component(GKO_PAPI_LIBDIR "${PAPI_FIRST_LIB}" DIRECTORY) + target_link_libraries(ginkgo PUBLIC PAPI::PAPI_SDE) + get_filename_component(GKO_PAPI_LIBDIR "${PAPI_SDE_LIBRARIES}" DIRECTORY) list(APPEND GKO_RPATH_ADDITIONS "${GKO_PAPI_LIBDIR}") endif() diff --git a/core/test/log/CMakeLists.txt b/core/test/log/CMakeLists.txt index 964572bd48c..8efd7fafc46 100644 --- a/core/test/log/CMakeLists.txt +++ b/core/test/log/CMakeLists.txt @@ -1,7 +1,7 @@ ginkgo_create_test(convergence) ginkgo_create_test(logger) if (GINKGO_HAVE_PAPI_SDE) - ginkgo_create_test(papi PAPI::PAPI) + ginkgo_create_test(papi ADDITIONAL_LIBRARIES PAPI::PAPI) endif() ginkgo_create_test(performance_hint) ginkgo_create_test(profiler_hook) diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp index 8ab0bb6421d..d089902c30c 100644 --- a/core/test/log/papi.cpp +++ b/core/test/log/papi.cpp @@ -71,7 +71,11 @@ class Papi : public ::testing::Test { } } - void TearDown() { eventset = PAPI_NULL; } + void TearDown() { + logger = nullptr; + PAPI_destroy_eventset(&eventset); + PAPI_shutdown(); + } template const std::string init(const gko::log::Logger::mask_type& event, diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp index 5d07879d116..9645e775b3d 100644 --- a/include/ginkgo/core/log/papi.hpp +++ b/include/ginkgo/core/log/papi.hpp @@ -46,16 +46,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include #include #include -#include "third_party/papi_sde/papi_sde_interface.h" - - namespace gko { namespace log { @@ -213,7 +210,10 @@ class Papi : public Logger { create(std::shared_ptr, const Logger::mask_type& enabled_events = Logger::all_events_mask) { - return std::shared_ptr(new Papi(enabled_events)); + return std::shared_ptr(new Papi(enabled_events), [](auto logger){ + papi_sde_shutdown(logger->get_handle()); + delete logger; + }); } /** @@ -224,7 +224,11 @@ class Papi : public Logger { static std::shared_ptr create( const Logger::mask_type& enabled_events = Logger::all_events_mask) { - return std::shared_ptr(new Papi(enabled_events)); + return std::shared_ptr(new Papi(enabled_events), [](auto logger){ + papi_sde_shutdown(logger->get_handle()); + delete logger; + } +); } /** @@ -235,6 +239,13 @@ class Papi : public Logger { */ const std::string get_handle_name() const { return name; } + /** + * Returns the corresponding papi_handle_t for this logger + * + * @return the corresponding papi_handle_t for this logger + */ + const papi_handle_t get_handle() const { return papi_handle; } + protected: [[deprecated("use single-parameter constructor")]] explicit Papi( std::shared_ptr exec, @@ -265,12 +276,10 @@ class Papi : public Logger { ~papi_queue() { - if (PAPI_is_initialized()) { - for (auto e : data) { - std::ostringstream oss; - oss << counter_name << "::" << e.first; - papi_sde_unregister_counter(*handle, oss.str().c_str()); - } + for (auto e : data) { + std::ostringstream oss; + oss << counter_name << "::" << e.first; + papi_sde_unregister_counter(*handle, oss.str().c_str()); } data.clear(); } diff --git a/reference/test/log/CMakeLists.txt b/reference/test/log/CMakeLists.txt index 2d9e8f188cb..44faca51f90 100644 --- a/reference/test/log/CMakeLists.txt +++ b/reference/test/log/CMakeLists.txt @@ -1,4 +1,4 @@ ginkgo_create_test(convergence) if (GINKGO_HAVE_PAPI_SDE) - ginkgo_create_test(papi PAPI::PAPI) + ginkgo_create_test(papi ADDITIONAL_LIBRARIES PAPI::PAPI) endif() diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index c714a51c187..be35785d730 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -7,6 +7,10 @@ if(GINKGO_BUILD_HWLOC AND (NOT HWLOC_FOUND)) add_subdirectory(hwloc) endif() +if(GINKGO_BUILD_PAPI_SDE AND (NOT PAPI_FOUND)) + add_subdirectory(papi_sde) +endif() + if(GINKGO_DEVEL_TOOLS) set(GCF_IGNORE_LIST "third_party" CACHE STRING "Ignore directories for GCF") add_subdirectory(git-cmake-format) diff --git a/third_party/hwloc/CMakeLists.txt b/third_party/hwloc/CMakeLists.txt index 9cbbb46482e..f86d6bf0e5b 100644 --- a/third_party/hwloc/CMakeLists.txt +++ b/third_party/hwloc/CMakeLists.txt @@ -2,6 +2,7 @@ message(STATUS "Configuring and building HWLOC") set(TPL_HWLOC_PATH "${PROJECT_BINARY_DIR}/third_party/hwloc") ginkgo_load_and_configure_package(hwloc_external "https://download.open-mpi.org/release/hwloc/v2.4/hwloc-2.4.1.tar.gz" "SHA1=b94950e8958e1125ca75ecac0bc0259ee3d108c4" + "" "${TPL_HWLOC_PATH}/src/configure" "--disable-nvml" "--disable-cuda" "--disable-rsmi" ) diff --git a/third_party/papi_sde/CMakeLists.txt b/third_party/papi_sde/CMakeLists.txt new file mode 100644 index 00000000000..f9866d1b553 --- /dev/null +++ b/third_party/papi_sde/CMakeLists.txt @@ -0,0 +1,37 @@ +message(STATUS "Configuring and building PAPI-SDE") +set(TPL_PAPI_PATH "${PROJECT_BINARY_DIR}/third_party/papi_sde/src") +ginkgo_load_and_configure_package(papi_external + "https://bitbucket.org/terry_cojean/papi/get/77cdd0ba8db98d86c1459dd5f55013aba242d5d5.tar.gz" + "SHA1=540c18a14eeafb83cd60cbbf0a96706111dbff3b" + "${TPL_PAPI_PATH}/src" + "./configure" "--prefix=${TPL_PAPI_PATH}/install" + "--with-components=sde" "--with-libsde=yes" "--with-tests=no" + "--with-static-lib=no" "--with-shared-lib=yes" + ) + +add_library(PAPI SHARED IMPORTED GLOBAL) +add_library(PAPI::PAPI ALIAS PAPI) +add_dependencies(PAPI papi_external) +# NOTE: if changing this (e.g. to `.a`), please update the special case in +# `cmake/information_helpers.cmake` +set(PAPI_LIBRARIES "${TPL_PAPI_PATH}/install/lib/libpapi.so" + CACHE FILEPATH "The path to PAPI libraries" FORCE) +set(PAPI_INCLUDE_DIRS "${TPL_PAPI_PATH}/install/include" CACHE PATH + "The directory containing the PAPI header, papi.h" FORCE) +set_target_properties(PAPI PROPERTIES IMPORTED_LOCATION "${PAPI_LIBRARIES}") +set_target_properties(PAPI PROPERTIES INTERFACE_LINK_LIBRARIES "${PAPI_LIBRARIES}") +set_target_properties(PAPI PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}") + + +add_library(PAPI_SDE SHARED IMPORTED GLOBAL) +add_library(PAPI::PAPI_SDE ALIAS PAPI_SDE) +add_dependencies(PAPI_SDE papi_external) +# NOTE: if changing this (e.g. to `.a`), please update the special case in +# `cmake/information_helpers.cmake` +set(PAPI_SDE_LIBRARIES "${TPL_PAPI_PATH}/install/lib/libsde.so" + CACHE FILEPATH "The path to PAPI SDE libraries" FORCE) +set(PAPI_SDE_INCLUDE_DIRS "${TPL_PAPI_PATH}/install/include" CACHE PATH + "The directory containing the PAPI SDE header, sde_lib.h" FORCE) +set_target_properties(PAPI_SDE PROPERTIES IMPORTED_LOCATION "${PAPI_SDE_LIBRARIES}") +set_target_properties(PAPI_SDE PROPERTIES INTERFACE_LINK_LIBRARIES "${PAPI_SDE_LIBRARIES}") +set_target_properties(PAPI_SDE PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${PAPI_SDE_INCLUDE_DIRS}") diff --git a/third_party/papi_sde/papi_sde_interface.h b/third_party/papi_sde/papi_sde_interface.h deleted file mode 100644 index 6a28d0089a3..00000000000 --- a/third_party/papi_sde/papi_sde_interface.h +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef PAPI_SDE_INTERFACE_H -#define PAPI_SDE_INTERFACE_H - -#include -#include - -#define PAPI_SDE_RO 0x00 -#define PAPI_SDE_RW 0x01 -#define PAPI_SDE_DELTA 0x00 -#define PAPI_SDE_INSTANT 0x10 - -#define PAPI_SDE_long_long 0x0 -#define PAPI_SDE_int 0x1 -#define PAPI_SDE_double 0x2 -#define PAPI_SDE_float 0x3 - -#define PAPI_SDE_SUM 0x0 -#define PAPI_SDE_MAX 0x1 -#define PAPI_SDE_MIN 0x2 - - -#define GET_FLOAT_SDE(x) *((float *)&x) -#define GET_DOUBLE_SDE(x) *((double *)&x) -/* - * GET_SDE_RECORDER_ADDRESS() USAGE EXAMPLE: - * If SDE recorder logs values of type 'double': - * double *ptr = GET_SDE_RECORDER_ADDRESS(papi_event_value[6], double); - * for (j=0; j Date: Thu, 6 Apr 2023 13:22:41 +0200 Subject: [PATCH 174/583] Adapt hwloc to the new scheme --- third_party/hwloc/CMakeLists.txt | 20 ++++++++------------ third_party/papi_sde/CMakeLists.txt | 4 ++-- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/third_party/hwloc/CMakeLists.txt b/third_party/hwloc/CMakeLists.txt index f86d6bf0e5b..5534c07f4f2 100644 --- a/third_party/hwloc/CMakeLists.txt +++ b/third_party/hwloc/CMakeLists.txt @@ -1,24 +1,20 @@ message(STATUS "Configuring and building HWLOC") -set(TPL_HWLOC_PATH "${PROJECT_BINARY_DIR}/third_party/hwloc") +set(TPL_HWLOC_PATH "${PROJECT_BINARY_DIR}/third_party/hwloc/src") ginkgo_load_and_configure_package(hwloc_external "https://download.open-mpi.org/release/hwloc/v2.4/hwloc-2.4.1.tar.gz" "SHA1=b94950e8958e1125ca75ecac0bc0259ee3d108c4" - "" - "${TPL_HWLOC_PATH}/src/configure" "--disable-nvml" "--disable-cuda" "--disable-rsmi" + "${TPL_HWLOC_PATH}" + "./configure" "--prefix=${TPL_HWLOC_PATH}/install" + "--disable-nvml" "--disable-cuda" "--disable-rsmi" ) add_library(hwloc SHARED IMPORTED GLOBAL) add_dependencies(hwloc hwloc_external) -file(MAKE_DIRECTORY ${TPL_HWLOC_PATH}/lib/) -file(GLOB HWLOC_LIBS "${TPL_HWLOC_PATH}/build/hwloc/.libs/libhwloc.so*") -configure_file("${TPL_HWLOC_PATH}/build/include/hwloc/autogen/config.h" "${TPL_HWLOC_PATH}/src/include/hwloc/autogen/config.h" COPYONLY) -foreach(lib ${HWLOC_LIBS}) - get_filename_component(lib_name ${lib} NAME) - configure_file("${lib}" "${TPL_HWLOC_PATH}/lib/${lib_name}" COPYONLY) -endforeach() # NOTE: if changing this (e.g. to `.a`), please update the special case in # `cmake/information_helpers.cmake` -set(HWLOC_LIBRARIES "${TPL_HWLOC_PATH}/lib/libhwloc.so" CACHE FILEPATH "The path to HWLOC library libhwloc.so" FORCE) -set(HWLOC_INCLUDE_DIRS "${TPL_HWLOC_PATH}/src/include" CACHE PATH "The directory containing the hwloc header, hwloc.h" FORCE) +set(HWLOC_LIBRARIES "${TPL_HWLOC_PATH}/install/lib/libhwloc.so" + CACHE FILEPATH "The path to HWLOC library libhwloc.so" FORCE) +set(HWLOC_INCLUDE_DIRS "${TPL_HWLOC_PATH}/install/include" + CACHE PATH "The directory containing the hwloc header, hwloc.h" FORCE) set_target_properties(hwloc PROPERTIES IMPORTED_LOCATION ${HWLOC_LIBRARIES}) set_target_properties(hwloc PROPERTIES INTERFACE_LINK_LIBRARIES ${HWLOC_LIBRARIES}) set_target_properties(hwloc PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HWLOC_INCLUDE_DIRS}") diff --git a/third_party/papi_sde/CMakeLists.txt b/third_party/papi_sde/CMakeLists.txt index f9866d1b553..4b58e368662 100644 --- a/third_party/papi_sde/CMakeLists.txt +++ b/third_party/papi_sde/CMakeLists.txt @@ -5,8 +5,8 @@ ginkgo_load_and_configure_package(papi_external "SHA1=540c18a14eeafb83cd60cbbf0a96706111dbff3b" "${TPL_PAPI_PATH}/src" "./configure" "--prefix=${TPL_PAPI_PATH}/install" - "--with-components=sde" "--with-libsde=yes" "--with-tests=no" - "--with-static-lib=no" "--with-shared-lib=yes" + "--with-components=sde" "--with-libsde=yes" "--with-tests=no" + "--with-static-lib=no" "--with-shared-lib=yes" ) add_library(PAPI SHARED IMPORTED GLOBAL) From 58c40d6d409b430a11e54ee8cbee7ad6a9d839b8 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Thu, 6 Apr 2023 13:25:03 +0200 Subject: [PATCH 175/583] Enable PAPI_SDE for a pipeline. --- .gitlab-ci.yml | 2 ++ .gitlab/scripts.yml | 2 ++ .gitlab/variables.yml | 1 + 3 files changed, 5 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ae7fa86fd38..1cd8c0335f8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -113,6 +113,7 @@ build/cuda101/nompi/clang/cuda_wo_omp/release/shared: CUDA_ARCH: 35 # Job with example runs. +# Also explicitly test PAPI SDE build/cuda101/openmpi/gcc/all/debug/shared: extends: - .build_template @@ -126,6 +127,7 @@ build/cuda101/openmpi/gcc/all/debug/shared: MPI_AS_ROOT: "ON" BUILD_HIP: "ON" BUILD_TYPE: "Debug" + BUILD_PAPI_SDE: "ON" RUN_EXAMPLES: "ON" CUDA_ARCH: 35 diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index cf6baad6fab..b007caff35f 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -43,6 +43,7 @@ -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR} -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} + -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} -DGINKGO_TEST_NONDEFAULT_STREAM=${NONDEFAULT_STREAM} @@ -87,6 +88,7 @@ -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR} -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} + -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION} diff --git a/.gitlab/variables.yml b/.gitlab/variables.yml index 183bdef9e4e..2316b5abc71 100644 --- a/.gitlab/variables.yml +++ b/.gitlab/variables.yml @@ -12,6 +12,7 @@ BUILD_CUDA: "OFF" BUILD_HIP: "OFF" BUILD_HWLOC: "ON" + BUILD_PAPI_SDE: "OFF" BUILD_MPI: "OFF" GKO_COMPILER_FLAGS: "" MPI_AS_ROOT: "OFF" From be0f5e2da52bdb7b405724a7fce8614503cf83cf Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Fri, 14 Jul 2023 15:49:46 +0200 Subject: [PATCH 176/583] Review updates --- CMakeLists.txt | 15 ++++++--------- cmake/GinkgoConfig.cmake.in | 2 +- third_party/CMakeLists.txt | 2 +- third_party/papi_sde/CMakeLists.txt | 6 +++--- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6dc01ed27ef..1d18b18d00a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,7 +79,7 @@ if(MSVC OR WIN32 OR CYGWIN OR APPLE) else() option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON) endif() -option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Default is OFF." OFF) +option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Default is ON. Requires a system package." ON) option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF) option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) @@ -232,12 +232,6 @@ else() set(GINKGO_HAVE_HWLOC 0) message(STATUS "HWLOC is being forcibly switched off") endif() -if(GINKGO_BUILD_PAPI_SDE) - set(GINKGO_HAVE_PAPI_SDE 1) -else() - set(GINKGO_HAVE_PAPI_SDE 0) - message(STATUS "PAPI SDE is being forcibly switched off") -endif() set(GINKGO_HAVE_GPU_AWARE_MPI OFF) set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF) @@ -278,9 +272,12 @@ endif() if(GINKGO_BUILD_HWLOC) find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC endif() +set(GINKGO_HAVE_PAPI_SDE 0) if(GINKGO_BUILD_PAPI_SDE) - # No need for QUIET as we ship FindPAPI - find_package(PAPI OPTIONAL_COMPONENTS sde) + find_package(PAPI 7.0.1.0 COMPONENTS sde) + if (PAPI_FOUND AND PAPI_SDE_FOUND) + set(GINKGO_HAVE_PAPI_SDE 1) + endif() endif() add_subdirectory(third_party) # Third-party tools and libraries diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index f4eace2fdbc..0d7ce5455f1 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -143,7 +143,7 @@ set(VTune_PATH "@VTune_PATH@") # so `third_party` libraries are currently unneeded. if(GINKGO_HAVE_PAPI_SDE) - find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde) + find_package(PAPI REQUIRED COMPONENTS sde) endif() if(GINKGO_HAVE_HWLOC) diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index be35785d730..062f520b8e8 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -7,7 +7,7 @@ if(GINKGO_BUILD_HWLOC AND (NOT HWLOC_FOUND)) add_subdirectory(hwloc) endif() -if(GINKGO_BUILD_PAPI_SDE AND (NOT PAPI_FOUND)) +if(GINKGO_WITH_PAPI_SDE AND (NOT PAPI_FOUND)) add_subdirectory(papi_sde) endif() diff --git a/third_party/papi_sde/CMakeLists.txt b/third_party/papi_sde/CMakeLists.txt index 4b58e368662..5b300d973a5 100644 --- a/third_party/papi_sde/CMakeLists.txt +++ b/third_party/papi_sde/CMakeLists.txt @@ -1,9 +1,9 @@ message(STATUS "Configuring and building PAPI-SDE") set(TPL_PAPI_PATH "${PROJECT_BINARY_DIR}/third_party/papi_sde/src") ginkgo_load_and_configure_package(papi_external - "https://bitbucket.org/terry_cojean/papi/get/77cdd0ba8db98d86c1459dd5f55013aba242d5d5.tar.gz" - "SHA1=540c18a14eeafb83cd60cbbf0a96706111dbff3b" - "${TPL_PAPI_PATH}/src" + "https://github.com/icl-utk-edu/papi/archive/d2dd17a07a3c175fbb26ce5528671e3a7e00b80f.tar.gz" + "SHA1=55019037c47aff216ff831b4191e7147f6932464" + "${TPL_PAPI_PATH}/build" "./configure" "--prefix=${TPL_PAPI_PATH}/install" "--with-components=sde" "--with-libsde=yes" "--with-tests=no" "--with-static-lib=no" "--with-shared-lib=yes" From 011f8c85155477eea348db8f38a65261e98ea654 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 8 Aug 2023 11:09:05 +0200 Subject: [PATCH 177/583] Try to not bundle SDE but rely on external only. --- third_party/CMakeLists.txt | 4 ---- third_party/papi_sde/CMakeLists.txt | 37 ----------------------------- 2 files changed, 41 deletions(-) delete mode 100644 third_party/papi_sde/CMakeLists.txt diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 062f520b8e8..c714a51c187 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -7,10 +7,6 @@ if(GINKGO_BUILD_HWLOC AND (NOT HWLOC_FOUND)) add_subdirectory(hwloc) endif() -if(GINKGO_WITH_PAPI_SDE AND (NOT PAPI_FOUND)) - add_subdirectory(papi_sde) -endif() - if(GINKGO_DEVEL_TOOLS) set(GCF_IGNORE_LIST "third_party" CACHE STRING "Ignore directories for GCF") add_subdirectory(git-cmake-format) diff --git a/third_party/papi_sde/CMakeLists.txt b/third_party/papi_sde/CMakeLists.txt deleted file mode 100644 index 5b300d973a5..00000000000 --- a/third_party/papi_sde/CMakeLists.txt +++ /dev/null @@ -1,37 +0,0 @@ -message(STATUS "Configuring and building PAPI-SDE") -set(TPL_PAPI_PATH "${PROJECT_BINARY_DIR}/third_party/papi_sde/src") -ginkgo_load_and_configure_package(papi_external - "https://github.com/icl-utk-edu/papi/archive/d2dd17a07a3c175fbb26ce5528671e3a7e00b80f.tar.gz" - "SHA1=55019037c47aff216ff831b4191e7147f6932464" - "${TPL_PAPI_PATH}/build" - "./configure" "--prefix=${TPL_PAPI_PATH}/install" - "--with-components=sde" "--with-libsde=yes" "--with-tests=no" - "--with-static-lib=no" "--with-shared-lib=yes" - ) - -add_library(PAPI SHARED IMPORTED GLOBAL) -add_library(PAPI::PAPI ALIAS PAPI) -add_dependencies(PAPI papi_external) -# NOTE: if changing this (e.g. to `.a`), please update the special case in -# `cmake/information_helpers.cmake` -set(PAPI_LIBRARIES "${TPL_PAPI_PATH}/install/lib/libpapi.so" - CACHE FILEPATH "The path to PAPI libraries" FORCE) -set(PAPI_INCLUDE_DIRS "${TPL_PAPI_PATH}/install/include" CACHE PATH - "The directory containing the PAPI header, papi.h" FORCE) -set_target_properties(PAPI PROPERTIES IMPORTED_LOCATION "${PAPI_LIBRARIES}") -set_target_properties(PAPI PROPERTIES INTERFACE_LINK_LIBRARIES "${PAPI_LIBRARIES}") -set_target_properties(PAPI PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${PAPI_INCLUDE_DIRS}") - - -add_library(PAPI_SDE SHARED IMPORTED GLOBAL) -add_library(PAPI::PAPI_SDE ALIAS PAPI_SDE) -add_dependencies(PAPI_SDE papi_external) -# NOTE: if changing this (e.g. to `.a`), please update the special case in -# `cmake/information_helpers.cmake` -set(PAPI_SDE_LIBRARIES "${TPL_PAPI_PATH}/install/lib/libsde.so" - CACHE FILEPATH "The path to PAPI SDE libraries" FORCE) -set(PAPI_SDE_INCLUDE_DIRS "${TPL_PAPI_PATH}/install/include" CACHE PATH - "The directory containing the PAPI SDE header, sde_lib.h" FORCE) -set_target_properties(PAPI_SDE PROPERTIES IMPORTED_LOCATION "${PAPI_SDE_LIBRARIES}") -set_target_properties(PAPI_SDE PROPERTIES INTERFACE_LINK_LIBRARIES "${PAPI_SDE_LIBRARIES}") -set_target_properties(PAPI_SDE PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${PAPI_SDE_INCLUDE_DIRS}") From fad4621c5af3bb8cbc0a6a915477265008257471 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Tue, 8 Aug 2023 09:20:57 +0000 Subject: [PATCH 178/583] Format files Co-authored-by: Terry Cojean --- core/test/log/papi.cpp | 3 ++- include/ginkgo/core/log/papi.hpp | 9 +++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp index d089902c30c..2ed266449f6 100644 --- a/core/test/log/papi.cpp +++ b/core/test/log/papi.cpp @@ -71,7 +71,8 @@ class Papi : public ::testing::Test { } } - void TearDown() { + void TearDown() + { logger = nullptr; PAPI_destroy_eventset(&eventset); PAPI_shutdown(); diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp index 9645e775b3d..bf22f7c876f 100644 --- a/include/ginkgo/core/log/papi.hpp +++ b/include/ginkgo/core/log/papi.hpp @@ -44,8 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include - - #include @@ -210,7 +208,7 @@ class Papi : public Logger { create(std::shared_ptr, const Logger::mask_type& enabled_events = Logger::all_events_mask) { - return std::shared_ptr(new Papi(enabled_events), [](auto logger){ + return std::shared_ptr(new Papi(enabled_events), [](auto logger) { papi_sde_shutdown(logger->get_handle()); delete logger; }); @@ -224,11 +222,10 @@ class Papi : public Logger { static std::shared_ptr create( const Logger::mask_type& enabled_events = Logger::all_events_mask) { - return std::shared_ptr(new Papi(enabled_events), [](auto logger){ + return std::shared_ptr(new Papi(enabled_events), [](auto logger) { papi_sde_shutdown(logger->get_handle()); delete logger; - } -); + }); } /** From 4320ad15d4430a60d434613524c3559336fc6688 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 8 Aug 2023 17:57:40 +0200 Subject: [PATCH 179/583] Improve PAPI finder: store binaries, include SDE --- cmake/Modules/FindPAPI.cmake | 77 ++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/cmake/Modules/FindPAPI.cmake b/cmake/Modules/FindPAPI.cmake index 95f26a24684..04962970e35 100644 --- a/cmake/Modules/FindPAPI.cmake +++ b/cmake/Modules/FindPAPI.cmake @@ -57,6 +57,7 @@ if(NOT PAPI_LIBRARY) select_library_configurations(PAPI) endif() +set(WORK_DIR "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/FindPAPI") if(PAPI_INCLUDE_DIR) if(EXISTS "${PAPI_INCLUDE_DIR}/papi.h") file(STRINGS "${PAPI_INCLUDE_DIR}/papi.h" papi_version_str REGEX "^#define[\t ]+PAPI_VERSION[\t ]+.*") @@ -70,7 +71,9 @@ if(PAPI_INCLUDE_DIR) # find the components enable_language(C) foreach(component IN LISTS PAPI_FIND_COMPONENTS) - file(WRITE "${PROJECT_BINARY_DIR}/papi_${component}_detect.c" + set(SRC_FILE "${WORK_DIR}/papi_${component}_detect.c") + set(BIN_FILE "${WORK_DIR}/papi_${component}_detect.bin") + file(WRITE "${SRC_FILE}" " #include int main() { @@ -78,17 +81,18 @@ if(PAPI_INCLUDE_DIR) retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT && retval > 0) return -1; - if (PAPI_get_component_index(\"${component}\") < 0) + if (PAPI_get_component_index(\"${component}\") == PAPI_ENOCMP) return 0; return 1; }" ) try_run(PAPI_${component}_FOUND gko_result_unused - "${PROJECT_BINARY_DIR}" - "${PROJECT_BINARY_DIR}/papi_${component}_detect.c" + "${WORK_DIR}" + "${SRC_FILE}" CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${PAPI_INCLUDE_DIR} LINK_LIBRARIES ${PAPI_LIBRARY} + COPY_FILE ${BIN_FILE} ) if (NOT PAPI_${component}_FOUND EQUAL 1) @@ -105,6 +109,33 @@ find_package_handle_standard_args(PAPI VERSION_VAR PAPI_VERSION_STRING HANDLE_COMPONENTS) +if(PAPI_sde_FOUND) + # PAPI SDE is another library and header, let's try to find them + find_path(PAPI_SDE_INCLUDE_DIR NAMES sde_lib.h) + mark_as_advanced(PAPI_SDE_INCLUDE_DIR) + + if(NOT PAPI_SDE_LIBRARY) + find_library(PAPI_SDE_LIBRARY_RELEASE NAMES + sde + ) + mark_as_advanced(PAPI_SDE_LIBRARY_RELEASE) + + find_library(PAPI_SDE_LIBRARY_DEBUG NAMES + sded + sde-d + ) + mark_as_advanced(PAPI_SDE_LIBRARY_DEBUG) + + include(SelectLibraryConfigurations) + select_library_configurations(PAPI_SDE) + endif() + + # FIXME: with CMake>=3.17, use NAME_MISMATCHED to get rid of the warning + find_package_handle_standard_args(PAPI_SDE + REQUIRED_VARS PAPI_SDE_LIBRARY PAPI_SDE_INCLUDE_DIR + VERSION_VAR PAPI_VERSION_STRING) +endif() + if(PAPI_FOUND) set(PAPI_LIBRARIES ${PAPI_LIBRARY}) set(PAPI_INCLUDE_DIRS ${PAPI_INCLUDE_DIR}) @@ -142,3 +173,41 @@ if(PAPI_FOUND) endif() endif() endif() + +if (PAPI_SDE_FOUND AND NOT TARGET PAPI::PAPI_SDE) + set(PAPI_SDE_LIBRARIES ${PAPI_SDE_LIBRARY}) + set(PAPI_SDE_INCLUDE_DIRS ${PAPI_SDE_INCLUDE_DIR}) + unset(PAPI_SDE_LIBRARY) + unset(PAPI_SDE_INCLUDE_DIR) + + if(NOT TARGET PAPI::PAPI_SDE) + add_library(PAPI::PAPI_SDE UNKNOWN IMPORTED) + set_target_properties(PAPI::PAPI_SDE PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${PAPI_SDE_INCLUDE_DIRS}") + + if(EXISTS "${PAPI_SDE_LIBRARIES}") + set_target_properties(PAPI::PAPI_SDE PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + INTERFACE_LINK_LIBRARIES "${PAPI_SDE_LIBRARIES}" + IMPORTED_LOCATION "${PAPI_SDE_LIBRARIES}") + endif() + if(PAPI_SDE_LIBRARY_RELEASE) + set_property(TARGET PAPI::PAPI_SDE APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(PAPI::PAPI_SDE PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + INTERFACE_LINK_LIBRARIES_RELEASE "${PAPI_SDE_LIBRARY_RELEASE}" + IMPORTED_LOCATION_RELEASE "${PAPI_SDE_LIBRARY_RELEASE}") + unset(PAPI_SDE_LIBRARY_RELEASE) + endif() + if(PAPI_SDE_LIBRARY_DEBUG) + set_property(TARGET PAPI::PAPI_SDE APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(PAPI::PAPI_SDE PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + INTERFACE_LINK_LIBRARIES_DEBUG "${PAPI_SDE_LIBRARY_DEBUG}" + IMPORTED_LOCATION_DEBUG "${PAPI_SDE_LIBRARY_DEBUG}") + unset(PAPI_SDE_LIBRARY_DEBUG) + endif() + endif() +endif() From 03577c3a6fe423ab8929075699982e80073861c4 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 8 Aug 2023 17:58:02 +0200 Subject: [PATCH 180/583] Also drop bundled hwloc. Rely on system only. --- CMakeLists.txt | 18 ++++++++++-------- third_party/CMakeLists.txt | 4 ---- third_party/hwloc/CMakeLists.txt | 20 -------------------- 3 files changed, 10 insertions(+), 32 deletions(-) delete mode 100644 third_party/hwloc/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d18b18d00a..94e0c6318ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,7 @@ option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) if(MSVC OR WIN32 OR CYGWIN OR APPLE) option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" OFF) else() - option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON) + option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. A system HWLOC is required, otherwise HWLOC support will be disabled." ON) endif() option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Default is ON. Requires a system package." ON) option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF) @@ -226,12 +226,6 @@ if(GINKGO_BUILD_HWLOC AND (MSVC OR WIN32 OR CYGWIN OR APPLE)) set(GINKGO_BUILD_HWLOC OFF CACHE BOOL "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" FORCE) message(WARNING "Ginkgo does not support HWLOC on Windows/MacOS, switch GINKGO_BUILD_HWLOC to OFF") endif() -if(GINKGO_BUILD_HWLOC) - set(GINKGO_HAVE_HWLOC 1) -else() - set(GINKGO_HAVE_HWLOC 0) - message(STATUS "HWLOC is being forcibly switched off") -endif() set(GINKGO_HAVE_GPU_AWARE_MPI OFF) set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF) @@ -269,8 +263,14 @@ if(GINKGO_BUILD_BENCHMARKS) find_package(gflags 2.2.2 QUIET) find_package(RapidJSON 1.1.0 QUIET) endif() + +# System provided, third party libraries (not bundled!) +set(GINKGO_HAVE_HWLOC 0) if(GINKGO_BUILD_HWLOC) - find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC + find_package(HWLOC 2.1) + if (HWLOC_FOUND) + set(GINKGO_HAVE_HWLOC 1) + endif() endif() set(GINKGO_HAVE_PAPI_SDE 0) if(GINKGO_BUILD_PAPI_SDE) @@ -279,6 +279,8 @@ if(GINKGO_BUILD_PAPI_SDE) set(GINKGO_HAVE_PAPI_SDE 1) endif() endif() + +# Bundled third party libraries add_subdirectory(third_party) # Third-party tools and libraries if(MSVC) diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index c714a51c187..a54d4d506ee 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -3,10 +3,6 @@ if(GINKGO_BUILD_TESTS AND (NOT GTest_FOUND)) add_subdirectory(gtest) endif() -if(GINKGO_BUILD_HWLOC AND (NOT HWLOC_FOUND)) - add_subdirectory(hwloc) -endif() - if(GINKGO_DEVEL_TOOLS) set(GCF_IGNORE_LIST "third_party" CACHE STRING "Ignore directories for GCF") add_subdirectory(git-cmake-format) diff --git a/third_party/hwloc/CMakeLists.txt b/third_party/hwloc/CMakeLists.txt deleted file mode 100644 index 5534c07f4f2..00000000000 --- a/third_party/hwloc/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -message(STATUS "Configuring and building HWLOC") -set(TPL_HWLOC_PATH "${PROJECT_BINARY_DIR}/third_party/hwloc/src") -ginkgo_load_and_configure_package(hwloc_external "https://download.open-mpi.org/release/hwloc/v2.4/hwloc-2.4.1.tar.gz" - "SHA1=b94950e8958e1125ca75ecac0bc0259ee3d108c4" - "${TPL_HWLOC_PATH}" - "./configure" "--prefix=${TPL_HWLOC_PATH}/install" - "--disable-nvml" "--disable-cuda" "--disable-rsmi" - ) - -add_library(hwloc SHARED IMPORTED GLOBAL) -add_dependencies(hwloc hwloc_external) -# NOTE: if changing this (e.g. to `.a`), please update the special case in -# `cmake/information_helpers.cmake` -set(HWLOC_LIBRARIES "${TPL_HWLOC_PATH}/install/lib/libhwloc.so" - CACHE FILEPATH "The path to HWLOC library libhwloc.so" FORCE) -set(HWLOC_INCLUDE_DIRS "${TPL_HWLOC_PATH}/install/include" - CACHE PATH "The directory containing the hwloc header, hwloc.h" FORCE) -set_target_properties(hwloc PROPERTIES IMPORTED_LOCATION ${HWLOC_LIBRARIES}) -set_target_properties(hwloc PROPERTIES INTERFACE_LINK_LIBRARIES ${HWLOC_LIBRARIES}) -set_target_properties(hwloc PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HWLOC_INCLUDE_DIRS}") From 58522a4e86808c432790c07677aeff8f9015c543 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 8 Aug 2023 18:19:25 +0200 Subject: [PATCH 181/583] Remove non CMake external package management --- CMakeLists.txt | 1 - cmake/DownloadNonCMakeCMakeLists.txt.in | 27 ----------- cmake/package_helpers.cmake | 60 ------------------------- 3 files changed, 88 deletions(-) delete mode 100644 cmake/DownloadNonCMakeCMakeLists.txt.in delete mode 100644 cmake/package_helpers.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 94e0c6318ae..706006000c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,7 +255,6 @@ if(GINKGO_BUILD_MPI) endif() # Try to find the third party packages before using our subdirectories -include(cmake/package_helpers.cmake) if(GINKGO_BUILD_TESTS) find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest endif() diff --git a/cmake/DownloadNonCMakeCMakeLists.txt.in b/cmake/DownloadNonCMakeCMakeLists.txt.in deleted file mode 100644 index 55e2f833985..00000000000 --- a/cmake/DownloadNonCMakeCMakeLists.txt.in +++ /dev/null @@ -1,27 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(${package_name}) - -include(ExternalProject) -ExternalProject_Add(${package_name} - URL "${package_url}" - URL_HASH "${package_hash}" - DOWNLOAD_NO_PROGRESS TRUE - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_DISCONNECTED ${GINKGO_SKIP_DEPENDENCY_UPDATE} - ) - -ExternalProject_Add_Step(${package_name} custom_configure - COMMAND "${config_command}" "${ARGN}" - WORKING_DIRECTORY "${working_dir}" - DEPENDEES download) -ExternalProject_Add_Step(${package_name} custom_build - COMMAND make - WORKING_DIRECTORY "${working_dir}" - DEPENDEES custom_configure) -ExternalProject_Add_Step(${package_name} custom_install - COMMAND make all install - WORKING_DIRECTORY "${working_dir}" - DEPENDEES custom_build) diff --git a/cmake/package_helpers.cmake b/cmake/package_helpers.cmake deleted file mode 100644 index 1abc1a72587..00000000000 --- a/cmake/package_helpers.cmake +++ /dev/null @@ -1,60 +0,0 @@ -set(NON_CMAKE_PACKAGE_DOWNLOADER_SCRIPT - "${CMAKE_CURRENT_LIST_DIR}/DownloadNonCMakeCMakeLists.txt.in") - - -# Load a package from the url provided and run configure (Non-CMake projects) -# -# \param package_name Name of the package -# \param package_url Url of the package -# \param package_tag Tag or version of the package to be downloaded. -# \param working_dir The directory where the configure/build should happen. -# \param config_command The command for the configuration step. -# -function(ginkgo_load_and_configure_package package_name package_url package_hash working_dir config_command) - set(GINKGO_THIRD_PARTY_BUILD_TYPE "Debug") - if (CMAKE_BUILD_TYPE MATCHES "[Rr][Ee][Ll][Ee][Aa][Ss][Ee]") - set(GINKGO_THIRD_PARTY_BUILD_TYPE "Release") - endif() - configure_file(${NON_CMAKE_PACKAGE_DOWNLOADER_SCRIPT} - download/CMakeLists.txt) - set(TOOLSET "") - if (NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "") - set(TOOLSET "-T${CMAKE_GENERATOR_TOOLSET}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" "${TOOLSET}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download) - if(result) - message(FATAL_ERROR - "CMake step for ${package_name}/download failed: ${result}") - return() - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download) - if(result) - message(FATAL_ERROR - "Build step for ${package_name}/download failed: ${result}") - return() - endif() -endfunction() - - -# Download a file and verify the download -# -# \param url The url of file to be downloaded -# \param filename The name of the file -# \param hash_type The type of hash, See CMake file() documentation for more details. -# \param hash The hash itself, See CMake file() documentation for more details. -# -function(ginkgo_download_file url filename hash_type hash) - file(DOWNLOAD ${url} ${filename} - TIMEOUT 60 # seconds - EXPECTED_HASH "${hash_type}=${hash}" - TLS_VERIFY ON) - if(EXISTS ${filename}) - message(STATUS "${filename} downloaded from ${url}") - else() - message(FATAL_ERROR "Download of ${filename} failed.") - endif() -endfunction(ginkgo_download_file) From e4fd30ac66087f455708b77a6cbdbc619c4ee44c Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 8 Aug 2023 18:44:13 +0200 Subject: [PATCH 182/583] Also drop RPATH management for hwloc and PAPI --- cmake/information_helpers.cmake | 7 +------ cmake/install_helpers.cmake | 4 ---- core/CMakeLists.txt | 2 -- 3 files changed, 1 insertion(+), 12 deletions(-) diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake index cef920a09ce..7ac7fdfeda5 100644 --- a/cmake/information_helpers.cmake +++ b/cmake/information_helpers.cmake @@ -78,12 +78,7 @@ macro(ginkgo_interface_libraries_recursively INTERFACE_LIBS) list(TRANSFORM GINKGO_LIBS_INTERFACE_LIBS REPLACE "\\$" "\\1") ginkgo_interface_libraries_recursively("${GINKGO_LIBS_INTERFACE_LIBS}") elseif(EXISTS "${_libs}") - if ("${_libs}" MATCHES "${PROJECT_BINARY_DIR}.*(papi|sde|pfm|hwloc).so") - get_filename_component(_lib_name "${_libs}" NAME) - list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${CMAKE_INSTALL_FULL_LIBDIR}/${_lib_name}") - else() - list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${_libs}") - endif() + list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${_libs}") elseif("${_libs}" STREQUAL "${CMAKE_DL_LIBS}") list(APPEND GINKGO_INTERFACE_LIBS_FOUND "-l${_libs}") endif() diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake index 8bec34d7a41..601fc89a3db 100644 --- a/cmake/install_helpers.cmake +++ b/cmake/install_helpers.cmake @@ -30,10 +30,6 @@ function(ginkgo_add_install_rpath name) endif() if (GINKGO_INSTALL_RPATH_DEPENDENCIES) set(RPATH_DEPENDENCIES "${ARGN}") - if(GINKGO_HAVE_HWLOC AND HWLOC_FOUND) - get_filename_component(HWLOC_LIB_PATH ${HWLOC_LIBRARIES} DIRECTORY) - list(APPEND RPATH_DEPENDENCIES "${HWLOC_LIBRARIES}") - endif() endif() if (GINKGO_INSTALL_RPATH) set_property(TARGET "${name}" PROPERTY INSTALL_RPATH diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index e7c2bf7ce45..8ec4502d9c7 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -114,8 +114,6 @@ set(GKO_RPATH_ADDITIONS "") if(GINKGO_HAVE_PAPI_SDE) target_link_libraries(ginkgo PUBLIC PAPI::PAPI_SDE) - get_filename_component(GKO_PAPI_LIBDIR "${PAPI_SDE_LIBRARIES}" DIRECTORY) - list(APPEND GKO_RPATH_ADDITIONS "${GKO_PAPI_LIBDIR}") endif() if(GINKGO_HAVE_TAU) From 62f1ff75d4142ab865e203f86844a66170c2f024 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 8 Aug 2023 18:52:26 +0200 Subject: [PATCH 183/583] Automatically detect HWLOC/PAPI. Notify force disabled. --- CMakeLists.txt | 21 +++++++++++---------- cmake/autodetect_system_libs.cmake | 7 +++++++ 2 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 cmake/autodetect_system_libs.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 706006000c9..3306f1b9ac7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ include(cmake/hip_path.cmake) include(cmake/autodetect_executors.cmake) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules/") +include(cmake/autodetect_system_libs.cmake) # Ginkgo configuration options option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF) @@ -77,9 +78,9 @@ option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) if(MSVC OR WIN32 OR CYGWIN OR APPLE) option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" OFF) else() - option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. A system HWLOC is required, otherwise HWLOC support will be disabled." ON) + option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Enabled if a system installation is found." ${HWLOC_FOUND}) endif() -option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Default is ON. Requires a system package." ON) +option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND}) option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF) option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) @@ -190,13 +191,6 @@ endif() include(CheckIncludeFileCXX) check_include_file_cxx(cxxabi.h GKO_HAVE_CXXABI_H) -# Automatically find PAPI and search for the required 'sde' component -set(GINKGO_HAVE_PAPI_SDE 0) -find_package(PAPI OPTIONAL_COMPONENTS sde) -if(PAPI_sde_FOUND) - set(GINKGO_HAVE_PAPI_SDE 1) -endif() - # Automatically find TAU set(GINKGO_HAVE_TAU 0) find_package(PerfStubs QUIET) @@ -269,13 +263,20 @@ if(GINKGO_BUILD_HWLOC) find_package(HWLOC 2.1) if (HWLOC_FOUND) set(GINKGO_HAVE_HWLOC 1) + else() + message(WARNING "HWLOC could not be found. HWLOC support will be disabled.") + set(GINKGO_BUILD_HWLOC OFF CACHE BOOL "HWLOC support was disabled because a system package could not be found." FORCE) endif() endif() + set(GINKGO_HAVE_PAPI_SDE 0) if(GINKGO_BUILD_PAPI_SDE) find_package(PAPI 7.0.1.0 COMPONENTS sde) - if (PAPI_FOUND AND PAPI_SDE_FOUND) + if (PAPI_SDE_FOUND) set(GINKGO_HAVE_PAPI_SDE 1) + else() + message(WARNING "PAPI (SDE) could not be found. PAPI_SDE support will be disabled.") + set(GINKGO_BUILD_PAPI_SDE OFF CACHE BOOL "PAPI_SDE support was disabled because a system package could not be found." FORCE) endif() endif() diff --git a/cmake/autodetect_system_libs.cmake b/cmake/autodetect_system_libs.cmake new file mode 100644 index 00000000000..6f59a759aa8 --- /dev/null +++ b/cmake/autodetect_system_libs.cmake @@ -0,0 +1,7 @@ +if (NOT DEFINED GINKGO_BUILD_HWLOC) + find_package(HWLOC 2.1) +endif() + +if (NOT DEFINED GINKGO_BUILD_PAPI_SDE) + find_package(PAPI 7.0.1.0 COMPONENTS sde) +endif() From b22c0452af0bcc089e461e275698a025ccab838c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 14 Aug 2023 17:08:50 +0200 Subject: [PATCH 184/583] set the pthread preference first --- CMakeLists.txt | 4 +++- cmake/GinkgoConfig.cmake.in | 3 +++ core/base/mixed_precision_types.hpp | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3306f1b9ac7..26bc992c457 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,8 @@ cmake_minimum_required(VERSION 3.16) project(Ginkgo LANGUAGES C CXX VERSION 1.7.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") set(Ginkgo_VERSION_TAG "master") set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG}) +# Cuda and Hip also look for Threads. Set it before any find_package to ensure the Threads setting is not changed. +set(THREADS_PREFER_PTHREAD_FLAG ON) # Determine which modules can be compiled include(cmake/hip_path.cmake) @@ -98,7 +100,7 @@ endif() if(GINKGO_BUILD_OMP) find_package(OpenMP 3.0 REQUIRED) endif() -set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) include(cmake/build_type_helpers.cmake) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 0d7ce5455f1..352cf1dde8d 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -139,6 +139,9 @@ set(GINKGO_HAVE_VTUNE "@GINKGO_HAVE_VTUNE@") set(GINKGO_HAVE_METIS "@GINKGO_HAVE_METIS@") set(VTune_PATH "@VTune_PATH@") +# ensure Threads settings +set(THREADS_PREFER_PTHREAD_FLAG ON) + # NOTE: we do not export benchmarks, examples, tests or devel tools # so `third_party` libraries are currently unneeded. diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 9579caaac4f..b5c1e37569b 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -75,7 +75,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ - template _macro(float, float, float, __VA_ARGS__); + template _macro(float, float, float, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ template _macro(double, double, double, __VA_ARGS__) From a4b91c65cf59de4a85b92424618b93d3a5b3bc12 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 23 Feb 2022 13:43:03 +0100 Subject: [PATCH 185/583] add build_from_local_range to partition --- core/distributed/partition.cpp | 35 ++++++++++++++++++- include/ginkgo/core/distributed/partition.hpp | 16 +++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index a1db99396e7..ac8f6c7fe28 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -32,7 +32,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include - #include "core/distributed/partition_kernels.hpp" @@ -90,6 +89,40 @@ Partition::build_from_contiguous( } +template +std::unique_ptr> +Partition::build_from_local_range( + std::shared_ptr exec, LocalIndexType local_start, + LocalIndexType local_end, std::shared_ptr comm) +{ + GlobalIndexType range[2] = {static_cast(local_start), + static_cast(local_end)}; + + // make all range_ends available on each rank + Array ranges_start_end(exec->get_master(), + comm->size() * 2); + ranges_start_end.fill(0); + // comm->all_gather(range, 2, ranges_start_end.get_data(), 2); + mpi::all_gather(range, 2, ranges_start_end.get_data(), 2, comm); + + // remove duplicates + Array ranges(exec->get_master(), comm->size() + 1); + auto ranges_se_data = ranges_start_end.get_const_data(); + ranges.get_data()[0] = ranges_se_data[0]; + for (int i = 1; i < ranges_start_end.get_num_elems() - 1; i += 2) { + GKO_ASSERT_EQ(ranges_se_data[i], ranges_se_data[i + 1]); + ranges.get_data()[i / 2 + 1] = ranges_se_data[i]; + } + ranges.get_data()[ranges.get_num_elems() - 1] = + ranges_se_data[ranges_start_end.get_num_elems() - 1]; + + // move data to correct executor + ranges.set_executor(exec); + + return Partition::build_from_contiguous(exec, ranges); +} + + template std::unique_ptr> Partition::build_from_global_size_uniform( diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index 0096edf999c..c753c1beb3d 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -285,6 +286,21 @@ class Partition std::shared_ptr exec, comm_index_type num_parts, global_index_type global_size); + /** + * Builds a partition from the local range + * + * @param exec the Executor on which the partition should be built + * @param local_start the start index of the local range + * @param local_end the end index of the local range + * + * @return a Partition where each range has the individual local_start + * and local_ends + */ + static std::unique_ptr build_from_local_range( + std::shared_ptr exec, local_index_type local_start, + local_index_type local_end, + std::shared_ptr comm); + private: /** * Creates a partition stored on the given executor with the given number of From f3aa670260f6b6bbf7d388cbcb9e48a29a26cbe8 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 23 Feb 2022 14:40:18 +0100 Subject: [PATCH 186/583] move build_from_local_range into its own header --- core/CMakeLists.txt | 1 + core/distributed/partition.cpp | 34 -------- core/distributed/partition_helpers.cpp | 86 +++++++++++++++++++ include/ginkgo/core/distributed/partition.hpp | 16 ---- .../core/distributed/partition_helpers.hpp | 78 +++++++++++++++++ include/ginkgo/ginkgo.hpp | 1 + 6 files changed, 166 insertions(+), 50 deletions(-) create mode 100644 core/distributed/partition_helpers.cpp create mode 100644 include/ginkgo/core/distributed/partition_helpers.hpp diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 8ec4502d9c7..7932976d6c9 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -95,6 +95,7 @@ if(GINKGO_BUILD_MPI) PRIVATE mpi/exception.cpp distributed/matrix.cpp + distributed/partition_helpers.cpp distributed/vector.cpp distributed/preconditioner/schwarz.cpp) endif() diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index ac8f6c7fe28..c6e5bfc5fe0 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -89,40 +89,6 @@ Partition::build_from_contiguous( } -template -std::unique_ptr> -Partition::build_from_local_range( - std::shared_ptr exec, LocalIndexType local_start, - LocalIndexType local_end, std::shared_ptr comm) -{ - GlobalIndexType range[2] = {static_cast(local_start), - static_cast(local_end)}; - - // make all range_ends available on each rank - Array ranges_start_end(exec->get_master(), - comm->size() * 2); - ranges_start_end.fill(0); - // comm->all_gather(range, 2, ranges_start_end.get_data(), 2); - mpi::all_gather(range, 2, ranges_start_end.get_data(), 2, comm); - - // remove duplicates - Array ranges(exec->get_master(), comm->size() + 1); - auto ranges_se_data = ranges_start_end.get_const_data(); - ranges.get_data()[0] = ranges_se_data[0]; - for (int i = 1; i < ranges_start_end.get_num_elems() - 1; i += 2) { - GKO_ASSERT_EQ(ranges_se_data[i], ranges_se_data[i + 1]); - ranges.get_data()[i / 2 + 1] = ranges_se_data[i]; - } - ranges.get_data()[ranges.get_num_elems() - 1] = - ranges_se_data[ranges_start_end.get_num_elems() - 1]; - - // move data to correct executor - ranges.set_executor(exec); - - return Partition::build_from_contiguous(exec, ranges); -} - - template std::unique_ptr> Partition::build_from_global_size_uniform( diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp new file mode 100644 index 00000000000..8fdd6cc0634 --- /dev/null +++ b/core/distributed/partition_helpers.cpp @@ -0,0 +1,86 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include + + +namespace gko { +namespace distributed { + + +template +std::unique_ptr> +build_partition_from_local_range(std::shared_ptr exec, + LocalIndexType local_start, + LocalIndexType local_end, + mpi::communicator comm) +{ + GlobalIndexType range[2] = {static_cast(local_start), + static_cast(local_end)}; + + // make all range_ends available on each rank + Array ranges_start_end(exec->get_master(), + comm.size() * 2); + ranges_start_end.fill(0); + comm.all_gather(range, 2, ranges_start_end.get_data(), 2); + + // remove duplicates + Array ranges(exec->get_master(), comm.size() + 1); + auto ranges_se_data = ranges_start_end.get_const_data(); + ranges.get_data()[0] = ranges_se_data[0]; + for (int i = 1; i < ranges_start_end.get_num_elems() - 1; i += 2) { + GKO_ASSERT_EQ(ranges_se_data[i], ranges_se_data[i + 1]); + ranges.get_data()[i / 2 + 1] = ranges_se_data[i]; + } + ranges.get_data()[ranges.get_num_elems() - 1] = + ranges_se_data[ranges_start_end.get_num_elems() - 1]; + + // move data to correct executor + ranges.set_executor(exec); + + return Partition::build_from_contiguous( + exec, ranges); +} + +#define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE(_local_type, \ + _global_type) \ + std::unique_ptr> \ + build_partition_from_local_range( \ + std::shared_ptr exec, _local_type local_start, \ + _local_type local_end, mpi::communicator comm) +GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE); + + +} // namespace distributed +} // namespace gko diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index c753c1beb3d..0096edf999c 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include @@ -286,21 +285,6 @@ class Partition std::shared_ptr exec, comm_index_type num_parts, global_index_type global_size); - /** - * Builds a partition from the local range - * - * @param exec the Executor on which the partition should be built - * @param local_start the start index of the local range - * @param local_end the end index of the local range - * - * @return a Partition where each range has the individual local_start - * and local_ends - */ - static std::unique_ptr build_from_local_range( - std::shared_ptr exec, local_index_type local_start, - local_index_type local_end, - std::shared_ptr comm); - private: /** * Creates a partition stored on the given executor with the given number of diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp new file mode 100644 index 00000000000..4439d8311e0 --- /dev/null +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_PARTITION_HELPERS_HPP_ +#define GKO_PUBLIC_CORE_DISTRIBUTED_PARTITION_HELPERS_HPP_ + + +#include + + +#if GINKGO_BUILD_MPI + + +#include + + +namespace gko { +namespace distributed { + +template +class Partition; + + +/** + * Builds a partition from the local range + * + * @param exec the Executor on which the partition should be built + * @param local_start the start index of the local range + * @param local_end the end index of the local range + * + * @return a Partition where each range has the individual local_start + * and local_ends + */ +template +std::unique_ptr> +build_partition_from_local_range(std::shared_ptr exec, + LocalIndexType local_start, + LocalIndexType local_end, + mpi::communicator comm); + + +} // namespace distributed +} // namespace gko + + +#endif + + +#endif // GKO_PUBLIC_CORE_DISTRIBUTED_PARTITION_HELPERS_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 179a8a01a46..594ad880b8c 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -82,6 +82,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include From bb844ffd4767b6b1ef65d2506d12ec1cf80604e1 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 Dec 2022 11:47:58 +0100 Subject: [PATCH 187/583] fixups after rebase --- core/distributed/partition_helpers.cpp | 5 ++++- .../ginkgo/core/distributed/partition_helpers.hpp | 12 +++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 8fdd6cc0634..24f129db36f 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace experimental { namespace distributed { @@ -52,7 +53,8 @@ build_partition_from_local_range(std::shared_ptr exec, Array ranges_start_end(exec->get_master(), comm.size() * 2); ranges_start_end.fill(0); - comm.all_gather(range, 2, ranges_start_end.get_data(), 2); + comm.all_gather(exec->get_master(), range, 2, ranges_start_end.get_data(), + 2); // remove duplicates Array ranges(exec->get_master(), comm.size() + 1); @@ -83,4 +85,5 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( } // namespace distributed +} // namespace experimental } // namespace gko diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp index 4439d8311e0..1433953c738 100644 --- a/include/ginkgo/core/distributed/partition_helpers.hpp +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace experimental { namespace distributed { template @@ -51,14 +52,14 @@ class Partition; /** - * Builds a partition from the local range + * Builds a partition from a local range. * - * @param exec the Executor on which the partition should be built - * @param local_start the start index of the local range - * @param local_end the end index of the local range + * @param exec the Executor on which the partition should be built. + * @param local_start the start index of the local range. + * @param local_end the end index of the local range. * * @return a Partition where each range has the individual local_start - * and local_ends + * and local_ends. */ template std::unique_ptr> @@ -69,6 +70,7 @@ build_partition_from_local_range(std::shared_ptr exec, } // namespace distributed +} // namespace experimental } // namespace gko From 149e479d2ca6a11c81eed3375ca16de4198aa7e9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 Dec 2022 11:53:33 +0100 Subject: [PATCH 188/583] use span for local range --- core/distributed/partition_helpers.cpp | 21 ++++++++----------- .../core/distributed/partition_helpers.hpp | 5 ++--- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 24f129db36f..2f764c2c478 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -42,14 +42,12 @@ namespace distributed { template std::unique_ptr> build_partition_from_local_range(std::shared_ptr exec, - LocalIndexType local_start, - LocalIndexType local_end, - mpi::communicator comm) + span local_range, mpi::communicator comm) { - GlobalIndexType range[2] = {static_cast(local_start), - static_cast(local_end)}; + GlobalIndexType range[2] = {static_cast(local_range.begin), + static_cast(local_range.end)}; - // make all range_ends available on each rank + // make all range_start_ends available on each rank Array ranges_start_end(exec->get_master(), comm.size() * 2); ranges_start_end.fill(0); @@ -74,12 +72,11 @@ build_partition_from_local_range(std::shared_ptr exec, exec, ranges); } -#define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE(_local_type, \ - _global_type) \ - std::unique_ptr> \ - build_partition_from_local_range( \ - std::shared_ptr exec, _local_type local_start, \ - _local_type local_end, mpi::communicator comm) +#define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE(_local_type, \ + _global_type) \ + std::unique_ptr> \ + build_partition_from_local_range(std::shared_ptr exec, \ + span local_range, mpi::communicator comm) GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE); diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp index 1433953c738..01bc1cc1a18 100644 --- a/include/ginkgo/core/distributed/partition_helpers.hpp +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "ginkgo/core/base/range.hpp" namespace gko { @@ -64,9 +65,7 @@ class Partition; template std::unique_ptr> build_partition_from_local_range(std::shared_ptr exec, - LocalIndexType local_start, - LocalIndexType local_end, - mpi::communicator comm); + span local_range, mpi::communicator comm); } // namespace distributed From 9718e48c16dfd7bcac21adc6ed45b1ba09990860 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 Dec 2022 12:33:27 +0100 Subject: [PATCH 189/583] adds kernel to remove duplicate start/ends --- common/unified/CMakeLists.txt | 1 + .../distributed/partition_helpers_kernels.cpp | 42 +++++++++++++++++++ core/device_hooks/common_kernels.inc.cpp | 10 +++++ core/distributed/partition_helpers.cpp | 38 ++++++++++------- .../distributed/partition_helpers_kernels.hpp | 37 ++++++++++++++++ reference/CMakeLists.txt | 1 + .../distributed/partition_helpers_kernels.cpp | 33 +++++++++++++++ 7 files changed, 146 insertions(+), 16 deletions(-) create mode 100644 common/unified/distributed/partition_helpers_kernels.cpp create mode 100644 core/distributed/partition_helpers_kernels.hpp create mode 100644 reference/distributed/partition_helpers_kernels.cpp diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt index 5a37eb022f9..67fc839d6a7 100644 --- a/common/unified/CMakeLists.txt +++ b/common/unified/CMakeLists.txt @@ -6,6 +6,7 @@ set(UNIFIED_SOURCES components/format_conversion_kernels.cpp components/precision_conversion_kernels.cpp components/reduce_array_kernels.cpp + distributed/partition_helpers_kernels.cpp distributed/partition_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp new file mode 100644 index 00000000000..d5f4f407cd5 --- /dev/null +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -0,0 +1,42 @@ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +#include "common/unified/base/kernel_launch.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace partition_helpers { + + +template +void compress_start_ends(std::shared_ptr exec, + const array& range_start_ends, + array& ranges) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto size, const auto* range_start_ends, + auto* ranges) { + if (i == 0) { + ranges[0] = range_start_ends[0]; + } + if (i != size - 1) { + ranges[i + 1] = range_start_ends[2 * i + 1]; + } + }, + ranges.get_num_elems() - 1, ranges.get_num_elems(), + range_start_ends.get_const_data(), ranges.get_data()); +} + + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); + + +} // namespace partition_helpers +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 0f898b3ae73..519376dae11 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/components/reduce_array_kernels.hpp" #include "core/distributed/matrix_kernels.hpp" +#include "core/distributed/partition_helpers_kernels.hpp" #include "core/distributed/partition_kernels.hpp" #include "core/distributed/vector_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" @@ -255,6 +256,15 @@ GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_IS_ORDERED); } // namespace partition +namespace partition_helpers { + + +GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); + + +} + + namespace distributed_vector { diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 2f764c2c478..3a46461bcf0 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -34,9 +34,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/distributed/partition_helpers_kernels.hpp" + + namespace gko { namespace experimental { namespace distributed { +namespace partition_helpers { +namespace { + + +GKO_REGISTER_OPERATION(compress_start_ends, + partition_helpers::compress_start_ends); + + +} +} // namespace partition_helpers template @@ -48,25 +61,18 @@ build_partition_from_local_range(std::shared_ptr exec, static_cast(local_range.end)}; // make all range_start_ends available on each rank - Array ranges_start_end(exec->get_master(), - comm.size() * 2); + auto mpi_exec = (exec == exec->get_master() || mpi::is_gpu_aware()) + ? exec + : exec->get_master(); + array ranges_start_end(mpi_exec, comm.size() * 2); ranges_start_end.fill(0); - comm.all_gather(exec->get_master(), range, 2, ranges_start_end.get_data(), - 2); + comm.all_gather(mpi_exec, range, 2, ranges_start_end.get_data(), 2); + ranges_start_end.set_executor(exec); // remove duplicates - Array ranges(exec->get_master(), comm.size() + 1); - auto ranges_se_data = ranges_start_end.get_const_data(); - ranges.get_data()[0] = ranges_se_data[0]; - for (int i = 1; i < ranges_start_end.get_num_elems() - 1; i += 2) { - GKO_ASSERT_EQ(ranges_se_data[i], ranges_se_data[i + 1]); - ranges.get_data()[i / 2 + 1] = ranges_se_data[i]; - } - ranges.get_data()[ranges.get_num_elems() - 1] = - ranges_se_data[ranges_start_end.get_num_elems() - 1]; - - // move data to correct executor - ranges.set_executor(exec); + array ranges(exec, comm.size() + 1); + exec->run( + partition_helpers::make_compress_start_ends(ranges_start_end, ranges)); return Partition::build_from_contiguous( exec, ranges); diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp new file mode 100644 index 00000000000..374fedf8c1a --- /dev/null +++ b/core/distributed/partition_helpers_kernels.hpp @@ -0,0 +1,37 @@ +#ifndef GINKGO_PARTITION_HELPERS_KERNELS_HPP +#define GINKGO_PARTITION_HELPERS_KERNELS_HPP + + +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS(_type) \ + void compress_start_ends(std::shared_ptr exec, \ + const array<_type>& range_start_ends, \ + array<_type>& ranges) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS(GlobalIndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(partition_helpers, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GINKGO_PARTITION_HELPERS_KERNELS_HPP diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 074d5efe818..dd54e3fb52f 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -13,6 +13,7 @@ target_sources(ginkgo_reference components/precision_conversion_kernels.cpp components/prefix_sum_kernels.cpp distributed/matrix_kernels.cpp + distributed/partition_helpers_kernels.cpp distributed/partition_kernels.cpp distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp new file mode 100644 index 00000000000..0451e82e10b --- /dev/null +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -0,0 +1,33 @@ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace partition_helpers { + + +template +void compress_start_ends(std::shared_ptr exec, + const array& range_start_ends, + array& ranges) +{ + if (ranges.get_num_elems()) { + ranges.get_data()[0] = range_start_ends.get_const_data()[0]; + for (size_type i = 0; i < ranges.get_num_elems() - 1; ++i) { + ranges.get_data()[i + 1] = + range_start_ends.get_const_data()[2 * i + 1]; + } + } +} + + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); + + +} // namespace partition_helpers +} // namespace reference +} // namespace kernels +} // namespace gko From ebda15c9e989d38383307830bf9bec15741fc144 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 Dec 2022 12:55:27 +0100 Subject: [PATCH 190/583] adds test for removing duplicate start/ends --- test/distributed/CMakeLists.txt | 1 + test/distributed/partition_helper_kernels.cpp | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 test/distributed/partition_helper_kernels.cpp diff --git a/test/distributed/CMakeLists.txt b/test/distributed/CMakeLists.txt index 1c8e9b1e8fc..32b3810ea31 100644 --- a/test/distributed/CMakeLists.txt +++ b/test/distributed/CMakeLists.txt @@ -1,3 +1,4 @@ ginkgo_create_common_test(matrix_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(vector_kernels DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_and_reference_test(partition_helper_kernels) diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp new file mode 100644 index 00000000000..c52ba65e5c7 --- /dev/null +++ b/test/distributed/partition_helper_kernels.cpp @@ -0,0 +1,98 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +#include +#include + + +#include + + +#include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + + +using comm_index_type = gko::experimental::distributed::comm_index_type; + + +template +class PartitionHelpers : public CommonTestFixture { +protected: + using index_type = IndexType; +}; + +TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes); + + +TYPED_TEST(PartitionHelpers, CanCompressStartEndsWithOneRange) +{ + using itype = typename TestFixture::index_type; + gko::array start_ends{this->exec, {0, 3}}; + gko::array expects{this->exec, {0, 3}}; + gko::array result{this->exec, expects.get_num_elems()}; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_start_ends( + this->exec, start_ends, result); + + GKO_ASSERT_ARRAY_EQ(result, expects); +} + + +TYPED_TEST(PartitionHelpers, CanCompressStartEndsWithMultipleRanges) +{ + using itype = typename TestFixture::index_type; + gko::array start_ends{this->exec, {0, 3, 3, 7, 7, 10}}; + gko::array expects{this->exec, {0, 3, 7, 10}}; + gko::array result{this->exec, expects.get_num_elems()}; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_start_ends( + this->exec, start_ends, result); + + GKO_ASSERT_ARRAY_EQ(result, expects); +} + + +TYPED_TEST(PartitionHelpers, CanCompressStartEndsWithZeroRange) +{ + using itype = typename TestFixture::index_type; + gko::array start_ends{this->exec}; + gko::array expects{this->exec, {0}}; + gko::array result{this->exec, {0}}; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_start_ends( + this->exec, start_ends, result); + + GKO_ASSERT_ARRAY_EQ(result, expects); +} From cf7429d74d25e7e5fe0f4ca2e388ffc891e90fb7 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 Dec 2022 13:25:23 +0100 Subject: [PATCH 191/583] adds tests for build_from_local_range --- test/mpi/partition_helpers.cpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 test/mpi/partition_helpers.cpp diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp new file mode 100644 index 00000000000..303d7362856 --- /dev/null +++ b/test/mpi/partition_helpers.cpp @@ -0,0 +1,28 @@ +#include +#include + + +#include "core/test/utils.hpp" +#include "test/utils/mpi/executor.hpp" + + +template +class PartitionHelpers : public CommonMpiTestFixture{ +protected: + using index_type = IndexType; + +}; + +TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes); + + +TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges){ + using itype = typename TestFixture::index_type ; + gko::span local_range[] = {{0u, 4u}, {4u, 9u}, {9u, 11u}}; + gko::array expects{this->exec, {0, 4, 9, 11}}; + + auto part = gko::experimental::distributed::build_partition_from_local_range(this->exec, local_range[this->comm.rank()], this->comm); + + GKO_ASSERT_ARRAY_EQ(expects, + gko::make_const_array_view(this->exec, expects.get_num_elems(), part->get_range_bounds())); +} From 5f9a359a5d738bbd567770cea867bf6ed4eefa5d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 Dec 2022 13:25:53 +0100 Subject: [PATCH 192/583] adds note on invalid inputs --- include/ginkgo/core/distributed/partition_helpers.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp index 01bc1cc1a18..93b04af7f6c 100644 --- a/include/ginkgo/core/distributed/partition_helpers.hpp +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -56,8 +56,11 @@ class Partition; * Builds a partition from a local range. * * @param exec the Executor on which the partition should be built. - * @param local_start the start index of the local range. - * @param local_end the end index of the local range. + * @param local_range the start and end indices of the local range + * + * @warning The local ranges have to be continuous and ascending. This means + * that for a process `i` with `range[i] = [s_i, e_i)` then for process + * `j = i+1` `range[j] = [s_j = e_i, e_j)`. * * @return a Partition where each range has the individual local_start * and local_ends. @@ -73,7 +76,5 @@ build_partition_from_local_range(std::shared_ptr exec, } // namespace gko -#endif - - +#endif // GINKGO_BUILD_MPI #endif // GKO_PUBLIC_CORE_DISTRIBUTED_PARTITION_HELPERS_HPP_ From 2c9a0597b6d134ccae69ad27316379426b0f902b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 Dec 2022 14:09:44 +0100 Subject: [PATCH 193/583] fixes reference kernel --- reference/distributed/partition_helpers_kernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 0451e82e10b..0060d20be10 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -13,7 +13,7 @@ void compress_start_ends(std::shared_ptr exec, const array& range_start_ends, array& ranges) { - if (ranges.get_num_elems()) { + if (ranges.get_num_elems() && range_start_ends.get_num_elems()) { ranges.get_data()[0] = range_start_ends.get_const_data()[0]; for (size_type i = 0; i < ranges.get_num_elems() - 1; ++i) { ranges.get_data()[i + 1] = From c4ab56b784e6baeeb14d8c58fa43c7f5d4b28e7c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 9 Dec 2022 15:14:27 +0100 Subject: [PATCH 194/583] allows specifying part ids for contiguous partition constructor --- .../unified/distributed/partition_kernels.cpp | 9 +++++--- core/distributed/partition.cpp | 11 ++++++++-- core/distributed/partition_kernels.hpp | 9 ++++---- include/ginkgo/core/distributed/partition.hpp | 11 ++++++---- reference/distributed/partition_kernels.cpp | 4 +++- .../test/distributed/partition_kernels.cpp | 22 +++++++++++++++++++ test/distributed/partition_kernels.cpp | 16 ++++++++++++++ 7 files changed, 68 insertions(+), 14 deletions(-) diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp index cb0f4813da5..dc13fec9f1b 100644 --- a/common/unified/distributed/partition_kernels.cpp +++ b/common/unified/distributed/partition_kernels.cpp @@ -66,19 +66,22 @@ void count_ranges(std::shared_ptr exec, template void build_from_contiguous(std::shared_ptr exec, const array& ranges, + const array& part_id_mapping, GlobalIndexType* range_bounds, comm_index_type* part_ids) { run_kernel( exec, - [] GKO_KERNEL(auto i, auto ranges, auto bounds, auto ids) { + [] GKO_KERNEL(auto i, auto ranges, auto mapping, auto bounds, auto ids, + bool uses_mapping) { if (i == 0) { bounds[0] = 0; } bounds[i + 1] = ranges[i + 1]; - ids[i] = i; + ids[i] = uses_mapping ? mapping[i] : i; }, - ranges.get_num_elems() - 1, ranges, range_bounds, part_ids); + ranges.get_num_elems() - 1, ranges, part_id_mapping, range_bounds, + part_ids, part_id_mapping.get_num_elems() > 0); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_CONTIGUOUS); diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index c6e5bfc5fe0..575ca83aba6 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -75,14 +75,21 @@ Partition::build_from_mapping( template std::unique_ptr> Partition::build_from_contiguous( - std::shared_ptr exec, const array& ranges) + std::shared_ptr exec, const array& ranges, + const array& part_ids) { + GKO_ASSERT(part_ids.get_num_elems() == 0 || + part_ids.get_num_elems() + 1 == ranges.get_num_elems()); + + array empty(exec); auto local_ranges = make_temporary_clone(exec, &ranges); + auto local_part_ids = make_temporary_clone( + exec, part_ids.get_num_elems() > 0 ? &part_ids : &empty); auto result = Partition::create( exec, static_cast(ranges.get_num_elems() - 1), ranges.get_num_elems() - 1); exec->run(partition::make_build_from_contiguous( - *local_ranges.get(), result->offsets_.get_data(), + *local_ranges, *local_part_ids, result->offsets_.get_data(), result->part_ids_.get_data())); result->finalize_construction(); return result; diff --git a/core/distributed/partition_kernels.hpp b/core/distributed/partition_kernels.hpp index 3d66ed113e8..070ff0839b4 100644 --- a/core/distributed/partition_kernels.hpp +++ b/core/distributed/partition_kernels.hpp @@ -49,10 +49,11 @@ namespace kernels { const array& mapping, \ size_type& num_ranges) -#define GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType) \ - void build_from_contiguous(std::shared_ptr exec, \ - const array& ranges, \ - GlobalIndexType* range_bounds, \ +#define GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType) \ + void build_from_contiguous(std::shared_ptr exec, \ + const array& ranges, \ + const array& part_id_mapping, \ + GlobalIndexType* range_bounds, \ comm_index_type* part_ids) #define GKO_PARTITION_BUILD_FROM_MAPPING(GlobalIndexType) \ diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index 0096edf999c..fa8b2739400 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -260,15 +260,18 @@ class Partition * * @param exec the Executor on which the partition should be built * @param ranges the boundaries of the ranges representing each part. - * Part i contains the indices [ranges[i], ranges[i + 1]). - * Has to contain at least one element. - * The first element has to be 0. + * Part parti_id[i] contains the indices + * [ranges[i], ranges[i + 1]). Has to contain at least + * one element. The first element has to be 0. + * @param part_ids the part ids of the provided ranges. If empty, then + * it will assume range i belongs to part i. * * @return a Partition representing the given contiguous partitioning. */ static std::unique_ptr build_from_contiguous( std::shared_ptr exec, - const array& ranges); + const array& ranges, + const array& part_ids = {}); /** * Builds a partition by evenly distributing the global range. diff --git a/reference/distributed/partition_kernels.cpp b/reference/distributed/partition_kernels.cpp index 6eae93d27d0..e9a2bfe7667 100644 --- a/reference/distributed/partition_kernels.cpp +++ b/reference/distributed/partition_kernels.cpp @@ -55,14 +55,16 @@ void count_ranges(std::shared_ptr exec, template void build_from_contiguous(std::shared_ptr exec, const array& ranges, + const array& part_id_mapping, GlobalIndexType* range_bounds, comm_index_type* part_ids) { + bool uses_mapping = part_id_mapping.get_num_elems() > 0; range_bounds[0] = 0; for (comm_index_type i = 0; i < ranges.get_num_elems() - 1; i++) { auto end = ranges.get_const_data()[i + 1]; range_bounds[i + 1] = end; - part_ids[i] = i; + part_ids[i] = uses_mapping ? part_id_mapping.get_const_data()[i] : i; } } diff --git a/reference/test/distributed/partition_kernels.cpp b/reference/test/distributed/partition_kernels.cpp index 4cc7750a193..f92349ee2eb 100644 --- a/reference/test/distributed/partition_kernels.cpp +++ b/reference/test/distributed/partition_kernels.cpp @@ -171,6 +171,28 @@ TYPED_TEST(Partition, BuildsFromRangeWithSingleElement) } +TYPED_TEST(Partition, BuildsFromRangesWithPartIds) +{ + using global_index_type = typename TestFixture::global_index_type; + using part_type = typename TestFixture::part_type; + gko::array ranges{this->ref, {0, 5, 5, 7, 9, 10}}; + gko::array part_id{this->ref, {0, 4, 3, 1, 2}}; + + auto partition = + part_type::build_from_contiguous(this->ref, ranges, part_id); + + EXPECT_EQ(partition->get_size(), + ranges.get_data()[ranges.get_num_elems() - 1]); + EXPECT_EQ(partition->get_num_ranges(), ranges.get_num_elems() - 1); + EXPECT_EQ(partition->get_num_parts(), ranges.get_num_elems() - 1); + EXPECT_EQ(partition->get_num_empty_parts(), 1); + assert_equal_data(partition->get_range_bounds(), {0, 5, 5, 7, 9, 10}); + assert_equal_data(partition->get_part_ids(), {0, 4, 3, 1, 2}); + assert_equal_data(partition->get_range_starting_indices(), {0, 0, 0, 0, 0}); + assert_equal_data(partition->get_part_sizes(), {5, 2, 1, 2, 0}); +} + + TYPED_TEST(Partition, BuildsFromGlobalSize) { using part_type = typename TestFixture::part_type; diff --git a/test/distributed/partition_kernels.cpp b/test/distributed/partition_kernels.cpp index 686d1432da5..7033abb37ef 100644 --- a/test/distributed/partition_kernels.cpp +++ b/test/distributed/partition_kernels.cpp @@ -276,6 +276,22 @@ TYPED_TEST(Partition, BuildsFromContiguousWithSingleEntry) } +TYPED_TEST(Partition, BuildsFromContiguousWithPartId) +{ + using global_index_type = typename TestFixture::global_index_type; + using part_type = typename TestFixture::part_type; + gko::array ranges{this->ref, + {0, 1234, 3134, 4578, 16435, 60000}}; + gko::array part_id{this->ref, {0, 4, 3, 1, 2}}; + gko::array dranges{this->exec, ranges}; + + auto part = part_type::build_from_contiguous(this->ref, ranges, part_id); + auto dpart = part_type::build_from_contiguous(this->exec, dranges, part_id); + + this->assert_equal(part, dpart); +} + + TYPED_TEST(Partition, BuildsFromGlobalSize) { using global_index_type = typename TestFixture::global_index_type; From 78a8927f7ff41c7698f6c6d0d52cc4131d55c3b2 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 9 Dec 2022 16:14:57 +0100 Subject: [PATCH 195/583] adds sorting kernel for ranges + part-ids --- .../distributed/partition_helpers_kernels.cpp | 15 +++ core/device_hooks/common_kernels.inc.cpp | 3 +- .../distributed/partition_helpers_kernels.hpp | 16 ++- .../distributed/partition_helpers_kernels.cpp | 24 ++++ reference/test/distributed/CMakeLists.txt | 1 + .../distributed/partition_helpers_kernels.cpp | 104 ++++++++++++++++++ 6 files changed, 158 insertions(+), 5 deletions(-) create mode 100644 reference/test/distributed/partition_helpers_kernels.cpp diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index d5f4f407cd5..a3b47718d88 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -32,10 +32,25 @@ void compress_start_ends(std::shared_ptr exec, } +template +void check_consecutive_ranges() +{} + + GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); +template +void sort_by_range_start(std::shared_ptr exec, + array& range_start_ends, + array& + part_ids) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + } // namespace partition_helpers } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 519376dae11..304f3ae45f8 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -260,9 +260,10 @@ namespace partition_helpers { GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); +GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); -} +} // namespace partition_helpers namespace distributed_vector { diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index 374fedf8c1a..b9c9984e93b 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -17,10 +17,18 @@ namespace kernels { const array<_type>& range_start_ends, \ array<_type>& ranges) - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS(GlobalIndexType) +#define GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(_type) \ + void sort_by_range_start( \ + std::shared_ptr exec, \ + array<_type>& range_start_ends, \ + array& part_ids) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS(GlobalIndexType); \ + template \ + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(GlobalIndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(partition_helpers, diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 0060d20be10..d4ba757b284 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -1,5 +1,8 @@ #include "core/distributed/partition_helpers_kernels.hpp" +#include + +#include "core/base/iterator_factory.hpp" namespace gko { @@ -27,6 +30,27 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); +template +void sort_by_range_start( + std::shared_ptr exec, + array& range_start_ends, + array& part_ids) +{ + auto part_ids_d = part_ids.get_data(); + auto num_parts = part_ids.get_num_elems(); + auto range_starts = range_start_ends.get_data(); + auto range_ends = range_starts + num_parts; + auto sort_it = + detail::make_zip_iterator(range_starts, range_ends, part_ids_d); + std::sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { + return std::get<0>(a) < std::get<0>(b); + }); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + } // namespace partition_helpers } // namespace reference } // namespace kernels diff --git a/reference/test/distributed/CMakeLists.txt b/reference/test/distributed/CMakeLists.txt index 2985c7b5e11..42ad2d7e1a2 100644 --- a/reference/test/distributed/CMakeLists.txt +++ b/reference/test/distributed/CMakeLists.txt @@ -1,3 +1,4 @@ ginkgo_create_test(matrix_kernels) +ginkgo_create_test(partition_helpers_kernels) ginkgo_create_test(partition_kernels) ginkgo_create_test(vector_kernels) diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp new file mode 100644 index 00000000000..688762d2d9a --- /dev/null +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -0,0 +1,104 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include +#include + + +#include + + +#include "core/distributed/partition_helpers_kernels.hpp" +#include "core/test/utils.hpp" + + +namespace { + + +using comm_index_type = gko::experimental::distributed::comm_index_type; + + +template +class PartitionHelpers : public ::testing::Test { +protected: + using global_index_type = GlobalIndexType; + + PartitionHelpers() : ref(gko::ReferenceExecutor::create()) {} + + std::shared_ptr ref; + gko::array default_range_start_ends{ + this->ref, {0, 4, 7, 9, 4, 7, 9, 11}}; + gko::array default_part_ids{this->ref, {0, 1, 2, 3}}; +}; + +TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes, + TypenameNameGenerator); + + +TYPED_TEST(PartitionHelpers, CanSortByRangeStartIdentity) +{ + using itype = typename TestFixture::global_index_type; + auto range_start_ends = this->default_range_start_ends; + auto part_ids = this->default_part_ids; + + gko::kernels::reference::partition_helpers::sort_by_range_start( + this->ref, range_start_ends, part_ids); + + GKO_ASSERT_ARRAY_EQ(range_start_ends, this->default_range_start_ends); + GKO_ASSERT_ARRAY_EQ(part_ids, this->default_part_ids); +} + + +TYPED_TEST(PartitionHelpers, CanSortByRangeStart) +{ + using global_index_type = typename TestFixture::global_index_type; + gko::array range_start_ends{this->ref, + {7, 4, 0, 9, 9, 7, 4, 11}}; + gko::array result_part_ids{this->ref, {2, 1, 0, 3}}; + auto part_ids = this->default_part_ids; + + gko::kernels::reference::partition_helpers::sort_by_range_start( + this->ref, range_start_ends, part_ids); + + GKO_ASSERT_ARRAY_EQ(range_start_ends, this->default_range_start_ends); + GKO_ASSERT_ARRAY_EQ(part_ids, result_part_ids); +} + +} // namespace From 8d4b61bc8e503bb5ddadfaa3b93f055efc231c30 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 9 Dec 2022 16:56:28 +0100 Subject: [PATCH 196/583] adds consistency check kernel (reference --- .../distributed/partition_helpers_kernels.cpp | 8 ++++++ core/device_hooks/common_kernels.inc.cpp | 1 + .../distributed/partition_helpers_kernels.hpp | 10 ++++++- .../distributed/partition_helpers_kernels.cpp | 25 +++++++++++++++++ .../distributed/partition_helpers_kernels.cpp | 28 +++++++++++++++++++ 5 files changed, 71 insertions(+), 1 deletion(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index a3b47718d88..437d1590a43 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -51,6 +51,14 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); +template +void check_consecutive_ranges(std::shared_ptr exec, + array& range_start_ends, + bool* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); + } // namespace partition_helpers } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 304f3ae45f8..2e37ae53e20 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -261,6 +261,7 @@ namespace partition_helpers { GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); +GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); } // namespace partition_helpers diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index b9c9984e93b..08af876c06f 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -24,11 +24,19 @@ namespace kernels { array& part_ids) +#define GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(_type) \ + void check_consecutive_ranges(std::shared_ptr exec, \ + array<_type>& range_start_ends, \ + bool* result) + + #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS(GlobalIndexType); \ template \ - GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(GlobalIndexType) + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(GlobalIndexType); \ + template \ + GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(GlobalIndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(partition_helpers, diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index d4ba757b284..882d88509f0 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -51,6 +51,31 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); +template +void check_consecutive_ranges(std::shared_ptr exec, + array& range_start_ends, + bool* result) +{ + auto num_parts = range_start_ends.get_num_elems() / 2; + auto range_starts = range_start_ends.get_data(); + auto range_ends = range_starts + num_parts; + auto combined_it = detail::make_zip_iterator(range_starts + 1, range_ends); + + if (num_parts) { + *result = std::all_of(combined_it, combined_it + (num_parts - 1), + [](const auto& start_end) { + return std::get<0>(start_end) == + std::get<1>(start_end); + }); + } else { + *result = true; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); + + } // namespace partition_helpers } // namespace reference } // namespace kernels diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index 688762d2d9a..1d34a4fd530 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -101,4 +101,32 @@ TYPED_TEST(PartitionHelpers, CanSortByRangeStart) GKO_ASSERT_ARRAY_EQ(part_ids, result_part_ids); } + +TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) +{ + using global_index_type = typename TestFixture::global_index_type; + auto range_start_ends = this->default_range_start_ends; + bool result = false; + + gko::kernels::reference::partition_helpers::check_consecutive_ranges( + this->ref, range_start_ends, &result); + + ASSERT_TRUE(result); +} + + +TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) +{ + using global_index_type = typename TestFixture::global_index_type; + gko::array range_start_ends{this->ref, + {7, 4, 0, 9, 9, 7, 4, 11}}; + bool result = true; + + gko::kernels::reference::partition_helpers::check_consecutive_ranges( + this->ref, range_start_ends, &result); + + ASSERT_FALSE(result); +} + + } // namespace From 55fe5c1c0fb4b1dd578e2af241666f3ca02e46cc Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 9 Dec 2022 17:49:39 +0100 Subject: [PATCH 197/583] wip --- core/distributed/partition_helpers.cpp | 55 +++++++++++++++---- include/ginkgo/core/base/mpi.hpp | 12 ++++ .../distributed/partition_helpers_kernels.cpp | 7 +-- test/mpi/partition_helpers.cpp | 22 +++++--- 4 files changed, 73 insertions(+), 23 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 3a46461bcf0..4fee81f82b9 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/fill_array_kernels.hpp" #include "core/distributed/partition_helpers_kernels.hpp" @@ -44,11 +45,14 @@ namespace partition_helpers { namespace { -GKO_REGISTER_OPERATION(compress_start_ends, - partition_helpers::compress_start_ends); +GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array); +GKO_REGISTER_OPERATION(sort_by_range_start, + partition_helpers::sort_by_range_start); +GKO_REGISTER_OPERATION(check_consecutive_ranges, + partition_helpers::check_consecutive_ranges); -} +} // namespace } // namespace partition_helpers @@ -61,18 +65,49 @@ build_partition_from_local_range(std::shared_ptr exec, static_cast(local_range.end)}; // make all range_start_ends available on each rank - auto mpi_exec = (exec == exec->get_master() || mpi::is_gpu_aware()) - ? exec - : exec->get_master(); + auto mpi_exec = exec->get_master(); array ranges_start_end(mpi_exec, comm.size() * 2); - ranges_start_end.fill(0); - comm.all_gather(mpi_exec, range, 2, ranges_start_end.get_data(), 2); + ranges_start_end.fill(invalid_index()); + MPI_Datatype tmp; + MPI_Type_vector(2, 1, comm.size(), + mpi::type_impl::get_type(), &tmp); + MPI_Type_commit(&tmp); + comm.all_gather( + mpi_exec, range, 1, + mpi::contiguous_type(2, mpi::type_impl::get_type()) + .get(), + ranges_start_end.get_data(), 1, tmp); + MPI_Type_free(&tmp); + if (comm.rank() == 0) { + std::cout << ranges_start_end.get_num_elems() << " "; + for (int i = 0; i < comm.size() * 2; ++i) { + std::cout << ranges_start_end.get_data()[i] << " "; + } + std::cout << std::endl; + } + comm.synchronize(); ranges_start_end.set_executor(exec); + // make_sort_by_range_start + array part_ids(exec, comm.size()); + exec->run(partition_helpers::make_fill_seq_array(part_ids.get_data(), + part_ids.get_num_elems())); + exec->run(partition_helpers::make_sort_by_range_start(ranges_start_end, + part_ids)); + + // check for consistency + bool consecutive_ranges = false; + exec->run(partition_helpers::make_check_consecutive_ranges( + ranges_start_end, &consecutive_ranges)); + if (!consecutive_ranges) { + throw Error(__FILE__, __LINE__, "The partition contains gaps."); + } + // remove duplicates array ranges(exec, comm.size() + 1); - exec->run( - partition_helpers::make_compress_start_ends(ranges_start_end, ranges)); + exec->copy(1, ranges_start_end.get_data(), ranges.get_data()); + exec->copy(comm.size(), ranges_start_end.get_data() + comm.size(), + ranges.get_data() + 1); return Partition::build_from_contiguous( exec, ranges); diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index bf985cabeb7..40b38b55781 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -1013,6 +1013,18 @@ class communicator { this->get())); } + + void all_gather(std::shared_ptr exec, + const void* send_buffer, const int send_count, + MPI_Datatype send_type, void* recv_buffer, + const int recv_count, MPI_Datatype recv_type) const + { + auto guard = exec->get_scoped_device_id_guard(); + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Allgather(send_buffer, send_count, send_type, recv_buffer, + recv_count, recv_type, this->get())); + } + /** * (Non-blocking) Gather data onto all ranks from all ranks in the * communicator. diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 882d88509f0..0142021d34d 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -17,11 +17,10 @@ void compress_start_ends(std::shared_ptr exec, array& ranges) { if (ranges.get_num_elems() && range_start_ends.get_num_elems()) { + auto num_ranges = ranges.get_num_elems() - 1; ranges.get_data()[0] = range_start_ends.get_const_data()[0]; - for (size_type i = 0; i < ranges.get_num_elems() - 1; ++i) { - ranges.get_data()[i + 1] = - range_start_ends.get_const_data()[2 * i + 1]; - } + std::copy_n(range_start_ends.get_const_data() + num_ranges, num_ranges, + ranges.get_data() + 1); } } diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index 303d7362856..6f886922e45 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -1,28 +1,32 @@ -#include #include +#include #include "core/test/utils.hpp" #include "test/utils/mpi/executor.hpp" -template -class PartitionHelpers : public CommonMpiTestFixture{ +template +class PartitionHelpers : public CommonMpiTestFixture { protected: using index_type = IndexType; - }; TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes); -TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges){ - using itype = typename TestFixture::index_type ; +TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges) +{ + using itype = typename TestFixture::index_type; gko::span local_range[] = {{0u, 4u}, {4u, 9u}, {9u, 11u}}; gko::array expects{this->exec, {0, 4, 9, 11}}; - auto part = gko::experimental::distributed::build_partition_from_local_range(this->exec, local_range[this->comm.rank()], this->comm); + auto part = + gko::experimental::distributed::build_partition_from_local_range< + gko::int32, itype>(this->exec, local_range[this->comm.rank()], + this->comm); - GKO_ASSERT_ARRAY_EQ(expects, - gko::make_const_array_view(this->exec, expects.get_num_elems(), part->get_range_bounds())); + GKO_ASSERT_ARRAY_EQ( + expects, gko::make_const_array_view(this->exec, expects.get_num_elems(), + part->get_range_bounds())); } From 0c7f4ba7d4f04c5c16a019f0b974e25f9fbd7f0f Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 12 Dec 2022 17:41:33 +0100 Subject: [PATCH 198/583] fixes ranges gathering --- core/distributed/partition_helpers.cpp | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 4fee81f82b9..1f380f3631f 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -68,24 +68,12 @@ build_partition_from_local_range(std::shared_ptr exec, auto mpi_exec = exec->get_master(); array ranges_start_end(mpi_exec, comm.size() * 2); ranges_start_end.fill(invalid_index()); - MPI_Datatype tmp; - MPI_Type_vector(2, 1, comm.size(), - mpi::type_impl::get_type(), &tmp); - MPI_Type_commit(&tmp); - comm.all_gather( - mpi_exec, range, 1, - mpi::contiguous_type(2, mpi::type_impl::get_type()) - .get(), - ranges_start_end.get_data(), 1, tmp); - MPI_Type_free(&tmp); - if (comm.rank() == 0) { - std::cout << ranges_start_end.get_num_elems() << " "; - for (int i = 0; i < comm.size() * 2; ++i) { - std::cout << ranges_start_end.get_data()[i] << " "; - } - std::cout << std::endl; - } - comm.synchronize(); + std::vector reqs; + reqs.push_back(comm.i_all_gather(mpi_exec, &range[0], 1, + ranges_start_end.get_data(), 1)); + reqs.push_back(comm.i_all_gather( + mpi_exec, &range[1], 1, ranges_start_end.get_data() + comm.size(), 1)); + mpi::wait_all(reqs); ranges_start_end.set_executor(exec); // make_sort_by_range_start @@ -110,7 +98,7 @@ build_partition_from_local_range(std::shared_ptr exec, ranges.get_data() + 1); return Partition::build_from_contiguous( - exec, ranges); + exec, ranges, part_ids); } #define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE(_local_type, \ From 2c448c67cddc72bbbb9cef9bde837d198f5eef07 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 12 Dec 2022 17:41:57 +0100 Subject: [PATCH 199/583] adds MPI tests --- test/mpi/partition_helpers.cpp | 53 ++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index 6f886922e45..506991c6e15 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -6,6 +6,9 @@ #include "test/utils/mpi/executor.hpp" +using comm_index_type = gko::experimental::distributed::comm_index_type; + + template class PartitionHelpers : public CommonMpiTestFixture { protected: @@ -19,7 +22,31 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges) { using itype = typename TestFixture::index_type; gko::span local_range[] = {{0u, 4u}, {4u, 9u}, {9u, 11u}}; - gko::array expects{this->exec, {0, 4, 9, 11}}; + gko::array expects_ranges{this->exec, {0, 4, 9, 11}}; + gko::array expects_pid{this->exec, {0, 1, 2}}; + + + auto part = + gko::experimental::distributed::build_partition_from_local_range< + gko::int32, itype>(this->exec, local_range[this->comm.rank()], + this->comm); + + GKO_ASSERT_ARRAY_EQ( + expects_ranges, + gko::make_const_array_view(this->exec, expects_ranges.get_num_elems(), + part->get_range_bounds())); + GKO_ASSERT_ARRAY_EQ( + expects_pid, + gko::make_const_array_view(this->exec, expects_pid.get_num_elems(), + part->get_part_ids())); +} + +TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesUnsorted) +{ + using itype = typename TestFixture::index_type; + gko::span local_range[] = {{4u, 9u}, {9u, 11u}, {0u, 4u}}; + gko::array expects_ranges{this->exec, {0, 4, 9, 11}}; + gko::array expects_pid{this->exec, {2, 0, 1}}; auto part = gko::experimental::distributed::build_partition_from_local_range< @@ -27,6 +54,26 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges) this->comm); GKO_ASSERT_ARRAY_EQ( - expects, gko::make_const_array_view(this->exec, expects.get_num_elems(), - part->get_range_bounds())); + expects_ranges, + gko::make_const_array_view(this->exec, expects_ranges.get_num_elems(), + part->get_range_bounds())); + GKO_ASSERT_ARRAY_EQ( + expects_pid, + gko::make_const_array_view(this->exec, expects_pid.get_num_elems(), + part->get_part_ids())); +} + + +TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesThrowsOnGap) +{ + using itype = typename TestFixture::index_type; + gko::span local_range[] = {{4u, 6u}, {9u, 11u}, {0u, 4u}}; + auto build_from_local_ranges = [](auto... args) { + return gko::experimental::distributed::build_partition_from_local_range< + gko::int32, itype>(args...); + }; + + ASSERT_THROW(build_from_local_ranges( + this->exec, local_range[this->comm.rank()], this->comm), + gko::Error); } From 20c43b5058e078473b06bb1690ae7a520af3bf1a Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 12 Dec 2022 17:43:18 +0100 Subject: [PATCH 200/583] removes dead code --- .../distributed/partition_helpers_kernels.cpp | 30 ------------------- core/device_hooks/common_kernels.inc.cpp | 1 - .../distributed/partition_helpers_kernels.hpp | 7 ----- .../distributed/partition_helpers_kernels.cpp | 18 ----------- 4 files changed, 56 deletions(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index 437d1590a43..6858b58d7ca 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -11,36 +11,6 @@ namespace GKO_DEVICE_NAMESPACE { namespace partition_helpers { -template -void compress_start_ends(std::shared_ptr exec, - const array& range_start_ends, - array& ranges) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto size, const auto* range_start_ends, - auto* ranges) { - if (i == 0) { - ranges[0] = range_start_ends[0]; - } - if (i != size - 1) { - ranges[i + 1] = range_start_ends[2 * i + 1]; - } - }, - ranges.get_num_elems() - 1, ranges.get_num_elems(), - range_start_ends.get_const_data(), ranges.get_data()); -} - - -template -void check_consecutive_ranges() -{} - - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); - - template void sort_by_range_start(std::shared_ptr exec, array& range_start_ends, diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 2e37ae53e20..51bcf9d9587 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -259,7 +259,6 @@ GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_IS_ORDERED); namespace partition_helpers { -GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index 08af876c06f..db7cd429ef9 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -12,11 +12,6 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS(_type) \ - void compress_start_ends(std::shared_ptr exec, \ - const array<_type>& range_start_ends, \ - array<_type>& ranges) - #define GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(_type) \ void sort_by_range_start( \ std::shared_ptr exec, \ @@ -31,8 +26,6 @@ namespace kernels { #define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS(GlobalIndexType); \ template \ GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(GlobalIndexType); \ template \ diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 0142021d34d..c8797682ef1 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -11,24 +11,6 @@ namespace reference { namespace partition_helpers { -template -void compress_start_ends(std::shared_ptr exec, - const array& range_start_ends, - array& ranges) -{ - if (ranges.get_num_elems() && range_start_ends.get_num_elems()) { - auto num_ranges = ranges.get_num_elems() - 1; - ranges.get_data()[0] = range_start_ends.get_const_data()[0]; - std::copy_n(range_start_ends.get_const_data() + num_ranges, num_ranges, - ranges.get_data() + 1); - } -} - - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_PARTITION_HELPERS_COMPRESS_START_ENDS); - - template void sort_by_range_start( std::shared_ptr exec, From 80d86c03fc7a3ca8bdb6b5f5e156ebce08125042 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 19 Dec 2022 17:14:17 +0100 Subject: [PATCH 201/583] adds device consecutive ranges check --- .../distributed/partition_helpers_kernels.cpp | 20 ++- test/distributed/partition_helper_kernels.cpp | 132 ++++++++++++++---- 2 files changed, 126 insertions(+), 26 deletions(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index 6858b58d7ca..4c1401666b4 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -3,6 +3,7 @@ #include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" namespace gko { @@ -24,7 +25,24 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( template void check_consecutive_ranges(std::shared_ptr exec, array& range_start_ends, - bool* result) GKO_NOT_IMPLEMENTED; + bool* result) +{ + array result_uint32{exec, 1}; + auto num_ranges = range_start_ends.get_num_elems() / 2; + run_kernel_reduction( + exec, + [] GKO_KERNEL(const auto i, const auto* starts, const auto* ends) { + return starts[i + 1] == ends[i]; + }, + [] GKO_KERNEL(const auto a, const auto b) { + return static_cast(a && b); + }, + [] GKO_KERNEL(auto x) { return x; }, static_cast(true), + result_uint32.get_data(), num_ranges - 1, range_start_ends.get_data(), + range_start_ends.get_data() + num_ranges); + *result = + static_cast(exec->copy_val_to_host(result_uint32.get_data())); +} GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index c52ba65e5c7..a1a270cbe14 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -44,7 +44,72 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "test/utils/executor.hpp" -using comm_index_type = gko::experimental::distributed::comm_index_type; +template +std::pair, std::vector> create_ranges( + gko::size_type num_ranges) +{ + std::default_random_engine engine; + std::uniform_int_distribution dist(5, 10); + std::vector range_sizes(num_ranges); + std::generate(range_sizes.begin(), range_sizes.end(), + [&]() { return dist(engine); }); + + std::vector range_offsets(num_ranges + 1, 0); + std::partial_sum(range_sizes.begin(), range_sizes.end(), + range_offsets.begin() + 1); + + std::vector range_starts(num_ranges); + std::vector range_ends(num_ranges); + std::copy_n(range_offsets.begin(), num_ranges, range_starts.begin()); + std::copy_n(range_offsets.begin() + 1, num_ranges, range_ends.begin()); + + return {std::move(range_starts), std::move(range_ends)}; +} + + +std::vector sample_unique(std::size_t min, std::size_t max, + gko::size_type n) +{ + std::default_random_engine engine; + std::vector values(std::clamp(max - min, 0ul, max)); + std::iota(values.begin(), values.end(), min); + + std::shuffle(values.begin(), values.end(), engine); + + values.erase(values.begin() + std::clamp(n, 0ul, values.size()), values.end()); + + return values; +} + + +template +std::vector remove_indices(const std::vector& source, + std::vector idxs) +{ + std::sort(idxs.begin(), idxs.end(), std::greater<>{}); + auto result = source; + for (auto idx : idxs) { + result.erase(result.begin() + idx); + } + return result; +} + + +template +gko::array concat_start_end( + std::shared_ptr exec, + const std::pair, std::vector>& start_ends) +{ + gko::size_type num_ranges = start_ends.first.size(); + gko::array concat(exec, num_ranges * 2); + + exec->copy_from(exec->get_master().get(), num_ranges, + start_ends.first.data(), concat.get_data()); + exec->copy_from(exec->get_master().get(), num_ranges, + start_ends.second.data(), concat.get_data() + num_ranges); + + return concat; +} template @@ -56,43 +121,60 @@ class PartitionHelpers : public CommonTestFixture { TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes); -TYPED_TEST(PartitionHelpers, CanCompressStartEndsWithOneRange) +TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) { - using itype = typename TestFixture::index_type; - gko::array start_ends{this->exec, {0, 3}}; - gko::array expects{this->exec, {0, 3}}; - gko::array result{this->exec, expects.get_num_elems()}; + using index_type = typename TestFixture::index_type; + auto start_ends = + concat_start_end(this->exec, create_ranges(100)); + bool result = false; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, start_ends, &result); + + ASSERT_TRUE(result); +} - gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_start_ends( - this->exec, start_ends, result); - GKO_ASSERT_ARRAY_EQ(result, expects); +TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) +{ + using index_type = typename TestFixture::index_type; + auto full_range_ends = create_ranges(100); + auto removal_idxs = sample_unique(0, full_range_ends.first.size(), 4); + auto start_ends = concat_start_end( + this->ref, + std::make_pair(remove_indices(full_range_ends.first, removal_idxs), + remove_indices(full_range_ends.second, removal_idxs))); + bool result = true; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, start_ends, &result); + + ASSERT_FALSE(result); } -TYPED_TEST(PartitionHelpers, CanCompressStartEndsWithMultipleRanges) +TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) { - using itype = typename TestFixture::index_type; - gko::array start_ends{this->exec, {0, 3, 3, 7, 7, 10}}; - gko::array expects{this->exec, {0, 3, 7, 10}}; - gko::array result{this->exec, expects.get_num_elems()}; + using index_type = typename TestFixture::index_type; + auto start_ends = concat_start_end( + this->ref,create_ranges(1)); + bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_start_ends( - this->exec, start_ends, result); + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, start_ends, &result); - GKO_ASSERT_ARRAY_EQ(result, expects); + ASSERT_TRUE(result); } -TYPED_TEST(PartitionHelpers, CanCompressStartEndsWithZeroRange) +TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) { - using itype = typename TestFixture::index_type; - gko::array start_ends{this->exec}; - gko::array expects{this->exec, {0}}; - gko::array result{this->exec, {0}}; + using index_type = typename TestFixture::index_type; + auto start_ends = gko::array(this->exec, {1}); + bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_start_ends( - this->exec, start_ends, result); + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, start_ends, &result); - GKO_ASSERT_ARRAY_EQ(result, expects); + ASSERT_TRUE(result); } From 2a3c35c7e9b9856c011f2cda541e7089a0dcf3c7 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 19 Dec 2022 17:20:49 +0100 Subject: [PATCH 202/583] add omp sorting kernel --- .../distributed/partition_helpers_kernels.cpp | 10 ----- cuda/CMakeLists.txt | 1 + cuda/distributed/partition_helpers_kernels.cu | 24 ++++++++++++ dpcpp/CMakeLists.txt | 1 + .../partition_helpers_kernels.dp.cpp | 24 ++++++++++++ hip/CMakeLists.txt | 1 + .../partition_helpers_kernels.hip.cpp | 24 ++++++++++++ omp/CMakeLists.txt | 1 + omp/distributed/partition_helpers_kernels.cpp | 39 +++++++++++++++++++ 9 files changed, 115 insertions(+), 10 deletions(-) create mode 100644 cuda/distributed/partition_helpers_kernels.cu create mode 100644 dpcpp/distributed/partition_helpers_kernels.dp.cpp create mode 100644 hip/distributed/partition_helpers_kernels.hip.cpp create mode 100644 omp/distributed/partition_helpers_kernels.cpp diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index 4c1401666b4..8b891dd3cb2 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -12,16 +12,6 @@ namespace GKO_DEVICE_NAMESPACE { namespace partition_helpers { -template -void sort_by_range_start(std::shared_ptr exec, - array& range_start_ends, - array& - part_ids) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); - - template void check_consecutive_ranges(std::shared_ptr exec, array& range_start_ends, diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 764f47afb83..4c972d2a584 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -21,6 +21,7 @@ target_sources(ginkgo_cuda base/version.cpp components/prefix_sum_kernels.cu distributed/matrix_kernels.cu + distributed/partition_helpers_kernels.cu distributed/partition_kernels.cu distributed/vector_kernels.cu factorization/cholesky_kernels.cu diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu new file mode 100644 index 00000000000..57729516508 --- /dev/null +++ b/cuda/distributed/partition_helpers_kernels.cu @@ -0,0 +1,24 @@ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace partition_helpers { + + +template +void sort_by_range_start(std::shared_ptr exec, + array& range_start_ends, + array& + part_ids) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + +} // namespace partition_helpers +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index b70175c6b12..dd0d7c4cdfb 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -18,6 +18,7 @@ target_sources(ginkgo_dpcpp base/version.dp.cpp components/prefix_sum_kernels.dp.cpp distributed/matrix_kernels.dp.cpp + distributed/partition_helpers_kernels.dp.cpp distributed/partition_kernels.dp.cpp distributed/vector_kernels.dp.cpp factorization/cholesky_kernels.dp.cpp diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp new file mode 100644 index 00000000000..e8dbe8444d8 --- /dev/null +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -0,0 +1,24 @@ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace partition_helpers { + + +template +void sort_by_range_start(std::shared_ptr exec, + array& range_start_ends, + array& + part_ids) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + +} // namespace partition_helpers +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 5ec1718ca4d..779db13d36a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -18,6 +18,7 @@ set(GINKGO_HIP_SOURCES base/version.hip.cpp components/prefix_sum_kernels.hip.cpp distributed/matrix_kernels.hip.cpp + distributed/partition_helpers_kernels.hip.cpp distributed/partition_kernels.hip.cpp distributed/vector_kernels.hip.cpp factorization/cholesky_kernels.hip.cpp diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp new file mode 100644 index 00000000000..99f3b711794 --- /dev/null +++ b/hip/distributed/partition_helpers_kernels.hip.cpp @@ -0,0 +1,24 @@ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +namespace partition_helpers { + + +template +void sort_by_range_start(std::shared_ptr exec, + array& range_start_ends, + array& + part_ids) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + +} // namespace partition_helpers +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 02248983385..c689ffc42f3 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -10,6 +10,7 @@ target_sources(ginkgo_omp base/version.cpp components/prefix_sum_kernels.cpp distributed/matrix_kernels.cpp + distributed/partition_helpers_kernels.cpp distributed/partition_kernels.cpp distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp new file mode 100644 index 00000000000..3216782f2ac --- /dev/null +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -0,0 +1,39 @@ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +#include "core/base/iterator_factory.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace partition_helpers { + + +template +void sort_by_range_start( + std::shared_ptr exec, + array& range_start_ends, + array& part_ids) +{ + auto part_ids_d = part_ids.get_data(); + auto num_parts = part_ids.get_num_elems(); + auto range_starts = range_start_ends.get_data(); + auto range_ends = range_starts + num_parts; + auto sort_it = + detail::make_zip_iterator(range_starts, range_ends, part_ids_d); + // TODO: use TBB or parallel std with c++17 + std::sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { + return std::get<0>(a) < std::get<0>(b); + }); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + +} // namespace partition_helpers +} // namespace omp +} // namespace kernels +} // namespace gko From 0474cbfcf64e5fbfa5a3c4a9adc89631badbc893 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 20 Dec 2022 09:23:39 +0100 Subject: [PATCH 203/583] fixes tests --- test/distributed/partition_helper_kernels.cpp | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index a1a270cbe14..eeefdd415a7 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -44,6 +44,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "test/utils/executor.hpp" +using gko::experimental::distributed::comm_index_type; + + +// TODO: remove with c++17 +template +T clamp(const T&v, const T& lo, const T& hi){ + return v < lo ? lo : (v > hi ? hi : v); +} + + +template +std::vector create_iota(IndexType min, IndexType max) +{ + std::vector iota(clamp(max - min, 0ul, max)); + std::iota(iota.begin(), iota.end(), min); + return iota; +} + + template std::pair, std::vector> create_ranges( gko::size_type num_ranges) @@ -71,13 +90,9 @@ std::vector sample_unique(std::size_t min, std::size_t max, gko::size_type n) { std::default_random_engine engine; - std::vector values(std::clamp(max - min, 0ul, max)); - std::iota(values.begin(), values.end(), min); - + auto values = create_iota(min, max); std::shuffle(values.begin(), values.end(), engine); - - values.erase(values.begin() + std::clamp(n, 0ul, values.size()), values.end()); - + values.erase(values.begin() + clamp(n, 0ul, values.size()), values.end()); return values; } @@ -141,7 +156,7 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) auto full_range_ends = create_ranges(100); auto removal_idxs = sample_unique(0, full_range_ends.first.size(), 4); auto start_ends = concat_start_end( - this->ref, + this->exec, std::make_pair(remove_indices(full_range_ends.first, removal_idxs), remove_indices(full_range_ends.second, removal_idxs))); bool result = true; @@ -170,7 +185,7 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) { using index_type = typename TestFixture::index_type; - auto start_ends = gko::array(this->exec, {1}); + auto start_ends = gko::array(this->exec, {1}); bool result = false; gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( From 3272d89cb1e88abeca981654ef3feab87ec29551 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 20 Dec 2022 11:11:42 +0100 Subject: [PATCH 204/583] adds cuda/hip/dpcpp sort kernels --- .../partition_helpers_kernels.hpp.inc | 48 +++++++++++ .../distributed/partition_helpers_kernels.cpp | 32 ++++++++ .../distributed/partition_helpers_kernels.hpp | 32 ++++++++ cuda/distributed/partition_helpers_kernels.cu | 47 +++++++++-- .../partition_helpers_kernels.dp.cpp | 57 ++++++++++++- .../partition_helpers_kernels.hip.cpp | 46 +++++++++-- omp/distributed/partition_helpers_kernels.cpp | 32 ++++++++ .../distributed/partition_helpers_kernels.cpp | 32 ++++++++ test/distributed/partition_helper_kernels.cpp | 80 +++++++++++++++++-- test/mpi/partition_helpers.cpp | 32 ++++++++ 10 files changed, 410 insertions(+), 28 deletions(-) create mode 100644 common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc new file mode 100644 index 00000000000..ca12b9a2bd1 --- /dev/null +++ b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc @@ -0,0 +1,48 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +void sort_by_range_start( + std::shared_ptr exec, + array& range_start_ends, + array& part_ids) +{ + auto num_ranges = range_start_ends.get_num_elems() / 2; + auto starts = thrust::device_pointer_cast(range_start_ends.get_data()); + auto ends = starts + num_ranges; + auto zip_it = thrust::make_zip_iterator(thrust::make_tuple( + ends, thrust::device_pointer_cast(part_ids.get_data()))); + thrust::sort_by_key(thrust::device, starts, starts + num_ranges, zip_it); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index 8b891dd3cb2..e7e37bfd9db 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -1,3 +1,35 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index db7cd429ef9..b3bacf694f4 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -1,3 +1,35 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #ifndef GINKGO_PARTITION_HELPERS_KERNELS_HPP #define GINKGO_PARTITION_HELPERS_KERNELS_HPP diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu index 57729516508..2687517ad7f 100644 --- a/cuda/distributed/partition_helpers_kernels.cu +++ b/cuda/distributed/partition_helpers_kernels.cu @@ -1,21 +1,52 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #include "core/distributed/partition_helpers_kernels.hpp" +#include +#include +#include +#include + + namespace gko { namespace kernels { namespace cuda { namespace partition_helpers { -template -void sort_by_range_start(std::shared_ptr exec, - array& range_start_ends, - array& - part_ids) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); +#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc" } // namespace partition_helpers diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index e8dbe8444d8..8aae72cd636 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -1,3 +1,41 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +#include +#include +#include +// force-top: off + #include "core/distributed/partition_helpers_kernels.hpp" @@ -9,10 +47,21 @@ namespace partition_helpers { template -void sort_by_range_start(std::shared_ptr exec, - array& range_start_ends, - array& - part_ids) GKO_NOT_IMPLEMENTED; +void sort_by_range_start( + std::shared_ptr exec, + array& range_start_ends, + array& part_ids) +{ + auto policy = + oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + auto num_ranges = range_start_ends.get_num_elems() / 2; + auto starts = range_start_ends.get_data(); + auto ends = starts + num_ranges; + auto zip_it = + oneapi::dpl::make_zip_iterator(starts, ends, part_ids.get_data()); + std::sort(policy, zip_it, zip_it + num_ranges, + [](auto a, auto b) { return std::get<0>(a) < std::get<0>(b); }); +} GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp index 99f3b711794..d6239650979 100644 --- a/hip/distributed/partition_helpers_kernels.hip.cpp +++ b/hip/distributed/partition_helpers_kernels.hip.cpp @@ -1,21 +1,51 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ #include "core/distributed/partition_helpers_kernels.hpp" +#include +#include +#include +#include + + namespace gko { namespace kernels { namespace hip { namespace partition_helpers { -template -void sort_by_range_start(std::shared_ptr exec, - array& range_start_ends, - array& - part_ids) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); +#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc" } // namespace partition_helpers diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index 3216782f2ac..9e42e8cc888 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -1,3 +1,35 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index c8797682ef1..1319c5a3951 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -1,3 +1,35 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #include "core/distributed/partition_helpers_kernels.hpp" #include diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index eeefdd415a7..64fd1e49b77 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -40,16 +40,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/iterator_factory.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" using gko::experimental::distributed::comm_index_type; +template +using range_container = + std::pair, std::vector>; + // TODO: remove with c++17 -template -T clamp(const T&v, const T& lo, const T& hi){ +template +T clamp(const T& v, const T& lo, const T& hi) +{ return v < lo ? lo : (v > hi ? hi : v); } @@ -57,15 +63,15 @@ T clamp(const T&v, const T& lo, const T& hi){ template std::vector create_iota(IndexType min, IndexType max) { - std::vector iota(clamp(max - min, 0ul, max)); + std::vector iota( + clamp(max - min, static_cast(0), max)); std::iota(iota.begin(), iota.end(), min); return iota; } template -std::pair, std::vector> create_ranges( - gko::size_type num_ranges) +range_container create_ranges(gko::size_type num_ranges) { std::default_random_engine engine; std::uniform_int_distribution dist(5, 10); @@ -113,7 +119,7 @@ std::vector remove_indices(const std::vector& source, template gko::array concat_start_end( std::shared_ptr exec, - const std::pair, std::vector>& start_ends) + const range_container& start_ends) { gko::size_type num_ranges = start_ends.first.size(); gko::array concat(exec, num_ranges * 2); @@ -127,6 +133,25 @@ gko::array concat_start_end( } +template +std::pair, std::vector> +shuffle_range_and_pid(const range_container& ranges, + const std::vector& pid) +{ + std::default_random_engine engine; + + auto result = std::make_pair(ranges, pid); + + auto num_ranges = result.second.size(); + auto zip_it = gko::detail::make_zip_iterator( + result.first.first.begin(), + result.first.second.begin(), + result.second.begin()); + std::shuffle(zip_it, zip_it + num_ranges, engine); + + return result; +} + template class PartitionHelpers : public CommonTestFixture { protected: @@ -171,8 +196,7 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) { using index_type = typename TestFixture::index_type; - auto start_ends = concat_start_end( - this->ref,create_ranges(1)); + auto start_ends = concat_start_end(this->ref, create_ranges(1)); bool result = false; gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( @@ -193,3 +217,43 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) ASSERT_TRUE(result); } + + +TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges) +{ + using index_type = typename TestFixture::index_type; + auto start_ends = + concat_start_end(this->exec, create_ranges(100)); + auto part_ids = create_iota(0, 100); + auto part_ids_arr = gko::array( + this->exec, part_ids.begin(), part_ids.end()); + auto expected_start_ends = start_ends; + auto expected_part_ids = part_ids_arr; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start( + this->exec, start_ends, part_ids_arr); + + GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends); + GKO_ASSERT_ARRAY_EQ(expected_part_ids, part_ids_arr); +} + + +TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges) +{ + using index_type = typename TestFixture::index_type; + auto ranges = create_ranges(100); + auto part_ids = create_iota(0, 100); + auto shuffled = shuffle_range_and_pid(ranges, part_ids); + auto expected_start_ends = concat_start_end(this->exec, ranges); + auto expected_part_ids = gko::array( + this->exec, part_ids.begin(), part_ids.end()); + auto start_ends = concat_start_end(this->exec, shuffled.first); + auto part_ids_arr = gko::array( + this->exec, shuffled.second.begin(), shuffled.second.end()); + + gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start( + this->exec, start_ends, part_ids_arr); + + GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends); + GKO_ASSERT_ARRAY_EQ(expected_part_ids, part_ids_arr); +} diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index 506991c6e15..ef56b4d927b 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -1,3 +1,35 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #include #include From 6d0d3d52ef160c48b7b31bd48d1500a027dfe3c3 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 20 Dec 2022 11:22:10 +0100 Subject: [PATCH 205/583] adds creator from local sizes --- core/distributed/partition_helpers.cpp | 51 +++++++++++++++++-- .../core/distributed/partition_helpers.hpp | 23 ++++++++- test/mpi/partition_helpers.cpp | 36 ++++++++++--- 3 files changed, 97 insertions(+), 13 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 1f380f3631f..5e1a8b7fa7d 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -32,20 +32,33 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "core/distributed/partition_helpers_kernels.hpp" namespace gko { namespace experimental { namespace distributed { -namespace partition_helpers { +namespace components { namespace { GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array); +GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum); + + +} // namespace +} // namespace components + + +namespace partition_helpers { +namespace { + + GKO_REGISTER_OPERATION(sort_by_range_start, partition_helpers::sort_by_range_start); GKO_REGISTER_OPERATION(check_consecutive_ranges, @@ -59,7 +72,7 @@ GKO_REGISTER_OPERATION(check_consecutive_ranges, template std::unique_ptr> build_partition_from_local_range(std::shared_ptr exec, - span local_range, mpi::communicator comm) + mpi::communicator comm, span local_range) { GlobalIndexType range[2] = {static_cast(local_range.begin), static_cast(local_range.end)}; @@ -78,8 +91,8 @@ build_partition_from_local_range(std::shared_ptr exec, // make_sort_by_range_start array part_ids(exec, comm.size()); - exec->run(partition_helpers::make_fill_seq_array(part_ids.get_data(), - part_ids.get_num_elems())); + exec->run(components::make_fill_seq_array(part_ids.get_data(), + part_ids.get_num_elems())); exec->run(partition_helpers::make_sort_by_range_start(ranges_start_end, part_ids)); @@ -105,11 +118,39 @@ build_partition_from_local_range(std::shared_ptr exec, _global_type) \ std::unique_ptr> \ build_partition_from_local_range(std::shared_ptr exec, \ - span local_range, mpi::communicator comm) + mpi::communicator comm, span local_range) GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE); +template +std::unique_ptr> +build_partition_from_local_size(std::shared_ptr exec, + mpi::communicator comm, size_type local_size) +{ + auto local_size_gi = static_cast(local_size); + std::vector sizes(comm.size()); + comm.all_gather(exec, &local_size_gi, 1, sizes.data(), 1); + + std::vector offsets(comm.size() + 1); + offsets[0] = 0; + std::partial_sum(sizes.begin(), sizes.end(), offsets.begin() + 1); + + auto ranges = + make_array_view(exec->get_master(), offsets.size(), offsets.data()); + return Partition::build_from_contiguous( + exec, ranges); +} + +#define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_SIZE(_local_type, _global_type) \ + std::unique_ptr> \ + build_partition_from_local_size(std::shared_ptr exec, \ + mpi::communicator comm, \ + size_type local_range) +GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_SIZE); + + } // namespace distributed } // namespace experimental } // namespace gko diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp index 93b04af7f6c..8364759567e 100644 --- a/include/ginkgo/core/distributed/partition_helpers.hpp +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -56,7 +56,8 @@ class Partition; * Builds a partition from a local range. * * @param exec the Executor on which the partition should be built. - * @param local_range the start and end indices of the local range + * @param local_range the start and end indices of the local range. + * @param comm the communicator used to determine the global partition. * * @warning The local ranges have to be continuous and ascending. This means * that for a process `i` with `range[i] = [s_i, e_i)` then for process @@ -68,7 +69,25 @@ class Partition; template std::unique_ptr> build_partition_from_local_range(std::shared_ptr exec, - span local_range, mpi::communicator comm); + mpi::communicator comm, span local_range); + + +/** + * Builds a partition from a local size. + * + * @param exec the Executor on which the partition should be built. + * @param local_range the number of the locally owned indices + * @param comm the communicator used to determine the global partition. + * + * @return a Partition where each range has the specified local size. More + * specifically, if this is called on process i with local_size `s_i`, + * then the range `i` has size `s_i`, and range `r_i = [start, start + + * s_i)`, where `start = sum_j^(i-1) s_j`. + */ +template +std::unique_ptr> +build_partition_from_local_size(std::shared_ptr exec, + mpi::communicator comm, size_type local_size); } // namespace distributed diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index ef56b4d927b..72e01da9931 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -60,8 +60,8 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges) auto part = gko::experimental::distributed::build_partition_from_local_range< - gko::int32, itype>(this->exec, local_range[this->comm.rank()], - this->comm); + gko::int32, itype>(this->exec, this->comm, + local_range[this->comm.rank()]); GKO_ASSERT_ARRAY_EQ( expects_ranges, @@ -73,6 +73,7 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges) part->get_part_ids())); } + TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesUnsorted) { using itype = typename TestFixture::index_type; @@ -82,8 +83,8 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesUnsorted) auto part = gko::experimental::distributed::build_partition_from_local_range< - gko::int32, itype>(this->exec, local_range[this->comm.rank()], - this->comm); + gko::int32, itype>(this->exec, this->comm, + local_range[this->comm.rank()]); GKO_ASSERT_ARRAY_EQ( expects_ranges, @@ -100,12 +101,35 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesThrowsOnGap) { using itype = typename TestFixture::index_type; gko::span local_range[] = {{4u, 6u}, {9u, 11u}, {0u, 4u}}; + // Hack because of multiple template arguments in macro auto build_from_local_ranges = [](auto... args) { return gko::experimental::distributed::build_partition_from_local_range< gko::int32, itype>(args...); }; - ASSERT_THROW(build_from_local_ranges( - this->exec, local_range[this->comm.rank()], this->comm), + ASSERT_THROW(build_from_local_ranges(this->exec, this->comm, + local_range[this->comm.rank()]), gko::Error); } + + +TYPED_TEST(PartitionHelpers, CanBuildFromLocalSize) +{ + using itype = typename TestFixture::index_type; + gko::size_type local_range[] = {4, 5, 3}; + gko::array expects_ranges{this->exec, {0, 4, 9, 12}}; + gko::array expects_pid{this->exec, {0, 1, 2}}; + + auto part = gko::experimental::distributed::build_partition_from_local_size< + gko::int32, itype>(this->exec, this->comm, + local_range[this->comm.rank()]); + + GKO_ASSERT_ARRAY_EQ( + expects_ranges, + gko::make_const_array_view(this->exec, expects_ranges.get_num_elems(), + part->get_range_bounds())); + GKO_ASSERT_ARRAY_EQ( + expects_pid, + gko::make_const_array_view(this->exec, expects_pid.get_num_elems(), + part->get_part_ids())); +} From 4213d363b6fdd8632cd7827c794caf9fea72d958 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 20 Dec 2022 11:58:29 +0100 Subject: [PATCH 206/583] review updates Co-authored-by: Tobias Ribizel --- core/distributed/partition_helpers.cpp | 6 +++--- test/mpi/partition_helpers.cpp | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 5e1a8b7fa7d..d1bc28a6fdd 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -48,7 +48,6 @@ namespace { GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array); -GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum); } // namespace @@ -74,8 +73,9 @@ std::unique_ptr> build_partition_from_local_range(std::shared_ptr exec, mpi::communicator comm, span local_range) { - GlobalIndexType range[2] = {static_cast(local_range.begin), - static_cast(local_range.end)}; + std::array range{ + static_cast(local_range.begin), + static_cast(local_range.end)}; // make all range_start_ends available on each rank auto mpi_exec = exec->get_master(); diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index 72e01da9931..dc9c63d28dd 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -47,7 +47,8 @@ class PartitionHelpers : public CommonMpiTestFixture { using index_type = IndexType; }; -TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes); +TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes, + TypenameNameGenerator); TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges) From 662a8e53734980a4b2fb20b6fd64a8d7ede4c1b3 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 12 Jan 2023 09:07:47 +0100 Subject: [PATCH 207/583] bump copyright --- common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc | 2 +- common/unified/distributed/partition_helpers_kernels.cpp | 2 +- core/distributed/partition_helpers.cpp | 2 +- core/distributed/partition_helpers_kernels.hpp | 2 +- cuda/distributed/partition_helpers_kernels.cu | 2 +- dpcpp/distributed/partition_helpers_kernels.dp.cpp | 2 +- hip/distributed/partition_helpers_kernels.hip.cpp | 2 +- include/ginkgo/core/distributed/partition_helpers.hpp | 2 +- omp/distributed/partition_helpers_kernels.cpp | 2 +- reference/distributed/partition_helpers_kernels.cpp | 2 +- reference/test/distributed/partition_helpers_kernels.cpp | 2 +- test/distributed/partition_helper_kernels.cpp | 2 +- test/mpi/partition_helpers.cpp | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc index ca12b9a2bd1..17ac375c056 100644 --- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index e7e37bfd9db..a40bda31de4 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index d1bc28a6fdd..fb9a1cea233 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index b3bacf694f4..22a946bfb8f 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu index 2687517ad7f..e37655e357e 100644 --- a/cuda/distributed/partition_helpers_kernels.cu +++ b/cuda/distributed/partition_helpers_kernels.cu @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index 8aae72cd636..797b7b5e081 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp index d6239650979..d9ae663f93f 100644 --- a/hip/distributed/partition_helpers_kernels.hip.cpp +++ b/hip/distributed/partition_helpers_kernels.hip.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp index 8364759567e..d9b2fee3d14 100644 --- a/include/ginkgo/core/distributed/partition_helpers.hpp +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index 9e42e8cc888..03a46d93f3b 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 1319c5a3951..bff3e26a997 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index 1d34a4fd530..abaab32903b 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 64fd1e49b77..441da3b8bd4 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index dc9c63d28dd..6f30761cbb0 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors +Copyright (c) 2017-2023, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without From 309459ea42acabfb24a231266c1f5ca9c6bfb257 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 12 Jan 2023 09:19:19 +0100 Subject: [PATCH 208/583] review updates: - documentation - make partition checks const - test fixes Co-authored-by: Gregor Olenik --- core/distributed/partition.cpp | 4 ++-- include/ginkgo/core/distributed/partition.hpp | 4 ++-- .../ginkgo/core/distributed/partition_helpers.hpp | 12 +++++++----- test/distributed/partition_helper_kernels.cpp | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index 575ca83aba6..22f0fdb3d94 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -123,7 +123,7 @@ void Partition::finalize_construction() template -bool Partition::has_connected_parts() +bool Partition::has_connected_parts() const { return this->get_num_parts() - this->get_num_empty_parts() == this->get_num_ranges(); @@ -131,7 +131,7 @@ bool Partition::has_connected_parts() template -bool Partition::has_ordered_parts() +bool Partition::has_ordered_parts() const { if (this->has_connected_parts()) { auto exec = this->get_executor(); diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index fa8b2739400..a40f30f7137 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -231,7 +231,7 @@ class Partition * * @return true if each part has no more than one contiguous range. */ - bool has_connected_parts(); + bool has_connected_parts() const; /** * Checks if the ranges are ordered by their part index. @@ -240,7 +240,7 @@ class Partition * * @return true if the ranges are ordered by their part index. */ - bool has_ordered_parts(); + bool has_ordered_parts() const; /** * Builds a partition from a given mapping global_index -> part_id. diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp index d9b2fee3d14..889347674c8 100644 --- a/include/ginkgo/core/distributed/partition_helpers.hpp +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -56,12 +56,14 @@ class Partition; * Builds a partition from a local range. * * @param exec the Executor on which the partition should be built. - * @param local_range the start and end indices of the local range. * @param comm the communicator used to determine the global partition. + * @param local_range the start and end indices of the local range. * - * @warning The local ranges have to be continuous and ascending. This means - * that for a process `i` with `range[i] = [s_i, e_i)` then for process - * `j = i+1` `range[j] = [s_j = e_i, e_j)`. + * @warning This throws, if the resulting partition would contain gaps. + * That means that for a partition of size `n` every local range `r_i + * = [s_i, e_i)` either `s_i != 0` and another local range `r_j = + * [s_j, e_j = s_i)` exists, or `e_i != n` and another local range + * `r_j = [s_j = e_i, e_j)` exists. * * @return a Partition where each range has the individual local_start * and local_ends. @@ -76,8 +78,8 @@ build_partition_from_local_range(std::shared_ptr exec, * Builds a partition from a local size. * * @param exec the Executor on which the partition should be built. - * @param local_range the number of the locally owned indices * @param comm the communicator used to determine the global partition. + * @param local_range the number of the locally owned indices * * @return a Partition where each range has the specified local size. More * specifically, if this is called on process i with local_size `s_i`, diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 441da3b8bd4..3cc472cd3b6 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -64,7 +64,7 @@ template std::vector create_iota(IndexType min, IndexType max) { std::vector iota( - clamp(max - min, static_cast(0), max)); + clamp(max - min, IndexType(0), max)); std::iota(iota.begin(), iota.end(), min); return iota; } @@ -98,7 +98,7 @@ std::vector sample_unique(std::size_t min, std::size_t max, std::default_random_engine engine; auto values = create_iota(min, max); std::shuffle(values.begin(), values.end(), engine); - values.erase(values.begin() + clamp(n, 0ul, values.size()), values.end()); + values.erase(values.begin() + clamp(n, gko::size_type(0), values.size()), values.end()); return values; } From 71fde250ef4415b310a4cb999563a936bde3556b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 27 Jan 2023 09:53:40 +0100 Subject: [PATCH 209/583] safeguard against negative reduction size --- common/unified/distributed/partition_helpers_kernels.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index a40bda31de4..5ecff0516c4 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -50,7 +50,8 @@ void check_consecutive_ranges(std::shared_ptr exec, bool* result) { array result_uint32{exec, 1}; - auto num_ranges = range_start_ends.get_num_elems() / 2; + auto num_ranges = std::max(range_start_ends.get_num_elems() / 2, + static_cast(1)); run_kernel_reduction( exec, [] GKO_KERNEL(const auto i, const auto* starts, const auto* ends) { From 6fa9d0460f23bbe4cc756a5ad579861e257e7f02 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 27 Jan 2023 09:55:21 +0100 Subject: [PATCH 210/583] remove unused function --- include/ginkgo/core/base/mpi.hpp | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 40b38b55781..9699dea4942 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -87,10 +87,13 @@ inline constexpr bool is_gpu_aware() int map_rank_to_device_id(MPI_Comm comm, int num_devices); -#define GKO_REGISTER_MPI_TYPE(input_type, mpi_type) \ - template <> \ - struct type_impl { \ - static MPI_Datatype get_type() { return mpi_type; } \ +#define GKO_REGISTER_MPI_TYPE(input_type, mpi_type) \ + template <> \ + struct type_impl { \ + static MPI_Datatype get_type() \ + { \ + return mpi_type; \ + } \ } /** @@ -1013,18 +1016,6 @@ class communicator { this->get())); } - - void all_gather(std::shared_ptr exec, - const void* send_buffer, const int send_count, - MPI_Datatype send_type, void* recv_buffer, - const int recv_count, MPI_Datatype recv_type) const - { - auto guard = exec->get_scoped_device_id_guard(); - GKO_ASSERT_NO_MPI_ERRORS( - MPI_Allgather(send_buffer, send_count, send_type, recv_buffer, - recv_count, recv_type, this->get())); - } - /** * (Non-blocking) Gather data onto all ranks from all ranks in the * communicator. From 2c0472f03e348ac34705acdeb4044ce74457abfe Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 27 Jan 2023 09:55:52 +0100 Subject: [PATCH 211/583] fixes partition documentation Co-authored-by: Pratik Nayak --- include/ginkgo/core/distributed/partition.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index a40f30f7137..c336470b923 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -83,11 +83,11 @@ namespace distributed { * ``` * starting_index[0] = 0, * starting_index[1] = 0, - * starting_index[2] = 3, // second range of part 1 + * starting_index[2] = 3, // second range of part 0 * starting_index[3] = 0, - * starting_index[4] = 5, // third range of part 1 + * starting_index[4] = 5, // third range of part 0 * ``` - * which you can use to iterate only over the the second range of part 1 (the + * which you can use to iterate only over the the second range of part 0 (the * third global range) with * ``` * for(int i = 0; i < r[3] - r[2]; ++i){ From 3cfe397d34bacb577a7ced0c7aa6215aae9c09e9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 9 Feb 2023 17:16:22 +0100 Subject: [PATCH 212/583] changes layout of gather ranges --- core/distributed/partition_helpers.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index fb9a1cea233..deda7f4299d 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -78,16 +78,9 @@ build_partition_from_local_range(std::shared_ptr exec, static_cast(local_range.end)}; // make all range_start_ends available on each rank - auto mpi_exec = exec->get_master(); - array ranges_start_end(mpi_exec, comm.size() * 2); + array ranges_start_end(exec, comm.size() * 2); ranges_start_end.fill(invalid_index()); - std::vector reqs; - reqs.push_back(comm.i_all_gather(mpi_exec, &range[0], 1, - ranges_start_end.get_data(), 1)); - reqs.push_back(comm.i_all_gather( - mpi_exec, &range[1], 1, ranges_start_end.get_data() + comm.size(), 1)); - mpi::wait_all(reqs); - ranges_start_end.set_executor(exec); + comm.all_gather(exec, range.data(), 2, ranges_start_end.get_data(), 2); // make_sort_by_range_start array part_ids(exec, comm.size()); From f4c88f645d4706ee5af3e0b853ac263e1df8a228 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 9 Feb 2023 17:16:36 +0100 Subject: [PATCH 213/583] adapts sorting to changed ranges layout --- .../partition_helpers_kernels.hpp.inc | 16 +- .../partition_helpers_kernels.dp.cpp | 8 +- omp/distributed/partition_helpers_kernels.cpp | 12 +- .../distributed/partition_helpers_kernels.cpp | 12 +- .../distributed/partition_helpers_kernels.cpp | 4 +- test/distributed/partition_helper_kernels.cpp | 170 +++++++++--------- 6 files changed, 115 insertions(+), 107 deletions(-) diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc index 17ac375c056..54d9d142df2 100644 --- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc @@ -37,11 +37,17 @@ void sort_by_range_start( array& part_ids) { auto num_ranges = range_start_ends.get_num_elems() / 2; - auto starts = thrust::device_pointer_cast(range_start_ends.get_data()); - auto ends = starts + num_ranges; - auto zip_it = thrust::make_zip_iterator(thrust::make_tuple( - ends, thrust::device_pointer_cast(part_ids.get_data()))); - thrust::sort_by_key(thrust::device, starts, starts + num_ranges, zip_it); + auto strided_indices = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __host__ __device__(const int i) { return 2 * i; }); + auto start_it = thrust::make_permutation_iterator( + range_start_ends.get_data(), strided_indices); + auto end_it = thrust::make_permutation_iterator( + range_start_ends.get_data() + 1, strided_indices); + auto zip_it = thrust::make_zip_iterator( + thrust::make_tuple(end_it, part_ids.get_data())); + thrust::sort_by_key(thrust::device, start_it, start_it + num_ranges, + zip_it); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index 797b7b5e081..3c4d437a750 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -55,10 +55,12 @@ void sort_by_range_start( auto policy = oneapi::dpl::execution::make_device_policy(*exec->get_queue()); auto num_ranges = range_start_ends.get_num_elems() / 2; - auto starts = range_start_ends.get_data(); - auto ends = starts + num_ranges; + auto start_it = oneapi::dpl::make_permutation_iterator( + range_start_ends.get_data(), [](auto i) { return 2 * i; }); + auto end_it = oneapi::dpl::make_permutation_iterator( + range_start_ends.get_data(), [](auto i) { return 2 * i + 1; }); auto zip_it = - oneapi::dpl::make_zip_iterator(starts, ends, part_ids.get_data()); + oneapi::dpl::make_zip_iterator(start_it, end_it, part_ids.get_data()); std::sort(policy, zip_it, zip_it + num_ranges, [](auto a, auto b) { return std::get<0>(a) < std::get<0>(b); }); } diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index 03a46d93f3b..093e8f1ff51 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -49,15 +49,17 @@ void sort_by_range_start( array& range_start_ends, array& part_ids) { + struct range { + GlobalIndexType idxs[2]; + }; + auto part_ids_d = part_ids.get_data(); auto num_parts = part_ids.get_num_elems(); - auto range_starts = range_start_ends.get_data(); - auto range_ends = range_starts + num_parts; - auto sort_it = - detail::make_zip_iterator(range_starts, range_ends, part_ids_d); + auto range_it = reinterpret_cast(range_start_ends.get_data()); + auto sort_it = detail::make_zip_iterator(range_it, part_ids_d); // TODO: use TBB or parallel std with c++17 std::sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { - return std::get<0>(a) < std::get<0>(b); + return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; }); } diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index bff3e26a997..35eca44e49f 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -49,14 +49,16 @@ void sort_by_range_start( array& range_start_ends, array& part_ids) { + struct range { + GlobalIndexType idxs[2]; + }; + auto part_ids_d = part_ids.get_data(); auto num_parts = part_ids.get_num_elems(); - auto range_starts = range_start_ends.get_data(); - auto range_ends = range_starts + num_parts; - auto sort_it = - detail::make_zip_iterator(range_starts, range_ends, part_ids_d); + auto range_it = reinterpret_cast(range_start_ends.get_data()); + auto sort_it = detail::make_zip_iterator(range_it, part_ids_d); std::sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { - return std::get<0>(a) < std::get<0>(b); + return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; }); } diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index abaab32903b..a5fe119d14a 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -64,7 +64,7 @@ class PartitionHelpers : public ::testing::Test { std::shared_ptr ref; gko::array default_range_start_ends{ - this->ref, {0, 4, 7, 9, 4, 7, 9, 11}}; + this->ref, {0, 4, 4, 7, 7, 9, 9, 11}}; gko::array default_part_ids{this->ref, {0, 1, 2, 3}}; }; @@ -90,7 +90,7 @@ TYPED_TEST(PartitionHelpers, CanSortByRangeStart) { using global_index_type = typename TestFixture::global_index_type; gko::array range_start_ends{this->ref, - {7, 4, 0, 9, 9, 7, 4, 11}}; + {7, 9, 4, 7, 0, 4, 9, 11}}; gko::array result_part_ids{this->ref, {2, 1, 0, 3}}; auto part_ids = this->default_part_ids; diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 3cc472cd3b6..c50975c4615 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -63,16 +63,19 @@ T clamp(const T& v, const T& lo, const T& hi) template std::vector create_iota(IndexType min, IndexType max) { - std::vector iota( - clamp(max - min, IndexType(0), max)); + std::vector iota(clamp(max - min, IndexType(0), max)); std::iota(iota.begin(), iota.end(), min); return iota; } template -range_container create_ranges(gko::size_type num_ranges) +std::vector create_ranges(gko::size_type num_ranges) { + struct repeated_value { + repeated_value(IndexType i) : vals{i, i} {} + IndexType vals[2]; + }; std::default_random_engine engine; std::uniform_int_distribution dist(5, 10); std::vector range_sizes(num_ranges); @@ -83,12 +86,11 @@ range_container create_ranges(gko::size_type num_ranges) std::partial_sum(range_sizes.begin(), range_sizes.end(), range_offsets.begin() + 1); - std::vector range_starts(num_ranges); - std::vector range_ends(num_ranges); - std::copy_n(range_offsets.begin(), num_ranges, range_starts.begin()); - std::copy_n(range_offsets.begin() + 1, num_ranges, range_ends.begin()); - - return {std::move(range_starts), std::move(range_ends)}; + std::vector ranges(num_ranges * 2, 0); + auto ranges_it = reinterpret_cast(ranges.data() + 1); + std::copy(range_offsets.begin() + 1, range_offsets.end() - 1, ranges_it); + ranges.back() = range_offsets.back(); + return ranges; } @@ -98,7 +100,8 @@ std::vector sample_unique(std::size_t min, std::size_t max, std::default_random_engine engine; auto values = create_iota(min, max); std::shuffle(values.begin(), values.end(), engine); - values.erase(values.begin() + clamp(n, gko::size_type(0), values.size()), values.end()); + values.erase(values.begin() + clamp(n, gko::size_type(0), values.size()), + values.end()); return values; } @@ -117,36 +120,29 @@ std::vector remove_indices(const std::vector& source, template -gko::array concat_start_end( - std::shared_ptr exec, - const range_container& start_ends) +gko::array make_array(std::shared_ptr exec, + const std::vector& v) { - gko::size_type num_ranges = start_ends.first.size(); - gko::array concat(exec, num_ranges * 2); - - exec->copy_from(exec->get_master().get(), num_ranges, - start_ends.first.data(), concat.get_data()); - exec->copy_from(exec->get_master().get(), num_ranges, - start_ends.second.data(), concat.get_data() + num_ranges); - - return concat; + return gko::array(exec, v.begin(), v.end()); } template -std::pair, std::vector> -shuffle_range_and_pid(const range_container& ranges, +std::pair, std::vector> +shuffle_range_and_pid(const std::vector& ranges, const std::vector& pid) { + struct range { + IndexType vals[2]; + }; + std::default_random_engine engine; auto result = std::make_pair(ranges, pid); auto num_ranges = result.second.size(); auto zip_it = gko::detail::make_zip_iterator( - result.first.first.begin(), - result.first.second.begin(), - result.second.begin()); + reinterpret_cast(result.first.data()), result.second.begin()); std::shuffle(zip_it, zip_it + num_ranges, engine); return result; @@ -161,69 +157,69 @@ class PartitionHelpers : public CommonTestFixture { TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes); -TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) -{ - using index_type = typename TestFixture::index_type; - auto start_ends = - concat_start_end(this->exec, create_ranges(100)); - bool result = false; - - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, &result); - - ASSERT_TRUE(result); -} - - -TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) -{ - using index_type = typename TestFixture::index_type; - auto full_range_ends = create_ranges(100); - auto removal_idxs = sample_unique(0, full_range_ends.first.size(), 4); - auto start_ends = concat_start_end( - this->exec, - std::make_pair(remove_indices(full_range_ends.first, removal_idxs), - remove_indices(full_range_ends.second, removal_idxs))); - bool result = true; - - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, &result); - - ASSERT_FALSE(result); -} - - -TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) -{ - using index_type = typename TestFixture::index_type; - auto start_ends = concat_start_end(this->ref, create_ranges(1)); - bool result = false; - - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, &result); - - ASSERT_TRUE(result); -} - - -TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) -{ - using index_type = typename TestFixture::index_type; - auto start_ends = gko::array(this->exec, {1}); - bool result = false; - - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, &result); - - ASSERT_TRUE(result); -} +// TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) +//{ +// using index_type = typename TestFixture::index_type; +// auto offsets = +// make_array(this->exec, create_ranges(100)); +// bool result = false; +// +// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( +// this->exec, offsets, &result); +// +// ASSERT_TRUE(result); +//} +// +// +// TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) +//{ +// using index_type = typename TestFixture::index_type; +// auto full_range_ends = create_ranges(100); +// auto removal_idxs = sample_unique(0, full_range_ends.size(), 4); +// auto start_ends = make_array( +// this->exec, +// std::make_pair(remove_indices(full_range_ends.first, removal_idxs), +// remove_indices(full_range_ends.second, removal_idxs))); +// bool result = true; +// +// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( +// this->exec, start_ends, &result); +// +// ASSERT_FALSE(result); +//} +// +// +// TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) +//{ +// using index_type = typename TestFixture::index_type; +// auto start_ends = +// make_array(this->ref, create_ranges(1)); +// bool result = false; +// +// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( +// this->exec, start_ends, &result); +// +// ASSERT_TRUE(result); +//} +// +// +// TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) +//{ +// using index_type = typename TestFixture::index_type; +// auto start_ends = gko::array(this->exec, {1}); +// bool result = false; +// +// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( +// this->exec, start_ends, &result); +// +// ASSERT_TRUE(result); +//} TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges) { using index_type = typename TestFixture::index_type; - auto start_ends = - concat_start_end(this->exec, create_ranges(100)); + auto start_ends = make_array(this->exec, create_ranges(100)); auto part_ids = create_iota(0, 100); auto part_ids_arr = gko::array( this->exec, part_ids.begin(), part_ids.end()); @@ -244,10 +240,10 @@ TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges) auto ranges = create_ranges(100); auto part_ids = create_iota(0, 100); auto shuffled = shuffle_range_and_pid(ranges, part_ids); - auto expected_start_ends = concat_start_end(this->exec, ranges); + auto expected_start_ends = make_array(this->exec, ranges); auto expected_part_ids = gko::array( this->exec, part_ids.begin(), part_ids.end()); - auto start_ends = concat_start_end(this->exec, shuffled.first); + auto start_ends = make_array(this->exec, shuffled.first); auto part_ids_arr = gko::array( this->exec, shuffled.second.begin(), shuffled.second.end()); From bb5f7ae638e02dec05f682a1f118fcc907a381eb Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 10 Feb 2023 09:45:01 +0100 Subject: [PATCH 214/583] adapts consecutive check to changed ranges layout --- .../distributed/partition_helpers_kernels.cpp | 35 +++--- .../distributed/partition_helpers_kernels.cpp | 18 +-- .../distributed/partition_helpers_kernels.cpp | 2 +- test/distributed/partition_helper_kernels.cpp | 113 +++++++++--------- 4 files changed, 86 insertions(+), 82 deletions(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index 5ecff0516c4..795be471175 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -50,21 +50,26 @@ void check_consecutive_ranges(std::shared_ptr exec, bool* result) { array result_uint32{exec, 1}; - auto num_ranges = std::max(range_start_ends.get_num_elems() / 2, - static_cast(1)); - run_kernel_reduction( - exec, - [] GKO_KERNEL(const auto i, const auto* starts, const auto* ends) { - return starts[i + 1] == ends[i]; - }, - [] GKO_KERNEL(const auto a, const auto b) { - return static_cast(a && b); - }, - [] GKO_KERNEL(auto x) { return x; }, static_cast(true), - result_uint32.get_data(), num_ranges - 1, range_start_ends.get_data(), - range_start_ends.get_data() + num_ranges); - *result = - static_cast(exec->copy_val_to_host(result_uint32.get_data())); + auto num_ranges = range_start_ends.get_num_elems() / 2; + // need additional guard because DPCPP doesn't return the initial value for + // empty inputs + if (num_ranges > 1) { + run_kernel_reduction( + exec, + [] GKO_KERNEL(const auto i, const auto* ranges) { + return ranges[2 * i] == ranges[2 * i + 1]; + }, + [] GKO_KERNEL(const auto a, const auto b) { + return static_cast(a && b); + }, + [] GKO_KERNEL(auto x) { return x; }, static_cast(true), + result_uint32.get_data(), num_ranges - 1, + range_start_ends.get_data() + 1); + *result = + static_cast(exec->copy_val_to_host(result_uint32.get_data())); + } else { + *result = true; + } } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 35eca44e49f..741f676df05 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -71,17 +71,19 @@ void check_consecutive_ranges(std::shared_ptr exec, array& range_start_ends, bool* result) { + struct end_start { + GlobalIndexType end; + GlobalIndexType start; + }; + auto num_parts = range_start_ends.get_num_elems() / 2; - auto range_starts = range_start_ends.get_data(); - auto range_ends = range_starts + num_parts; - auto combined_it = detail::make_zip_iterator(range_starts + 1, range_ends); + auto range_it = + reinterpret_cast(range_start_ends.get_data() + 1); if (num_parts) { - *result = std::all_of(combined_it, combined_it + (num_parts - 1), - [](const auto& start_end) { - return std::get<0>(start_end) == - std::get<1>(start_end); - }); + *result = + std::all_of(range_it, range_it + num_parts - 1, + [](const end_start& r) { return r.end == r.start; }); } else { *result = true; } diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index a5fe119d14a..5617883f30a 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -119,7 +119,7 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) { using global_index_type = typename TestFixture::global_index_type; gko::array range_start_ends{this->ref, - {7, 4, 0, 9, 9, 7, 4, 11}}; + {7, 9, 4, 7, 0, 4, 9, 11}}; bool result = true; gko::kernels::reference::partition_helpers::check_consecutive_ranges( diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index c50975c4615..53310e76b58 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -113,7 +113,7 @@ std::vector remove_indices(const std::vector& source, std::sort(idxs.begin(), idxs.end(), std::greater<>{}); auto result = source; for (auto idx : idxs) { - result.erase(result.begin() + idx); + result.erase(result.begin() + 2 * idx, result.begin() + 2 * idx + 1); } return result; } @@ -157,63 +157,60 @@ class PartitionHelpers : public CommonTestFixture { TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes); -// TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) -//{ -// using index_type = typename TestFixture::index_type; -// auto offsets = -// make_array(this->exec, create_ranges(100)); -// bool result = false; -// -// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( -// this->exec, offsets, &result); -// -// ASSERT_TRUE(result); -//} -// -// -// TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) -//{ -// using index_type = typename TestFixture::index_type; -// auto full_range_ends = create_ranges(100); -// auto removal_idxs = sample_unique(0, full_range_ends.size(), 4); -// auto start_ends = make_array( -// this->exec, -// std::make_pair(remove_indices(full_range_ends.first, removal_idxs), -// remove_indices(full_range_ends.second, removal_idxs))); -// bool result = true; -// -// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( -// this->exec, start_ends, &result); -// -// ASSERT_FALSE(result); -//} -// -// -// TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) -//{ -// using index_type = typename TestFixture::index_type; -// auto start_ends = -// make_array(this->ref, create_ranges(1)); -// bool result = false; -// -// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( -// this->exec, start_ends, &result); -// -// ASSERT_TRUE(result); -//} -// -// -// TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) -//{ -// using index_type = typename TestFixture::index_type; -// auto start_ends = gko::array(this->exec, {1}); -// bool result = false; -// -// gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( -// this->exec, start_ends, &result); -// -// ASSERT_TRUE(result); -//} +TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) +{ + using index_type = typename TestFixture::index_type; + auto offsets = make_array(this->exec, create_ranges(100)); + bool result = false; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, offsets, &result); + + ASSERT_TRUE(result); +} + + +TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) +{ + using index_type = typename TestFixture::index_type; + auto full_range_ends = create_ranges(100); + auto removal_idxs = sample_unique(0, full_range_ends.size() / 2, 4); + auto start_ends = + make_array(this->exec, remove_indices(full_range_ends, removal_idxs)); + bool result = true; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, start_ends, &result); + + ASSERT_FALSE(result); +} + + + TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) +{ + using index_type = typename TestFixture::index_type; + auto start_ends = + make_array(this->ref, create_ranges(1)); + bool result = false; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, start_ends, &result); + + ASSERT_TRUE(result); +} + + + TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) +{ + using index_type = typename TestFixture::index_type; + auto start_ends = gko::array(this->exec, {1}); + bool result = false; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( + this->exec, start_ends, &result); + + ASSERT_TRUE(result); +} TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges) From 030c75ba18edc6bb178459b182b7fca0d88dbd08 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 10 Feb 2023 10:20:45 +0100 Subject: [PATCH 215/583] adds kernel to compress ranges --- .../distributed/partition_helpers_kernels.cpp | 22 +++++++++ core/device_hooks/common_kernels.inc.cpp | 1 + core/distributed/partition_helpers.cpp | 6 +-- .../distributed/partition_helpers_kernels.hpp | 18 +++++--- .../distributed/partition_helpers_kernels.cpp | 16 +++++++ .../distributed/partition_helpers_kernels.cpp | 15 +++++++ test/distributed/partition_helper_kernels.cpp | 45 +++++++++++++++---- 7 files changed, 106 insertions(+), 17 deletions(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index 795be471175..cfae171844f 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -75,6 +75,28 @@ void check_consecutive_ranges(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); + +template +void compress_ranges(std::shared_ptr exec, + const array& range_start_ends, + array& range_offsets) +{ + run_kernel( + exec, + [] GKO_KERNEL(const auto i, const auto* start_ends, auto* offsets) { + if (i == 0) { + offsets[0] = start_ends[0]; + } + offsets[i + 1] = start_ends[2 * i + 1]; + }, + range_offsets.get_num_elems() - 1, range_start_ends.get_const_data(), + range_offsets.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES); + + } // namespace partition_helpers } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 51bcf9d9587..c8bbd2e0a31 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -261,6 +261,7 @@ namespace partition_helpers { GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); +GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES); } // namespace partition_helpers diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index deda7f4299d..b57c3e5be53 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -62,6 +62,7 @@ GKO_REGISTER_OPERATION(sort_by_range_start, partition_helpers::sort_by_range_start); GKO_REGISTER_OPERATION(check_consecutive_ranges, partition_helpers::check_consecutive_ranges); +GKO_REGISTER_OPERATION(compress_ranges, partition_helpers::compress_ranges); } // namespace @@ -99,9 +100,8 @@ build_partition_from_local_range(std::shared_ptr exec, // remove duplicates array ranges(exec, comm.size() + 1); - exec->copy(1, ranges_start_end.get_data(), ranges.get_data()); - exec->copy(comm.size(), ranges_start_end.get_data() + comm.size(), - ranges.get_data() + 1); + exec->run( + partition_helpers::make_compress_ranges(ranges_start_end, ranges)); return Partition::build_from_contiguous( exec, ranges, part_ids); diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index 22a946bfb8f..80e22699b43 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -57,11 +57,19 @@ namespace kernels { bool* result) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(GlobalIndexType); \ - template \ - GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(GlobalIndexType) +#define GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES(_type) \ + void compress_ranges(std::shared_ptr exec, \ + const array<_type>& range_start_ends, \ + array<_type>& range_offsets) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(GlobalIndexType); \ + template \ + GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(GlobalIndexType); \ + template \ + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES(GlobalIndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(partition_helpers, diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 741f676df05..08e1c5a49c2 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -93,6 +93,22 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); +template +void compress_ranges(std::shared_ptr exec, + const array& range_start_ends, + array& range_offsets) +{ + range_offsets.get_data()[0] = range_start_ends.get_const_data()[0]; + for (int i = 0; i < range_offsets.get_num_elems() - 1; ++i) { + range_offsets.get_data()[i + 1] = + range_start_ends.get_const_data()[2 * i + 1]; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES); + + } // namespace partition_helpers } // namespace reference } // namespace kernels diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index 5617883f30a..9b339fd926f 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -129,4 +129,19 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) } +TYPED_TEST(PartitionHelpers, CanCompressRanges) +{ + using itype = typename TestFixture::global_index_type; + auto range_start_ends = this->default_range_start_ends; + gko::array range_offsets{this->ref, + range_start_ends.get_num_elems() / 2 + 1}; + gko::array expected_range_offsets{this->ref, {0, 4, 7, 9, 11}}; + + gko::kernels::reference::partition_helpers::compress_ranges( + this->ref, range_start_ends, range_offsets); + + GKO_ASSERT_ARRAY_EQ(range_offsets, expected_range_offsets); +} + + } // namespace diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 53310e76b58..fdfeb553ae1 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -47,10 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. using gko::experimental::distributed::comm_index_type; -template -using range_container = - std::pair, std::vector>; - // TODO: remove with c++17 template @@ -70,12 +66,8 @@ std::vector create_iota(IndexType min, IndexType max) template -std::vector create_ranges(gko::size_type num_ranges) +std::vector create_range_offsets(gko::size_type num_ranges) { - struct repeated_value { - repeated_value(IndexType i) : vals{i, i} {} - IndexType vals[2]; - }; std::default_random_engine engine; std::uniform_int_distribution dist(5, 10); std::vector range_sizes(num_ranges); @@ -85,7 +77,19 @@ std::vector create_ranges(gko::size_type num_ranges) std::vector range_offsets(num_ranges + 1, 0); std::partial_sum(range_sizes.begin(), range_sizes.end(), range_offsets.begin() + 1); + return range_offsets; +} + +template +std::vector create_ranges( + const std::vector& range_offsets) +{ + struct repeated_value { + repeated_value(IndexType i) : vals{i, i} {} + IndexType vals[2]; + }; + gko::size_type num_ranges = range_offsets.size() - 1; std::vector ranges(num_ranges * 2, 0); auto ranges_it = reinterpret_cast(ranges.data() + 1); std::copy(range_offsets.begin() + 1, range_offsets.end() - 1, ranges_it); @@ -94,6 +98,15 @@ std::vector create_ranges(gko::size_type num_ranges) } +template +std::vector create_ranges(gko::size_type num_ranges) +{ + auto range_offsets = create_range_offsets(num_ranges); + + return create_ranges(range_offsets); +} + + std::vector sample_unique(std::size_t min, std::size_t max, gko::size_type n) { @@ -250,3 +263,17 @@ TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges) GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends); GKO_ASSERT_ARRAY_EQ(expected_part_ids, part_ids_arr); } + + +TYPED_TEST(PartitionHelpers, CanCompressRanges) +{ + using index_type = typename TestFixture::index_type; + auto expected_offsets = create_range_offsets(100); + auto ranges = make_array(this->exec, create_ranges(expected_offsets)); + gko::array offsets{this->exec, expected_offsets.size()}; + + gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_ranges( + this->exec, ranges, offsets); + + GKO_ASSERT_ARRAY_EQ(offsets, make_array(this->exec, expected_offsets)); +} From 6f1e90e2a7f4836dea02fc74b7a1ef6d59f0dd92 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 10 Feb 2023 10:29:47 +0100 Subject: [PATCH 216/583] review updates: - constness - documentation Co-authored-by: Tobias Ribizel --- common/unified/distributed/partition_helpers_kernels.cpp | 4 ++-- core/distributed/partition_helpers.cpp | 2 +- core/distributed/partition_helpers_kernels.hpp | 2 +- include/ginkgo/core/distributed/partition.hpp | 2 +- reference/distributed/partition_helpers_kernels.cpp | 6 +++--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index cfae171844f..e5565819021 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -46,7 +46,7 @@ namespace partition_helpers { template void check_consecutive_ranges(std::shared_ptr exec, - array& range_start_ends, + const array& range_start_ends, bool* result) { array result_uint32{exec, 1}; @@ -64,7 +64,7 @@ void check_consecutive_ranges(std::shared_ptr exec, }, [] GKO_KERNEL(auto x) { return x; }, static_cast(true), result_uint32.get_data(), num_ranges - 1, - range_start_ends.get_data() + 1); + range_start_ends.get_const_data() + 1); *result = static_cast(exec->copy_val_to_host(result_uint32.get_data())); } else { diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index b57c3e5be53..acc4d535519 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -98,7 +98,7 @@ build_partition_from_local_range(std::shared_ptr exec, throw Error(__FILE__, __LINE__, "The partition contains gaps."); } - // remove duplicates + // join (now consecutive) starts and ends into combined array array ranges(exec, comm.size() + 1); exec->run( partition_helpers::make_compress_ranges(ranges_start_end, ranges)); diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index 80e22699b43..6d55926db76 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -53,7 +53,7 @@ namespace kernels { #define GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(_type) \ void check_consecutive_ranges(std::shared_ptr exec, \ - array<_type>& range_start_ends, \ + const array<_type>& range_start_ends, \ bool* result) diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index c336470b923..bb36528a4a8 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -260,7 +260,7 @@ class Partition * * @param exec the Executor on which the partition should be built * @param ranges the boundaries of the ranges representing each part. - * Part parti_id[i] contains the indices + * Part part_id[i] contains the indices * [ranges[i], ranges[i + 1]). Has to contain at least * one element. The first element has to be 0. * @param part_ids the part ids of the provided ranges. If empty, then diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 08e1c5a49c2..989ac1cddb0 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -68,7 +68,7 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( template void check_consecutive_ranges(std::shared_ptr exec, - array& range_start_ends, + const array& range_start_ends, bool* result) { struct end_start { @@ -77,8 +77,8 @@ void check_consecutive_ranges(std::shared_ptr exec, }; auto num_parts = range_start_ends.get_num_elems() / 2; - auto range_it = - reinterpret_cast(range_start_ends.get_data() + 1); + auto range_it = reinterpret_cast( + range_start_ends.get_const_data() + 1); if (num_parts) { *result = From 0cda7efa3783fdc60f44c7dfdf6487957fa889ab Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 21 Feb 2023 11:50:36 +0100 Subject: [PATCH 217/583] review update: - use stable sort Co-authored-by: Gregor Olenik --- .../cuda_hip/distributed/partition_helpers_kernels.hpp.inc | 4 ++-- dpcpp/distributed/partition_helpers_kernels.dp.cpp | 5 +++-- omp/distributed/partition_helpers_kernels.cpp | 7 ++++--- reference/distributed/partition_helpers_kernels.cpp | 7 ++++--- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc index 54d9d142df2..e3e8335dd22 100644 --- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc @@ -46,8 +46,8 @@ void sort_by_range_start( range_start_ends.get_data() + 1, strided_indices); auto zip_it = thrust::make_zip_iterator( thrust::make_tuple(end_it, part_ids.get_data())); - thrust::sort_by_key(thrust::device, start_it, start_it + num_ranges, - zip_it); + thrust::stable_sort_by_key(thrust::device, start_it, start_it + num_ranges, + zip_it); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index 3c4d437a750..b9823e1df9f 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -61,8 +61,9 @@ void sort_by_range_start( range_start_ends.get_data(), [](auto i) { return 2 * i + 1; }); auto zip_it = oneapi::dpl::make_zip_iterator(start_it, end_it, part_ids.get_data()); - std::sort(policy, zip_it, zip_it + num_ranges, - [](auto a, auto b) { return std::get<0>(a) < std::get<0>(b); }); + std::stable_sort(policy, zip_it, zip_it + num_ranges, [](auto a, auto b) { + return std::get<0>(a) < std::get<0>(b); + }); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index 093e8f1ff51..5fc55862b08 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -58,9 +58,10 @@ void sort_by_range_start( auto range_it = reinterpret_cast(range_start_ends.get_data()); auto sort_it = detail::make_zip_iterator(range_it, part_ids_d); // TODO: use TBB or parallel std with c++17 - std::sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { - return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; - }); + std::stable_sort(sort_it, sort_it + num_parts, + [](const auto& a, const auto& b) { + return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; + }); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 989ac1cddb0..7f7dfce756c 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -57,9 +57,10 @@ void sort_by_range_start( auto num_parts = part_ids.get_num_elems(); auto range_it = reinterpret_cast(range_start_ends.get_data()); auto sort_it = detail::make_zip_iterator(range_it, part_ids_d); - std::sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { - return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; - }); + std::stable_sort(sort_it, sort_it + num_parts, + [](const auto& a, const auto& b) { + return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; + }); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( From 862e021321a9ae9a0a28c6f720c8b00e8e348f48 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 21 Feb 2023 12:05:35 +0100 Subject: [PATCH 218/583] fixing dpcpp --- .../partition_helpers_kernels.dp.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index b9823e1df9f..6362c243d95 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -45,6 +45,20 @@ namespace kernels { namespace dpcpp { namespace partition_helpers { +struct stride { + // Some version requires [] while some requires (), so I added both + template + Index operator[](const Index& i) const + { + return i * 2; + } + + template + Index operator()(const Index& i) const + { + return operator[](i); + } +}; template void sort_by_range_start( @@ -55,10 +69,11 @@ void sort_by_range_start( auto policy = oneapi::dpl::execution::make_device_policy(*exec->get_queue()); auto num_ranges = range_start_ends.get_num_elems() / 2; + auto start_it = oneapi::dpl::make_permutation_iterator( - range_start_ends.get_data(), [](auto i) { return 2 * i; }); + range_start_ends.get_data(), stride{}); auto end_it = oneapi::dpl::make_permutation_iterator( - range_start_ends.get_data(), [](auto i) { return 2 * i + 1; }); + range_start_ends.get_data() + 1, stride{}); auto zip_it = oneapi::dpl::make_zip_iterator(start_it, end_it, part_ids.get_data()); std::stable_sort(policy, zip_it, zip_it + num_ranges, [](auto a, auto b) { From cc702fcb3048b5d67e2adeae142901ee3762832c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 22 Feb 2023 10:51:31 +0100 Subject: [PATCH 219/583] don't mix host and device buffers for MPI --- core/distributed/partition_helpers.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index acc4d535519..b906c0b6e42 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -79,9 +79,13 @@ build_partition_from_local_range(std::shared_ptr exec, static_cast(local_range.end)}; // make all range_start_ends available on each rank - array ranges_start_end(exec, comm.size() * 2); + // note: not all combination of MPI + GPU library seem to support + // mixing host and device buffers, e.g. OpenMPI 4.0.5 and Rocm 4.0 + auto mpi_exec = exec->get_master(); + array ranges_start_end(mpi_exec, comm.size() * 2); ranges_start_end.fill(invalid_index()); - comm.all_gather(exec, range.data(), 2, ranges_start_end.get_data(), 2); + comm.all_gather(mpi_exec, range.data(), 2, ranges_start_end.get_data(), 2); + ranges_start_end.set_executor(exec); // make_sort_by_range_start array part_ids(exec, comm.size()); From d8642b9a5823e0996bec17e4c28bfaa7f1a52003 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 19 Apr 2023 09:18:51 +0200 Subject: [PATCH 220/583] adds permutation iterator Co-authored-by: Tobias Ribizel --- core/base/iterator_factory.hpp | 124 +++++++++++++++++++ core/test/base/iterator_factory.cpp | 181 +++++++++++++++++++++++++--- 2 files changed, 291 insertions(+), 14 deletions(-) diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 7ebbc510f74..29aa99a4f86 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -366,6 +366,130 @@ void swap(zip_iterator_reference a, } +template +class permute_iterator { +public: + using difference_type = std::ptrdiff_t; + using value_type = typename std::iterator_traits::value_type; + using pointer = typename std::iterator_traits::pointer; + using reference = typename std::iterator_traits::reference; + using iterator_category = std::random_access_iterator_tag; + + explicit permute_iterator() = default; + + explicit permute_iterator(IteratorType it, PermuteFn perm) + : it_{std::move(it)}, idx_{}, perm_{std::move(perm)} + {} + + permute_iterator& operator=(permute_iterator other) + { + it_ = other.it_; + idx_ = other.idx_; + // no perm_ = other.perm_ because lambdas are not copy-assignable + return *this; + } + + permute_iterator& operator+=(difference_type i) + { + idx_ += i; + return *this; + } + + permute_iterator& operator-=(difference_type i) { return *this += -i; } + + permute_iterator& operator++() { return *this += 1; } + + permute_iterator operator++(int) + { + auto tmp = *this; + ++(*this); + return tmp; + } + + permute_iterator& operator--() { return *this -= 1; } + + permute_iterator operator--(int) + { + auto tmp = *this; + --(*this); + return tmp; + } + + permute_iterator operator+(difference_type i) const + { + auto tmp = *this; + tmp += i; + return tmp; + } + + friend permute_iterator operator+(difference_type i, + const permute_iterator& iter) + { + return iter + i; + } + + permute_iterator operator-(difference_type i) const + { + auto tmp = *this; + tmp -= i; + return tmp; + } + + difference_type operator-(const permute_iterator& other) const + { + return idx_ - other.idx_; + } + + reference operator*() const { return it_[perm_(idx_)]; } + + reference operator[](difference_type i) const { return *(*this + i); } + + bool operator==(const permute_iterator& other) const + { + return idx_ == other.idx_; + } + + bool operator!=(const permute_iterator& other) const + { + return !(*this == other); + } + + bool operator<(const permute_iterator& other) const + { + return idx_ < other.idx_; + } + + bool operator<=(const permute_iterator& other) const + { + return idx_ <= other.idx_; + } + + bool operator>(const permute_iterator& other) const + { + return !(*this <= other); + } + + bool operator>=(const permute_iterator& other) const + { + return !(*this < other); + } + +private: + IteratorType it_; + difference_type idx_; + PermuteFn perm_; +}; + + +template +permute_iterator make_permute_iterator( + IteratorType it, PermutationFn perm) +{ + return permute_iterator{std::move(it), + std::move(perm)}; +} + + } // namespace detail } // namespace gko diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp index 68ed87e07cb..f41181f0c10 100644 --- a/core/test/base/iterator_factory.cpp +++ b/core/test/base/iterator_factory.cpp @@ -67,13 +67,13 @@ namespace { template -class IteratorFactory : public ::testing::Test { +class ZipIterator : public ::testing::Test { protected: using value_type = typename std::tuple_element<0, decltype(ValueIndexType())>::type; using index_type = typename std::tuple_element<1, decltype(ValueIndexType())>::type; - IteratorFactory() + ZipIterator() : reversed_index{100, 50, 10, 9, 8, 7, 5, 5, 4, 3, 2, 1, 0, -1, -2}, ordered_index{-2, -1, 0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 10, 50, 100}, reversed_value{15., 14., 13., 12., 11., 10., 9., 7., @@ -109,11 +109,11 @@ class IteratorFactory : public ::testing::Test { const std::vector ordered_value; }; -TYPED_TEST_SUITE(IteratorFactory, gko::test::ValueIndexTypes, +TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypes, PairTypenameNameGenerator); -TYPED_TEST(IteratorFactory, EmptyIterator) +TYPED_TEST(ZipIterator, EmptyIterator) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -125,7 +125,7 @@ TYPED_TEST(IteratorFactory, EmptyIterator) } -TYPED_TEST(IteratorFactory, SortingReversedWithIterator) +TYPED_TEST(ZipIterator, SortingReversedWithIterator) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -140,7 +140,7 @@ TYPED_TEST(IteratorFactory, SortingReversedWithIterator) } -TYPED_TEST(IteratorFactory, SortingAlreadySortedWithIterator) +TYPED_TEST(ZipIterator, SortingAlreadySortedWithIterator) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -155,7 +155,7 @@ TYPED_TEST(IteratorFactory, SortingAlreadySortedWithIterator) } -TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller) +TYPED_TEST(ZipIterator, IteratorReferenceOperatorSmaller) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -170,7 +170,7 @@ TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller) } -TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller2) +TYPED_TEST(ZipIterator, IteratorReferenceOperatorSmaller2) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -185,7 +185,7 @@ TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller2) } -TYPED_TEST(IteratorFactory, IncreasingIterator) +TYPED_TEST(ZipIterator, IncreasingIterator) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -262,7 +262,7 @@ bool check_assertion_exit_code(int exit_code) } -TYPED_TEST(IteratorFactory, IncompatibleIteratorDeathTest) +TYPED_TEST(ZipIterator, IncompatibleIteratorDeathTest) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -286,7 +286,7 @@ TYPED_TEST(IteratorFactory, IncompatibleIteratorDeathTest) #endif -TYPED_TEST(IteratorFactory, DecreasingIterator) +TYPED_TEST(ZipIterator, DecreasingIterator) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -316,7 +316,7 @@ TYPED_TEST(IteratorFactory, DecreasingIterator) } -TYPED_TEST(IteratorFactory, CorrectDereferencing) +TYPED_TEST(ZipIterator, CorrectDereferencing) { using index_type_it = typename TestFixture::index_type; using value_type_it = typename TestFixture::value_type; @@ -337,7 +337,7 @@ TYPED_TEST(IteratorFactory, CorrectDereferencing) } -TYPED_TEST(IteratorFactory, CorrectSwapping) +TYPED_TEST(ZipIterator, CorrectSwapping) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -361,7 +361,7 @@ TYPED_TEST(IteratorFactory, CorrectSwapping) } -TYPED_TEST(IteratorFactory, CorrectHandWrittenSwapping) +TYPED_TEST(ZipIterator, CorrectHandWrittenSwapping) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; @@ -388,4 +388,157 @@ TYPED_TEST(IteratorFactory, CorrectHandWrittenSwapping) } +template +class PermuteIterator : public ::testing::Test { +protected: + using value_type = ValueType; +}; + +TYPED_TEST_SUITE(PermuteIterator, gko::test::ValueAndIndexTypes, + TypenameNameGenerator); + + +TYPED_TEST(PermuteIterator, EmptyIterator) +{ + auto test_iter = gko::detail::make_permute_iterator( + nullptr, [](int i) { return i; }); + + ASSERT_NO_THROW(std::sort(test_iter, test_iter)); +} + + +TYPED_TEST(PermuteIterator, SortingWithIdentityPermutation) +{ + std::vector vec{6, 2, 5, 2, 4}; + std::vector sorted{2, 2, 4, 5, 6}; + + auto test_iter = gko::detail::make_permute_iterator( + vec.begin(), [](int i) { return i; }); + + std::sort(test_iter, test_iter + vec.size()); + + ASSERT_EQ(vec, sorted); +} + + +TYPED_TEST(PermuteIterator, SortingWithReversePermutation) +{ + std::vector vec{6, 2, 5, 2, 4}; + std::vector sorted{6, 5, 4, 2, 2}; + auto test_iter = gko::detail::make_permute_iterator( + vec.begin(), + [size = vec.size()](int i) { return static_cast(size) - 1 - i; }); + + std::sort(test_iter, test_iter + vec.size()); + + ASSERT_EQ(vec, sorted); +} + + +TYPED_TEST(PermuteIterator, SortingWithStridedPermutation) +{ + std::vector vec{6, 8, 2, 9, 5, 1, 2, 7, 4, 0}; + std::vector sorted{2, 8, 2, 9, 4, 1, 5, 7, 6, 0}; + + auto test_iter = gko::detail::make_permute_iterator( + vec.begin(), [](int i) { return 2 * i; }); + + std::sort(test_iter, test_iter + vec.size() / 2); + + ASSERT_EQ(vec, sorted); +} + + +TYPED_TEST(PermuteIterator, IncreasingIterator) +{ + std::vector vec{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto perm = [size = vec.size()](int i) { + return static_cast(size) - 1 - i; + }; + + auto test_iter = gko::detail::make_permute_iterator(vec.begin(), perm); + auto begin = test_iter; + auto plus_2 = begin + 2; + auto plus_2_rev = 2 + begin; + auto plus_minus_2 = plus_2 - 2; + auto increment_pre_2 = begin; + ++increment_pre_2; + ++increment_pre_2; + auto increment_post_2 = begin; + increment_post_2++; + increment_post_2++; + auto increment_pre_test = begin; + auto increment_post_test = begin; + + // check results for equality + ASSERT_TRUE(begin == plus_minus_2); + ASSERT_TRUE(plus_2 == increment_pre_2); + ASSERT_TRUE(plus_2_rev == increment_pre_2); + ASSERT_TRUE(increment_pre_2 == increment_post_2); + ASSERT_TRUE(begin == increment_post_test++); + ASSERT_TRUE(begin + 1 == ++increment_pre_test); + ASSERT_TRUE(*plus_2 == vec[perm(2)]); + // check other comparison operators and difference + std::vector::iterator, decltype(perm)>> + its{begin, + plus_2, + plus_2_rev, + plus_minus_2, + increment_pre_2, + increment_post_2, + increment_pre_test, + increment_post_test, + begin + 5, + begin + 9}; + std::sort(its.begin(), its.end()); + std::vector dists; + std::vector ref_dists{0, 1, 0, 1, 0, 0, 0, 3, 4}; + for (int i = 0; i < its.size() - 1; i++) { + SCOPED_TRACE(i); + dists.push_back(its[i + 1] - its[i]); + auto equal = dists.back() > 0; + ASSERT_EQ(its[i + 1] > its[i], equal); + ASSERT_EQ(its[i] < its[i + 1], equal); + ASSERT_EQ(its[i] != its[i + 1], equal); + ASSERT_EQ(its[i] == its[i + 1], !equal); + ASSERT_EQ(its[i] >= its[i + 1], !equal); + ASSERT_EQ(its[i + 1] <= its[i], !equal); + ASSERT_TRUE(its[i + 1] >= its[i]); + ASSERT_TRUE(its[i] <= its[i + 1]); + } + ASSERT_EQ(dists, ref_dists); +} + + +TYPED_TEST(PermuteIterator, DecreasingIterator) +{ + std::vector vec{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto perm = [size = vec.size()](int i) { + return static_cast(size) - 1 - i; + }; + + auto test_iter = gko::detail::make_permute_iterator(vec.begin(), perm); + + auto iter = test_iter + 5; + auto minus_2 = iter - 2; + auto minus_plus_2 = minus_2 + 2; + auto decrement_pre_2 = iter; + --decrement_pre_2; + --decrement_pre_2; + auto decrement_post_2 = iter; + decrement_post_2--; + decrement_post_2--; + auto decrement_pre_test = iter; + auto decrement_post_test = iter; + + ASSERT_TRUE(iter == minus_plus_2); + ASSERT_TRUE(minus_2 == decrement_pre_2); + ASSERT_TRUE(decrement_pre_2 == decrement_post_2); + ASSERT_TRUE(iter == decrement_post_test--); + ASSERT_TRUE(iter - 1 == --decrement_pre_test); + ASSERT_TRUE(*minus_2 == vec[perm(3)]); +} + + } // namespace From 8fd45071a9af1c6aa90bb4fc086463b876429a8d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 19 Apr 2023 09:26:08 +0200 Subject: [PATCH 221/583] use permute iterator for STL algorithms --- omp/distributed/partition_helpers_kernels.cpp | 13 ++++---- .../distributed/partition_helpers_kernels.cpp | 33 +++++++++---------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index 5fc55862b08..d03c21c0731 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -49,18 +49,17 @@ void sort_by_range_start( array& range_start_ends, array& part_ids) { - struct range { - GlobalIndexType idxs[2]; - }; - auto part_ids_d = part_ids.get_data(); auto num_parts = part_ids.get_num_elems(); - auto range_it = reinterpret_cast(range_start_ends.get_data()); - auto sort_it = detail::make_zip_iterator(range_it, part_ids_d); + auto start_it = detail::make_permute_iterator( + range_start_ends.get_data(), [](const auto i) { return 2 * i; }); + auto end_it = detail::make_permute_iterator( + range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; }); + auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d); // TODO: use TBB or parallel std with c++17 std::stable_sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { - return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; + return std::get<0>(a) < std::get<0>(b); }); } diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 7f7dfce756c..b392dd362b4 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -49,17 +49,16 @@ void sort_by_range_start( array& range_start_ends, array& part_ids) { - struct range { - GlobalIndexType idxs[2]; - }; - auto part_ids_d = part_ids.get_data(); auto num_parts = part_ids.get_num_elems(); - auto range_it = reinterpret_cast(range_start_ends.get_data()); - auto sort_it = detail::make_zip_iterator(range_it, part_ids_d); + auto start_it = detail::make_permute_iterator( + range_start_ends.get_data(), [](const auto i) { return 2 * i; }); + auto end_it = detail::make_permute_iterator( + range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; }); + auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d); std::stable_sort(sort_it, sort_it + num_parts, [](const auto& a, const auto& b) { - return std::get<0>(a).idxs[0] < std::get<0>(b).idxs[0]; + return std::get<0>(a) < std::get<0>(b); }); } @@ -72,19 +71,19 @@ void check_consecutive_ranges(std::shared_ptr exec, const array& range_start_ends, bool* result) { - struct end_start { - GlobalIndexType end; - GlobalIndexType start; - }; - auto num_parts = range_start_ends.get_num_elems() / 2; - auto range_it = reinterpret_cast( - range_start_ends.get_const_data() + 1); + auto start_it = + detail::make_permute_iterator(range_start_ends.get_const_data() + 2, + [](const auto i) { return 2 * i; }); + auto end_it = + detail::make_permute_iterator(range_start_ends.get_const_data() + 1, + [](const auto i) { return 2 * i; }); + auto range_it = detail::make_zip_iterator(start_it, end_it); if (num_parts) { - *result = - std::all_of(range_it, range_it + num_parts - 1, - [](const end_start& r) { return r.end == r.start; }); + *result = std::all_of( + range_it, range_it + num_parts - 1, + [](const auto& r) { return std::get<0>(r) == std::get<1>(r); }); } else { *result = true; } From 830ed3b5221b7bc366a8fe1a50ad9d7c278a4efa Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Apr 2023 09:49:42 +0200 Subject: [PATCH 222/583] review updates: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - removes more reinterpret casts - makes permute_iterator copy assignable Co-authored-by: Thomas Grützmacher Co-authored-by: Tobias Ribizel --- core/base/iterator_factory.hpp | 14 +++++---- core/distributed/partition_helpers.cpp | 3 +- core/test/base/iterator_factory.cpp | 2 -- .../distributed/partition_helpers_kernels.cpp | 2 +- test/distributed/partition_helper_kernels.cpp | 30 +++++++++---------- test/mpi/partition_helpers.cpp | 1 - 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 29aa99a4f86..76cf3dcf36d 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace gko { @@ -375,17 +376,18 @@ class permute_iterator { using reference = typename std::iterator_traits::reference; using iterator_category = std::random_access_iterator_tag; - explicit permute_iterator() = default; + explicit permute_iterator() : it_{}, idx_{}, perm_{{}} {} explicit permute_iterator(IteratorType it, PermuteFn perm) : it_{std::move(it)}, idx_{}, perm_{std::move(perm)} {} - permute_iterator& operator=(permute_iterator other) + permute_iterator& operator=(const permute_iterator& other) { it_ = other.it_; idx_ = other.idx_; - // no perm_ = other.perm_ because lambdas are not copy-assignable + perm_.clear(); + perm_.emplace_back(other.perm_[0]); return *this; } @@ -440,7 +442,7 @@ class permute_iterator { return idx_ - other.idx_; } - reference operator*() const { return it_[perm_(idx_)]; } + reference operator*() const { return it_[perm_[0](idx_)]; } reference operator[](difference_type i) const { return *(*this + i); } @@ -477,7 +479,9 @@ class permute_iterator { private: IteratorType it_; difference_type idx_; - PermuteFn perm_; + // hack to make lambda function copy assignable + // could be better done with std::optional + std::vector perm_; }; diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index b906c0b6e42..921eeee19fd 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -32,11 +32,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include + + #include #include "core/components/fill_array_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp index f41181f0c10..e4d8d39b340 100644 --- a/core/test/base/iterator_factory.cpp +++ b/core/test/base/iterator_factory.cpp @@ -411,7 +411,6 @@ TYPED_TEST(PermuteIterator, SortingWithIdentityPermutation) { std::vector vec{6, 2, 5, 2, 4}; std::vector sorted{2, 2, 4, 5, 6}; - auto test_iter = gko::detail::make_permute_iterator( vec.begin(), [](int i) { return i; }); @@ -439,7 +438,6 @@ TYPED_TEST(PermuteIterator, SortingWithStridedPermutation) { std::vector vec{6, 8, 2, 9, 5, 1, 2, 7, 4, 0}; std::vector sorted{2, 8, 2, 9, 4, 1, 5, 7, 6, 0}; - auto test_iter = gko::detail::make_permute_iterator( vec.begin(), [](int i) { return 2 * i; }); diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index b392dd362b4..9cbc425906d 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/distributed/partition_helpers_kernels.hpp" -#include + #include "core/base/iterator_factory.hpp" diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index fdfeb553ae1..bdf750e4675 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -85,14 +85,13 @@ template std::vector create_ranges( const std::vector& range_offsets) { - struct repeated_value { - repeated_value(IndexType i) : vals{i, i} {} - IndexType vals[2]; - }; + assert(range_offsets.size() >= 2); gko::size_type num_ranges = range_offsets.size() - 1; std::vector ranges(num_ranges * 2, 0); - auto ranges_it = reinterpret_cast(ranges.data() + 1); - std::copy(range_offsets.begin() + 1, range_offsets.end() - 1, ranges_it); + for (gko::size_type i = 1; i < num_ranges; ++i) { + ranges[2 * i - 1] = range_offsets[i]; + ranges[2 * i] = range_offsets[i]; + } ranges.back() = range_offsets.back(); return ranges; } @@ -145,17 +144,17 @@ std::pair, std::vector> shuffle_range_and_pid(const std::vector& ranges, const std::vector& pid) { - struct range { - IndexType vals[2]; - }; - std::default_random_engine engine; auto result = std::make_pair(ranges, pid); auto num_ranges = result.second.size(); - auto zip_it = gko::detail::make_zip_iterator( - reinterpret_cast(result.first.data()), result.second.begin()); + auto range_start_it = gko::detail::make_permute_iterator( + result.first.begin(), [](const auto i) { return 2 * i; }); + auto range_end_it = gko::detail::make_permute_iterator( + result.first.begin() + 1, [](const auto i) { return 2 * i; }); + auto zip_it = gko::detail::make_zip_iterator(range_start_it, range_end_it, + result.second.begin()); std::shuffle(zip_it, zip_it + num_ranges, engine); return result; @@ -199,11 +198,10 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) } - TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) +TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) { using index_type = typename TestFixture::index_type; - auto start_ends = - make_array(this->ref, create_ranges(1)); + auto start_ends = make_array(this->ref, create_ranges(1)); bool result = false; gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( @@ -213,7 +211,7 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) } - TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) +TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) { using index_type = typename TestFixture::index_type; auto start_ends = gko::array(this->exec, {1}); diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index 6f30761cbb0..dc99bb0a4ab 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -58,7 +58,6 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges) gko::array expects_ranges{this->exec, {0, 4, 9, 11}}; gko::array expects_pid{this->exec, {0, 1, 2}}; - auto part = gko::experimental::distributed::build_partition_from_local_range< gko::int32, itype>(this->exec, this->comm, From 690e89b1d6b05595d28a68f2740ce6984e88606e Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Apr 2023 14:27:45 +0200 Subject: [PATCH 223/583] review updates: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - use gko::array instead of gko::vector Co-authored-by: Thomas Grützmacher --- core/distributed/partition_helpers.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 921eeee19fd..3a3f06cef77 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -127,17 +127,16 @@ build_partition_from_local_size(std::shared_ptr exec, mpi::communicator comm, size_type local_size) { auto local_size_gi = static_cast(local_size); - std::vector sizes(comm.size()); - comm.all_gather(exec, &local_size_gi, 1, sizes.data(), 1); + array sizes(exec->get_master(), comm.size()); + comm.all_gather(exec, &local_size_gi, 1, sizes.get_data(), 1); - std::vector offsets(comm.size() + 1); - offsets[0] = 0; - std::partial_sum(sizes.begin(), sizes.end(), offsets.begin() + 1); + array offsets(exec->get_master(), comm.size() + 1); + offsets.get_data()[0] = 0; + std::partial_sum(sizes.get_data(), sizes.get_data() + comm.size(), + offsets.get_data() + 1); - auto ranges = - make_array_view(exec->get_master(), offsets.size(), offsets.data()); return Partition::build_from_contiguous( - exec, ranges); + exec, offsets); } #define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_SIZE(_local_type, _global_type) \ From 721d8292650265661d17f1869837ec3a5e29cc37 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Apr 2023 15:59:23 +0200 Subject: [PATCH 224/583] adds copy_assignable wrapper class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this allows to rely on default implementation of the constructors and assignment operators in the `permute_iterator`. Except for the default constructor due to AppleClang issues Co-authored-by: Thomas Grützmacher Co-authored-by: Tobias Ribizel --- core/base/copy_assignable.hpp | 62 ++++++++++++++++++++++++++++++++++ core/base/iterator_factory.hpp | 20 +++-------- 2 files changed, 67 insertions(+), 15 deletions(-) create mode 100644 core/base/copy_assignable.hpp diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp new file mode 100644 index 00000000000..2c29e4e64e4 --- /dev/null +++ b/core/base/copy_assignable.hpp @@ -0,0 +1,62 @@ +#ifndef GKO_CORE_BASE_COPY_ASSIGNABLE_HPP +#define GKO_CORE_BASE_COPY_ASSIGNABLE_HPP + + +#include + + +namespace gko { +namespace detail { + + +template +class copy_assignable; + + +/** + * Helper class to make a type copy assignable. + * + * This class wraps an object of a type that has a copy constructor, but not + * a copy assignment. This is most often the case for lambdas. The wrapped + * object can then be copy assigned, by relying on the copy constructor. + * + * @tparam T type with a copy constructor + */ +template +class copy_assignable< + T, typename std::enable_if::value>::type> { +public: + copy_assignable() : obj_{{}} {} + copy_assignable(const copy_assignable& other) = default; + copy_assignable(copy_assignable&& other) noexcept = default; + + copy_assignable(const T& obj) : obj_{obj} {} + copy_assignable(T&& obj) : obj_{std::move(obj)} {} + + copy_assignable& operator=(const copy_assignable& other) + { + obj_.clear(); + obj_.emplace_back(other.get()); + return *this; + } + copy_assignable& operator=(copy_assignable&& other) noexcept = default; + + template + decltype(auto) operator()(Args&&... args) const + { + return obj_[0](std::forward(args)...); + } + + T const& get() const { return obj_[0]; } + T& get() { return obj_[0]; } + +private: + //!< Store wrapped object in a container that has an emplace function + std::vector obj_; +}; + + +} // namespace detail +} // namespace gko + +#endif // GKO_CORE_BASE_COPY_ASSIGNABLE_HPP diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 76cf3dcf36d..1423803555c 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -40,7 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include + +#include namespace gko { @@ -376,21 +377,12 @@ class permute_iterator { using reference = typename std::iterator_traits::reference; using iterator_category = std::random_access_iterator_tag; - explicit permute_iterator() : it_{}, idx_{}, perm_{{}} {} + permute_iterator() = default; explicit permute_iterator(IteratorType it, PermuteFn perm) : it_{std::move(it)}, idx_{}, perm_{std::move(perm)} {} - permute_iterator& operator=(const permute_iterator& other) - { - it_ = other.it_; - idx_ = other.idx_; - perm_.clear(); - perm_.emplace_back(other.perm_[0]); - return *this; - } - permute_iterator& operator+=(difference_type i) { idx_ += i; @@ -442,7 +434,7 @@ class permute_iterator { return idx_ - other.idx_; } - reference operator*() const { return it_[perm_[0](idx_)]; } + reference operator*() const { return it_[perm_(idx_)]; } reference operator[](difference_type i) const { return *(*this + i); } @@ -479,9 +471,7 @@ class permute_iterator { private: IteratorType it_; difference_type idx_; - // hack to make lambda function copy assignable - // could be better done with std::optional - std::vector perm_; + copy_assignable perm_; }; From 84750fc5afe78d2dec1ccacc0b04a7d90ca27065 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 26 Apr 2023 15:46:45 +0200 Subject: [PATCH 225/583] adds invalid state exception Co-authored-by: Pratik Nayak Co-authored-by: Tobias Ribizel --- core/distributed/partition_helpers.cpp | 2 +- test/mpi/partition_helpers.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 3a3f06cef77..70ae3897a52 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -100,7 +100,7 @@ build_partition_from_local_range(std::shared_ptr exec, exec->run(partition_helpers::make_check_consecutive_ranges( ranges_start_end, &consecutive_ranges)); if (!consecutive_ranges) { - throw Error(__FILE__, __LINE__, "The partition contains gaps."); + GKO_INVALID_STATE("The partition contains gaps."); } // join (now consecutive) starts and ends into combined array diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index dc99bb0a4ab..de0b897fd13 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -109,7 +109,7 @@ TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesThrowsOnGap) ASSERT_THROW(build_from_local_ranges(this->exec, this->comm, local_range[this->comm.rank()]), - gko::Error); + gko::InvalidStateError); } From c20213995bbad39bc2be2334f40c8fbd5863df74 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 26 Apr 2023 15:59:25 +0200 Subject: [PATCH 226/583] review updates: - documentation - formatting Co-authored-by: Pratik Nayak --- core/base/iterator_factory.hpp | 11 +++++++++++ test/distributed/partition_helper_kernels.cpp | 1 + 2 files changed, 12 insertions(+) diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 1423803555c..e1ed0ca62d0 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -368,6 +368,17 @@ void swap(zip_iterator_reference a, } +/** + * Random access iterator that uses a function to transform the index. + * + * For a function `fn` and an underlying iterator `it`, accessing the + * permute_iterator at index `i` will result in accessing `it[fn(i)]`. + * + * @tparam IteratorType Underlying iterator, has to be random access. + * @tparam PermuteFn A function `difference_type -> difference_type` that + * transforms any given index. It doesn't have to be a strict + * permutation of indices (i.e. not bijective). + */ template class permute_iterator { public: diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index bdf750e4675..44c514093d8 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -160,6 +160,7 @@ shuffle_range_and_pid(const std::vector& ranges, return result; } + template class PartitionHelpers : public CommonTestFixture { protected: From a433592932d00692381118b58d2557b2d65bbefe Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Wed, 26 Apr 2023 14:08:32 +0000 Subject: [PATCH 227/583] Format files Co-authored-by: Marcel Koch --- .../distributed/partition_helpers_kernels.cpp | 1 - core/base/copy_assignable.hpp | 38 +++++++++++++++++-- core/base/iterator_factory.hpp | 3 +- core/distributed/partition.cpp | 1 + core/distributed/partition_helpers.cpp | 4 +- cuda/distributed/partition_helpers_kernels.cu | 1 - include/ginkgo/core/base/mpi.hpp | 11 ++---- .../core/distributed/partition_helpers.hpp | 2 +- omp/distributed/partition_helpers_kernels.cpp | 1 - .../distributed/partition_helpers_kernels.cpp | 1 - .../distributed/partition_helpers_kernels.cpp | 4 +- test/distributed/partition_helper_kernels.cpp | 4 +- 12 files changed, 48 insertions(+), 23 deletions(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index e5565819021..dbd20c40c15 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp index 2c29e4e64e4..199a02e8388 100644 --- a/core/base/copy_assignable.hpp +++ b/core/base/copy_assignable.hpp @@ -1,5 +1,37 @@ -#ifndef GKO_CORE_BASE_COPY_ASSIGNABLE_HPP -#define GKO_CORE_BASE_COPY_ASSIGNABLE_HPP +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_ +#define GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_ #include @@ -59,4 +91,4 @@ class copy_assignable< } // namespace detail } // namespace gko -#endif // GKO_CORE_BASE_COPY_ASSIGNABLE_HPP +#endif // GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_ diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index e1ed0ca62d0..bbc1d3b4b2b 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -41,7 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include + +#include "core/base/copy_assignable.hpp" namespace gko { diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index 22f0fdb3d94..bfeb5e8c286 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -32,6 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + #include "core/distributed/partition_kernels.hpp" diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 70ae3897a52..9085b7ec2e7 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -30,13 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include #include #include +#include + + #include "core/components/fill_array_kernels.hpp" #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu index e37655e357e..a70e728f845 100644 --- a/cuda/distributed/partition_helpers_kernels.cu +++ b/cuda/distributed/partition_helpers_kernels.cu @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 9699dea4942..bf985cabeb7 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -87,13 +87,10 @@ inline constexpr bool is_gpu_aware() int map_rank_to_device_id(MPI_Comm comm, int num_devices); -#define GKO_REGISTER_MPI_TYPE(input_type, mpi_type) \ - template <> \ - struct type_impl { \ - static MPI_Datatype get_type() \ - { \ - return mpi_type; \ - } \ +#define GKO_REGISTER_MPI_TYPE(input_type, mpi_type) \ + template <> \ + struct type_impl { \ + static MPI_Datatype get_type() { return mpi_type; } \ } /** diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp index 889347674c8..6bc20350a7d 100644 --- a/include/ginkgo/core/distributed/partition_helpers.hpp +++ b/include/ginkgo/core/distributed/partition_helpers.hpp @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "ginkgo/core/base/range.hpp" +#include namespace gko { diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index d03c21c0731..2c006a22885 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 9cbc425906d..a9b476d0315 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index 9b339fd926f..5a139f4edb5 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -30,9 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include - - #include #include #include @@ -43,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 44c514093d8..d43062d3ccd 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -30,9 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/distributed/partition_helpers_kernels.hpp" - - #include #include @@ -41,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/iterator_factory.hpp" +#include "core/distributed/partition_helpers_kernels.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" From 763131d33c351d64e5d18f9a433a17ba69ae9b4b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 27 Apr 2023 14:29:24 +0200 Subject: [PATCH 228/583] uses placement-new for copy-assignable wrapper Co-authored-by: Tobias Ribizel --- core/base/copy_assignable.hpp | 47 +++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp index 199a02e8388..40948757701 100644 --- a/core/base/copy_assignable.hpp +++ b/core/base/copy_assignable.hpp @@ -58,20 +58,45 @@ template class copy_assignable< T, typename std::enable_if::value>::type> { public: - copy_assignable() : obj_{{}} {} - copy_assignable(const copy_assignable& other) = default; - copy_assignable(copy_assignable&& other) noexcept = default; + copy_assignable() : obj_(new(buf)(T)()) {} - copy_assignable(const T& obj) : obj_{obj} {} - copy_assignable(T&& obj) : obj_{std::move(obj)} {} + copy_assignable(const copy_assignable& other) + { + if (this != &other) { + *this = other; + } + } + + copy_assignable(copy_assignable&& other) noexcept + { + if (this != &other) { + *this = std::move(other); + } + } + + copy_assignable(const T& obj) : obj_{new(buf)(T)(obj)} {} + + copy_assignable(T&& obj) : obj_{new(buf)(T)(std::move(obj))} {} copy_assignable& operator=(const copy_assignable& other) { - obj_.clear(); - obj_.emplace_back(other.get()); + if (this != &other) { + obj_->~T(); + obj_ = new (buf)(T)(*other.obj_); + } return *this; } - copy_assignable& operator=(copy_assignable&& other) noexcept = default; + + copy_assignable& operator=(copy_assignable&& other) noexcept + { + if (this != &other) { + obj_->~T(); + obj_ = new (buf)(T)(std::move(*other.obj_)); + } + return *this; + } + + ~copy_assignable() { obj_->~T(); } template decltype(auto) operator()(Args&&... args) const @@ -80,11 +105,13 @@ class copy_assignable< } T const& get() const { return obj_[0]; } + T& get() { return obj_[0]; } private: - //!< Store wrapped object in a container that has an emplace function - std::vector obj_; + //!< Store wrapped object on the stack, should use std::optional in c++17 + T* obj_; + alignas(T) unsigned char buf[sizeof(T)]; }; From 32b7e621bdb49e530494429c2259e3f15174e222 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 27 Apr 2023 15:52:34 +0000 Subject: [PATCH 229/583] Format files Co-authored-by: Marcel Koch --- core/base/copy_assignable.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp index 40948757701..5d3985c5c20 100644 --- a/core/base/copy_assignable.hpp +++ b/core/base/copy_assignable.hpp @@ -58,7 +58,7 @@ template class copy_assignable< T, typename std::enable_if::value>::type> { public: - copy_assignable() : obj_(new(buf)(T)()) {} + copy_assignable() : obj_(new (buf)(T)()) {} copy_assignable(const copy_assignable& other) { @@ -74,9 +74,9 @@ class copy_assignable< } } - copy_assignable(const T& obj) : obj_{new(buf)(T)(obj)} {} + copy_assignable(const T& obj) : obj_{new (buf)(T)(obj)} {} - copy_assignable(T&& obj) : obj_{new(buf)(T)(std::move(obj))} {} + copy_assignable(T&& obj) : obj_{new (buf)(T)(std::move(obj))} {} copy_assignable& operator=(const copy_assignable& other) { From f2af1f998bdf4499342b80dedf584824b9b903f9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 28 Apr 2023 09:52:39 +0200 Subject: [PATCH 230/583] remove undefined lambda default constructor --- core/base/copy_assignable.hpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp index 5d3985c5c20..de552831a86 100644 --- a/core/base/copy_assignable.hpp +++ b/core/base/copy_assignable.hpp @@ -58,7 +58,7 @@ template class copy_assignable< T, typename std::enable_if::value>::type> { public: - copy_assignable() : obj_(new (buf)(T)()) {} + copy_assignable() = default; copy_assignable(const copy_assignable& other) { @@ -74,14 +74,16 @@ class copy_assignable< } } - copy_assignable(const T& obj) : obj_{new (buf)(T)(obj)} {} + copy_assignable(const T& obj) : obj_{new(buf)(T)(obj)} {} - copy_assignable(T&& obj) : obj_{new (buf)(T)(std::move(obj))} {} + copy_assignable(T&& obj) : obj_{new(buf)(T)(std::move(obj))} {} copy_assignable& operator=(const copy_assignable& other) { if (this != &other) { - obj_->~T(); + if (obj_) { + obj_->~T(); + } obj_ = new (buf)(T)(*other.obj_); } return *this; @@ -90,13 +92,20 @@ class copy_assignable< copy_assignable& operator=(copy_assignable&& other) noexcept { if (this != &other) { - obj_->~T(); + if (obj_) { + obj_->~T(); + } obj_ = new (buf)(T)(std::move(*other.obj_)); } return *this; } - ~copy_assignable() { obj_->~T(); } + ~copy_assignable() + { + if (obj_) { + obj_->~T(); + } + } template decltype(auto) operator()(Args&&... args) const From 85e43e83a73d6ae4f9228766dda9ccefe268685d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 5 Jul 2023 17:34:12 +0200 Subject: [PATCH 231/583] use workaround for old dpcpp version --- .../partition_helpers_kernels.dp.cpp | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index 6362c243d95..46f72c8ef58 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -74,11 +74,31 @@ void sort_by_range_start( range_start_ends.get_data(), stride{}); auto end_it = oneapi::dpl::make_permutation_iterator( range_start_ends.get_data() + 1, stride{}); + + // older versions of oneDPL have a bug when sorting permutation iterators +#if ONEDPL_VERSION_MAJOR >= 2022 && ONEDPL_VERSION_MINOR >= 1 auto zip_it = oneapi::dpl::make_zip_iterator(start_it, end_it, part_ids.get_data()); std::stable_sort(policy, zip_it, zip_it + num_ranges, [](auto a, auto b) { return std::get<0>(a) < std::get<0>(b); }); +#else + array starts(exec, num_ranges); + array ends(exec, num_ranges); + + std::copy(policy, start_it, start_it + num_ranges, starts.get_data()); + std::copy(policy, end_it, end_it + num_ranges, ends.get_data()); + + auto zip_it = oneapi::dpl::make_zip_iterator( + starts.get_data(), ends.get_data(), part_ids.get_data()); + std::stable_sort(policy, zip_it, zip_it + num_ranges, [](auto a, auto b) { + return std::get<0>(a) < std::get<0>(b); + }); + + std::copy(policy, starts.get_data(), starts.get_data() + num_ranges, + start_it); + std::copy(policy, ends.get_data(), ends.get_data() + num_ranges, end_it); +#endif } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( From 5327dd54c27bfc281ea278e46b3c6761d1f0849e Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 7 Jul 2023 09:38:28 +0200 Subject: [PATCH 232/583] fixup after rebase --- test/mpi/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt index 08050bde58f..fc0aec8138a 100644 --- a/test/mpi/CMakeLists.txt +++ b/test/mpi/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3) +ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3) ginkgo_create_common_and_reference_test(vector MPI_SIZE 3) add_subdirectory(preconditioner) From 0e948b5f168d5ae7ee64863c386840dd40f95dee Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 7 Jul 2023 12:23:50 +0200 Subject: [PATCH 233/583] review updates: - initialize pointer - dereference pointer instead of array access - use `bool&` as return type instead of `bool*` Co-authored-by: Tobias Ribizel --- .../distributed/partition_helpers_kernels.cpp | 6 +++--- core/base/copy_assignable.hpp | 12 ++++++------ core/distributed/partition_helpers.cpp | 2 +- core/distributed/partition_helpers_kernels.hpp | 2 +- reference/distributed/partition_helpers_kernels.cpp | 6 +++--- .../test/distributed/partition_helpers_kernels.cpp | 4 ++-- test/distributed/partition_helper_kernels.cpp | 8 ++++---- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index dbd20c40c15..3c041dd7e4b 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -46,7 +46,7 @@ namespace partition_helpers { template void check_consecutive_ranges(std::shared_ptr exec, const array& range_start_ends, - bool* result) + bool& result) { array result_uint32{exec, 1}; auto num_ranges = range_start_ends.get_num_elems() / 2; @@ -64,10 +64,10 @@ void check_consecutive_ranges(std::shared_ptr exec, [] GKO_KERNEL(auto x) { return x; }, static_cast(true), result_uint32.get_data(), num_ranges - 1, range_start_ends.get_const_data() + 1); - *result = + result = static_cast(exec->copy_val_to_host(result_uint32.get_data())); } else { - *result = true; + result = true; } } diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp index de552831a86..7f5e4125e10 100644 --- a/core/base/copy_assignable.hpp +++ b/core/base/copy_assignable.hpp @@ -74,9 +74,9 @@ class copy_assignable< } } - copy_assignable(const T& obj) : obj_{new(buf)(T)(obj)} {} + copy_assignable(const T& obj) : obj_{new (buf)(T)(obj)} {} - copy_assignable(T&& obj) : obj_{new(buf)(T)(std::move(obj))} {} + copy_assignable(T&& obj) : obj_{new (buf)(T)(std::move(obj))} {} copy_assignable& operator=(const copy_assignable& other) { @@ -110,16 +110,16 @@ class copy_assignable< template decltype(auto) operator()(Args&&... args) const { - return obj_[0](std::forward(args)...); + return (*obj_)(std::forward(args)...); } - T const& get() const { return obj_[0]; } + T const& get() const { return *obj_; } - T& get() { return obj_[0]; } + T& get() { return *obj_; } private: //!< Store wrapped object on the stack, should use std::optional in c++17 - T* obj_; + T* obj_{}; alignas(T) unsigned char buf[sizeof(T)]; }; diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 9085b7ec2e7..b1fd1dd9bc5 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -100,7 +100,7 @@ build_partition_from_local_range(std::shared_ptr exec, // check for consistency bool consecutive_ranges = false; exec->run(partition_helpers::make_check_consecutive_ranges( - ranges_start_end, &consecutive_ranges)); + ranges_start_end, consecutive_ranges)); if (!consecutive_ranges) { GKO_INVALID_STATE("The partition contains gaps."); } diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index 6d55926db76..ed9fa60364f 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -54,7 +54,7 @@ namespace kernels { #define GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(_type) \ void check_consecutive_ranges(std::shared_ptr exec, \ const array<_type>& range_start_ends, \ - bool* result) + bool& result) #define GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES(_type) \ diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index a9b476d0315..b68c10b1d01 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -68,7 +68,7 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( template void check_consecutive_ranges(std::shared_ptr exec, const array& range_start_ends, - bool* result) + bool& result) { auto num_parts = range_start_ends.get_num_elems() / 2; auto start_it = @@ -80,11 +80,11 @@ void check_consecutive_ranges(std::shared_ptr exec, auto range_it = detail::make_zip_iterator(start_it, end_it); if (num_parts) { - *result = std::all_of( + result = std::all_of( range_it, range_it + num_parts - 1, [](const auto& r) { return std::get<0>(r) == std::get<1>(r); }); } else { - *result = true; + result = true; } } diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index 5a139f4edb5..f0ce4918d01 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -107,7 +107,7 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) bool result = false; gko::kernels::reference::partition_helpers::check_consecutive_ranges( - this->ref, range_start_ends, &result); + this->ref, range_start_ends, result); ASSERT_TRUE(result); } @@ -121,7 +121,7 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) bool result = true; gko::kernels::reference::partition_helpers::check_consecutive_ranges( - this->ref, range_start_ends, &result); + this->ref, range_start_ends, result); ASSERT_FALSE(result); } diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index d43062d3ccd..a53505cf1f6 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -175,7 +175,7 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) bool result = false; gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, offsets, &result); + this->exec, offsets, result); ASSERT_TRUE(result); } @@ -191,7 +191,7 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) bool result = true; gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, &result); + this->exec, start_ends, result); ASSERT_FALSE(result); } @@ -204,7 +204,7 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) bool result = false; gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, &result); + this->exec, start_ends, result); ASSERT_TRUE(result); } @@ -217,7 +217,7 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) bool result = false; gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, &result); + this->exec, start_ends, result); ASSERT_TRUE(result); } From 2bb5a12e4a5dc364b10b5f0d665598ca49659827 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 10 Aug 2023 08:35:41 +0200 Subject: [PATCH 234/583] use custom stream for thrust policy --- common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc | 4 ++-- cuda/distributed/partition_helpers_kernels.cu | 3 +++ hip/distributed/partition_helpers_kernels.hip.cpp | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc index e3e8335dd22..f92794ec138 100644 --- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc @@ -46,8 +46,8 @@ void sort_by_range_start( range_start_ends.get_data() + 1, strided_indices); auto zip_it = thrust::make_zip_iterator( thrust::make_tuple(end_it, part_ids.get_data())); - thrust::stable_sort_by_key(thrust::device, start_it, start_it + num_ranges, - zip_it); + thrust::stable_sort_by_key(thrust_policy(exec), start_it, + start_it + num_ranges, zip_it); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu index a70e728f845..62dad1efaf1 100644 --- a/cuda/distributed/partition_helpers_kernels.cu +++ b/cuda/distributed/partition_helpers_kernels.cu @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/base/thrust.cuh" + + namespace gko { namespace kernels { namespace cuda { diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp index d9ae663f93f..d4769141676 100644 --- a/hip/distributed/partition_helpers_kernels.hip.cpp +++ b/hip/distributed/partition_helpers_kernels.hip.cpp @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "hip/base/thrust.hip.hpp" + + namespace gko { namespace kernels { namespace hip { From 755827546c4fb3e162f4216af704b641f0e74012 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 17 Aug 2023 10:01:09 +0200 Subject: [PATCH 235/583] correctly define permutation map --- dpcpp/distributed/partition_helpers_kernels.dp.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index 46f72c8ef58..8b0171cd349 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -46,7 +46,14 @@ namespace dpcpp { namespace partition_helpers { struct stride { - // Some version requires [] while some requires (), so I added both +#if ONEDPL_VERSION_MAJOR >= 2022 && ONEDPL_VERSION_MINOR >= 1 + template + Index operator()(const Index& i) const + { + return i * 2; + } +#else + // Some older version require [] while some require (), so I added both template Index operator[](const Index& i) const { @@ -56,8 +63,9 @@ struct stride { template Index operator()(const Index& i) const { - return operator[](i); + return i * 2; } +#endif }; template From 20a8215ef4444ee80fe701f9d66cc8b13735a265 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 14 Aug 2023 10:38:52 +0200 Subject: [PATCH 236/583] allow passing matrix filename as benchmark input --- benchmark/blas/blas.cpp | 2 ++ benchmark/blas/distributed/multi_vector.cpp | 2 ++ .../matrix_generator/matrix_generator.cpp | 2 ++ benchmark/solver/distributed/solver.cpp | 2 ++ benchmark/solver/solver.cpp | 2 ++ benchmark/utils/general.hpp | 19 ++++++++++++++++++- 6 files changed, 28 insertions(+), 1 deletion(-) diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 11228ed5818..ce3a98a9a68 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -131,6 +131,8 @@ Parameters for a benchmark case are: stride_C: stride for C matrix in gemm (optional, default m) )"; std::string format = example_config; + // this benchmark doesn't use input matrices + matrix_input = false; initialize_argument_parsing(&argc, &argv, header, format); std::string extra_information = diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index be326b08b96..bcf77594c4b 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -61,6 +61,8 @@ Parameters for a benchmark case are: stride_y: stride for in/out vector y (optional, default r) )"; std::string format = example_config; + // this benchmark doesn't use input matrices + matrix_input = false; initialize_argument_parsing(&argc, &argv, header, format); const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp index 138b5a9c2ce..94883f5f63d 100644 --- a/benchmark/matrix_generator/matrix_generator.cpp +++ b/benchmark/matrix_generator/matrix_generator.cpp @@ -127,6 +127,8 @@ int main(int argc, char* argv[]) std::string header = "A utility that generates various types of " "matrices.\n"; + // this benchmark doesn't use input matrices + matrix_input = false; initialize_argument_parsing(&argc, &argv, header, input_format); std::clog << gko::version_info::get() << std::endl; diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index 2db71c16ca3..4f583bcd8a8 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -98,6 +98,8 @@ int main(int argc, char* argv[]) "-", where both "local_format" and "non_local_format" can be any of the recognized spmv formats. )"; + // this benchmark needs an additional "optimal" object in the input + matrix_input_additional_json = ",\"optimal\":{\"spmv\":\"csr-csr\"}"; initialize_argument_parsing(&argc, &argv, header, format); const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index 9190c99dad0..c5010116dea 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -61,6 +61,8 @@ int main(int argc, char* argv[]) std::string format = example_config + R"( "optimal":"spmv" can be one of the recognized spmv formats )"; + // this benchmark needs an additional "optimal" object in the input + matrix_input_additional_json = ",\"optimal\":{\"spmv\":\"csr\"}"; initialize_argument_parsing(&argc, &argv, header, format); std::stringstream ss_rel_res_goal; diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 335ed687002..d1b39263468 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -95,7 +95,9 @@ DEFINE_string(double_buffer, "", DEFINE_string( input, "", "If set, the value is used as the input for the benchmark (if set to a " - "json string ending with ]) or as input file path (otherwise)."); + "json string ending with ]), as the \"filename\" of a generated JSON input " + "(if the variable points to a MatrixMarket or Ginkgo binary matrix file) " + "or as JSON input file path (otherwise)."); DEFINE_bool(detailed, true, "If set, performs several runs to obtain more detailed results"); @@ -297,6 +299,12 @@ std::vector split(const std::string& s, char delimiter = ',') } +// allow matrix files as -input value +bool matrix_input = true; +// additional JSON to append to the input_str if the input file is a matrix file +std::string matrix_input_additional_json = ""; + + // returns the stream to be used as input of the application std::istream& get_input_stream() { @@ -308,6 +316,15 @@ std::istream& get_input_stream() if (input_str.back() == ']') { return std::make_unique(input_str); } + if (matrix_input) { + auto first_char = std::ifstream{input_str}.peek(); + // if the input looks like a MatrixMarket or Ginkgo binary file + if (first_char == '%' || first_char == 'G') { + input_str = "[{\"filename\":\"" + input_str + "\"" + + matrix_input_additional_json + "}]"; + return std::make_unique(input_str); + } + } return std::make_unique(input_str); }(); if (stream) { From 56672f8a651e8defa9ce6ecf5eac778564d64da6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 16 Aug 2023 13:33:24 +0200 Subject: [PATCH 237/583] add `-input_matrix` flag --- benchmark/blas/blas.cpp | 2 - benchmark/blas/distributed/multi_vector.cpp | 2 - benchmark/conversions/conversions.cpp | 4 +- .../matrix_generator/matrix_generator.cpp | 2 - .../matrix_statistics/matrix_statistics.cpp | 4 +- benchmark/preconditioner/preconditioner.cpp | 4 +- benchmark/solver/distributed/solver.cpp | 8 +-- benchmark/solver/solver.cpp | 8 +-- benchmark/sparse_blas/sparse_blas.cpp | 4 +- benchmark/spmv/distributed/spmv.cpp | 4 +- benchmark/spmv/spmv.cpp | 4 +- benchmark/utils/general.hpp | 44 ++++-------- benchmark/utils/general_matrix.hpp | 72 +++++++++++++++++++ 13 files changed, 106 insertions(+), 56 deletions(-) create mode 100644 benchmark/utils/general_matrix.hpp diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index ce3a98a9a68..11228ed5818 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -131,8 +131,6 @@ Parameters for a benchmark case are: stride_C: stride for C matrix in gemm (optional, default m) )"; std::string format = example_config; - // this benchmark doesn't use input matrices - matrix_input = false; initialize_argument_parsing(&argc, &argv, header, format); std::string extra_information = diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index bcf77594c4b..be326b08b96 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -61,8 +61,6 @@ Parameters for a benchmark case are: stride_y: stride for in/out vector y (optional, default r) )"; std::string format = example_config; - // this benchmark doesn't use input matrices - matrix_input = false; initialize_argument_parsing(&argc, &argv, header, format); const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp index b249293116b..d9684321e2d 100644 --- a/benchmark/conversions/conversions.cpp +++ b/benchmark/conversions/conversions.cpp @@ -44,7 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/spmv_validation.hpp" #include "benchmark/utils/timer.hpp" @@ -115,7 +115,7 @@ int main(int argc, char* argv[]) std::string header = "A benchmark for measuring performance of Ginkgo's conversions.\n"; std::string format_str = example_config; - initialize_argument_parsing(&argc, &argv, header, format_str); + initialize_argument_parsing_matrix(&argc, &argv, header, format_str); std::string extra_information = std::string() + "The formats are " + FLAGS_formats + "\n"; diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp index 94883f5f63d..138b5a9c2ce 100644 --- a/benchmark/matrix_generator/matrix_generator.cpp +++ b/benchmark/matrix_generator/matrix_generator.cpp @@ -127,8 +127,6 @@ int main(int argc, char* argv[]) std::string header = "A utility that generates various types of " "matrices.\n"; - // this benchmark doesn't use input matrices - matrix_input = false; initialize_argument_parsing(&argc, &argv, header, input_format); std::clog << gko::version_info::get() << std::endl; diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 09cae6a7554..fccf4391ad5 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/spmv_validation.hpp" #include "benchmark/utils/types.hpp" @@ -173,7 +173,7 @@ int main(int argc, char* argv[]) "A utility that collects additional statistical properties of the " "matrix.\n"; std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + initialize_argument_parsing_matrix(&argc, &argv, header, format); std::clog << gko::version_info::get() << std::endl; diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index d125b46bb34..e7859e992dc 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/preconditioners.hpp" @@ -262,7 +262,7 @@ int main(int argc, char* argv[]) std::string header = "A benchmark for measuring preconditioner performance.\n"; std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + initialize_argument_parsing_matrix(&argc, &argv, header, format); std::string extra_information = "Running with preconditioners: " + FLAGS_preconditioners + "\n"; diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index 4f583bcd8a8..8b285e343ce 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -40,7 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/solver/solver_common.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" @@ -98,9 +98,9 @@ int main(int argc, char* argv[]) "-", where both "local_format" and "non_local_format" can be any of the recognized spmv formats. )"; - // this benchmark needs an additional "optimal" object in the input - matrix_input_additional_json = ",\"optimal\":{\"spmv\":\"csr-csr\"}"; - initialize_argument_parsing(&argc, &argv, header, format); + std::string additional_json = ",\"optimal\":{\"spmv\":\"csr-csr\"}"; + initialize_argument_parsing_matrix(&argc, &argv, header, format, + additional_json); const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); const auto rank = comm.rank(); diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index c5010116dea..910bb54d89a 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/solver/solver_common.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" @@ -61,9 +61,9 @@ int main(int argc, char* argv[]) std::string format = example_config + R"( "optimal":"spmv" can be one of the recognized spmv formats )"; - // this benchmark needs an additional "optimal" object in the input - matrix_input_additional_json = ",\"optimal\":{\"spmv\":\"csr\"}"; - initialize_argument_parsing(&argc, &argv, header, format); + std::string additional_json = ",\"optimal\":{\"spmv\":\"csr\"}"; + initialize_argument_parsing_matrix(&argc, &argv, header, format, + additional_json); std::stringstream ss_rel_res_goal; ss_rel_res_goal << std::scientific << FLAGS_rel_res_goal; diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index d906e9f9e12..8c054709fdf 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/sparse_blas/operations.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/spmv_validation.hpp" #include "benchmark/utils/types.hpp" @@ -161,7 +161,7 @@ int main(int argc, char* argv[]) "A benchmark for measuring performance of Ginkgo's sparse BLAS " "operations.\n"; std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + initialize_argument_parsing_matrix(&argc, &argv, header, format); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp index 3c2986846b3..9b7e4ad8c8f 100644 --- a/benchmark/spmv/distributed/spmv.cpp +++ b/benchmark/spmv/distributed/spmv.cpp @@ -44,7 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/spmv/spmv_common.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -102,7 +102,7 @@ int main(int argc, char* argv[]) std::string header = "A benchmark for measuring performance of Ginkgo's spmv.\n"; std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + initialize_argument_parsing_matrix(&argc, &argv, header, format); if (rank == 0) { std::string extra_information = "The formats are [" + diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index df000cecd47..034437907c8 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -39,7 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/spmv/spmv_common.hpp" #include "benchmark/utils/formats.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/spmv_validation.hpp" @@ -64,7 +64,7 @@ int main(int argc, char* argv[]) std::string header = "A benchmark for measuring performance of Ginkgo's spmv.\n"; std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + initialize_argument_parsing_matrix(&argc, &argv, header, format); std::string extra_information = "The formats are " + FLAGS_formats + "\nThe number of right hand sides is " + diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index d1b39263468..b7ec0e72cf1 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -95,9 +95,7 @@ DEFINE_string(double_buffer, "", DEFINE_string( input, "", "If set, the value is used as the input for the benchmark (if set to a " - "json string ending with ]), as the \"filename\" of a generated JSON input " - "(if the variable points to a MatrixMarket or Ginkgo binary matrix file) " - "or as JSON input file path (otherwise)."); + "json string ending with ]) or as input file path (otherwise)."); DEFINE_bool(detailed, true, "If set, performs several runs to obtain more detailed results"); @@ -147,6 +145,9 @@ DEFINE_double( "is lower than or equal to 1, the timing region is always 1 repetition."); +std::unique_ptr input_stream; + + /** * Parses arguments through gflags and initialize a documentation string. * @@ -186,6 +187,14 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header, FLAGS_profiler_hook = "auto"; } } + std::string input_str(FLAGS_input); + if (!input_str.empty()) { + if (input_str.back() == ']') { + input_stream = std::make_unique(input_str); + } else { + input_stream = std::make_unique(input_str); + } + } } /** @@ -299,36 +308,11 @@ std::vector split(const std::string& s, char delimiter = ',') } -// allow matrix files as -input value -bool matrix_input = true; -// additional JSON to append to the input_str if the input file is a matrix file -std::string matrix_input_additional_json = ""; - - // returns the stream to be used as input of the application std::istream& get_input_stream() { - static auto stream = []() -> std::unique_ptr { - std::string input_str(FLAGS_input); - if (input_str.empty()) { - return nullptr; - } - if (input_str.back() == ']') { - return std::make_unique(input_str); - } - if (matrix_input) { - auto first_char = std::ifstream{input_str}.peek(); - // if the input looks like a MatrixMarket or Ginkgo binary file - if (first_char == '%' || first_char == 'G') { - input_str = "[{\"filename\":\"" + input_str + "\"" + - matrix_input_additional_json + "}]"; - return std::make_unique(input_str); - } - } - return std::make_unique(input_str); - }(); - if (stream) { - return *stream; + if (input_stream) { + return *input_stream; } return std::cin; } diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp new file mode 100644 index 00000000000..3791976e3ab --- /dev/null +++ b/benchmark/utils/general_matrix.hpp @@ -0,0 +1,72 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ +#define GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ + + +#include + + +#include + + +#include "benchmark/utils/general.hpp" + + +DEFINE_string(input_matrix, "", + "Filename of a matrix to be used as the single input. Overwrites " + "the value of the -input flag"); + + +/** + * @copydoc initialize_argument_parsing + * @param additional_matrix_file_json text to be appended to the + * `{"filename":"..."}` JSON object that + * will be used as input for the benchmark + * if the `-input_matrix` flag is used. + */ +void initialize_argument_parsing_matrix( + int* argc, char** argv[], std::string& header, std::string& format, + std::string additional_matrix_file_json = "") +{ + initialize_argument_parsing(argc, argv, header, format); + std::string input_matrix_str{FLAGS_input_matrix}; + if (!input_matrix_str.empty()) { + auto input_json = "[{\"filename\":\"" + input_matrix_str + "\"" + + additional_matrix_file_json + "}]"; + input_stream = std::make_unique(input_json); + } +} + + +#endif // GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ \ No newline at end of file From eb58d6cb6650d15b1e25300fc21fa58ffa131496 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 16 Aug 2023 13:33:51 +0200 Subject: [PATCH 238/583] test `-input_matrix` flag --- benchmark/test/conversion.py | 12 ++++ benchmark/test/matrix_statistics.py | 7 +++ benchmark/test/preconditioner.py | 7 +++ .../test/reference/conversion.matrix.stderr | 46 +++++++++++++++ .../test/reference/conversion.matrix.stdout | 19 +++++++ .../distributed_solver.matrix.stderr | 17 ++++++ .../distributed_solver.matrix.stdout | 57 +++++++++++++++++++ .../reference/matrix_statistics.matrix.stderr | 8 +++ .../reference/matrix_statistics.matrix.stdout | 38 +++++++++++++ .../reference/preconditioner.matrix.stderr | 42 ++++++++++++++ .../reference/preconditioner.matrix.stdout | 30 ++++++++++ benchmark/test/reference/solver.matrix.stderr | 17 ++++++ benchmark/test/reference/solver.matrix.stdout | 55 ++++++++++++++++++ .../test/reference/sparse_blas.matrix.stderr | 36 ++++++++++++ .../test/reference/sparse_blas.matrix.stdout | 25 ++++++++ benchmark/test/reference/spmv.matrix.stderr | 31 ++++++++++ benchmark/test/reference/spmv.matrix.stdout | 20 +++++++ benchmark/test/solver.py | 7 +++ benchmark/test/solver_distributed.py | 7 +++ benchmark/test/sparse_blas.py | 12 ++++ benchmark/test/spmv.py | 7 +++ benchmark/test/test_framework.py.in | 12 ++-- 22 files changed, 505 insertions(+), 7 deletions(-) create mode 100644 benchmark/test/reference/conversion.matrix.stderr create mode 100644 benchmark/test/reference/conversion.matrix.stdout create mode 100644 benchmark/test/reference/distributed_solver.matrix.stderr create mode 100644 benchmark/test/reference/distributed_solver.matrix.stdout create mode 100644 benchmark/test/reference/matrix_statistics.matrix.stderr create mode 100644 benchmark/test/reference/matrix_statistics.matrix.stdout create mode 100644 benchmark/test/reference/preconditioner.matrix.stderr create mode 100644 benchmark/test/reference/preconditioner.matrix.stdout create mode 100644 benchmark/test/reference/solver.matrix.stderr create mode 100644 benchmark/test/reference/solver.matrix.stdout create mode 100644 benchmark/test/reference/sparse_blas.matrix.stderr create mode 100644 benchmark/test/reference/sparse_blas.matrix.stdout create mode 100644 benchmark/test/reference/spmv.matrix.stderr create mode 100644 benchmark/test/reference/spmv.matrix.stdout diff --git a/benchmark/test/conversion.py b/benchmark/test/conversion.py index cf2e33983af..2eada100731 100755 --- a/benchmark/test/conversion.py +++ b/benchmark/test/conversion.py @@ -29,6 +29,18 @@ expected_stderr="conversion.simple.stderr", ) +# input matrixfile +test_framework.compare_output( + [ + "-input_matrix", + str(test_framework.matrixpath), + "-formats", + "coo,csr", + ], + expected_stdout="conversion.matrix.stdout", + expected_stderr="conversion.matrix.stderr", +) + # check that all conversions work test_framework.compare_output( [ diff --git a/benchmark/test/matrix_statistics.py b/benchmark/test/matrix_statistics.py index a29c80a0a7a..6e4d8b1d2f5 100755 --- a/benchmark/test/matrix_statistics.py +++ b/benchmark/test/matrix_statistics.py @@ -23,3 +23,10 @@ expected_stdout="matrix_statistics.simple.stdout", expected_stderr="matrix_statistics.simple.stderr", ) + +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="matrix_statistics.matrix.stdout", + expected_stderr="matrix_statistics.matrix.stderr", +) diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py index a5a8dd3f13f..e05e5b780ac 100755 --- a/benchmark/test/preconditioner.py +++ b/benchmark/test/preconditioner.py @@ -24,6 +24,13 @@ expected_stderr="preconditioner.simple.stderr", ) +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="preconditioner.matrix.stdout", + expected_stderr="preconditioner.matrix.stderr", +) + # profiler annotations test_framework.compare_output( [ diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr new file mode 100644 index 00000000000..813e04a2100 --- /dev/null +++ b/benchmark/test/reference/conversion.matrix.stderr @@ -0,0 +1,46 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr +Benchmarking conversions. +Running test case +{ + "filename": "../../matrices/test/ani1.mtx", + "conversions": {} +} +Matrix is of size (36, 36) +Current state: +[ + { + "filename": "../../matrices/test/ani1.mtx", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "size": 36 + } +] +Current state: +[ + { + "filename": "../../matrices/test/ani1.mtx", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "size": 36 + } +] diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout new file mode 100644 index 00000000000..d3f62ec2b56 --- /dev/null +++ b/benchmark/test/reference/conversion.matrix.stdout @@ -0,0 +1,19 @@ + +[ + { + "filename": "../../matrices/test/ani1.mtx", + "conversions": { + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "size": 36 + } +] diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr new file mode 100644 index 00000000000..7eba2fc85dd --- /dev/null +++ b/benchmark/test/reference/distributed_solver.matrix.stderr @@ -0,0 +1,17 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case +{ + "filename": "../../matrices/test/ani1.mtx", + "optimal": { + "spmv": "csr-csr" + }, + "solver": {} +} +Matrix is of size (36, 36) + Running solver: cg diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout new file mode 100644 index 00000000000..157c40fdccf --- /dev/null +++ b/benchmark/test/reference/distributed_solver.matrix.stdout @@ -0,0 +1,57 @@ + +[ + { + "filename": "../../matrices/test/ani1.mtx", + "optimal": { + "spmv": "csr-csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "dense::row_gather": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_squared_norm2": 1.0, + "dense::compute_sqrt": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 27, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "size": 36 + } +] diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr new file mode 100644 index 00000000000..0f7cc261c47 --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.matrix.stderr @@ -0,0 +1,8 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running test case +{ + "filename": "../../matrices/test/ani1.mtx", + "problem": {} +} +Matrix is of size (36, 36) diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout new file mode 100644 index 00000000000..20657f7abd5 --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.matrix.stdout @@ -0,0 +1,38 @@ + +[ + { + "filename": "../../matrices/test/ani1.mtx", + "problem": { + "rows": 36, + "columns": 36, + "nonzeros": 208, + "row_distribution": { + "min": 4, + "q1": 4.5, + "median": 6.0, + "q3": 7.0, + "max": 9, + "mean": 5.777777777777778, + "variance": 2.061728395061728, + "skewness": 0.3366362745126052, + "kurtosis": 2.0507009932231366, + "hyperskewness": 1.9165991338199193, + "hyperflatness": 6.0545648993883665 + }, + "col_distribution": { + "min": 4, + "q1": 4.5, + "median": 6.0, + "q3": 7.0, + "max": 9, + "mean": 5.777777777777778, + "variance": 2.061728395061728, + "skewness": 0.3366362745126052, + "kurtosis": 2.0507009932231366, + "hyperskewness": 1.9165991338199193, + "hyperflatness": 6.0545648993883665 + } + }, + "size": 36 + } +] diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr new file mode 100644 index 00000000000..d54c99ac971 --- /dev/null +++ b/benchmark/test/reference/preconditioner.matrix.stderr @@ -0,0 +1,42 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case +{ + "filename": "../../matrices/test/ani1.mtx", + "preconditioner": {} +} +Matrix is of size (36, 36) +Current state: +[ + { + "filename": "../../matrices/test/ani1.mtx", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate()": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply()": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + }, + "size": 36 + } +] diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout new file mode 100644 index 00000000000..22c0c9cc1c1 --- /dev/null +++ b/benchmark/test/reference/preconditioner.matrix.stdout @@ -0,0 +1,30 @@ + +[ + { + "filename": "../../matrices/test/ani1.mtx", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate()": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply()": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + }, + "size": 36 + } +] diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr new file mode 100644 index 00000000000..78b75c301f7 --- /dev/null +++ b/benchmark/test/reference/solver.matrix.stderr @@ -0,0 +1,17 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case +{ + "filename": "../../matrices/test/ani1.mtx", + "optimal": { + "spmv": "csr" + }, + "solver": {} +} +Matrix is of size (36, 36) + Running solver: cg diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout new file mode 100644 index 00000000000..4a68d8a599b --- /dev/null +++ b/benchmark/test/reference/solver.matrix.stdout @@ -0,0 +1,55 @@ + +[ + { + "filename": "../../matrices/test/ani1.mtx", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_norm2_dispatch": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 27, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "size": 36 + } +] diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr new file mode 100644 index 00000000000..404a761aec9 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.matrix.stderr @@ -0,0 +1,36 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are transposeRunning test case +{ + "filename": "../../matrices/test/ani1.mtx", + "sparse_blas": {} +} +Matrix is of size (36, 36), 208 +Current state: +[ + { + "filename": "../../matrices/test/ani1.mtx", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 36, + "cols": 36, + "nonzeros": 208 + } +] diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout new file mode 100644 index 00000000000..ae983436081 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.matrix.stdout @@ -0,0 +1,25 @@ + +[ + { + "filename": "../../matrices/test/ani1.mtx", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 36, + "cols": 36, + "nonzeros": 208 + } +] diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr new file mode 100644 index 00000000000..1096d64c74e --- /dev/null +++ b/benchmark/test/reference/spmv.matrix.stderr @@ -0,0 +1,31 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case +{ + "filename": "../../matrices/test/ani1.mtx", + "spmv": {} +} +Matrix is of size (36, 36) +Current state: +[ + { + "filename": "../../matrices/test/ani1.mtx", + "spmv": { + "coo": { + "storage": 3328, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "size": 36, + "nnz": 208, + "optimal": {} + } +] diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout new file mode 100644 index 00000000000..b51f331f918 --- /dev/null +++ b/benchmark/test/reference/spmv.matrix.stdout @@ -0,0 +1,20 @@ + +[ + { + "filename": "../../matrices/test/ani1.mtx", + "spmv": { + "coo": { + "storage": 3328, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "size": 36, + "nnz": 208, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py index e974f849276..025ee92707c 100755 --- a/benchmark/test/solver.py +++ b/benchmark/test/solver.py @@ -24,6 +24,13 @@ expected_stderr="solver.simple.stderr", ) +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="solver.matrix.stdout", + expected_stderr="solver.matrix.stderr", +) + # profiler annotations test_framework.compare_output( [ diff --git a/benchmark/test/solver_distributed.py b/benchmark/test/solver_distributed.py index c6623723a43..54bbb030077 100644 --- a/benchmark/test/solver_distributed.py +++ b/benchmark/test/solver_distributed.py @@ -27,6 +27,13 @@ expected_stderr="distributed_solver.simple.stderr", ) +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="distributed_solver.matrix.stdout", + expected_stderr="distributed_solver.matrix.stderr", +) + # profiler annotations test_framework.compare_output( [ diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py index 7b0968a710c..724cdb866f0 100755 --- a/benchmark/test/sparse_blas.py +++ b/benchmark/test/sparse_blas.py @@ -29,6 +29,18 @@ expected_stderr="sparse_blas.simple.stderr", ) +# input matrix file +test_framework.compare_output( + [ + "-operations", + "transpose", + "-input_matrix", + str(test_framework.matrixpath), + ], + expected_stdout="sparse_blas.matrix.stdout", + expected_stderr="sparse_blas.matrix.stderr", +) + # profiler annotations (transpose has the smallest number of allocations) test_framework.compare_output( [ diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py index 6e2d9f05d49..865f74bb6d0 100755 --- a/benchmark/test/spmv.py +++ b/benchmark/test/spmv.py @@ -24,6 +24,13 @@ expected_stderr="spmv.simple.stderr", ) +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="spmv.matrix.stdout", + expected_stderr="spmv.matrix.stderr", +) + # profiler annotations test_framework.compare_output( [ diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 6037f8c594e..1c762905c77 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -9,6 +9,7 @@ import sys sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") binpath = pathlib.Path("@PROJECT_BINARY_DIR@") +matrixpath = pathlib.Path("../../matrices/test/ani1.mtx") generate = False if len(sys.argv) > 2 and sys.argv[2] == "--generate": generate = True @@ -99,8 +100,7 @@ def sanitize_json_in_text(lines: List[str]) -> List[str]: for begin, end, do_sanitize in combined_pairs ] reconstructed = [ - json.dumps(sanitize_json(json.loads(t)), - indent=4) if do_sanitize else t + json.dumps(sanitize_json(json.loads(t)), indent=4) if do_sanitize else t for t, do_sanitize in texts ] return "\n".join(reconstructed).split("\n") @@ -135,7 +135,7 @@ def determinize_text( break if keep: output_lines.append(line) - if output_lines[-1] != "": + if len(output_lines) == 0 or output_lines[-1] != "": output_lines.append("") try: return sanitize_json_in_text(output_lines) @@ -215,8 +215,7 @@ def compare_output_impl( print("FAIL: stdout differs") print( "\n".join( - difflib.unified_diff( - expected_stdout_processed, result_stdout_processed) + difflib.unified_diff(expected_stdout_processed, result_stdout_processed) ) ) failed = True @@ -224,8 +223,7 @@ def compare_output_impl( print("FAIL: stderr differs") print( "\n".join( - difflib.unified_diff( - expected_stderr_processed, result_stderr_processed) + difflib.unified_diff(expected_stderr_processed, result_stderr_processed) ) ) failed = True From 409bd3fae0a6052132351e7501820ac20f37e85b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 17 Aug 2023 09:54:59 +0200 Subject: [PATCH 239/583] use absolute path --- benchmark/test/reference/conversion.matrix.stderr | 6 +++--- benchmark/test/reference/conversion.matrix.stdout | 2 +- benchmark/test/reference/distributed_solver.matrix.stderr | 2 +- benchmark/test/reference/distributed_solver.matrix.stdout | 2 +- benchmark/test/reference/matrix_statistics.matrix.stderr | 2 +- benchmark/test/reference/matrix_statistics.matrix.stdout | 2 +- benchmark/test/reference/preconditioner.matrix.stderr | 4 ++-- benchmark/test/reference/preconditioner.matrix.stdout | 2 +- benchmark/test/reference/solver.matrix.stderr | 2 +- benchmark/test/reference/solver.matrix.stdout | 2 +- benchmark/test/reference/sparse_blas.matrix.stderr | 4 ++-- benchmark/test/reference/sparse_blas.matrix.stdout | 2 +- benchmark/test/reference/spmv.matrix.stderr | 4 ++-- benchmark/test/reference/spmv.matrix.stdout | 2 +- benchmark/test/test_framework.py.in | 4 ++-- 15 files changed, 21 insertions(+), 21 deletions(-) diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr index 813e04a2100..1d604175479 100644 --- a/benchmark/test/reference/conversion.matrix.stderr +++ b/benchmark/test/reference/conversion.matrix.stderr @@ -7,14 +7,14 @@ The formats are coo,csr Benchmarking conversions. Running test case { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "conversions": {} } Matrix is of size (36, 36) Current state: [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "conversions": { "coo-csr": { "time": 1.0, @@ -28,7 +28,7 @@ Current state: Current state: [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "conversions": { "coo-csr": { "time": 1.0, diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout index d3f62ec2b56..e43edda0595 100644 --- a/benchmark/test/reference/conversion.matrix.stdout +++ b/benchmark/test/reference/conversion.matrix.stdout @@ -1,7 +1,7 @@ [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "conversions": { "coo-csr": { "time": 1.0, diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr index 7eba2fc85dd..4f0c6b22edd 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stderr +++ b/benchmark/test/reference/distributed_solver.matrix.stderr @@ -7,7 +7,7 @@ Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 Running test case { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "optimal": { "spmv": "csr-csr" }, diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout index 157c40fdccf..34fdda13e55 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stdout +++ b/benchmark/test/reference/distributed_solver.matrix.stdout @@ -1,7 +1,7 @@ [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "optimal": { "spmv": "csr-csr" }, diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr index 0f7cc261c47..af205c778c0 100644 --- a/benchmark/test/reference/matrix_statistics.matrix.stderr +++ b/benchmark/test/reference/matrix_statistics.matrix.stderr @@ -2,7 +2,7 @@ This is Ginkgo 1.7.0 (develop) running with core module 1.7.0 (develop) Running test case { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "problem": {} } Matrix is of size (36, 36) diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout index 20657f7abd5..a056241669b 100644 --- a/benchmark/test/reference/matrix_statistics.matrix.stdout +++ b/benchmark/test/reference/matrix_statistics.matrix.stdout @@ -1,7 +1,7 @@ [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "problem": { "rows": 36, "columns": 36, diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr index d54c99ac971..c9ef583d79e 100644 --- a/benchmark/test/reference/preconditioner.matrix.stderr +++ b/benchmark/test/reference/preconditioner.matrix.stderr @@ -6,14 +6,14 @@ The random seed for right hand sides is 42 Running with preconditioners: none Running test case { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "preconditioner": {} } Matrix is of size (36, 36) Current state: [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "preconditioner": { "none": { "generate": { diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout index 22c0c9cc1c1..77979f4c54b 100644 --- a/benchmark/test/reference/preconditioner.matrix.stdout +++ b/benchmark/test/reference/preconditioner.matrix.stdout @@ -1,7 +1,7 @@ [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "preconditioner": { "none": { "generate": { diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr index 78b75c301f7..8a1ea117314 100644 --- a/benchmark/test/reference/solver.matrix.stderr +++ b/benchmark/test/reference/solver.matrix.stderr @@ -7,7 +7,7 @@ Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 Running test case { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "optimal": { "spmv": "csr" }, diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout index 4a68d8a599b..6a1f8ceb959 100644 --- a/benchmark/test/reference/solver.matrix.stdout +++ b/benchmark/test/reference/solver.matrix.stdout @@ -1,7 +1,7 @@ [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "optimal": { "spmv": "csr" }, diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr index 404a761aec9..5001c604e72 100644 --- a/benchmark/test/reference/sparse_blas.matrix.stderr +++ b/benchmark/test/reference/sparse_blas.matrix.stderr @@ -5,14 +5,14 @@ Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are transposeRunning test case { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "sparse_blas": {} } Matrix is of size (36, 36), 208 Current state: [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "sparse_blas": { "transpose": { "time": 1.0, diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout index ae983436081..4a64c8ea1ce 100644 --- a/benchmark/test/reference/sparse_blas.matrix.stdout +++ b/benchmark/test/reference/sparse_blas.matrix.stdout @@ -1,7 +1,7 @@ [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "sparse_blas": { "transpose": { "time": 1.0, diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr index 1096d64c74e..8d942cd0de5 100644 --- a/benchmark/test/reference/spmv.matrix.stderr +++ b/benchmark/test/reference/spmv.matrix.stderr @@ -7,14 +7,14 @@ The formats are coo The number of right hand sides is 1 Running test case { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "spmv": {} } Matrix is of size (36, 36) Current state: [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "spmv": { "coo": { "storage": 3328, diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout index b51f331f918..47035c27549 100644 --- a/benchmark/test/reference/spmv.matrix.stdout +++ b/benchmark/test/reference/spmv.matrix.stdout @@ -1,7 +1,7 @@ [ { - "filename": "../../matrices/test/ani1.mtx", + "filename": "", "spmv": { "coo": { "storage": 3328, diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 1c762905c77..da1b0bfd618 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -9,7 +9,7 @@ import sys sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") binpath = pathlib.Path("@PROJECT_BINARY_DIR@") -matrixpath = pathlib.Path("../../matrices/test/ani1.mtx") +matrixpath = pathlib.Path("@PROJECT_BINARY_DIR@/matrices/test/ani1.mtx") generate = False if len(sys.argv) > 2 and sys.argv[2] == "--generate": generate = True @@ -22,7 +22,7 @@ denumberify_paths = [ "rhs_norm", "max_relative_norm2", ] -empty_string_paths = ["error"] +empty_string_paths = ["error", "filename"] empty_array_paths = [ "recurrent_residuals", "true_residuals", From cc9857693e7f340a7b071d35d3b49b67ae5c5bca Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 17 Aug 2023 10:01:34 +0200 Subject: [PATCH 240/583] review updates * -input and -input_matrix are incompatible * use R-strings for JSON Co-authored-by: Marcel Koch Co-authored-by: Yuhsiang M. Tsai --- benchmark/solver/distributed/solver.cpp | 2 +- benchmark/solver/solver.cpp | 2 +- benchmark/utils/general_matrix.hpp | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index 8b285e343ce..a9b1f9c1c93 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -98,7 +98,7 @@ int main(int argc, char* argv[]) "-", where both "local_format" and "non_local_format" can be any of the recognized spmv formats. )"; - std::string additional_json = ",\"optimal\":{\"spmv\":\"csr-csr\"}"; + std::string additional_json = R"(,"optimal":{"spmv":"csr-csr"})"; initialize_argument_parsing_matrix(&argc, &argv, header, format, additional_json); diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index 910bb54d89a..4efc5558a8e 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -61,7 +61,7 @@ int main(int argc, char* argv[]) std::string format = example_config + R"( "optimal":"spmv" can be one of the recognized spmv formats )"; - std::string additional_json = ",\"optimal\":{\"spmv\":\"csr\"}"; + std::string additional_json = R"(,"optimal":{"spmv":"csr"})"; initialize_argument_parsing_matrix(&argc, &argv, header, format, additional_json); diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp index 3791976e3ab..e499d5d9326 100644 --- a/benchmark/utils/general_matrix.hpp +++ b/benchmark/utils/general_matrix.hpp @@ -62,11 +62,16 @@ void initialize_argument_parsing_matrix( initialize_argument_parsing(argc, argv, header, format); std::string input_matrix_str{FLAGS_input_matrix}; if (!input_matrix_str.empty()) { - auto input_json = "[{\"filename\":\"" + input_matrix_str + "\"" + + if (input_stream) { + std::cerr + << "-input and -input_matrix cannot be used simultaneously\n"; + std::exit(1); + } + auto input_json = R"([{"filename":")" + input_matrix_str + "\"" + additional_matrix_file_json + "}]"; input_stream = std::make_unique(input_json); } } -#endif // GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ \ No newline at end of file +#endif // GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ From 62de06bb59f3ac1f186d6db4014bc60436fb27bf Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 17 Aug 2023 11:56:12 +0200 Subject: [PATCH 241/583] escape file path using rapidJSON --- benchmark/utils/general_matrix.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp index e499d5d9326..2049dadf45f 100644 --- a/benchmark/utils/general_matrix.hpp +++ b/benchmark/utils/general_matrix.hpp @@ -67,9 +67,17 @@ void initialize_argument_parsing_matrix( << "-input and -input_matrix cannot be used simultaneously\n"; std::exit(1); } - auto input_json = R"([{"filename":")" + input_matrix_str + "\"" + - additional_matrix_file_json + "}]"; - input_stream = std::make_unique(input_json); + // create JSON for the filename via RapidJSON to ensure the string is + // correctly escaped + rapidjson::Document d; + auto json_template = + R"([{"filename":"")" + additional_matrix_file_json + "}]"; + d.Parse(json_template.c_str()); + d[0]["filename"].SetString(input_matrix_str.c_str(), d.GetAllocator()); + rapidjson::StringBuffer sb; + rapidjson::PrettyWriter writer(sb); + d.Accept(writer); + input_stream = std::make_unique(sb.GetString()); } } From 7d25218ef72856d8e50611aba585d5403f1b1a13 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 18 Aug 2023 14:30:21 +0200 Subject: [PATCH 242/583] fix missing $ in CI_COMMIT_TAG --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1cd8c0335f8..94dedd030c6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,7 +25,7 @@ include: # [1] https://gitlab.com/gitlab-org/gitlab/-/issues/194023#note_1225906002 - local: '.gitlab/add-interrupt.yml' rules: - - if: $CI_COMMIT_BRANCH != "master" && $CI_COMMIT_BRANCH != "develop" && CI_COMMIT_TAG !~ /^v\d+\.\d+\.\d+/ + - if: $CI_COMMIT_BRANCH != "master" && $CI_COMMIT_BRANCH != "develop" && $CI_COMMIT_TAG !~ /^v\d+\.\d+\.\d+/ sync: stage: sync From e9a54182f20008bd715cf16e47f8b2fa4bfd87e6 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 22 Aug 2023 23:08:32 +0200 Subject: [PATCH 243/583] fix median case --- benchmark/utils/timer_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/utils/timer_impl.hpp b/benchmark/utils/timer_impl.hpp index 888cb496248..a6b9d968713 100644 --- a/benchmark/utils/timer_impl.hpp +++ b/benchmark/utils/timer_impl.hpp @@ -111,7 +111,8 @@ class Timer { return copy.back(); } else if (method == "median") { auto mid = copy.size() / 2; - if (copy.size() % 2) { + if (copy.size() % 2 == 0) { + // contains even elements return (copy.at(mid) + copy.at(mid - 1)) / 2; } else { return copy.at(mid); From 1e1f6dd566adbf6bf2cc1279cf614fcc207a7a49 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Jul 2023 23:45:18 +0200 Subject: [PATCH 244/583] nlohmann_json refactor --- CMakeLists.txt | 2 +- benchmark/CMakeLists.txt | 4 +- benchmark/blas/blas.cpp | 21 +- benchmark/blas/blas_common.hpp | 247 +-- benchmark/blas/distributed/multi_vector.cpp | 30 +- .../CMakeLists.txt | 2 +- benchmark/conversion/conversion.cpp | 194 ++ benchmark/conversions/conversions.cpp | 223 -- .../matrix_generator/matrix_generator.cpp | 36 +- .../matrix_statistics/matrix_statistics.cpp | 183 +- benchmark/preconditioner/preconditioner.cpp | 238 +-- benchmark/solver/distributed/solver.cpp | 32 +- benchmark/solver/solver.cpp | 21 +- benchmark/solver/solver_common.hpp | 406 ++-- benchmark/sparse_blas/operations.cpp | 13 +- benchmark/sparse_blas/operations.hpp | 8 +- benchmark/sparse_blas/sparse_blas.cpp | 191 +- benchmark/spmv/distributed/spmv.cpp | 67 +- benchmark/spmv/spmv.cpp | 33 +- benchmark/spmv/spmv_common.hpp | 289 ++- benchmark/test/reference/blas.profile.stderr | 69 +- benchmark/test/reference/blas.simple.stderr | 69 +- .../test/reference/conversion.all.stderr | 1862 +---------------- .../test/reference/conversion.all.stdout | 74 +- .../test/reference/conversion.matrix.stderr | 42 +- .../test/reference/conversion.matrix.stdout | 16 +- .../test/reference/conversion.profile.stderr | 98 +- .../test/reference/conversion.profile.stdout | 19 +- .../test/reference/conversion.simple.stderr | 42 +- .../test/reference/conversion.simple.stdout | 19 +- .../distributed_solver.matrix.stdout | 3 +- .../distributed_solver.profile.stderr | 8 +- .../distributed_solver.profile.stdout | 6 +- .../distributed_solver.simple.stdout | 6 +- .../reference/matrix_statistics.matrix.stderr | 2 +- .../reference/matrix_statistics.matrix.stdout | 4 +- .../reference/matrix_statistics.simple.stderr | 2 +- .../reference/matrix_statistics.simple.stdout | 7 +- .../reference/preconditioner.matrix.stderr | 33 +- .../reference/preconditioner.matrix.stdout | 4 +- .../reference/preconditioner.profile.stderr | 29 +- .../reference/preconditioner.profile.stdout | 7 +- .../reference/preconditioner.simple.stderr | 33 +- .../reference/preconditioner.simple.stdout | 7 +- benchmark/test/reference/solver.matrix.stdout | 3 +- .../test/reference/solver.profile.stderr | 8 +- .../test/reference/solver.profile.stdout | 6 +- benchmark/test/reference/solver.simple.stdout | 6 +- .../test/reference/sparse_blas.matrix.stderr | 29 +- .../test/reference/sparse_blas.profile.stderr | 23 +- .../test/reference/sparse_blas.simple.stderr | 30 +- benchmark/test/reference/spmv.matrix.stderr | 21 +- benchmark/test/reference/spmv.matrix.stdout | 5 +- benchmark/test/reference/spmv.profile.stderr | 32 +- benchmark/test/reference/spmv.profile.stdout | 6 +- benchmark/test/reference/spmv.simple.stderr | 21 +- benchmark/test/reference/spmv.simple.stdout | 6 +- benchmark/utils/general.hpp | 342 +-- benchmark/utils/general_matrix.hpp | 18 +- benchmark/utils/generator.hpp | 118 +- benchmark/utils/iteration_control.hpp | 326 +++ benchmark/utils/json.hpp | 63 +- benchmark/utils/loggers.hpp | 100 +- benchmark/utils/runner.hpp | 209 ++ benchmark/utils/spmv_validation.hpp | 83 - third_party/CMakeLists.txt | 4 +- third_party/nlohmann_json/CMakeLists.txt | 9 + third_party/rapidjson/CMakeLists.txt | 14 - 68 files changed, 1864 insertions(+), 4319 deletions(-) rename benchmark/{conversions => conversion}/CMakeLists.txt (88%) create mode 100644 benchmark/conversion/conversion.cpp delete mode 100644 benchmark/conversions/conversions.cpp create mode 100644 benchmark/utils/iteration_control.hpp create mode 100644 benchmark/utils/runner.hpp delete mode 100644 benchmark/utils/spmv_validation.hpp create mode 100644 third_party/nlohmann_json/CMakeLists.txt delete mode 100644 third_party/rapidjson/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 26bc992c457..fab64e43c76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,7 +256,7 @@ if(GINKGO_BUILD_TESTS) endif() if(GINKGO_BUILD_BENCHMARKS) find_package(gflags 2.2.2 QUIET) - find_package(RapidJSON 1.1.0 QUIET) + find_package(nlohmann_json 3.9.1 QUIET) endif() # System provided, third party libraries (not bundled!) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 44a0a3d1d9e..e993ee6cf0c 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -57,7 +57,7 @@ endfunction() # All remaining arguments will be treated as source files function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def type) add_executable("${name}" ${ARGN}) - target_link_libraries("${name}" ginkgo gflags rapidjson) + target_link_libraries("${name}" ginkgo gflags nlohmann_json::nlohmann_json) # always include the device timer if (GINKGO_BUILD_CUDA) target_compile_definitions("${name}" PRIVATE HAS_CUDA_TIMER=1) @@ -149,7 +149,7 @@ if (GINKGO_BUILD_MPI) endif() add_subdirectory(blas) -add_subdirectory(conversions) +add_subdirectory(conversion) add_subdirectory(matrix_generator) add_subdirectory(matrix_statistics) add_subdirectory(preconditioner) diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 11228ed5818..f7ad8120a80 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -130,26 +130,17 @@ Parameters for a benchmark case are: stride_B: stride for B matrix in gemm (optional, default m) stride_C: stride for C matrix in gemm (optional, default m) )"; - std::string format = example_config; + std::string format = Generator::get_example_config(); initialize_argument_parsing(&argc, &argv, header, format); - std::string extra_information = - "The operations are " + FLAGS_operations + "\n"; + std::string extra_information = "The operations are " + FLAGS_operations; print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - std::cerr - << "Input has to be a JSON array of benchmark configurations:\n" - << format; - std::exit(1); - } + auto test_cases = json::parse(get_input_stream()); - run_blas_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), operation_map, - test_cases, true); + run_test_cases(BlasBenchmark{operation_map}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp index fe0110f82fb..88819a043b0 100644 --- a/benchmark/blas/blas_common.hpp +++ b/benchmark/blas/blas_common.hpp @@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/general.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -70,14 +72,6 @@ DEFINE_string( "C has dimensions n x m and x and y have dimensions n x r"); -std::string example_config = R"( - [ - { "n": 100 }, - { "n": 200, "m": 200, "k": 200 } - ] -)"; - - class BenchmarkOperation { public: virtual ~BenchmarkOperation() = default; @@ -404,70 +398,101 @@ struct dimensions { }; -dimensions parse_dims(rapidjson::Value& test_case) -{ - auto get_optional = [](rapidjson::Value& obj, const char* name, - gko::size_type default_value) -> gko::size_type { - if (obj.HasMember(name)) { - return obj[name].GetUint64(); - } else { - return default_value; - } - }; - - dimensions result; - result.n = test_case["n"].GetInt64(); - result.k = get_optional(test_case, "k", result.n); - result.m = get_optional(test_case, "m", result.n); - result.r = get_optional(test_case, "r", 1); - if (test_case.HasMember("stride")) { - result.stride_x = test_case["stride"].GetInt64(); - result.stride_y = result.stride_x; - } else { - result.stride_x = get_optional(test_case, "stride_x", result.r); - result.stride_y = get_optional(test_case, "stride_y", result.r); +struct BlasBenchmark : Benchmark { + using map_type = + std::map( + std::shared_ptr, dimensions)>>; + map_type operation_map; + std::vector operations; + std::string name; + bool do_print; + + BlasBenchmark(map_type operation_map, bool do_print = true) + : operation_map{std::move(operation_map)}, + name{"blas"}, + operations{split(FLAGS_operations)}, + do_print{do_print} + {} + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return operations; } - result.stride_A = get_optional(test_case, "stride_A", result.k); - result.stride_B = get_optional(test_case, "stride_B", result.m); - result.stride_C = get_optional(test_case, "stride_C", result.m); - return result; -} + bool should_print() const override { return do_print; } -std::string describe(rapidjson::Value& test_case) -{ - std::stringstream ss; - auto optional_output = [&](const char* name) { - if (test_case.HasMember(name) && test_case[name].IsInt64()) { - ss << name << " = " << test_case[name].GetInt64() << " "; - } - }; - optional_output("n"); - optional_output("k"); - optional_output("m"); - optional_output("r"); - optional_output("stride"); - optional_output("stride_x"); - optional_output("stride_y"); - optional_output("stride_A"); - optional_output("stride_B"); - optional_output("stride_C"); - return ss.str(); -} + std::string get_example_config() const override + { + return json::parse(R"([{"n": 100}, {"n": 200, "m": 200, "k": 200}])") + .dump(4); + } + bool validate_config(const json& value) const override + { + return value.contains("n") && value["n"].is_number_integer(); + } -template -void apply_blas(const char* operation_name, std::shared_ptr exec, - std::shared_ptr timer, const OpMap& operation_map, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& blas_case = test_case["blas"]; - add_or_set_member(blas_case, operation_name, - rapidjson::Value(rapidjson::kObjectType), allocator); + std::string describe_config(const json& test_case) const override + { + std::stringstream ss; + auto optional_output = [&](const char* name) { + if (test_case.contains(name) && + test_case[name].is_number_integer()) { + ss << name << " = " << test_case[name].get() << " "; + } + }; + optional_output("n"); + optional_output("k"); + optional_output("m"); + optional_output("r"); + optional_output("stride"); + optional_output("stride_x"); + optional_output("stride_y"); + optional_output("stride_A"); + optional_output("stride_B"); + optional_output("stride_C"); + return ss.str(); + } + + dimensions setup(std::shared_ptr exec, + json& test_case) const override + { + auto get_optional = [](json& obj, const char* name, + gko::size_type default_value) -> gko::size_type { + if (obj.contains(name)) { + return obj[name].get(); + } else { + return default_value; + } + }; + + dimensions result; + result.n = test_case["n"].get(); + result.k = get_optional(test_case, "k", result.n); + result.m = get_optional(test_case, "m", result.n); + result.r = get_optional(test_case, "r", 1); + if (test_case.contains("stride")) { + result.stride_x = test_case["stride"].get(); + result.stride_y = result.stride_x; + } else { + result.stride_x = get_optional(test_case, "stride_x", result.r); + result.stride_y = get_optional(test_case, "stride_y", result.r); + } + result.stride_A = get_optional(test_case, "stride_A", result.k); + result.stride_B = get_optional(test_case, "stride_B", result.m); + result.stride_C = get_optional(test_case, "stride_C", result.m); + return result; + } - auto op = operation_map.at(operation_name)(exec, parse_dims(test_case)); + + void run(std::shared_ptr exec, std::shared_ptr timer, + dimensions& dims, const std::string& operation_name, + json& operation_case) const override + { + auto op = operation_map.at(operation_name)(exec, dims); IterationControl ic(timer); @@ -488,89 +513,9 @@ void apply_blas(const char* operation_name, std::shared_ptr exec, const auto flops = static_cast(op->get_flops()); const auto mem = static_cast(op->get_memory()); const auto repetitions = ic.get_num_repetitions(); - add_or_set_member(blas_case[operation_name], "time", runtime, - allocator); - add_or_set_member(blas_case[operation_name], "flops", flops / runtime, - allocator); - add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime, - allocator); - add_or_set_member(blas_case[operation_name], "repetitions", repetitions, - allocator); - - // compute and write benchmark data - add_or_set_member(blas_case[operation_name], "completed", true, - allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["blas"][operation_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["blas"][operation_name], "error", - msg_value, allocator); - } - std::cerr << "Error when processing test case\n" - << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} - - -template -void run_blas_benchmarks(std::shared_ptr exec, - std::shared_ptr timer, - const OpMap& operation_map, - rapidjson::Document& test_cases, bool do_print) -{ - auto operations = split(FLAGS_operations, ','); - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); + operation_case["time"] = runtime; + operation_case["flops"] = flops / runtime; + operation_case["bandwidth"] = mem / runtime; + operation_case["repetitions"] = repetitions; } - auto annotate = annotate_functor{profiler_hook}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - if (!test_case.HasMember("blas")) { - test_case.AddMember("blas", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& blas_case = test_case["blas"]; - if (!FLAGS_overwrite && - all_of(begin(operations), end(operations), - [&blas_case](const std::string& s) { - return blas_case.HasMember(s.c_str()); - })) { - continue; - } - if (do_print) { - std::clog << "Running test case\n" << test_case << std::endl; - } - // annotate the test case - auto test_case_range = annotate(describe(test_case)); - for (const auto& operation_name : operations) { - { - auto operation_range = annotate(operation_name.c_str()); - apply_blas(operation_name.c_str(), exec, timer, - operation_map, test_case, allocator); - } - - if (do_print) { - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - - backup_results(test_cases); - } - } - } catch (const std::exception& e) { - std::cerr << "Error setting up benchmark, what(): " << e.what() - << std::endl; - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } -} +}; diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index be326b08b96..d95e5fb38ac 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -50,6 +50,10 @@ int main(int argc, char* argv[]) { gko::experimental::mpi::environment mpi_env{argc, argv}; + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + const auto do_print = rank == 0; + std::string header = R"(" A benchmark for measuring performance of Ginkgo's BLAS-like " operations. @@ -60,13 +64,10 @@ Parameters for a benchmark case are: stride_x: stride for input vector x (optional, default r) stride_y: stride for in/out vector y (optional, default r) )"; - std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + std::string format = Generator::get_example_config(); + initialize_argument_parsing(&argc, &argv, header, format, do_print); - const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - const auto rank = comm.rank(); - - if (rank == 0) { + if (do_print) { std::string extra_information = "The operations are " + FLAGS_operations; print_general_information(extra_information); @@ -75,14 +76,7 @@ Parameters for a benchmark case are: auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); std::string json_input = broadcast_json_input(get_input_stream(), comm); - rapidjson::Document test_cases; - test_cases.Parse(json_input.c_str()); - if (!test_cases.IsArray()) { - std::cerr - << "Input has to be a JSON array of benchmark configurations:\n" - << format; - std::exit(1); - } + auto test_cases = json::parse(json_input); std::map( @@ -130,10 +124,10 @@ Parameters for a benchmark case are: exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y); }}}; - run_blas_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer), - operation_map, test_cases, rank == 0); + run_test_cases(BlasBenchmark{operation_map, do_print}, exec, + get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases); - if (rank == 0) { - std::cout << test_cases << std::endl; + if (do_print) { + std::cout << std::setw(4) << test_cases << std::endl; } } diff --git a/benchmark/conversions/CMakeLists.txt b/benchmark/conversion/CMakeLists.txt similarity index 88% rename from benchmark/conversions/CMakeLists.txt rename to benchmark/conversion/CMakeLists.txt index 21dd363d3c0..7ecf578c055 100644 --- a/benchmark/conversions/CMakeLists.txt +++ b/benchmark/conversion/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(conversion "NO" conversions.cpp) +ginkgo_add_typed_benchmark_executables(conversion "NO" conversion.cpp) diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp new file mode 100644 index 00000000000..b9a5d5c46d6 --- /dev/null +++ b/benchmark/conversion/conversion.cpp @@ -0,0 +1,194 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general_matrix.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/iteration_control.hpp" +#include "benchmark/utils/runner.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" + + +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + +using Generator = DefaultSystemGenerator<>; + + +struct ConversionBenchmark : Benchmark> { + std::string name; + std::vector operations; + + ConversionBenchmark() : name{"conversion"} + { + auto ref_exec = gko::ReferenceExecutor::create(); + auto formats = split(FLAGS_formats); + for (const auto& from_format : formats) { + operations.push_back(from_format + "-read"); + auto from_mtx = + formats::matrix_type_factory.at(from_format)(ref_exec); + // all pairs of conversions that are supported by Ginkgo + for (const auto& to_format : formats) { + if (from_format == to_format) { + continue; + } + auto to_mtx = + formats::matrix_type_factory.at(to_format)(ref_exec); + try { + to_mtx->copy_from(from_mtx); + operations.push_back(from_format + "-" + to_format); + } catch (const std::exception& e) { + } + } + } + } + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return operations; + } + + bool should_print() const override { return true; } + + std::string get_example_config() const override + { + return Generator::get_example_config(); + } + + bool validate_config(const json& test_case) const override + { + return Generator::validate_config(test_case); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + gko::matrix_data setup(std::shared_ptr exec, + json& test_case) const override + { + gko::matrix_data data; + data = Generator::generate_matrix_data(test_case); + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + return data; + } + + + void run(std::shared_ptr exec, std::shared_ptr timer, + gko::matrix_data& data, + const std::string& operation_name, + json& operation_case) const override + { + auto split_it = + std::find(operation_name.begin(), operation_name.end(), '-'); + std::string from_name{operation_name.begin(), split_it}; + std::string to_name{split_it + 1, operation_name.end()}; + auto mtx_from = formats::matrix_type_factory.at(from_name)(exec); + auto readable = + gko::as>(mtx_from.get()); + IterationControl ic{timer}; + if (to_name == "read") { + // warm run + for (auto _ : ic.warmup_run()) { + exec->synchronize(); + readable->read(data); + exec->synchronize(); + } + // timed run + for (auto _ : ic.run()) { + readable->read(data); + } + } else { + readable->read(data); + auto mtx_to = formats::matrix_type_factory.at(to_name)(exec); + + // warm run + for (auto _ : ic.warmup_run()) { + exec->synchronize(); + mtx_to->copy_from(mtx_from); + exec->synchronize(); + } + // timed run + for (auto _ : ic.run()) { + mtx_to->copy_from(mtx_from); + } + } + operation_case["time"] = ic.compute_time(FLAGS_timer_method); + operation_case["repetitions"] = ic.get_num_repetitions(); + } +}; + + +int main(int argc, char* argv[]) +{ + std::string header = + "A benchmark for measuring performance of Ginkgo's conversions.\n"; + std::string format_str = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format_str); + + std::string extra_information = + std::string() + "The formats are " + FLAGS_formats; + print_general_information(extra_information); + + auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); + auto formats = split(FLAGS_formats, ','); + + auto test_cases = json::parse(get_input_stream()); + + run_test_cases(ConversionBenchmark{}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); + + std::cout << std::setw(4) << test_cases << std::endl; +} diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp deleted file mode 100644 index d9684321e2d..00000000000 --- a/benchmark/conversions/conversions.cpp +++ /dev/null @@ -1,223 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "benchmark/utils/formats.hpp" -#include "benchmark/utils/general_matrix.hpp" -#include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" -#include "benchmark/utils/timer.hpp" -#include "benchmark/utils/types.hpp" - - -#ifdef GINKGO_BENCHMARK_ENABLE_TUNING -#include "benchmark/utils/tuning_variables.hpp" -#endif // GINKGO_BENCHMARK_ENABLE_TUNING - - -// This function supposes that management of `FLAGS_overwrite` is done before -// calling it -void convert_matrix(const gko::LinOp* matrix_from, const char* format_to, - const char* conversion_name, - std::shared_ptr exec, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& conversion_case = test_case["conversions"]; - add_or_set_member(conversion_case, conversion_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - - gko::matrix_data data{gko::dim<2>{1, 1}, 1}; - auto matrix_to = share(formats::matrix_factory(format_to, exec, data)); - - auto timer = get_timer(exec, FLAGS_gpu_timer); - IterationControl ic{timer}; - - // warm run - for (auto _ : ic.warmup_run()) { - exec->synchronize(); - matrix_to->copy_from(matrix_from); - exec->synchronize(); - matrix_to->clear(); - } - // timed run - for (auto _ : ic.run()) { - matrix_to->copy_from(matrix_from); - } - add_or_set_member(conversion_case[conversion_name], "time", - ic.compute_time(FLAGS_timer_method), allocator); - add_or_set_member(conversion_case[conversion_name], "repetitions", - ic.get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(conversion_case[conversion_name], "completed", true, - allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["conversions"][conversion_name], - "completed", false, allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["conversions"][conversion_name], - "error", msg_value, allocator); - } - std::cerr << "Error when processing test case\n" - << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} - - -int main(int argc, char* argv[]) -{ - std::string header = - "A benchmark for measuring performance of Ginkgo's conversions.\n"; - std::string format_str = example_config; - initialize_argument_parsing_matrix(&argc, &argv, header, format_str); - - std::string extra_information = - std::string() + "The formats are " + FLAGS_formats + "\n"; - print_general_information(extra_information); - - auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - auto formats = split(FLAGS_formats, ','); - - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } - - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - DefaultSystemGenerator<> generator{}; - - for (auto& test_case : test_cases.GetArray()) { - std::clog << "Benchmarking conversions. " << std::endl; - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("conversions")) { - test_case.AddMember("conversions", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& conversion_case = test_case["conversions"]; - - std::clog << "Running test case\n" << test_case << std::endl; - gko::matrix_data data; - try { - data = generator.generate_matrix_data(test_case); - } catch (std::exception& e) { - std::cerr << "Error setting up matrix data, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); - } - continue; - } - std::clog << "Matrix is of size (" << data.size[0] << ", " - << data.size[1] << ")" << std::endl; - add_or_set_member(test_case, "size", data.size[0], allocator); - // annotate the test case - auto test_case_range = annotate(generator.describe_config(test_case)); - for (const auto& format_from : formats) { - try { - auto matrix_from = - share(formats::matrix_factory(format_from, exec, data)); - for (const auto& format_to : formats) { - if (format_from == format_to) { - continue; - } - auto conversion_name = - std::string(format_from) + "-" + format_to; - - if (!FLAGS_overwrite && - conversion_case.HasMember(conversion_name.c_str())) { - continue; - } - { - auto conversion_range = - annotate(conversion_name.c_str()); - convert_matrix(matrix_from.get(), format_to.c_str(), - conversion_name.c_str(), exec, test_case, - allocator); - } - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - } - backup_results(test_cases); - } catch (const gko::AllocationError& e) { - for (const auto& format : formats::matrix_type_factory) { - const auto format_to = std::get<0>(format); - auto conversion_name = - std::string(format_from) + "-" + format_to; - add_or_set_member( - test_case["conversions"][conversion_name.c_str()], - "completed", false, allocator); - } - std::cerr << "Error when allocating data for type " - << format_from << ". what(): " << e.what() - << std::endl; - backup_results(test_cases); - } catch (const std::exception& e) { - std::cerr << "Error when running benchmark, what(): " - << e.what() << std::endl; - } - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } - - std::cout << test_cases << std::endl; -} diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp index 138b5a9c2ce..193d95f897f 100644 --- a/benchmark/matrix_generator/matrix_generator.cpp +++ b/benchmark/matrix_generator/matrix_generator.cpp @@ -85,31 +85,33 @@ std::string input_format = // clang-format on -void validate_option_object(const rapidjson::Value& value) +void validate_option_object(const json& value) { - if (!value.IsObject() || !value.HasMember("filename") || - !value["filename"].IsString() || !value.HasMember("problem") || - !value["problem"].IsObject() || !value["problem"].HasMember("type") || - !value["problem"]["type"].IsString()) { + if (!value.is_object() || !value.contains("filename") || + !value["filename"].is_string() || !value.contains("problem") || + !value["problem"].is_object() || !value["problem"].contains("type") || + !value["problem"]["type"].is_string()) { print_config_error_and_exit(2); } } using generator_function = std::function( - rapidjson::Value&, std::default_random_engine&)>; + json&, std::default_random_engine&)>; // matrix generators gko::matrix_data generate_block_diagonal( - rapidjson::Value& config, std::default_random_engine& engine) + json& config, std::default_random_engine& engine) { - if (!config.HasMember("num_blocks") || !config["num_blocks"].IsUint() || - !config.HasMember("block_size") || !config["block_size"].IsUint()) { + if (!config.contains("num_blocks") || + !config["num_blocks"].is_number_unsigned() || + !config.contains("block_size") || + !config["block_size"].is_number_unsigned()) { print_config_error_and_exit(2); } - auto num_blocks = config["num_blocks"].GetUint(); - auto block_size = config["block_size"].GetUint(); + auto num_blocks = config["num_blocks"].get(); + auto block_size = config["block_size"].get(); auto block = gko::matrix_data( gko::dim<2>(block_size), std::uniform_real_distribution(-1.0, 1.0), engine); @@ -132,20 +134,18 @@ int main(int argc, char* argv[]) std::clog << gko::version_info::get() << std::endl; auto engine = get_engine(); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document configurations; - configurations.ParseStream(jcin); + auto configurations = json::parse(get_input_stream()); - if (!configurations.IsArray()) { + if (!configurations.is_array()) { print_config_error_and_exit(1); } - for (auto& config : configurations.GetArray()) { + for (auto& config : configurations) { try { validate_option_object(config); std::clog << "Generating matrix: " << config << std::endl; - auto filename = config["filename"].GetString(); - auto type = config["problem"]["type"].GetString(); + auto filename = config["filename"].get(); + auto type = config["problem"]["type"].get(); auto mdata = generator[type](config["problem"], engine); std::ofstream ofs(filename); gko::write_raw(ofs, mdata, gko::layout_type::coordinate); diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index fccf4391ad5..40c505c7627 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -38,9 +38,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/types.hpp" @@ -51,9 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // See en.wikipedia.org/wiki/Five-number_summary // Quartile computation uses Method 3 from en.wikipedia.org/wiki/Quartile -void compute_summary(const std::vector& dist, - rapidjson::Value& out, - rapidjson::MemoryPoolAllocator<>& allocator) +void compute_summary(const std::vector& dist, json& out) { const auto q = dist.size() / 4; const auto r = dist.size() % 4; @@ -72,23 +73,14 @@ void compute_summary(const std::vector& dist, }; // clang-format on - add_or_set_member(out, "min", dist[0], allocator); - add_or_set_member( - out, "q1", - coefs[r][0] * static_cast(dist[positions[r][0]]) + - coefs[r][1] * static_cast(dist[positions[r][1]]), - allocator); - add_or_set_member( - out, "median", - coefs[r][2] * static_cast(dist[positions[r][2]]) + - coefs[r][3] * static_cast(dist[positions[r][3]]), - allocator); - add_or_set_member( - out, "q3", - coefs[r][4] * static_cast(dist[positions[r][4]]) + - coefs[r][5] * static_cast(dist[positions[r][5]]), - allocator); - add_or_set_member(out, "max", dist[dist.size() - 1], allocator); + out["min"] = dist.front(); + out["q1"] = coefs[r][0] * static_cast(dist[positions[r][0]]) + + coefs[r][1] * static_cast(dist[positions[r][1]]); + out["median"] = coefs[r][2] * static_cast(dist[positions[r][2]]) + + coefs[r][3] * static_cast(dist[positions[r][3]]); + out["q3"] = coefs[r][4] * static_cast(dist[positions[r][4]]) + + coefs[r][5] * static_cast(dist[positions[r][5]]); + out["max"] = dist.back(); } @@ -108,39 +100,30 @@ double compute_moment(int degree, const std::vector& dist, // See en.wikipedia.org/wiki/Moment_(mathematics) -void compute_moments(const std::vector& dist, - rapidjson::Value& out, - rapidjson::MemoryPoolAllocator<>& allocator) +void compute_moments(const std::vector& dist, json& out) { const auto mean = compute_moment(1, dist); - add_or_set_member(out, "mean", mean, allocator); + out["mean"] = mean; const auto variance = compute_moment(2, dist, mean); - add_or_set_member(out, "variance", variance, allocator); + out["variance"] = variance; const auto dev = std::sqrt(variance); - add_or_set_member(out, "skewness", compute_moment(3, dist, mean, dev), - allocator); - add_or_set_member(out, "kurtosis", compute_moment(4, dist, mean, dev), - allocator); - add_or_set_member(out, "hyperskewness", compute_moment(5, dist, mean, dev), - allocator); - add_or_set_member(out, "hyperflatness", compute_moment(6, dist, mean, dev), - allocator); + out["skewness"] = compute_moment(3, dist, mean, dev); + out["kurtosis"] = compute_moment(4, dist, mean, dev); + out["hyperskewness"] = compute_moment(5, dist, mean, dev); + out["hyperflatness"] = compute_moment(6, dist, mean, dev); } -template void compute_distribution_properties(const std::vector& dist, - rapidjson::Value& out, - Allocator& allocator) + json& out) { - compute_summary(dist, out, allocator); - compute_moments(dist, out, allocator); + compute_summary(dist, out); + compute_moments(dist, out); } -template void extract_matrix_statistics(gko::matrix_data& data, - rapidjson::Value& problem, Allocator& allocator) + json& problem) { std::vector row_dist(data.size[0]); std::vector col_dist(data.size[1]); @@ -149,72 +132,90 @@ void extract_matrix_statistics(gko::matrix_data& data, ++col_dist[v.column]; } - add_or_set_member(problem, "rows", data.size[0], allocator); - add_or_set_member(problem, "columns", data.size[1], allocator); - add_or_set_member(problem, "nonzeros", data.nonzeros.size(), allocator); + problem["rows"] = data.size[0]; + problem["columns"] = data.size[1]; + problem["nonzeros"] = data.nonzeros.size(); std::sort(begin(row_dist), end(row_dist)); - add_or_set_member(problem, "row_distribution", - rapidjson::Value(rapidjson::kObjectType), allocator); - compute_distribution_properties(row_dist, problem["row_distribution"], - allocator); + problem["row_distribution"] = json::object(); + compute_distribution_properties(row_dist, problem["row_distribution"]); std::sort(begin(col_dist), end(col_dist)); - add_or_set_member(problem, "col_distribution", - rapidjson::Value(rapidjson::kObjectType), allocator); - compute_distribution_properties(col_dist, problem["col_distribution"], - allocator); + problem["col_distribution"] = json::object(); + compute_distribution_properties(col_dist, problem["col_distribution"]); } -int main(int argc, char* argv[]) -{ - std::string header = - "A utility that collects additional statistical properties of the " - "matrix.\n"; - std::string format = example_config; - initialize_argument_parsing_matrix(&argc, &argv, header, format); +using Generator = DefaultSystemGenerator; - std::clog << gko::version_info::get() << std::endl; - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } +struct MatrixStatistics : Benchmark { + std::string name; + std::vector empty; - auto& allocator = test_cases.GetAllocator(); + MatrixStatistics() : name{"problem"} {} - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("problem")) { - test_case.AddMember("problem", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& problem = test_case["problem"]; + const std::string& get_name() const override { return name; } - std::clog << "Running test case\n" << test_case << std::endl; + const std::vector& get_operations() const override + { + return empty; + } - auto matrix = - DefaultSystemGenerator::generate_matrix_data( - test_case); + bool should_print() const override { return true; } - std::clog << "Matrix is of size (" << matrix.size[0] << ", " - << matrix.size[1] << ")" << std::endl; - add_or_set_member(test_case, "size", matrix.size[0], allocator); + std::string get_example_config() const override + { + return Generator::get_example_config(); + } - extract_matrix_statistics(matrix, test_case["problem"], allocator); + bool validate_config(const json& test_case) const override + { + return Generator::validate_config(test_case); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } - backup_results(test_cases); - } catch (const std::exception& e) { - std::cerr << "Error extracting statistics, what(): " << e.what() - << std::endl; - } + int setup(std::shared_ptr exec, + json& test_case) const override + { + auto data = Generator::generate_matrix_data(test_case); + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + + extract_matrix_statistics(data, test_case["problem"]); + return 0; } - std::cout << test_cases << std::endl; + + void run(std::shared_ptr exec, std::shared_ptr timer, + int& data, const std::string& operation_name, + json& operation_case) const override + {} +}; + + +int main(int argc, char* argv[]) +{ + std::string header = + "A utility that collects additional statistical properties of the " + "matrix.\n"; + std::string format = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format); + + std::clog << gko::version_info::get() << std::endl; + + auto test_cases = json::parse(get_input_stream()); + auto exec = gko::ReferenceExecutor::create(); + + run_test_cases(MatrixStatistics{}, exec, get_timer(exec, false), + test_cases); + + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index e7859e992dc..7c130328d34 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -43,9 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/preconditioners.hpp" -#include "benchmark/utils/spmv_validation.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -128,34 +129,85 @@ std::string encode_parameters(const char* precond_name) } -void run_preconditioner(const char* precond_name, - std::shared_ptr exec, - std::shared_ptr system_matrix, - const vec* b, const vec* x, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& precond_object = test_case["preconditioner"]; - auto encoded_name = encode_parameters(precond_name); +struct preconditioner_benchmark_state { + std::unique_ptr x; + std::unique_ptr b; + std::shared_ptr system_matrix; +}; + + +using Generator = DefaultSystemGenerator<>; + - if (!FLAGS_overwrite && - precond_object.HasMember(encoded_name.c_str())) { - return; +struct PreconditionerBenchmark : Benchmark { + std::string name; + std::vector preconditioners; + std::map precond_decoder; + + PreconditionerBenchmark() + : name{"preconditioner"}, preconditioners{split(FLAGS_preconditioners)} + { + for (auto precond : split(FLAGS_preconditioners)) { + preconditioners.push_back(encode_parameters(precond.c_str())); + precond_decoder[preconditioners.back()] = precond; } + } + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return preconditioners; + } + + bool should_print() const override { return true; } + + bool validate_config(const json& value) const override + { + return Generator::validate_config(value); + } - add_or_set_member(precond_object, encoded_name.c_str(), - rapidjson::Value(rapidjson::kObjectType), allocator); - auto& this_precond_data = precond_object[encoded_name.c_str()]; + std::string get_example_config() const override + { + return Generator::get_example_config(); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } - add_or_set_member(this_precond_data, "generate", - rapidjson::Value(rapidjson::kObjectType), allocator); - add_or_set_member(this_precond_data, "apply", - rapidjson::Value(rapidjson::kObjectType), allocator); + preconditioner_benchmark_state setup(std::shared_ptr exec, + json& test_case) const override + { + preconditioner_benchmark_state state; + auto data = Generator::generate_matrix_data(test_case); + + state.system_matrix = + formats::matrix_factory(FLAGS_formats, exec, data); + state.b = Generator::create_multi_vector_random(exec, data.size[0]); + state.x = Generator::create_multi_vector(exec, data.size[0], + gko::zero()); + + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + return state; + } + + + void run(std::shared_ptr exec, std::shared_ptr timer, + preconditioner_benchmark_state& state, + const std::string& encoded_precond_name, + json& precond_case) const override + { + auto decoded_precond_name = precond_decoder.at(encoded_precond_name); + precond_case["generate"] = json::object(); + precond_case["apply"] = json::object(); for (auto stage : {"generate", "apply"}) { - add_or_set_member(this_precond_data[stage], "components", - rapidjson::Value(rapidjson::kObjectType), - allocator); + precond_case[stage]["components"] = json::object(); } IterationControl ic_gen{get_timer(exec, FLAGS_gpu_timer)}; @@ -163,54 +215,51 @@ void run_preconditioner(const char* precond_name, { // fast run, gets total time - auto x_clone = clone(x); - - auto precond = precond_factory.at(precond_name)(exec); + auto x_clone = clone(state.x); + auto precond = precond_factory.at(decoded_precond_name)(exec); for (auto _ : ic_apply.warmup_run()) { - precond->generate(system_matrix)->apply(b, x_clone); + precond->generate(state.system_matrix)->apply(state.b, x_clone); } std::unique_ptr precond_op; for (auto _ : ic_gen.run()) { - precond_op = precond->generate(system_matrix); + precond_op = precond->generate(state.system_matrix); } - add_or_set_member(this_precond_data["generate"], "time", - ic_gen.compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(this_precond_data["generate"], "repetitions", - ic_gen.get_num_repetitions(), allocator); + precond_case["generate"]["time"] = + ic_gen.compute_time(FLAGS_timer_method); + precond_case["generate"]["repetitions"] = + ic_gen.get_num_repetitions(); for (auto _ : ic_apply.run()) { - precond_op->apply(b, x_clone); + precond_op->apply(state.b, x_clone); } - add_or_set_member(this_precond_data["apply"], "time", - ic_apply.compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(this_precond_data["apply"], "repetitions", - ic_apply.get_num_repetitions(), allocator); + precond_case["apply"]["time"] = + ic_apply.compute_time(FLAGS_timer_method); + precond_case["apply"]["repetitions"] = + ic_apply.get_num_repetitions(); } if (FLAGS_detailed) { // slow run, times each component separately - auto x_clone = clone(x); - auto precond = precond_factory.at(precond_name)(exec); + auto x_clone = clone(state.x); + auto precond = precond_factory.at(decoded_precond_name)(exec); std::unique_ptr precond_op; { auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - this_precond_data["generate"]["components"], allocator, + precond_case["generate"]["components"], ic_gen.get_num_repetitions()); exec->add_logger(gen_logger); if (exec->get_master() != exec) { exec->get_master()->add_logger(gen_logger); } for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) { - precond_op = precond->generate(system_matrix); + precond_op = precond->generate(state.system_matrix); } if (exec->get_master() != exec) { exec->get_master()->remove_logger(gen_logger); @@ -220,39 +269,22 @@ void run_preconditioner(const char* precond_name, auto apply_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - this_precond_data["apply"]["components"], allocator, + precond_case["apply"]["components"], ic_apply.get_num_repetitions()); exec->add_logger(apply_logger); if (exec->get_master() != exec) { exec->get_master()->add_logger(apply_logger); } for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) { - precond_op->apply(b, x_clone); + precond_op->apply(state.b, x_clone); } if (exec->get_master() != exec) { exec->get_master()->remove_logger(apply_logger); } exec->remove_logger(apply_logger); } - - add_or_set_member(this_precond_data, "completed", true, allocator); - } catch (const std::exception& e) { - auto encoded_name = encode_parameters(precond_name); - add_or_set_member(test_case["preconditioner"], encoded_name.c_str(), - rapidjson::Value(rapidjson::kObjectType), allocator); - add_or_set_member(test_case["preconditioner"][encoded_name.c_str()], - "completed", false, allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["preconditioner"][encoded_name.c_str()], - "error", msg_value, allocator); - } - std::cerr << "Error when processing test case\n" - << test_case << "\n" - << "what(): " << e.what() << std::endl; } -} +}; int main(int argc, char* argv[]) @@ -261,11 +293,11 @@ int main(int argc, char* argv[]) FLAGS_formats = "csr"; std::string header = "A benchmark for measuring preconditioner performance.\n"; - std::string format = example_config; + std::string format = Generator::get_example_config(); initialize_argument_parsing_matrix(&argc, &argv, header, format); std::string extra_information = - "Running with preconditioners: " + FLAGS_preconditioners + "\n"; + "Running with preconditioners: " + FLAGS_preconditioners; print_general_information(extra_information); auto exec = get_executor(FLAGS_gpu_timer); @@ -279,76 +311,10 @@ int main(int argc, char* argv[]) std::exit(1); } - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(get_input_stream()); - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - DefaultSystemGenerator<> generator{}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("preconditioner")) { - test_case.AddMember("preconditioner", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& precond_object = test_case["preconditioner"]; - if (!FLAGS_overwrite && - all_of(begin(preconditioners), end(preconditioners), - [&precond_object](const std::string& s) { - return precond_object.HasMember(s.c_str()); - })) { - continue; - } - std::clog << "Running test case\n" << test_case << std::endl; - - // annotate the test case - auto test_case_range = - annotate(generator.describe_config(test_case)); - - auto data = generator.generate_matrix_data(test_case); - - auto system_matrix = - share(formats::matrix_factory(FLAGS_formats, exec, data)); - auto b = generator.create_multi_vector_random( - exec, system_matrix->get_size()[0]); - auto x = generator.create_multi_vector( - exec, system_matrix->get_size()[0], gko::zero()); - - std::clog << "Matrix is of size (" << system_matrix->get_size()[0] - << ", " << system_matrix->get_size()[1] << ")" - << std::endl; - add_or_set_member(test_case, "size", data.size[0], allocator); - for (const auto& precond_name : preconditioners) { - { - auto precond_range = annotate(precond_name.c_str()); - run_preconditioner(precond_name.c_str(), exec, - system_matrix, b.get(), x.get(), - test_case, allocator); - } - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - backup_results(test_cases); - } - } catch (const std::exception& e) { - std::cerr << "Error setting up preconditioner, what(): " << e.what() - << std::endl; - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } + run_test_cases(PreconditionerBenchmark{}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index a9b1f9c1c93..d691309ab6a 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -52,7 +52,7 @@ struct Generator : public DistributedDefaultSystemGenerator { std::unique_ptr generate_rhs(std::shared_ptr exec, const gko::LinOp* system_matrix, - rapidjson::Value& config) const + json& config) const { return Vec::create( exec, comm, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs}, @@ -82,9 +82,13 @@ int main(int argc, char* argv[]) FLAGS_repetitions = "1"; FLAGS_min_repetitions = 1; + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + const auto do_print = rank == 0; + std::string header = "A benchmark for measuring Ginkgo's distributed solvers\n"; - std::string format = example_config + R"( + std::string format = solver_example_config + R"( The matrix will either be read from an input file if the filename parameter is given, or generated as a stencil matrix. If the filename parameter is given, all processes will read the file and @@ -100,10 +104,7 @@ int main(int argc, char* argv[]) )"; std::string additional_json = R"(,"optimal":{"spmv":"csr-csr"})"; initialize_argument_parsing_matrix(&argc, &argv, header, format, - additional_json); - - const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - const auto rank = comm.rank(); + additional_json, do_print); auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); @@ -114,8 +115,8 @@ int main(int argc, char* argv[]) "Running " + FLAGS_solvers + " with " + std::to_string(FLAGS_max_iters) + " iterations and residual goal of " + ss_rel_res_goal.str() + "\nThe number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; - if (rank == 0) { + std::to_string(FLAGS_nrhs); + if (do_print) { print_general_information(extra_information); } @@ -136,17 +137,12 @@ int main(int argc, char* argv[]) "optimal": {"spmv": "csr-csr"}] )" : broadcast_json_input(get_input_stream(), comm); - rapidjson::Document test_cases; - test_cases.Parse(json_input.c_str()); - - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(json_input); - run_solver_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer), - test_cases, Generator(comm), rank == 0); + run_test_cases(SolverBenchmark{Generator{comm}}, exec, + get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases); - if (rank == 0) { - std::cout << test_cases << std::endl; + if (do_print) { + std::cout << std::setw(4) << test_cases << std::endl; } } diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index 4efc5558a8e..b656102e5df 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -58,7 +58,7 @@ int main(int argc, char* argv[]) FLAGS_min_repetitions = 1; std::string header = "A benchmark for measuring performance of Ginkgo's solvers.\n"; - std::string format = example_config + R"( + std::string format = solver_example_config + R"( "optimal":"spmv" can be one of the recognized spmv formats )"; std::string additional_json = R"(,"optimal":{"spmv":"csr"})"; @@ -72,29 +72,24 @@ int main(int argc, char* argv[]) "Running " + FLAGS_solvers + " with " + std::to_string(FLAGS_max_iters) + " iterations and residual goal of " + ss_rel_res_goal.str() + "\nThe number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; + std::to_string(FLAGS_nrhs); print_general_information(extra_information); auto exec = get_executor(FLAGS_gpu_timer); - rapidjson::Document test_cases; + json test_cases; if (!FLAGS_overhead) { - rapidjson::IStreamWrapper jcin(get_input_stream()); - test_cases.ParseStream(jcin); + test_cases = json::parse(get_input_stream()); } else { // Fake test case to run once auto overhead_json = std::string() + " [{\"filename\": \"overhead.mtx\", \"optimal\": " "{ \"spmv\": \"csr\"}}]"; - test_cases.Parse(overhead_json.c_str()); + test_cases = json::parse(overhead_json); } - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } - - run_solver_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), test_cases, - SolverGenerator{}, true); + run_test_cases(SolverBenchmark{SolverGenerator{}}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index ae9ae6dc1fb..4976e5759d4 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -37,8 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" #include "benchmark/utils/generator.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/preconditioners.hpp" +#include "benchmark/utils/runner.hpp" #ifdef GINKGO_BENCHMARK_ENABLE_TUNING @@ -107,7 +109,7 @@ DEFINE_bool(overhead, false, "If set, uses dummy data to benchmark Ginkgo overhead"); -std::string example_config = R"( +std::string solver_example_config = R"( [ {"filename": "my_file.mtx", "optimal": {"spmv": "ell-csr"}, "rhs": "my_file_rhs.mtx"}, @@ -119,28 +121,6 @@ std::string example_config = R"( )"; -// input validation -[[noreturn]] void print_config_error_and_exit() -{ - std::cerr << "Input has to be a JSON array of solver configurations:\n" - << example_config << std::endl; - std::exit(1); -} - - -void validate_option_object(const rapidjson::Value& value) -{ - if (!value.IsObject() || - !((value.HasMember("size") && value.HasMember("stencil") && - value["size"].IsInt64() && value["stencil"].IsString()) || - (value.HasMember("filename") && value["filename"].IsString())) || - (!value.HasMember("optimal") && !value["optimal"].HasMember("spmv") && - !value["optimal"]["spmv"].IsString())) { - print_config_error_and_exit(); - } -} - - std::shared_ptr create_criterion( std::shared_ptr exec, std::uint32_t max_iters) { @@ -284,21 +264,17 @@ std::unique_ptr generate_solver( } -void write_precond_info(const gko::LinOp* precond, - rapidjson::Value& precond_info, - rapidjson::MemoryPoolAllocator<>& allocator) +void write_precond_info(const gko::LinOp* precond, json& precond_info) { if (const auto jacobi = dynamic_cast*>(precond)) { // extract block sizes const auto bdata = jacobi->get_parameters().block_pointers.get_const_data(); - add_or_set_member(precond_info, "block_sizes", - rapidjson::Value(rapidjson::kArrayType), allocator); + precond_info["block_sizes"] = json::array(); const auto nblocks = jacobi->get_num_blocks(); for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i], - allocator); + precond_info["block_sizes"].push_back(bdata[i + 1] - bdata[i]); } // extract block precisions @@ -306,24 +282,19 @@ void write_precond_info(const gko::LinOp* precond, jacobi->get_parameters() .storage_optimization.block_wise.get_const_data(); if (pdata) { - add_or_set_member(precond_info, "block_precisions", - rapidjson::Value(rapidjson::kArrayType), - allocator); + precond_info["block_precisions"] = json::array(); for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_precisions"].PushBack( - static_cast(pdata[i]), allocator); + precond_info["block_precisions"].push_back( + static_cast(pdata[i])); } } // extract condition numbers const auto cdata = jacobi->get_conditioning(); if (cdata) { - add_or_set_member(precond_info, "block_conditioning", - rapidjson::Value(rapidjson::kArrayType), - allocator); + precond_info["block_conditioning"] = json::array(); for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_conditioning"].PushBack(cdata[i], - allocator); + precond_info["block_conditioning"].push_back(cdata[i]); } } } @@ -335,10 +306,10 @@ struct SolverGenerator : DefaultSystemGenerator<> { std::unique_ptr generate_rhs(std::shared_ptr exec, const gko::LinOp* system_matrix, - rapidjson::Value& config) const + json& config) const { - if (config.HasMember("rhs")) { - std::ifstream rhs_fd{config["rhs"].GetString()}; + if (config.contains("rhs")) { + std::ifstream rhs_fd{config["rhs"].get()}; return gko::read(rhs_fd, std::move(exec)); } else { gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs}; @@ -399,45 +370,112 @@ struct SolverGenerator : DefaultSystemGenerator<> { }; -template -void solve_system(const std::string& solver_name, - const std::string& precond_name, - const char* precond_solver_name, - std::shared_ptr exec, - std::shared_ptr timer, - std::shared_ptr system_matrix, - const VectorType* b, const VectorType* x, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& solver_case = test_case["solver"]; - if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) { - return; +template +struct solver_benchmark_state { + using Vec = typename Generator::Vec; + std::shared_ptr system_matrix; + std::unique_ptr b; + std::unique_ptr x; +}; + + +template +struct SolverBenchmark : Benchmark> { + std::string name; + std::vector precond_solvers; + std::map> decoder; + Generator generator; + + SolverBenchmark(Generator generator) : name{"solver"}, generator{generator} + { + auto solvers = split(FLAGS_solvers, ','); + auto preconds = split(FLAGS_preconditioners, ','); + for (const auto& s : solvers) { + for (const auto& p : preconds) { + precond_solvers.push_back(s + (p == "none" ? "" : "-" + p)); + decoder[precond_solvers.back()] = {s, p}; + } + } + } + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return precond_solvers; + } + + bool should_print() const override { return true; } + + std::string get_example_config() const override + { + return solver_example_config; + } + + bool validate_config(const json& value) const override + { + return ((value.contains("size") && value.contains("stencil") && + value["size"].is_number_integer() && + value["stencil"].is_string()) || + (value.contains("filename") && + value["filename"].is_string())) && + (value.contains("optimal") && + value["optimal"].contains("spmv") && + value["optimal"]["spmv"].is_string()); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + solver_benchmark_state setup(std::shared_ptr exec, + json& test_case) const override + { + solver_benchmark_state state; + + if (FLAGS_overhead) { + state.system_matrix = generator.initialize({1.0}, exec); + state.b = generator.initialize( + {std::numeric_limits::quiet_NaN()}, exec); + state.x = generator.initialize({0.0}, exec); + } else { + state.system_matrix = + generator.generate_matrix_with_optimal_format(exec, test_case); + state.b = generator.generate_rhs(exec, state.system_matrix.get(), + test_case); + state.x = generator.generate_initial_guess( + exec, state.system_matrix.get(), state.b.get()); } - add_or_set_member(solver_case, precond_solver_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - auto& solver_json = solver_case[precond_solver_name]; - add_or_set_member(solver_json, "recurrent_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "true_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "implicit_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "iteration_timestamps", - rapidjson::Value(rapidjson::kArrayType), allocator); - if (b->get_size()[1] == 1 && !FLAGS_overhead) { - auto rhs_norm = compute_norm2(b); - add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator); + std::clog << "Matrix is of size (" << state.system_matrix->get_size()[0] + << ", " << state.system_matrix->get_size()[1] << ")" + << std::endl; + test_case["rows"] = state.system_matrix->get_size()[0]; + test_case["cols"] = state.system_matrix->get_size()[1]; + return state; + } + + + void run(std::shared_ptr exec, std::shared_ptr timer, + solver_benchmark_state& state, + const std::string& encoded_solver_name, + json& solver_case) const override + { + const auto decoded_pair = decoder.at(encoded_solver_name); + auto& solver_name = decoded_pair.first; + auto& precond_name = decoded_pair.second; + solver_case["recurrent_residuals"] = json::array(); + solver_case["true_residuals"] = json::array(); + solver_case["implicit_residuals"] = json::array(); + solver_case["iteration_timestamps"] = json::array(); + if (state.b->get_size()[1] == 1 && !FLAGS_overhead) { + auto rhs_norm = compute_norm2(state.b.get()); + solver_case["rhs_norm"] = rhs_norm; } for (auto stage : {"generate", "apply"}) { - add_or_set_member(solver_json, stage, - rapidjson::Value(rapidjson::kObjectType), - allocator); - add_or_set_member(solver_json[stage], "components", - rapidjson::Value(rapidjson::kObjectType), - allocator); + solver_case[stage] = json::object(); + solver_case[stage]["components"] = json::object(); } IterationControl ic{timer}; @@ -445,24 +483,24 @@ void solve_system(const std::string& solver_name, // warm run std::shared_ptr solver; for (auto _ : ic.warmup_run()) { - auto x_clone = clone(x); + auto x_clone = clone(state.x); auto precond = precond_factory.at(precond_name)(exec); solver = generate_solver(exec, give(precond), solver_name, FLAGS_warmup_max_iters) - ->generate(system_matrix); - solver->apply(b, x_clone); + ->generate(state.system_matrix); + solver->apply(state.b, x_clone); exec->synchronize(); } // detail run if (FLAGS_detailed && !FLAGS_overhead) { // slow run, get the time of each functions - auto x_clone = clone(x); + auto x_clone = clone(state.x); { auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - solver_json["generate"]["components"], allocator, 1); + solver_case["generate"]["components"], 1); exec->add_logger(gen_logger); if (exec != exec->get_master()) { exec->get_master()->add_logger(gen_logger); @@ -471,7 +509,7 @@ void solve_system(const std::string& solver_name, auto precond = precond_factory.at(precond_name)(exec); solver = generate_solver(exec, give(precond), solver_name, FLAGS_max_iters) - ->generate(system_matrix); + ->generate(state.system_matrix); exec->remove_logger(gen_logger); if (exec != exec->get_master()) { @@ -481,25 +519,22 @@ void solve_system(const std::string& solver_name, if (auto prec = dynamic_cast(solver.get())) { - add_or_set_member(solver_json, "preconditioner", - rapidjson::Value(rapidjson::kObjectType), - allocator); + solver_case["preconditioner"] = json::object(); write_precond_info( clone(exec->get_master(), prec->get_preconditioner()).get(), - solver_json["preconditioner"], allocator); + solver_case["preconditioner"]); } { auto apply_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - solver_json["apply"]["components"], allocator, 1); + solver_case["apply"]["components"], 1); exec->add_logger(apply_logger); if (exec != exec->get_master()) { exec->get_master()->add_logger(apply_logger); } - - solver->apply(b, x_clone); + solver->apply(state.b, x_clone); exec->remove_logger(apply_logger); if (exec != exec->get_master()) { @@ -508,17 +543,18 @@ void solve_system(const std::string& solver_name, } // slow run, gets the recurrent and true residuals of each iteration - if (b->get_size()[1] == 1) { - x_clone = clone(x); + if (state.b->get_size()[1] == 1) { + x_clone = clone(state.x); auto res_logger = std::make_shared>( - system_matrix, b, solver_json["recurrent_residuals"], - solver_json["true_residuals"], - solver_json["implicit_residuals"], - solver_json["iteration_timestamps"], allocator); + state.system_matrix, state.b, + solver_case["recurrent_residuals"], + solver_case["true_residuals"], + solver_case["implicit_residuals"], + solver_case["iteration_timestamps"]); solver->add_logger(res_logger); - solver->apply(b, x_clone); + solver->apply(state.b, x_clone); if (!res_logger->has_implicit_res_norms()) { - solver_json.RemoveMember("implicit_residuals"); + solver_case.erase("implicit_residuals"); } } exec->synchronize(); @@ -528,16 +564,16 @@ void solve_system(const std::string& solver_name, auto it_logger = std::make_shared(); auto generate_timer = get_timer(exec, FLAGS_gpu_timer); auto apply_timer = ic.get_timer(); - auto x_clone = clone(x); + auto x_clone = clone(state.x); for (auto status : ic.run(false)) { - x_clone = clone(x); + x_clone = clone(state.x); exec->synchronize(); generate_timer->tic(); auto precond = precond_factory.at(precond_name)(exec); solver = generate_solver(exec, give(precond), solver_name, FLAGS_max_iters) - ->generate(system_matrix); + ->generate(state.system_matrix); generate_timer->toc(); exec->synchronize(); @@ -545,165 +581,33 @@ void solve_system(const std::string& solver_name, solver->add_logger(it_logger); } apply_timer->tic(); - solver->apply(b, x_clone); + solver->apply(state.b, x_clone); apply_timer->toc(); if (ic.get_num_repetitions() == 0) { solver->remove_logger(it_logger); } } - it_logger->write_data(solver_json["apply"], allocator); + it_logger->write_data(solver_case["apply"]); - if (b->get_size()[1] == 1 && !FLAGS_overhead) { + if (state.b->get_size()[1] == 1 && !FLAGS_overhead) { // a solver is considered direct if it didn't log any iterations - if (solver_json["apply"].HasMember("iterations") && - solver_json["apply"]["iterations"].GetInt() == 0) { - auto error = - compute_direct_error(solver.get(), b, x_clone.get()); - add_or_set_member(solver_json, "forward_error", error, - allocator); - } - auto residual = - compute_residual_norm(system_matrix.get(), b, x_clone.get()); - add_or_set_member(solver_json, "residual_norm", residual, - allocator); - } - add_or_set_member(solver_json["generate"], "time", - generate_timer->compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(solver_json["apply"], "time", - apply_timer->compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(solver_json, "repetitions", - apply_timer->get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(solver_json, "completed", true, allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["solver"][precond_solver_name], "completed", - false, allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["solver"][precond_solver_name], "error", - msg_value, allocator); - } - std::cerr << "Error when processing test case\n" - << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} - - -template -void run_solver_benchmarks(std::shared_ptr exec, - std::shared_ptr timer, - rapidjson::Document& test_cases, - const SystemGenerator& system_generator, - bool do_print) -{ - auto solvers = split(FLAGS_solvers, ','); - auto preconds = split(FLAGS_preconditioners, ','); - std::vector precond_solvers; - for (const auto& s : solvers) { - for (const auto& p : preconds) { - precond_solvers.push_back(s + (p == "none" ? "" : "-" + p)); - } - } - - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("solver")) { - test_case.AddMember("solver", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& solver_case = test_case["solver"]; - if (!FLAGS_overwrite && - all_of(begin(precond_solvers), end(precond_solvers), - [&solver_case](const std::string& s) { - return solver_case.HasMember(s.c_str()); - })) { - continue; - } - // annotate the test case - auto test_case_range = - annotate(system_generator.describe_config(test_case)); - - if (do_print) { - std::clog << "Running test case\n" << test_case << std::endl; - } - - using Vec = typename SystemGenerator::Vec; - std::shared_ptr system_matrix; - std::unique_ptr b; - std::unique_ptr x; - if (FLAGS_overhead) { - system_matrix = system_generator.initialize({1.0}, exec); - b = system_generator.initialize( - {std::numeric_limits::quiet_NaN()}, exec); - x = system_generator.initialize({0.0}, exec); - } else { - system_matrix = - system_generator.generate_matrix_with_optimal_format( - exec, test_case); - b = system_generator.generate_rhs(exec, system_matrix.get(), - test_case); - x = system_generator.generate_initial_guess( - exec, system_matrix.get(), b.get()); - } - - if (do_print) { - std::clog << "Matrix is of size (" - << system_matrix->get_size()[0] << ", " - << system_matrix->get_size()[1] << ")" << std::endl; - } - add_or_set_member(test_case, "size", system_matrix->get_size()[0], - allocator); - auto precond_solver_name = begin(precond_solvers); - for (const auto& solver_name : solvers) { - auto solver_range = annotate(solver_name.c_str()); - for (const auto& precond_name : preconds) { - if (do_print) { - std::clog - << "\tRunning solver: " << *precond_solver_name - << std::endl; - } - { - auto precond_range = annotate(precond_name.c_str()); - solve_system(solver_name, precond_name, - precond_solver_name->c_str(), exec, timer, - system_matrix, b.get(), x.get(), test_case, - allocator); - } - if (do_print) { - backup_results(test_cases); - } - ++precond_solver_name; - } - } - } catch (const std::exception& e) { - std::cerr << "Error setting up solver, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); + if (solver_case["apply"].contains("iterations") && + solver_case["apply"]["iterations"].get() == 0) { + auto error = compute_direct_error(solver.get(), state.b.get(), + x_clone.get()); + solver_case["forward_error"] = error; } + auto residual = compute_residual_norm(state.system_matrix.get(), + state.b.get(), x_clone.get()); + solver_case["residual_norm"] = residual; } + solver_case["generate"]["time"] = + generate_timer->compute_time(FLAGS_timer_method); + solver_case["apply"]["time"] = + apply_timer->compute_time(FLAGS_timer_method); + solver_case["repetitions"] = apply_timer->get_num_repetitions(); } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } -} +}; #endif // GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index 66e5707c559..2ee766d4f83 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/sparse_blas/operations.hpp" -#include "benchmark/utils/json.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" @@ -632,11 +631,9 @@ class SymbolicLuOperation : public BenchmarkOperation { void run() override { gko::factorization::symbolic_lu(mtx_, result_); } - void write_stats(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& allocator) override + void write_stats(json& object) override { - add_or_set_member(object, "factor_nonzeros", - result_->get_num_stored_elements(), allocator); + object["factor_nonzeros"] = result_->get_num_stored_elements(); } private: @@ -680,11 +677,9 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { forest_); } - void write_stats(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& allocator) override + void write_stats(json& object) override { - add_or_set_member(object, "factor_nonzeros", - result_->get_num_stored_elements(), allocator); + object["factor_nonzeros"] = result_->get_num_stored_elements(); } private: diff --git a/benchmark/sparse_blas/operations.hpp b/benchmark/sparse_blas/operations.hpp index 99cf72b8e59..48034eb8a1f 100644 --- a/benchmark/sparse_blas/operations.hpp +++ b/benchmark/sparse_blas/operations.hpp @@ -36,9 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - +#include "benchmark/utils/json.hpp" #include "benchmark/utils/types.hpp" @@ -79,9 +77,7 @@ class BenchmarkOperation { /** * Allows the operation to write arbitrary information to the JSON output. */ - virtual void write_stats(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& allocator) - {} + virtual void write_stats(json& object) {} }; diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index 8c054709fdf..21df4d9c448 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -47,7 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/sparse_blas/operations.hpp" #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" +#include "benchmark/utils/iteration_control.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/types.hpp" #include "core/test/utils/matrix_generator.hpp" @@ -74,18 +75,64 @@ DEFINE_bool(validate, false, "against the ReferenceExecutor solution."); -void apply_sparse_blas(const char* operation_name, - std::shared_ptr exec, const Mtx* mtx, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - add_or_set_member(test_case, operation_name, - rapidjson::Value(rapidjson::kObjectType), allocator); +using Generator = DefaultSystemGenerator<>; + + +struct SparseBlasBenchmark : Benchmark> { + std::string name; + std::vector operations; + + SparseBlasBenchmark() + : name{"sparse_blas"}, operations{split(FLAGS_operations)} + {} + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return operations; + } + + bool should_print() const override { return true; } + + bool validate_config(const json& value) const override + { + return Generator::validate_config(value); + } + + std::string get_example_config() const override + { + return Generator::get_example_config(); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + std::unique_ptr setup(std::shared_ptr exec, + json& test_case) const override + { + auto data = Generator::generate_matrix_data(test_case); + data.ensure_row_major_order(); + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + + auto mtx = Mtx::create(exec, data.size, data.nonzeros.size()); + mtx->read(data); + return mtx; + } + - auto op = get_operation(operation_name, mtx); + void run(std::shared_ptr exec, std::shared_ptr timer, + std::unique_ptr& mtx, const std::string& operation_name, + json& operation_case) const override + { + auto op = get_operation(operation_name, mtx.get()); - auto timer = get_timer(exec, FLAGS_gpu_timer); IterationControl ic(timer); // warm run @@ -105,54 +152,30 @@ void apply_sparse_blas(const char* operation_name, const auto flops = static_cast(op->get_flops()); const auto mem = static_cast(op->get_memory()); const auto repetitions = ic.get_num_repetitions(); - add_or_set_member(test_case[operation_name], "time", runtime, - allocator); - add_or_set_member(test_case[operation_name], "flops", flops / runtime, - allocator); - add_or_set_member(test_case[operation_name], "bandwidth", mem / runtime, - allocator); - add_or_set_member(test_case[operation_name], "repetitions", repetitions, - allocator); + operation_case["time"] = runtime; + operation_case["flops"] = flops / runtime; + operation_case["bandwidth"] = mem / runtime; + operation_case["repetitions"] = repetitions; if (FLAGS_validate) { auto validation_result = op->validate(); - add_or_set_member(test_case[operation_name], "correct", - validation_result.first, allocator); - add_or_set_member(test_case[operation_name], "error", - validation_result.second, allocator); + operation_case["correct"] = validation_result.first; + operation_case["error"] = validation_result.second; } if (FLAGS_detailed) { - add_or_set_member(test_case[operation_name], "components", - rapidjson::Value(rapidjson::kObjectType), - allocator); + operation_case["components"] = json::object(); auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - test_case[operation_name]["components"], allocator, - repetitions); + operation_case["components"], repetitions); exec->add_logger(gen_logger); for (unsigned i = 0; i < repetitions; i++) { op->run(); } exec->remove_logger(gen_logger); } - op->write_stats(test_case[operation_name], allocator); - - add_or_set_member(test_case[operation_name], "completed", true, - allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case[operation_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case[operation_name], "error", msg_value, - allocator); - } - std::cerr << "Error when processing test case\n" - << test_case << "\n" - << "what(): " << e.what() << std::endl; + op->write_stats(operation_case); } -} +}; int main(int argc, char* argv[]) @@ -160,86 +183,18 @@ int main(int argc, char* argv[]) std::string header = "A benchmark for measuring performance of Ginkgo's sparse BLAS " "operations.\n"; - std::string format = example_config; + std::string format = Generator::get_example_config(); initialize_argument_parsing_matrix(&argc, &argv, header, format); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(get_input_stream()); std::string extra_information = "The operations are " + FLAGS_operations; print_general_information(extra_information); - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - auto operations = split(FLAGS_operations, ','); - - DefaultSystemGenerator<> generator{}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember(benchmark_name)) { - test_case.AddMember(rapidjson::Value(benchmark_name, allocator), - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& sp_blas_case = test_case[benchmark_name]; - std::clog << "Running test case\n" << test_case << std::endl; - auto data = generator.generate_matrix_data(test_case); - data.ensure_row_major_order(); - std::clog << "Matrix is of size (" << data.size[0] << ", " - << data.size[1] << "), " << data.nonzeros.size() - << std::endl; - add_or_set_member(test_case, "rows", data.size[0], allocator); - add_or_set_member(test_case, "cols", data.size[1], allocator); - add_or_set_member(test_case, "nonzeros", data.nonzeros.size(), - allocator); - - auto mtx = Mtx::create(exec, data.size, data.nonzeros.size()); - mtx->read(data); - // annotate the test case - auto test_case_range = - annotate(generator.describe_config(test_case)); - for (const auto& operation_name : operations) { - if (FLAGS_overwrite || - !sp_blas_case.HasMember(operation_name.c_str())) { - { - auto operation_range = annotate(operation_name.c_str()); - apply_sparse_blas(operation_name.c_str(), exec, - mtx.get(), sp_blas_case, allocator); - } - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - backup_results(test_cases); - } - } - // write the output if we have no strategies - backup_results(test_cases); - } catch (const std::exception& e) { - std::cerr << "Error setting up matrix data, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); - } - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } + run_test_cases(SparseBlasBenchmark{}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp index 9b7e4ad8c8f..202aad15c7e 100644 --- a/benchmark/spmv/distributed/spmv.cpp +++ b/benchmark/spmv/distributed/spmv.cpp @@ -58,38 +58,7 @@ DEFINE_string(non_local_formats, "csr", "run. See the 'formats' option for a list of supported versions"); -std::string example_config = R"( - [ - {"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}, - {"filename": "my_file.mtx"} - ] -)"; - - -[[noreturn]] void print_config_error_and_exit() -{ - std::cerr << "Input has to be a JSON array of matrix configurations:\n" - << example_config << std::endl; - std::exit(1); -} - - -struct Generator : DistributedDefaultSystemGenerator> { - Generator(gko::experimental::mpi::communicator comm) - : DistributedDefaultSystemGenerator>{ - std::move(comm), {}} - {} - - void validate_options(const rapidjson::Value& options) const - { - if (!options.IsObject() || - !((options.HasMember("size") && options.HasMember("stencil") && - options.HasMember("comm_pattern")) || - options.HasMember("filename"))) { - print_config_error_and_exit(); - } - } -}; +using Generator = DistributedDefaultSystemGenerator>; int main(int argc, char* argv[]) @@ -98,18 +67,19 @@ int main(int argc, char* argv[]) const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); const auto rank = comm.rank(); + const auto do_print = rank == 0; std::string header = "A benchmark for measuring performance of Ginkgo's spmv.\n"; - std::string format = example_config; - initialize_argument_parsing_matrix(&argc, &argv, header, format); - - if (rank == 0) { - std::string extra_information = "The formats are [" + - FLAGS_local_formats + "]x[" + - FLAGS_non_local_formats + "]\n" + - "The number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; + std::string format = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format, "", + do_print); + + if (do_print) { + std::string extra_information = + "The formats are [" + FLAGS_local_formats + "]x[" + + FLAGS_non_local_formats + "]\n" + + "The number of right hand sides is " + std::to_string(FLAGS_nrhs); print_general_information(extra_information); } @@ -125,16 +95,13 @@ int main(int argc, char* argv[]) } std::string json_input = broadcast_json_input(get_input_stream(), comm); - rapidjson::Document test_cases; - test_cases.Parse(json_input.c_str()); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(json_input); - run_spmv_benchmark(exec, test_cases, formats, Generator{comm}, - get_mpi_timer(exec, comm, FLAGS_gpu_timer), rank == 0); + run_test_cases(SpmvBenchmark{Generator{comm}, formats, do_print}, + exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer), + test_cases); - if (rank == 0) { - std::cout << test_cases << std::endl; + if (do_print) { + std::cout << std::setw(4) << test_cases << std::endl; } } diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index 034437907c8..abd1b783019 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -41,48 +41,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" -struct Generator : DefaultSystemGenerator<> { - void validate_options(const rapidjson::Value& options) const - { - if (!options.IsObject() || - !((options.HasMember("size") && options.HasMember("stencil")) || - options.HasMember("filename"))) { - std::cerr - << "Input has to be a JSON array of matrix configurations:\n" - << example_config << std::endl; - std::exit(1); - } - } -}; +using Generator = DefaultSystemGenerator<>; int main(int argc, char* argv[]) { std::string header = "A benchmark for measuring performance of Ginkgo's spmv.\n"; - std::string format = example_config; + std::string format = Generator::get_example_config(); initialize_argument_parsing_matrix(&argc, &argv, header, format); std::string extra_information = "The formats are " + FLAGS_formats + "\nThe number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; + std::to_string(FLAGS_nrhs); print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - auto formats = split(FLAGS_formats, ','); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(get_input_stream()); - run_spmv_benchmark(exec, test_cases, formats, Generator{}, - get_timer(exec, FLAGS_gpu_timer), true); + run_test_cases(SpmvBenchmark{Generator{}, split(FLAGS_formats)}, + exec, get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index 3c8d886df3b..4a7d014de8b 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -36,7 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" #ifdef GINKGO_BENCHMARK_ENABLE_TUNING @@ -48,57 +50,119 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. DEFINE_uint32(nrhs, 1, "The number of right hand sides"); -// This function supposes that management of `FLAGS_overwrite` is done before -// calling it -template -void apply_spmv(const char* format_name, std::shared_ptr exec, - const Generator& generator, std::shared_ptr timer, - const gko::matrix_data& data, - const VectorType* b, const VectorType* x, - const VectorType* answer, rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& spmv_case = test_case["spmv"]; - add_or_set_member(spmv_case, format_name, - rapidjson::Value(rapidjson::kObjectType), allocator); +template +struct spmv_benchmark_state { + gko::matrix_data data; + std::unique_ptr x; + std::unique_ptr b; + std::unique_ptr answer; +}; + + +template +struct SpmvBenchmark : Benchmark> { + using Vec = typename Generator::Vec; + std::string name; + std::vector formats; + bool do_print; + Generator generator; + + SpmvBenchmark(Generator generator, std::vector formats, + bool do_print = true) + : name{"spmv"}, + formats{std::move(formats)}, + generator{generator}, + do_print{do_print} + {} + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return formats; + } + + bool should_print() const override { return do_print; } + std::string get_example_config() const override + { + return generator.get_example_config(); + } + + bool validate_config(const json& test_case) const override + { + return generator.validate_config(test_case); + } + + std::string describe_config(const json& test_case) const override + { + return generator.describe_config(test_case); + } + + spmv_benchmark_state setup(std::shared_ptr exec, + json& test_case) const override + { + spmv_benchmark_state state; + state.data = generator.generate_matrix_data(test_case); + + auto nrhs = FLAGS_nrhs; + state.b = generator.create_multi_vector_random( + exec, gko::dim<2>{state.data.size[1], nrhs}); + state.x = generator.create_multi_vector_random( + exec, gko::dim<2>{state.data.size[0], nrhs}); + if (do_print) { + std::clog << "Matrix is of size (" << state.data.size[0] << ", " + << state.data.size[1] << "), " + << state.data.nonzeros.size() << std::endl; + } + test_case["rows"] = state.data.size[0]; + test_case["cols"] = state.data.size[1]; + test_case["nonzeros"] = state.data.nonzeros.size(); + if (FLAGS_detailed) { + state.answer = gko::clone(state.x); + auto system_matrix = + generator.generate_matrix_with_default_format(exec, state.data); + exec->synchronize(); + system_matrix->apply(state.b, state.answer); + exec->synchronize(); + } + return state; + } + + void run(std::shared_ptr exec, std::shared_ptr timer, + spmv_benchmark_state& state, + const std::string& format_name, json& format_case) const override + { auto system_matrix = generator.generate_matrix_with_format( - exec, format_name, data, &spmv_case[format_name], &allocator); + exec, format_name, state.data, &format_case); // check the residual if (FLAGS_detailed) { - auto x_clone = clone(x); + auto x_clone = clone(state.x); exec->synchronize(); - system_matrix->apply(b, x_clone); + system_matrix->apply(state.b, x_clone); exec->synchronize(); auto max_relative_norm2 = - compute_max_relative_norm2(x_clone.get(), answer); - add_or_set_member(spmv_case[format_name], "max_relative_norm2", - max_relative_norm2, allocator); + compute_max_relative_norm2(x_clone.get(), state.answer.get()); + format_case["max_relative_norm2"] = max_relative_norm2; } IterationControl ic{timer}; // warm run for (auto _ : ic.warmup_run()) { - auto x_clone = clone(x); + auto x_clone = clone(state.x); exec->synchronize(); - system_matrix->apply(b, x_clone); + system_matrix->apply(state.b, x_clone); exec->synchronize(); } // tuning run #ifdef GINKGO_BENCHMARK_ENABLE_TUNING auto& format_case = spmv_case[format_name]; - if (!format_case.HasMember("tuning")) { - format_case.AddMember( - "tuning", rapidjson::Value(rapidjson::kObjectType), allocator); - } + format_case["tuning"] = json::object(); auto& tuning_case = format_case["tuning"]; - add_or_set_member(tuning_case, "time", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(tuning_case, "values", - rapidjson::Value(rapidjson::kArrayType), allocator); + tuning_case["time"] = json::array(); + tuning_case["values"] = json::array(); // Enable tuning for this portion of code gko::_tuning_flag = true; @@ -112,13 +176,13 @@ void apply_spmv(const char* format_name, std::shared_ptr exec, gko::_tuned_value = val; auto tuning_timer = get_timer(exec, FLAGS_gpu_timer); IterationControl ic_tuning{tuning_timer}; - auto x_clone = clone(x); + auto x_clone = clone(state.x); for (auto _ : ic_tuning.run()) { - system_matrix->apply(b, x_clone); + system_matrix->apply(state.b, x_clone); } - tuning_case["time"].PushBack( - ic_tuning.compute_time(FLAGS_timer_method), allocator); - tuning_case["values"].PushBack(val, allocator); + tuning_case["time"].push_back( + ic_tuning.compute_time(FLAGS_timer_method)); + tuning_case["values"].push_back(val); } // We put back the flag to false to use the default (non-tuned) values // for the following @@ -126,142 +190,41 @@ void apply_spmv(const char* format_name, std::shared_ptr exec, #endif // GINKGO_BENCHMARK_ENABLE_TUNING // timed run - auto x_clone = clone(x); + auto x_clone = clone(state.x); for (auto _ : ic.run()) { - system_matrix->apply(b, x_clone); - } - add_or_set_member(spmv_case[format_name], "time", - ic.compute_time(FLAGS_timer_method), allocator); - add_or_set_member(spmv_case[format_name], "repetitions", - ic.get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(spmv_case[format_name], "completed", true, allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["spmv"][format_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["spmv"][format_name], "error", - msg_value, allocator); + system_matrix->apply(state.b, x_clone); } - std::cerr << "Error when processing test case\n" - << test_case << "\n" - << "what(): " << e.what() << std::endl; + format_case["time"] = ic.compute_time(FLAGS_timer_method); + format_case["repetitions"] = ic.get_num_repetitions(); } -} - - -template -void run_spmv_benchmark(std::shared_ptr exec, - rapidjson::Document& test_cases, - const std::vector formats, - const SystemGenerator& system_generator, - std::shared_ptr timer, bool do_print) -{ - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - system_generator.validate_options(test_case); - if (!test_case.HasMember("spmv")) { - test_case.AddMember("spmv", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& spmv_case = test_case["spmv"]; - if (!FLAGS_overwrite && - all_of(begin(formats), end(formats), - [&spmv_case](const std::string& s) { - return spmv_case.HasMember(s.c_str()); - })) { - continue; - } - if (do_print) { - std::clog << "Running test case\n" << test_case << std::endl; - } - // annotate the test case - auto test_case_range = - annotate(system_generator.describe_config(test_case)); - - auto data = system_generator.generate_matrix_data(test_case); - - auto nrhs = FLAGS_nrhs; - auto b = system_generator.create_multi_vector_random( - exec, gko::dim<2>{data.size[1], nrhs}); - auto x = system_generator.create_multi_vector_random( - exec, gko::dim<2>{data.size[0], nrhs}); - if (do_print) { - std::clog << "Matrix is of size (" << data.size[0] << ", " - << data.size[1] << ")" << std::endl; - } - add_or_set_member(test_case, "size", data.size[0], allocator); - add_or_set_member(test_case, "nnz", data.nonzeros.size(), - allocator); - auto best_performance = std::numeric_limits::max(); - if (!test_case.HasMember("optimal")) { - test_case.AddMember("optimal", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - // Compute the result from ginkgo::coo as the correct answer - auto answer = gko::clone(x); - if (FLAGS_detailed) { - auto system_matrix = - system_generator.generate_matrix_with_default_format(exec, - data); - exec->synchronize(); - system_matrix->apply(b, answer); - exec->synchronize(); + void postprocess(json& test_case) const override + { + if (!test_case.contains("optimal")) { + test_case["optimal"] = json::object(); + } + auto best_time = std::numeric_limits::max(); + std::string best_format; + // find the fastest among all formats we tested + for (const auto& format : formats) { + if (!test_case[name].contains(format)) { + continue; } - for (const auto& format_name : formats) { - { - auto format_range = annotate(format_name.c_str()); - apply_spmv(format_name.c_str(), exec, system_generator, - timer, data, b.get(), x.get(), answer.get(), - test_case, allocator); - } - if (do_print) { - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - } - if (spmv_case[format_name.c_str()]["completed"].GetBool()) { - auto performance = - spmv_case[format_name.c_str()]["time"].GetDouble(); - if (performance < best_performance) { - best_performance = performance; - add_or_set_member( - test_case["optimal"], "spmv", - rapidjson::Value(format_name.c_str(), allocator) - .Move(), - allocator); - } - } - if (do_print) { - backup_results(test_cases); + auto& format_case = test_case[name][format]; + if (format_case.contains("completed") && + format_case["completed"].template get()) { + auto time = format_case["time"]; + if (time < best_time) { + best_time = time; + best_format = format; } } - } catch (const std::exception& e) { - std::cerr << "Error setting up matrix data, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); - } + } + if (!best_format.empty()) { + test_case["optimal"][name] = best_format; } } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } -} +}; + #endif // GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index abc496b0921..b64f4321287 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -10,6 +10,7 @@ Running test case "blas": {} } DEBUG: begin n = 100 + Running blas: copy DEBUG: begin copy DEBUG: begin allocate DEBUG: end allocate @@ -24,21 +25,7 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end copy -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - } - } - } -] + Running blas: axpy DEBUG: begin axpy DEBUG: begin allocate DEBUG: end allocate @@ -61,28 +48,7 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end axpy -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - } - } - } -] + Running blas: scal DEBUG: begin scal DEBUG: begin allocate DEBUG: end allocate @@ -99,33 +65,4 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end scal -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - }, - "scal": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - } - } - } -] DEBUG: end n = 100 diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr index 9508b0dcf1e..f41b25c6ee1 100644 --- a/benchmark/test/reference/blas.simple.stderr +++ b/benchmark/test/reference/blas.simple.stderr @@ -9,69 +9,6 @@ Running test case "n": 100, "blas": {} } -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "n": 100, - "blas": { - "copy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "axpy": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - }, - "scal": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] + Running blas: copy + Running blas: axpy + Running blas: scal diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr index 9ab8a899649..1d5df7477ba 100644 --- a/benchmark/test/reference/conversion.all.stderr +++ b/benchmark/test/reference/conversion.all.stderr @@ -4,1853 +4,23 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr,ell,sellp,hybrid -Benchmarking conversions. Running test case { "size": 100, "stencil": "7pt", - "conversions": {} -} -Matrix is of size (125, 125) -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" - }, - "hybrid-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" - }, - "hybrid-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "hybrid-ell": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" - }, - "hybrid-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "hybrid-ell": { - "completed": false, - "error": "" - } - } - } -] -Error when processing test case -{ - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" - }, - "hybrid-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "hybrid-ell": { - "completed": false, - "error": "" - }, - "hybrid-sellp": { - "completed": false, - "error": "" - } - } -} -what(): -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" - }, - "coo-hybrid": { - "completed": false, - "error": "" - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-ell": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-sellp": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-hybrid": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-coo": { - "completed": false, - "error": "" - }, - "ell-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" - }, - "sellp-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" - }, - "hybrid-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "hybrid-ell": { - "completed": false, - "error": "" - }, - "hybrid-sellp": { - "completed": false, - "error": "" - } - } - } -] + "conversion": {} +} +Matrix is of size (125, 125), 725 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo + Running conversion: csr-ell + Running conversion: csr-sellp + Running conversion: csr-hybrid + Running conversion: ell-read + Running conversion: ell-csr + Running conversion: sellp-read + Running conversion: sellp-csr + Running conversion: hybrid-read + Running conversion: hybrid-csr diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout index cb53bb81a6c..c4b657a42c4 100644 --- a/benchmark/test/reference/conversion.all.stdout +++ b/benchmark/test/reference/conversion.all.stdout @@ -1,25 +1,23 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", - "conversions": { - "coo-csr": { + "conversion": { + "coo-read": { "time": 1.0, "repetitions": 10, "completed": true }, - "coo-ell": { - "completed": false, - "error": "" - }, - "coo-sellp": { - "completed": false, - "error": "" + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true }, - "coo-hybrid": { - "completed": false, - "error": "" + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true }, "csr-coo": { "time": 1.0, @@ -41,57 +39,39 @@ "repetitions": 10, "completed": true }, - "ell-coo": { - "completed": false, - "error": "" + "ell-read": { + "time": 1.0, + "repetitions": 10, + "completed": true }, "ell-csr": { "time": 1.0, "repetitions": 10, "completed": true }, - "ell-sellp": { - "completed": false, - "error": "" - }, - "ell-hybrid": { - "completed": false, - "error": "" - }, - "sellp-coo": { - "completed": false, - "error": "" + "sellp-read": { + "time": 1.0, + "repetitions": 10, + "completed": true }, "sellp-csr": { "time": 1.0, "repetitions": 10, "completed": true }, - "sellp-ell": { - "completed": false, - "error": "" - }, - "sellp-hybrid": { - "completed": false, - "error": "" - }, - "hybrid-coo": { - "completed": false, - "error": "" + "hybrid-read": { + "time": 1.0, + "repetitions": 10, + "completed": true }, "hybrid-csr": { "time": 1.0, "repetitions": 10, "completed": true - }, - "hybrid-ell": { - "completed": false, - "error": "" - }, - "hybrid-sellp": { - "completed": false, - "error": "" } - } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 } ] diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr index 1d604175479..369a363a53e 100644 --- a/benchmark/test/reference/conversion.matrix.stderr +++ b/benchmark/test/reference/conversion.matrix.stderr @@ -4,43 +4,13 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Benchmarking conversions. Running test case { "filename": "", - "conversions": {} + "conversion": {} } -Matrix is of size (36, 36) -Current state: -[ - { - "filename": "", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - }, - "size": 36 - } -] -Current state: -[ - { - "filename": "", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - }, - "size": 36 - } -] +Matrix is of size (36, 36), 208 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout index e43edda0595..7e537fa4919 100644 --- a/benchmark/test/reference/conversion.matrix.stdout +++ b/benchmark/test/reference/conversion.matrix.stdout @@ -2,18 +2,30 @@ [ { "filename": "", - "conversions": { + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, "coo-csr": { "time": 1.0, "repetitions": 10, "completed": true }, + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, "csr-coo": { "time": 1.0, "repetitions": 10, "completed": true } }, - "size": 36 + "rows": 36, + "cols": 36, + "nonzeros": 208 } ] diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index 6733472be8f..089e6be02f9 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -4,15 +4,16 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Benchmarking conversions. Running test case { "size": 100, "stencil": "7pt", - "conversions": {} + "conversion": {} } -Matrix is of size (125, 125) -DEBUG: begin stencil(125,7pt) +Matrix is of size (125, 125), 725 +DEBUG: begin stencil(100,7pt) + Running conversion: coo-read +DEBUG: begin coo-read DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -21,13 +22,17 @@ DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end coo-read + Running conversion: coo-csr DEBUG: begin coo-csr DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -36,12 +41,8 @@ DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin components::convert_idxs_to_ptrs -DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free +DEBUG: begin components::fill_array +DEBUG: end components::fill_array DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate @@ -49,14 +50,10 @@ DEBUG: begin free DEBUG: end free DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin components::convert_idxs_to_ptrs @@ -68,27 +65,15 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end coo-csr -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 1, - "completed": true - } - } - } -] DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free +DEBUG: end coo-csr + Running conversion: csr-read +DEBUG: begin csr-read DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array @@ -109,32 +94,46 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end csr-read + Running conversion: csr-coo DEBUG: begin csr-coo DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin copy() DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_ptrs_to_idxs DEBUG: end components::convert_ptrs_to_idxs DEBUG: end copy() @@ -144,30 +143,11 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end csr-coo -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 1, - "completed": true - }, - "csr-coo": { - "time": 1.0, - "repetitions": 1, - "completed": true - } - } - } -] DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end stencil(125,7pt) +DEBUG: end csr-coo +DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout index 3e76bc26934..b29815f6c17 100644 --- a/benchmark/test/reference/conversion.profile.stdout +++ b/benchmark/test/reference/conversion.profile.stdout @@ -1,19 +1,32 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", - "conversions": { + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, "coo-csr": { "time": 1.0, "repetitions": 1, "completed": true }, + "csr-read": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, "csr-coo": { "time": 1.0, "repetitions": 1, "completed": true } - } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 } ] diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr index d221ead12a4..a814dba6888 100644 --- a/benchmark/test/reference/conversion.simple.stderr +++ b/benchmark/test/reference/conversion.simple.stderr @@ -4,44 +4,14 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Benchmarking conversions. Running test case { "size": 100, "stencil": "7pt", - "conversions": {} + "conversion": {} } -Matrix is of size (125, 125) -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "conversions": { - "coo-csr": { - "time": 1.0, - "repetitions": 10, - "completed": true - }, - "csr-coo": { - "time": 1.0, - "repetitions": 10, - "completed": true - } - } - } -] +Matrix is of size (125, 125), 725 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout index 9ecdd46f5e1..856f1330eea 100644 --- a/benchmark/test/reference/conversion.simple.stdout +++ b/benchmark/test/reference/conversion.simple.stdout @@ -1,19 +1,32 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", - "conversions": { + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, "coo-csr": { "time": 1.0, "repetitions": 10, "completed": true }, + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, "csr-coo": { "time": 1.0, "repetitions": 10, "completed": true } - } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 } ] diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout index 34fdda13e55..cd3c7b8bd43 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stdout +++ b/benchmark/test/reference/distributed_solver.matrix.stdout @@ -52,6 +52,7 @@ "completed": true } }, - "size": 36 + "rows": 36, + "cols": 36 } ] diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index efd79f66dc5..e583a1411a8 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -5,7 +5,6 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -DEBUG: begin stencil(100,7pt,stencil) Running test case { "size": 100, @@ -213,9 +212,9 @@ DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() Matrix is of size (125, 125) -DEBUG: begin cg +DEBUG: begin stencil(100,7pt,stencil) Running solver: cg -DEBUG: begin none +DEBUG: begin cg DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 @@ -670,8 +669,8 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end none DEBUG: end cg +DEBUG: end stencil(100,7pt,stencil) DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -686,4 +685,3 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end stencil(100,7pt,stencil) diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout index c61541a5d5b..aef92652256 100644 --- a/benchmark/test/reference/distributed_solver.profile.stdout +++ b/benchmark/test/reference/distributed_solver.profile.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": { @@ -27,6 +27,8 @@ "repetitions": 1, "completed": true } - } + }, + "rows": 125, + "cols": 125 } ] diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout index 54d7233ba77..002b9d91347 100644 --- a/benchmark/test/reference/distributed_solver.simple.stdout +++ b/benchmark/test/reference/distributed_solver.simple.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": { @@ -53,6 +53,8 @@ "repetitions": 1, "completed": true } - } + }, + "rows": 125, + "cols": 125 } ] diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr index af205c778c0..7bb33842f25 100644 --- a/benchmark/test/reference/matrix_statistics.matrix.stderr +++ b/benchmark/test/reference/matrix_statistics.matrix.stderr @@ -5,4 +5,4 @@ Running test case "filename": "", "problem": {} } -Matrix is of size (36, 36) +Matrix is of size (36, 36), 208 diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout index a056241669b..ea73587fde4 100644 --- a/benchmark/test/reference/matrix_statistics.matrix.stdout +++ b/benchmark/test/reference/matrix_statistics.matrix.stdout @@ -33,6 +33,8 @@ "hyperflatness": 6.0545648993883665 } }, - "size": 36 + "rows": 36, + "cols": 36, + "nonzeros": 208 } ] diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr index 6b853c3f4ea..75a7cca709f 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stderr +++ b/benchmark/test/reference/matrix_statistics.simple.stderr @@ -6,4 +6,4 @@ Running test case "stencil": "7pt", "problem": {} } -Matrix is of size (125, 125) +Matrix is of size (125, 125), 725 diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout index 4470784e7c5..13746ce8a46 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stdout +++ b/benchmark/test/reference/matrix_statistics.simple.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "problem": { "rows": 125, @@ -33,6 +33,9 @@ "hyperskewness": -1.741577812922432, "hyperflatness": 7.762345679012379 } - } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 } ] diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr index c9ef583d79e..4088a20c925 100644 --- a/benchmark/test/reference/preconditioner.matrix.stderr +++ b/benchmark/test/reference/preconditioner.matrix.stderr @@ -9,34 +9,5 @@ Running test case "filename": "", "preconditioner": {} } -Matrix is of size (36, 36) -Current state: -[ - { - "filename": "", - "preconditioner": { - "none": { - "generate": { - "components": { - "generate()": 1.0, - "overhead": 1.0 - }, - "time": 1.0, - "repetitions": 10 - }, - "apply": { - "components": { - "apply()": 1.0, - "copy()": 1.0, - "dense::copy": 1.0, - "overhead": 1.0 - }, - "time": 1.0, - "repetitions": 10 - }, - "completed": true - } - }, - "size": 36 - } -] +Matrix is of size (36, 36), 208 + Running preconditioner: none diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout index 77979f4c54b..0415a87ea8d 100644 --- a/benchmark/test/reference/preconditioner.matrix.stdout +++ b/benchmark/test/reference/preconditioner.matrix.stdout @@ -25,6 +25,8 @@ "completed": true } }, - "size": 36 + "rows": 36, + "cols": 36, + "nonzeros": 208 } ] diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index 5b47bc9bd94..c215b22c925 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -10,7 +10,6 @@ Running test case "stencil": "7pt", "preconditioner": {} } -DEBUG: begin stencil(100,7pt) DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array @@ -59,7 +58,9 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -Matrix is of size (125, 125) +Matrix is of size (125, 125), 725 +DEBUG: begin stencil(100,7pt) + Running preconditioner: none DEBUG: begin none DEBUG: begin copy() DEBUG: begin allocate @@ -78,28 +79,7 @@ DEBUG: end apply() DEBUG: begin free DEBUG: end free DEBUG: end none -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "preconditioner": { - "none": { - "generate": { - "components": {}, - "time": 1.0, - "repetitions": 1 - }, - "apply": { - "components": {}, - "time": 1.0, - "repetitions": 1 - }, - "completed": true - } - } - } -] +DEBUG: end stencil(100,7pt) DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -110,4 +90,3 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout index cc73c4c4552..f53407d818d 100644 --- a/benchmark/test/reference/preconditioner.profile.stdout +++ b/benchmark/test/reference/preconditioner.profile.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "preconditioner": { "none": { @@ -17,6 +17,9 @@ }, "completed": true } - } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 } ] diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr index d480d4fedbd..07d2cca6704 100644 --- a/benchmark/test/reference/preconditioner.simple.stderr +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -10,34 +10,5 @@ Running test case "stencil": "7pt", "preconditioner": {} } -Matrix is of size (125, 125) -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "preconditioner": { - "none": { - "generate": { - "components": { - "generate()": 1.0, - "overhead": 1.0 - }, - "time": 1.0, - "repetitions": 10 - }, - "apply": { - "components": { - "apply()": 1.0, - "copy()": 1.0, - "dense::copy": 1.0, - "overhead": 1.0 - }, - "time": 1.0, - "repetitions": 10 - }, - "completed": true - } - } - } -] +Matrix is of size (125, 125), 725 + Running preconditioner: none diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout index c47146a72e1..92bb51ddb57 100644 --- a/benchmark/test/reference/preconditioner.simple.stdout +++ b/benchmark/test/reference/preconditioner.simple.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "preconditioner": { "none": { @@ -25,6 +25,9 @@ }, "completed": true } - } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 } ] diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout index 6a1f8ceb959..56577288c2d 100644 --- a/benchmark/test/reference/solver.matrix.stdout +++ b/benchmark/test/reference/solver.matrix.stdout @@ -50,6 +50,7 @@ "completed": true } }, - "size": 36 + "rows": 36, + "cols": 36 } ] diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 65b7560d936..0c3f7060796 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -5,7 +5,6 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -DEBUG: begin stencil(100,7pt) Running test case { "size": 100, @@ -62,9 +61,9 @@ DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() Matrix is of size (125, 125) -DEBUG: begin cg +DEBUG: begin stencil(100,7pt) Running solver: cg -DEBUG: begin none +DEBUG: begin cg DEBUG: begin allocate DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch @@ -425,8 +424,8 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end none DEBUG: end cg +DEBUG: end stencil(100,7pt) DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -437,4 +436,3 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout index 128a8a1f169..0148e6ef092 100644 --- a/benchmark/test/reference/solver.profile.stdout +++ b/benchmark/test/reference/solver.profile.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "optimal": { "spmv": "csr" @@ -26,6 +26,8 @@ "repetitions": 1, "completed": true } - } + }, + "rows": 125, + "cols": 125 } ] diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout index c6055339d67..b4e7b56b2bf 100644 --- a/benchmark/test/reference/solver.simple.stdout +++ b/benchmark/test/reference/solver.simple.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "optimal": { "spmv": "csr" @@ -50,6 +50,8 @@ "repetitions": 1, "completed": true } - } + }, + "rows": 125, + "cols": 125 } ] diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr index 5001c604e72..ff52b6a3269 100644 --- a/benchmark/test/reference/sparse_blas.matrix.stderr +++ b/benchmark/test/reference/sparse_blas.matrix.stderr @@ -3,34 +3,11 @@ This is Ginkgo 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 -The operations are transposeRunning test case +The operations are transpose +Running test case { "filename": "", "sparse_blas": {} } Matrix is of size (36, 36), 208 -Current state: -[ - { - "filename": "", - "sparse_blas": { - "transpose": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "components": { - "allocate": 1.0, - "components::fill_array": 1.0, - "csr::transpose": 1.0, - "free": 1.0, - "overhead": 1.0 - }, - "completed": true - } - }, - "rows": 36, - "cols": 36, - "nonzeros": 208 - } -] + Running sparse_blas: transpose diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index d05f5117b8e..d1434dad146 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -3,7 +3,8 @@ This is Ginkgo 1.7.0 (develop) Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 -The operations are transposeRunning test case +The operations are transpose +Running test case { "size": 100, "stencil": "7pt", @@ -35,6 +36,7 @@ DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin free DEBUG: end free DEBUG: begin stencil(100,7pt) + Running sparse_blas: transpose DEBUG: begin transpose DEBUG: begin allocate DEBUG: end allocate @@ -53,25 +55,6 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end transpose -Current state: -[ - { - "size": 100, - "stencil": "7pt", - "sparse_blas": { - "transpose": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 1, - "completed": true - } - }, - "rows": 125, - "cols": 125, - "nonzeros": 725 - } -] DEBUG: end stencil(100,7pt) DEBUG: begin free DEBUG: end free diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr index bf5001f67b7..452374a9268 100644 --- a/benchmark/test/reference/sparse_blas.simple.stderr +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -3,36 +3,12 @@ This is Ginkgo 1.7.0 (develop) Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 -The operations are transposeRunning test case +The operations are transpose +Running test case { "size": 100, "stencil": "7pt", "sparse_blas": {} } Matrix is of size (125, 125), 725 -Current state: -[ - { - "size": 100, - "stencil": "7pt", - "sparse_blas": { - "transpose": { - "time": 1.0, - "flops": 1.0, - "bandwidth": 1.0, - "repetitions": 10, - "components": { - "allocate": 1.0, - "components::fill_array": 1.0, - "csr::transpose": 1.0, - "free": 1.0, - "overhead": 1.0 - }, - "completed": true - } - }, - "rows": 125, - "cols": 125, - "nonzeros": 725 - } -] + Running sparse_blas: transpose diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr index 8d942cd0de5..a618da5b321 100644 --- a/benchmark/test/reference/spmv.matrix.stderr +++ b/benchmark/test/reference/spmv.matrix.stderr @@ -10,22 +10,5 @@ Running test case "filename": "", "spmv": {} } -Matrix is of size (36, 36) -Current state: -[ - { - "filename": "", - "spmv": { - "coo": { - "storage": 3328, - "max_relative_norm2": 1.0, - "time": 1.0, - "repetitions": 10, - "completed": true - } - }, - "size": 36, - "nnz": 208, - "optimal": {} - } -] +Matrix is of size (36, 36), 208 + Running spmv: coo diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout index 47035c27549..dc30ab6b284 100644 --- a/benchmark/test/reference/spmv.matrix.stdout +++ b/benchmark/test/reference/spmv.matrix.stdout @@ -11,8 +11,9 @@ "completed": true } }, - "size": 36, - "nnz": 208, + "rows": 36, + "cols": 36, + "nonzeros": 208, "optimal": { "spmv": "coo" } diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 961ac587990..09a10b725ea 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -11,7 +11,6 @@ Running test case "stencil": "7pt", "spmv": {} } -DEBUG: begin stencil(100,7pt) DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate @@ -52,13 +51,9 @@ DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -Matrix is of size (125, 125) -DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin dense::copy -DEBUG: end dense::copy -DEBUG: end copy() +Matrix is of size (125, 125), 725 +DEBUG: begin stencil(100,7pt) + Running spmv: coo DEBUG: begin coo DEBUG: begin allocate DEBUG: end allocate @@ -87,27 +82,8 @@ DEBUG: end free DEBUG: begin free DEBUG: end free DEBUG: end coo -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "spmv": { - "coo": { - "storage": 11600, - "time": 1.0, - "repetitions": 1, - "completed": true - } - }, - "nnz": 725, - "optimal": {} - } -] -DEBUG: begin free -DEBUG: end free +DEBUG: end stencil(100,7pt) DEBUG: begin free DEBUG: end free DEBUG: begin free DEBUG: end free -DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout index dacc490ddf0..5302d54f9f0 100644 --- a/benchmark/test/reference/spmv.profile.stdout +++ b/benchmark/test/reference/spmv.profile.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "spmv": { "coo": { @@ -11,7 +11,9 @@ "completed": true } }, - "nnz": 725, + "rows": 125, + "cols": 125, + "nonzeros": 725, "optimal": { "spmv": "coo" } diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr index dc9933b40ec..a910512ff31 100644 --- a/benchmark/test/reference/spmv.simple.stderr +++ b/benchmark/test/reference/spmv.simple.stderr @@ -11,22 +11,5 @@ Running test case "stencil": "7pt", "spmv": {} } -Matrix is of size (125, 125) -Current state: -[ - { - "size": 125, - "stencil": "7pt", - "spmv": { - "coo": { - "storage": 11600, - "max_relative_norm2": 1.0, - "time": 1.0, - "repetitions": 10, - "completed": true - } - }, - "nnz": 725, - "optimal": {} - } -] +Matrix is of size (125, 125), 725 + Running spmv: coo diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout index 90f8903a452..737938d7c96 100644 --- a/benchmark/test/reference/spmv.simple.stdout +++ b/benchmark/test/reference/spmv.simple.stdout @@ -1,7 +1,7 @@ [ { - "size": 125, + "size": 100, "stencil": "7pt", "spmv": { "coo": { @@ -12,7 +12,9 @@ "completed": true } }, - "nnz": 725, + "rows": 125, + "cols": 125, + "nonzeros": 725, "optimal": { "spmv": "coo" } diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index b7ec0e72cf1..41acb560ba1 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -53,10 +54,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include -#include -#include #include @@ -100,10 +97,6 @@ DEFINE_string( DEFINE_bool(detailed, true, "If set, performs several runs to obtain more detailed results"); -DEFINE_bool(keep_errors, true, - "If set, writes exception messages during the execution into the " - "JSON output"); - DEFINE_bool(nested_names, false, "If set, separately logs nested operations"); DEFINE_bool(profile, false, @@ -157,27 +150,32 @@ std::unique_ptr input_stream; * @param format the format of the benchmark input data */ void initialize_argument_parsing(int* argc, char** argv[], std::string& header, - std::string& format) + std::string& format, bool do_print = true) { - std::ostringstream doc; - doc << header << "Usage: " << (*argv)[0] << " [options]\n" - << format - << " The results are written on standard output, in the same " - "format,\n" - << " but with test cases extended to include an additional member " - "\n" - << " object for each benchmark run.\n" - << " If run with a --backup flag, an intermediate result is " - "written \n" - << " to a file in the same format. The backup file can be used as " - "\n" - << " input to this test suite, and the benchmarking will \n" - << " continue from the point where the backup file was created."; - - gflags::SetUsageMessage(doc.str()); - std::ostringstream ver; - ver << gko::version_info::get(); - gflags::SetVersionString(ver.str()); + if (do_print) { + std::ostringstream doc; + doc << header << "Usage: " << (*argv)[0] << " [options]\n" + << format + << " The results are written on standard output, in the same " + "format,\n" + << " but with test cases extended to include an additional member " + "\n" + << " object for each benchmark run.\n" + << " If run with a --backup flag, an intermediate result is " + "written \n" + << " to a file in the same format. The backup file can be used as " + "\n" + << " input to this test suite, and the benchmarking will \n" + << " continue from the point where the backup file was created."; + + gflags::SetUsageMessage(doc.str()); + std::ostringstream ver; + ver << gko::version_info::get(); + gflags::SetVersionString(ver.str()); + } else { + gflags::SetUsageMessage(""); + gflags::SetVersionString(""); + } gflags::ParseCommandLineFlags(argc, argv, true); if (FLAGS_profile) { FLAGS_repetitions = "1"; @@ -206,20 +204,19 @@ void print_general_information(const std::string& extra) { std::clog << gko::version_info::get() << std::endl << "Running on " << FLAGS_executor << "(" << FLAGS_device_id - << ")" << std::endl + << ")\n" << "Running with " << FLAGS_warmup << " warm iterations and "; if (FLAGS_repetitions == "auto") { std::clog << "adaptively determined repetititions with " << FLAGS_min_repetitions << " <= rep <= " << FLAGS_max_repetitions - << " and a minimal runtime of " << FLAGS_min_runtime << "s" - << std::endl; + << " and a minimal runtime of " << FLAGS_min_runtime << "s\n"; } else { - std::clog << FLAGS_repetitions << " running iterations" << std::endl; + std::clog << FLAGS_repetitions << " running iterations\n"; } std::clog << "The random seed for right hand sides is " << FLAGS_seed - << std::endl - << extra; + << '\n' + << extra << '\n'; } @@ -319,7 +316,7 @@ std::istream& get_input_stream() // backup generation -void backup_results(rapidjson::Document& results) +void backup_results(json& results) { static int next = 0; static auto filenames = []() -> std::array { @@ -576,279 +573,4 @@ gko::remove_complex compute_max_relative_norm2( } -/** - * A class for controlling the number warmup and timed iterations. - * - * The behavior is determined by the following flags - * - 'repetitions' switch between fixed and adaptive number of iterations - * - 'warmup' warmup iterations, applies in fixed and adaptive case - * - 'min_repetitions' minimal number of repetitions (adaptive case) - * - 'max_repetitions' maximal number of repetitions (adaptive case) - * - 'min_runtime' minimal total runtime (adaptive case) - * - 'repetition_growth_factor' controls the increase between two successive - * timings - * - * Usage: - * `IterationControl` exposes the member functions: - * - `warmup_run()`: controls run defined by `warmup` flag - * - `run(bool)`: controls run defined by all other flags - * - `get_timer()`: access to underlying timer - * The first two methods return an object that is to be used in a range-based - * for loop: - * ``` - * IterationControl ic(get_timer(...)); - * - * // warmup run always uses fixed number of iteration and does not issue - * // timings - * for(auto status: ic.warmup_run()){ - * // execute benchmark - * } - * // run may use adaptive number of iterations (depending on cmd line flag) - * // and issues timing (unless manage_timings is false) - * for(auto status: ic.run(manage_timings [default is true])){ - * if(! manage_timings) ic.get_timer->tic(); - * // execute benchmark - * if(! manage_timings) ic.get_timer->toc(); - * } - * - * ``` - * At the beginning of both methods, the timer is reset. - * The `status` object exposes the member - * - `cur_it`, containing the current iteration number, - * and the methods - * - `is_finished`, checks if the benchmark is finished, - */ -class IterationControl { - using IndexType = unsigned int; //!< to be compatible with GFLAGS type - - class run_control; - -public: - /** - * Creates an `IterationControl` object. - * - * Uses the commandline flags to setup the stopping criteria for the - * warmup and timed run. - * - * @param timer the timer that is to be used for the timings - */ - explicit IterationControl(const std::shared_ptr& timer) - { - status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup, - FLAGS_warmup, 0., 0}; - if (FLAGS_repetitions == "auto") { - status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions, - FLAGS_max_repetitions, FLAGS_min_runtime}; - } else { - const auto reps = - static_cast(std::stoi(FLAGS_repetitions)); - status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0}; - } - } - - IterationControl() = default; - IterationControl(const IterationControl&) = default; - IterationControl(IterationControl&&) = default; - - /** - * Creates iterable `run_control` object for the warmup run. - * - * This run uses always a fixed number of iterations. - */ - run_control warmup_run() - { - status_warmup_.cur_it = 0; - status_warmup_.managed_timer.clear(); - return run_control{&status_warmup_}; - } - - /** - * Creates iterable `run_control` object for the timed run. - * - * This run may be adaptive, depending on the commandline flags. - * - * @param manage_timings If true, the timer calls (`tic/toc`) are handled - * by the `run_control` object, otherwise they need to be executed outside - */ - run_control run(bool manage_timings = true) - { - status_run_.cur_it = 0; - status_run_.managed_timer.clear(); - status_run_.managed_timer.manage_timings = manage_timings; - return run_control{&status_run_}; - } - - std::shared_ptr get_timer() const - { - return status_run_.managed_timer.timer; - } - - /** - * Compute the time from the given statistical method - * - * @param method the statistical method. If the timer does not have the - * same iteration as the IterationControl, it can only use - * average from the IterationControl. - * - * @return the statistical time - */ - double compute_time(const std::string& method = "average") const - { - if (status_run_.managed_timer.timer->get_num_repetitions() == - this->get_num_repetitions()) { - return status_run_.managed_timer.compute_time(method); - } else { - assert(method == "average"); - return status_run_.managed_timer.get_total_time() / - this->get_num_repetitions(); - } - } - - IndexType get_num_repetitions() const { return status_run_.cur_it; } - -private: - struct TimerManager { - std::shared_ptr timer; - bool manage_timings = false; - - void tic() - { - if (manage_timings) { - timer->tic(); - } - } - void toc(unsigned int num = 1) - { - if (manage_timings) { - timer->toc(num); - } - } - - void clear() { timer->clear(); } - - double get_total_time() const { return timer->get_total_time(); } - - double compute_time(const std::string& method = "average") const - { - return timer->compute_time(method); - } - }; - - /** - * Stores stopping criteria of the adaptive benchmark run as well as the - * current iteration number. - */ - struct status { - TimerManager managed_timer{}; - - IndexType min_it = 0; - IndexType max_it = 0; - double max_runtime = 0.; - - IndexType cur_it = 0; - - /** - * checks if the adaptive run is complete - * - * the adaptive run is complete if: - * - the minimum number of iteration is reached - * - and either: - * - the maximum number of repetitions is reached - * - the total runtime is above the threshold - * - * @return completeness state of the adaptive run - */ - bool is_finished() const - { - return cur_it >= min_it && - (cur_it >= max_it || - managed_timer.get_total_time() >= max_runtime); - } - }; - - /** - * Iterable class managing the benchmark iteration. - * - * Has to be used in a range-based for loop. - */ - struct run_control { - struct iterator { - /** - * Increases the current iteration count and finishes timing if - * necessary. - * - * As `++it` is the last step of a for-loop, the managed_timer is - * stopped, if enough iterations have passed since the last timing. - * The interval between two timings is steadily increased to - * reduce the timing overhead. - */ - iterator operator++() - { - cur_info->cur_it++; - if (cur_info->cur_it >= next_timing && !stopped) { - cur_info->managed_timer.toc( - static_cast(cur_info->cur_it - start_timing)); - stopped = true; - next_timing = static_cast(std::ceil( - next_timing * FLAGS_repetition_growth_factor)); - // If repetition_growth_factor <= 1, next_timing will be - // next iteration. - if (next_timing <= cur_info->cur_it) { - next_timing = cur_info->cur_it + 1; - } - } - return *this; - } - - status operator*() const { return *cur_info; } - - /** - * Checks if the benchmark is finished and handles timing, if - * necessary. - * - * As `begin != end` is the first step in a for-loop, the - * managed_timer is started, if it was previously stopped. - * Additionally, if the benchmark is complete and the managed_timer - * is still running it is stopped. (This may occur if the maximal - * number of repetitions is surpassed) - * - * Uses only the information from the `status` object, i.e. - * the right hand side is ignored. - * - * @return true if benchmark is not finished, else false - */ - bool operator!=(const iterator&) - { - const bool is_finished = cur_info->is_finished(); - if (!is_finished && stopped) { - stopped = false; - cur_info->managed_timer.tic(); - start_timing = cur_info->cur_it; - } else if (is_finished && !stopped) { - cur_info->managed_timer.toc( - static_cast(cur_info->cur_it - start_timing)); - stopped = true; - } - return !is_finished; - } - - status* cur_info; - IndexType next_timing = 1; //!< next iteration to stop timing - IndexType start_timing = 0; //!< iteration for starting timing - bool stopped = true; - }; - - iterator begin() const { return iterator{info}; } - - // not used, could potentially be used in c++17 as a sentinel - iterator end() const { return iterator{}; } - - status* info; - }; - - status status_warmup_; - status status_run_; -}; - - #endif // GKO_BENCHMARK_UTILS_GENERAL_HPP_ diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp index 2049dadf45f..39d8b5a8107 100644 --- a/benchmark/utils/general_matrix.hpp +++ b/benchmark/utils/general_matrix.hpp @@ -57,9 +57,9 @@ DEFINE_string(input_matrix, "", */ void initialize_argument_parsing_matrix( int* argc, char** argv[], std::string& header, std::string& format, - std::string additional_matrix_file_json = "") + std::string additional_matrix_file_json = "", bool do_print = true) { - initialize_argument_parsing(argc, argv, header, format); + initialize_argument_parsing(argc, argv, header, format, do_print); std::string input_matrix_str{FLAGS_input_matrix}; if (!input_matrix_str.empty()) { if (input_stream) { @@ -67,17 +67,13 @@ void initialize_argument_parsing_matrix( << "-input and -input_matrix cannot be used simultaneously\n"; std::exit(1); } - // create JSON for the filename via RapidJSON to ensure the string is - // correctly escaped - rapidjson::Document d; + // create JSON for the filename via nlohmann_json to ensure the string + // is correctly escaped auto json_template = R"([{"filename":"")" + additional_matrix_file_json + "}]"; - d.Parse(json_template.c_str()); - d[0]["filename"].SetString(input_matrix_str.c_str(), d.GetAllocator()); - rapidjson::StringBuffer sb; - rapidjson::PrettyWriter writer(sb); - d.Accept(writer); - input_stream = std::make_unique(sb.GetString()); + auto doc = json::parse(json_template); + doc[0]["filename"] = input_matrix_str; + input_stream = std::make_unique(doc.dump()); } } diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 076d2954980..257a2384634 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -53,28 +53,45 @@ struct DefaultSystemGenerator { using Vec = vec; static gko::matrix_data generate_matrix_data( - rapidjson::Value& config) + const json& config) { - if (config.HasMember("filename")) { - std::ifstream in(config["filename"].GetString()); + if (config.contains("filename")) { + std::ifstream in(config["filename"].get()); return gko::read_generic_raw(in); - } else if (config.HasMember("stencil")) { + } else if (config.contains("stencil")) { return generate_stencil( - config["stencil"].GetString(), config["size"].GetInt64()); + config["stencil"].get(), + config["size"].get()); } else { throw std::runtime_error( "No known way to generate matrix data found."); } } - static std::string describe_config(rapidjson::Value& config) + static std::string get_example_config() { - if (config.HasMember("filename")) { - return config["filename"].GetString(); - } else if (config.HasMember("stencil")) { + return json:: + parse(R"([{"filename": "my_file.mtx"},{"filename": "my_file2.mtx"},{"size": 100, "stencil": "7pt"}])") + .dump(4); + } + + static bool validate_config(const json& test_case) + { + return ((test_case.contains("size") && test_case.contains("stencil") && + test_case["size"].is_number_integer() && + test_case["stencil"].is_string()) || + (test_case.contains("filename") && + test_case["filename"].is_string())); + } + + static std::string describe_config(const json& config) + { + if (config.contains("filename")) { + return config["filename"].get(); + } else if (config.contains("stencil")) { std::stringstream ss; - ss << "stencil(" << config["size"].GetInt64() << "," - << config["stencil"].GetString() << ")"; + ss << "stencil(" << config["size"].get() << "," + << config["stencil"].get() << ")"; return ss.str(); } else { throw std::runtime_error("No known way to describe config."); @@ -82,30 +99,30 @@ struct DefaultSystemGenerator { } static std::shared_ptr generate_matrix_with_optimal_format( - std::shared_ptr exec, rapidjson::Value& config) + std::shared_ptr exec, json& config) { auto data = generate_matrix_data(config); return generate_matrix_with_format( - std::move(exec), config["optimal"]["spmv"].GetString(), data); + std::move(exec), config["optimal"]["spmv"].get(), + data); } static std::shared_ptr generate_matrix_with_format( std::shared_ptr exec, const std::string& format_name, const gko::matrix_data& data, - rapidjson::Value* spmv_case = nullptr, - rapidjson::MemoryPoolAllocator<>* allocator = nullptr) + json* spmv_case = nullptr) { auto storage_logger = std::make_shared(); - if (spmv_case && allocator) { + if (spmv_case) { exec->add_logger(storage_logger); } auto mtx = gko::share(::formats::matrix_factory(format_name, exec, data)); - if (spmv_case && allocator) { + if (spmv_case) { exec->remove_logger(storage_logger); - storage_logger->write_data(*spmv_case, *allocator); + storage_logger->write_data(*spmv_case); } return mtx; @@ -172,32 +189,51 @@ struct DistributedDefaultSystemGenerator { using Vec = dist_vec; gko::matrix_data generate_matrix_data( - rapidjson::Value& config) const + const json& config) const { - if (config.HasMember("filename")) { - std::ifstream in(config["filename"].GetString()); + if (config.contains("filename")) { + std::ifstream in(config["filename"].get()); return gko::read_generic_raw(in); - } else if (config.HasMember("stencil")) { + } else if (config.contains("stencil")) { auto local_size = static_cast( - config["size"].GetInt64() / comm.size()); + config["size"].get() / comm.size()); return generate_stencil( - config["stencil"].GetString(), comm, local_size, - config["comm_pattern"].GetString() == std::string("optimal")); + config["stencil"].get(), comm, local_size, + config["comm_pattern"].get() == + std::string("optimal")); } else { throw std::runtime_error( "No known way to generate matrix data found."); } } - std::string describe_config(rapidjson::Value& config) const + static std::string get_example_config() { - if (config.HasMember("filename")) { - return config["filename"].GetString(); - } else if (config.HasMember("stencil")) { + return json:: + parse(R"([{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}, {"filename": "my_file.mtx"}])") + .dump(4); + } + + static bool validate_config(const json& test_case) + { + return ((test_case.contains("size") && test_case.contains("stencil") && + test_case.contains("comm_pattern") && + test_case["size"].is_number_integer() && + test_case["stencil"].is_string() && + test_case["comm_pattern"].is_string()) || + (test_case.contains("filename") && + test_case["filename"].is_string())); + } + + static std::string describe_config(const json& config) + { + if (config.contains("filename")) { + return config["filename"].get(); + } else if (config.contains("stencil")) { std::stringstream ss; - ss << "stencil(" << config["size"].GetInt64() << "," - << config["stencil"].GetString() << "," - << config["comm_pattern"].GetString() << ")"; + ss << "stencil(" << config["size"].get() << "," + << config["stencil"].get() << "," + << config["comm_pattern"].get() << ")"; return ss.str(); } else { throw std::runtime_error("No known way to describe config."); @@ -205,29 +241,33 @@ struct DistributedDefaultSystemGenerator { } std::shared_ptr generate_matrix_with_optimal_format( - std::shared_ptr exec, rapidjson::Value& config) const + std::shared_ptr exec, json& config) const { auto data = generate_matrix_data(config); return generate_matrix_with_format( - std::move(exec), config["optimal"]["spmv"].GetString(), data); + std::move(exec), config["optimal"]["spmv"].get(), + data); } std::shared_ptr generate_matrix_with_format( std::shared_ptr exec, const std::string& format_name, const gko::matrix_data& data, - rapidjson::Value* spmv_case = nullptr, - rapidjson::MemoryPoolAllocator<>* allocator = nullptr) const + json* spmv_case = nullptr) const { auto part = gko::experimental::distributed:: Partition::build_from_global_size_uniform( exec, comm.size(), static_cast(data.size[0])); auto formats = split(format_name, '-'); + if (formats.size() != 2) { + throw std::runtime_error{"Invalid distributed format specifier " + + format_name}; + } auto local_mat = formats::matrix_type_factory.at(formats[0])(exec); auto non_local_mat = formats::matrix_type_factory.at(formats[1])(exec); auto storage_logger = std::make_shared(); - if (spmv_case && allocator) { + if (spmv_case) { exec->add_logger(storage_logger); } @@ -235,9 +275,9 @@ struct DistributedDefaultSystemGenerator { exec, comm, local_mat, non_local_mat); dist_mat->read_distributed(data, part); - if (spmv_case && allocator) { + if (spmv_case) { exec->remove_logger(storage_logger); - storage_logger->write_data(comm, *spmv_case, *allocator); + storage_logger->write_data(comm, *spmv_case); } return dist_mat; diff --git a/benchmark/utils/iteration_control.hpp b/benchmark/utils/iteration_control.hpp new file mode 100644 index 00000000000..295ae7870d6 --- /dev/null +++ b/benchmark/utils/iteration_control.hpp @@ -0,0 +1,326 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_ +#define GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_ + + +#include + + +#include +#include +#include + + +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" +#include "core/distributed/helpers.hpp" + + +/** + * A class for controlling the number warmup and timed iterations. + * + * The behavior is determined by the following flags + * - 'repetitions' switch between fixed and adaptive number of iterations + * - 'warmup' warmup iterations, applies in fixed and adaptive case + * - 'min_repetitions' minimal number of repetitions (adaptive case) + * - 'max_repetitions' maximal number of repetitions (adaptive case) + * - 'min_runtime' minimal total runtime (adaptive case) + * - 'repetition_growth_factor' controls the increase between two successive + * timings + * + * Usage: + * `IterationControl` exposes the member functions: + * - `warmup_run()`: controls run defined by `warmup` flag + * - `run(bool)`: controls run defined by all other flags + * - `get_timer()`: access to underlying timer + * The first two methods return an object that is to be used in a range-based + * for loop: + * ``` + * IterationControl ic(get_timer(...)); + * + * // warmup run always uses fixed number of iteration and does not issue + * // timings + * for(auto status: ic.warmup_run()){ + * // execute benchmark + * } + * // run may use adaptive number of iterations (depending on cmd line flag) + * // and issues timing (unless manage_timings is false) + * for(auto status: ic.run(manage_timings [default is true])){ + * if(! manage_timings) ic.get_timer->tic(); + * // execute benchmark + * if(! manage_timings) ic.get_timer->toc(); + * } + * + * ``` + * At the beginning of both methods, the timer is reset. + * The `status` object exposes the member + * - `cur_it`, containing the current iteration number, + * and the methods + * - `is_finished`, checks if the benchmark is finished, + */ +class IterationControl { + using IndexType = unsigned int; //!< to be compatible with GFLAGS type + + class run_control; + +public: + /** + * Creates an `IterationControl` object. + * + * Uses the commandline flags to setup the stopping criteria for the + * warmup and timed run. + * + * @param timer the timer that is to be used for the timings + */ + explicit IterationControl(const std::shared_ptr& timer) + { + status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup, + FLAGS_warmup, 0., 0}; + if (FLAGS_repetitions == "auto") { + status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions, + FLAGS_max_repetitions, FLAGS_min_runtime}; + } else { + const auto reps = + static_cast(std::stoi(FLAGS_repetitions)); + status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0}; + } + } + + IterationControl() = default; + IterationControl(const IterationControl&) = default; + IterationControl(IterationControl&&) = default; + + /** + * Creates iterable `run_control` object for the warmup run. + * + * This run uses always a fixed number of iterations. + */ + run_control warmup_run() + { + status_warmup_.cur_it = 0; + status_warmup_.managed_timer.clear(); + return run_control{&status_warmup_}; + } + + /** + * Creates iterable `run_control` object for the timed run. + * + * This run may be adaptive, depending on the commandline flags. + * + * @param manage_timings If true, the timer calls (`tic/toc`) are handled + * by the `run_control` object, otherwise they need to be executed outside + */ + run_control run(bool manage_timings = true) + { + status_run_.cur_it = 0; + status_run_.managed_timer.clear(); + status_run_.managed_timer.manage_timings = manage_timings; + return run_control{&status_run_}; + } + + std::shared_ptr get_timer() const + { + return status_run_.managed_timer.timer; + } + + /** + * Compute the time from the given statistical method + * + * @param method the statistical method. If the timer does not have the + * same iteration as the IterationControl, it can only use + * average from the IterationControl. + * + * @return the statistical time + */ + double compute_time(const std::string& method = "average") const + { + if (status_run_.managed_timer.timer->get_num_repetitions() == + this->get_num_repetitions()) { + return status_run_.managed_timer.compute_time(method); + } else { + assert(method == "average"); + return status_run_.managed_timer.get_total_time() / + this->get_num_repetitions(); + } + } + + IndexType get_num_repetitions() const { return status_run_.cur_it; } + +private: + struct TimerManager { + std::shared_ptr timer; + bool manage_timings = false; + + void tic() + { + if (manage_timings) { + timer->tic(); + } + } + void toc(unsigned int num = 1) + { + if (manage_timings) { + timer->toc(num); + } + } + + void clear() { timer->clear(); } + + double get_total_time() const { return timer->get_total_time(); } + + double compute_time(const std::string& method = "average") const + { + return timer->compute_time(method); + } + }; + + /** + * Stores stopping criteria of the adaptive benchmark run as well as the + * current iteration number. + */ + struct status { + TimerManager managed_timer{}; + + IndexType min_it = 0; + IndexType max_it = 0; + double max_runtime = 0.; + + IndexType cur_it = 0; + + /** + * checks if the adaptive run is complete + * + * the adaptive run is complete if: + * - the minimum number of iteration is reached + * - and either: + * - the maximum number of repetitions is reached + * - the total runtime is above the threshold + * + * @return completeness state of the adaptive run + */ + bool is_finished() const + { + return cur_it >= min_it && + (cur_it >= max_it || + managed_timer.get_total_time() >= max_runtime); + } + }; + + /** + * Iterable class managing the benchmark iteration. + * + * Has to be used in a range-based for loop. + */ + struct run_control { + struct iterator { + /** + * Increases the current iteration count and finishes timing if + * necessary. + * + * As `++it` is the last step of a for-loop, the managed_timer is + * stopped, if enough iterations have passed since the last timing. + * The interval between two timings is steadily increased to + * reduce the timing overhead. + */ + iterator operator++() + { + cur_info->cur_it++; + if (cur_info->cur_it >= next_timing && !stopped) { + cur_info->managed_timer.toc( + static_cast(cur_info->cur_it - start_timing)); + stopped = true; + next_timing = static_cast(std::ceil( + next_timing * FLAGS_repetition_growth_factor)); + // If repetition_growth_factor <= 1, next_timing will be + // next iteration. + if (next_timing <= cur_info->cur_it) { + next_timing = cur_info->cur_it + 1; + } + } + return *this; + } + + status operator*() const { return *cur_info; } + + /** + * Checks if the benchmark is finished and handles timing, if + * necessary. + * + * As `begin != end` is the first step in a for-loop, the + * managed_timer is started, if it was previously stopped. + * Additionally, if the benchmark is complete and the managed_timer + * is still running it is stopped. (This may occur if the maximal + * number of repetitions is surpassed) + * + * Uses only the information from the `status` object, i.e. + * the right hand side is ignored. + * + * @return true if benchmark is not finished, else false + */ + bool operator!=(const iterator&) + { + const bool is_finished = cur_info->is_finished(); + if (!is_finished && stopped) { + stopped = false; + cur_info->managed_timer.tic(); + start_timing = cur_info->cur_it; + } else if (is_finished && !stopped) { + cur_info->managed_timer.toc( + static_cast(cur_info->cur_it - start_timing)); + stopped = true; + } + return !is_finished; + } + + status* cur_info; + IndexType next_timing = 1; //!< next iteration to stop timing + IndexType start_timing = 0; //!< iteration for starting timing + bool stopped = true; + }; + + iterator begin() const { return iterator{info}; } + + // not used, could potentially be used in c++17 as a sentinel + iterator end() const { return iterator{}; } + + status* info; + }; + + status status_warmup_; + status status_run_; +}; + + +#endif // GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_ diff --git a/benchmark/utils/json.hpp b/benchmark/utils/json.hpp index b0cd384cae5..684db0229aa 100644 --- a/benchmark/utils/json.hpp +++ b/benchmark/utils/json.hpp @@ -34,69 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_BENCHMARK_UTILS_JSON_HPP_ -#include +#include -#include - - -#include -#include -#include -#include - - -// helper for setting rapidjson object members -template -std::enable_if_t< - !std::is_same::type, gko::size_type>::value, void> -add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value, - Allocator&& allocator) -{ - if (object.HasMember(name)) { - object[name] = std::forward(value); - } else { - auto n = rapidjson::Value(name, allocator); - object.AddMember(n, std::forward(value), allocator); - } -} - - -/** - @internal This is required to fix some MacOS problems (and possibly other - compilers). There is no explicit RapidJSON constructor for `std::size_t` so a - conversion to a known constructor is required to solve any ambiguity. See the - last comments of https://github.com/ginkgo-project/ginkgo/issues/270. - */ -template -std::enable_if_t< - std::is_same::type, gko::size_type>::value, void> -add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value, - Allocator&& allocator) -{ - if (object.HasMember(name)) { - object[name] = - std::forward(static_cast(value)); - } else { - auto n = rapidjson::Value(name, allocator); - object.AddMember( - n, std::forward(static_cast(value)), - allocator); - } -} - - -// helper for writing out rapidjson Values -inline std::ostream& operator<<(std::ostream& os, const rapidjson::Value& value) -{ - rapidjson::OStreamWrapper jos(os); - rapidjson::PrettyWriter, - rapidjson::UTF8<>, rapidjson::CrtAllocator, - rapidjson::kWriteNanAndInfFlag> - writer(jos); - value.Accept(writer); - return os; -} +using json = nlohmann::ordered_json; #endif // GKO_BENCHMARK_UTILS_JSON_HPP_ diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp index e3e6228604e..1e651811f0f 100644 --- a/benchmark/utils/loggers.hpp +++ b/benchmark/utils/loggers.hpp @@ -50,10 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter, gko::log::ProfilerHook::NestedSummaryWriter { - JsonSummaryWriter(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& alloc, - gko::uint32 repetitions) - : object{&object}, alloc{&alloc}, repetitions{repetitions} + JsonSummaryWriter(json& object, gko::uint32 repetitions) + : object{&object}, repetitions{repetitions} {} void write( @@ -62,13 +60,11 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter, { for (const auto& entry : entries) { if (entry.name != "total") { - add_or_set_member(*object, entry.name.c_str(), - entry.exclusive.count() * 1e-9 / repetitions, - *alloc); + (*object)[entry.name] = + entry.exclusive.count() * 1e-9 / repetitions; } } - add_or_set_member(*object, "overhead", - overhead.count() * 1e-9 / repetitions, *alloc); + (*object)["overhead"] = overhead.count() * 1e-9 / repetitions; } void write_nested(const gko::log::ProfilerHook::nested_summary_entry& root, @@ -84,27 +80,24 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter, visit(visit, child, new_prefix); exclusive -= child.elapsed; } - add_or_set_member(*object, (prefix + node.name).c_str(), - exclusive.count() * 1e-9 / repetitions, *alloc); + (*object)[prefix + node.name] = + exclusive.count() * 1e-9 / repetitions; }; // we don't need to annotate the total for (const auto& child : root.children) { visit(visit, child, ""); } - add_or_set_member(*object, "overhead", - overhead.count() * 1e-9 / repetitions, *alloc); + (*object)["overhead"] = overhead.count() * 1e-9 / repetitions; } - rapidjson::Value* object; - rapidjson::MemoryPoolAllocator<>* alloc; + json* object; gko::uint32 repetitions; }; inline std::shared_ptr create_operations_logger( bool gpu_timer, bool nested, std::shared_ptr exec, - rapidjson::Value& object, rapidjson::MemoryPoolAllocator<>& alloc, - gko::uint32 repetitions) + json& object, gko::uint32 repetitions) { std::shared_ptr timer; if (gpu_timer) { @@ -114,12 +107,10 @@ inline std::shared_ptr create_operations_logger( } if (nested) { return gko::log::ProfilerHook::create_nested_summary( - timer, - std::make_unique(object, alloc, repetitions)); + timer, std::make_unique(object, repetitions)); } else { return gko::log::ProfilerHook::create_summary( - timer, - std::make_unique(object, alloc, repetitions)); + timer, std::make_unique(object, repetitions)); } } @@ -140,21 +131,18 @@ struct StorageLogger : gko::log::Logger { storage[location] = 0; } - void write_data(rapidjson::Value& output, - rapidjson::MemoryPoolAllocator<>& allocator) + void write_data(json& output) { const std::lock_guard lock(mutex); gko::size_type total{}; for (const auto& e : storage) { total += e.second; } - add_or_set_member(output, "storage", total, allocator); + output["storage"] = total; } #if GINKGO_BUILD_MPI - void write_data(gko::experimental::mpi::communicator comm, - rapidjson::Value& output, - rapidjson::MemoryPoolAllocator<>& allocator) + void write_data(gko::experimental::mpi::communicator comm, json& output) { const std::lock_guard lock(mutex); gko::size_type total{}; @@ -166,7 +154,7 @@ struct StorageLogger : gko::log::Logger { ? static_cast(MPI_IN_PLACE) : &total, &total, 1, MPI_SUM, 0); - add_or_set_member(output, "storage", total, allocator); + output["storage"] = total; } #endif @@ -191,17 +179,16 @@ struct ResidualLogger : gko::log::Logger { const gko::array* status, bool all_stopped) const override { - timestamps.PushBack(std::chrono::duration( - std::chrono::steady_clock::now() - start) - .count(), - alloc); + timestamps.push_back(std::chrono::duration( + std::chrono::steady_clock::now() - start) + .count()); if (residual_norm) { - rec_res_norms.PushBack( - get_norm(gko::as>(residual_norm)), alloc); + rec_res_norms.push_back( + get_norm(gko::as>(residual_norm))); } else { gko::detail::vector_dispatch( residual, [&](const auto v_residual) { - rec_res_norms.PushBack(compute_norm2(v_residual), alloc); + rec_res_norms.push_back(compute_norm2(v_residual)); }); } if (solution) { @@ -209,32 +196,25 @@ struct ResidualLogger : gko::log::Logger { rc_vtype>(solution, [&](auto v_solution) { using concrete_type = std::remove_pointer_t>; - true_res_norms.PushBack( - compute_residual_norm(matrix, gko::as(b), - v_solution), - alloc); + true_res_norms.push_back(compute_residual_norm( + matrix, gko::as(b), v_solution)); }); } else { - true_res_norms.PushBack(-1.0, alloc); + true_res_norms.push_back(-1.0); } if (implicit_sq_residual_norm) { - implicit_res_norms.PushBack( - std::sqrt(get_norm( - gko::as>(implicit_sq_residual_norm))), - alloc); + implicit_res_norms.push_back(std::sqrt( + get_norm(gko::as>(implicit_sq_residual_norm)))); has_implicit_res_norm = true; } else { - implicit_res_norms.PushBack(-1.0, alloc); + implicit_res_norms.push_back(-1.0); } } ResidualLogger(gko::ptr_param matrix, - gko::ptr_param b, - rapidjson::Value& rec_res_norms, - rapidjson::Value& true_res_norms, - rapidjson::Value& implicit_res_norms, - rapidjson::Value& timestamps, - rapidjson::MemoryPoolAllocator<>& alloc) + gko::ptr_param b, json& rec_res_norms, + json& true_res_norms, json& implicit_res_norms, + json& timestamps) : gko::log::Logger(gko::log::Logger::iteration_complete_mask), matrix{matrix.get()}, b{b.get()}, @@ -243,8 +223,7 @@ struct ResidualLogger : gko::log::Logger { true_res_norms{true_res_norms}, has_implicit_res_norm{}, implicit_res_norms{implicit_res_norms}, - timestamps{timestamps}, - alloc{alloc} + timestamps{timestamps} {} bool has_implicit_res_norms() const { return has_implicit_res_norm; } @@ -253,12 +232,11 @@ struct ResidualLogger : gko::log::Logger { const gko::LinOp* matrix; const gko::LinOp* b; std::chrono::steady_clock::time_point start; - rapidjson::Value& rec_res_norms; - rapidjson::Value& true_res_norms; + json& rec_res_norms; + json& true_res_norms; mutable bool has_implicit_res_norm; - rapidjson::Value& implicit_res_norms; - rapidjson::Value& timestamps; - rapidjson::MemoryPoolAllocator<>& alloc; + json& implicit_res_norms; + json& timestamps; }; @@ -279,11 +257,7 @@ struct IterationLogger : gko::log::Logger { : gko::log::Logger(gko::log::Logger::iteration_complete_mask) {} - void write_data(rapidjson::Value& output, - rapidjson::MemoryPoolAllocator<>& allocator) - { - add_or_set_member(output, "iterations", this->num_iters, allocator); - } + void write_data(json& output) { output["iterations"] = this->num_iters; } private: mutable gko::size_type num_iters{0}; diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp new file mode 100644 index 00000000000..3520f7299ee --- /dev/null +++ b/benchmark/utils/runner.hpp @@ -0,0 +1,209 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_RUNNER_HPP_ +#define GKO_BENCHMARK_UTILS_RUNNER_HPP_ + + +#include + + +#include +#include +#include + + +#include "benchmark/utils/general.hpp" + + +std::shared_ptr create_profiler_hook( + std::shared_ptr exec, bool do_print = true) +{ + using gko::log::ProfilerHook; + std::map()>> + hook_map{ + {"none", [] { return std::shared_ptr{}; }}, + {"auto", [&] { return ProfilerHook::create_for_executor(exec); }}, + {"nvtx", [] { return ProfilerHook::create_nvtx(); }}, + {"roctx", [] { return ProfilerHook::create_roctx(); }}, + {"tau", [] { return ProfilerHook::create_tau(); }}, + {"vtune", [] { return ProfilerHook::create_vtune(); }}, + {"debug", [do_print] { + return ProfilerHook::create_custom( + [do_print](const char* name, + gko::log::profile_event_category) { + if (do_print) { + std::clog << "DEBUG: begin " << name << '\n'; + } + }, + [do_print](const char* name, + gko::log::profile_event_category) { + if (do_print) { + std::clog << "DEBUG: end " << name << '\n'; + } + }); + }}}; + return hook_map.at(FLAGS_profiler_hook)(); +} + + +template +struct Benchmark { + /** The name to be used in the JSON output. */ + virtual const std::string& get_name() const = 0; + + /** The operations to loop over for each test case. */ + virtual const std::vector& get_operations() const = 0; + + /** Should we write logging output? */ + virtual bool should_print() const = 0; + + /** Example JSON input */ + virtual std::string get_example_config() const = 0; + + /** Is the input test case in the correct format? */ + virtual bool validate_config(const json& value) const = 0; + + /** Textual representation of the test case for profiler annotation */ + virtual std::string describe_config(const json& test_case) const = 0; + + /** Sets up shared state and test case info */ + virtual State setup(std::shared_ptr exec, + json& test_case) const = 0; + + /** Runs a single operation of the benchmark */ + virtual void run(std::shared_ptr exec, + std::shared_ptr timer, State& state, + const std::string& operation, + json& operation_case) const = 0; + + /** Post-process test case info. */ + virtual void postprocess(json& test_case) const {} +}; + + +template +void run_test_cases(const Benchmark& benchmark, + std::shared_ptr exec, + std::shared_ptr timer, json& test_cases) +{ + if (!test_cases.is_array()) { + if (benchmark.should_print()) { + std::cerr + << "Input has to be a JSON array of benchmark configurations:\n" + << benchmark.get_example_config() << std::endl; + } + std::exit(1); + } + for (const auto& test_case : test_cases) { + if (!test_case.is_object() || !benchmark.validate_config(test_case)) { + if (benchmark.should_print()) { + std::cerr << "Invalid test case:\n" + << std::setw(4) << test_case << "\nInput format:\n" + << benchmark.get_example_config() << std::endl; + } + std::exit(2); + } + } + + auto profiler_hook = create_profiler_hook(exec, benchmark.should_print()); + if (profiler_hook) { + exec->add_logger(profiler_hook); + } + auto annotate = + [profiler_hook](const char* name) -> gko::log::profiling_scope_guard { + if (profiler_hook) { + return profiler_hook->user_range(name); + } + return {}; + }; + + for (auto& test_case : test_cases) { + try { + // set up benchmark + if (!test_case.contains(benchmark.get_name())) { + test_case[benchmark.get_name()] = json::object(); + } + if (benchmark.should_print()) { + std::clog << "Running test case\n" + << std::setw(4) << test_case << std::endl; + } + auto test_case_state = benchmark.setup(exec, test_case); + auto test_case_str = benchmark.describe_config(test_case); + auto test_case_range = annotate(test_case_str.c_str()); + auto& benchmark_case = test_case[benchmark.get_name()]; + for (const auto& operation_name : benchmark.get_operations()) { + if (benchmark_case.contains(operation_name) && + !FLAGS_overwrite) { + continue; + } + benchmark_case[operation_name] = json::object(); + if (benchmark.should_print()) { + std::clog << "\tRunning " << benchmark.get_name() << ": " + << operation_name << std::endl; + } + auto& operation_case = benchmark_case[operation_name]; + try { + auto operation_range = annotate(operation_name.c_str()); + benchmark.run(exec, timer, test_case_state, operation_name, + operation_case); + operation_case["completed"] = true; + } catch (const std::exception& e) { + operation_case["completed"] = false; + operation_case["error_type"] = + gko::name_demangling::get_dynamic_type(e); + operation_case["error"] = e.what(); + std::cerr << "Error when processing test case\n" + << std::setw(4) << test_case << "\n" + << "what(): " << e.what() << std::endl; + } + + if (benchmark.should_print()) { + backup_results(test_cases); + } + } + benchmark.postprocess(test_case); + } catch (const std::exception& e) { + std::cerr << "Error setting up benchmark, what(): " << e.what() + << std::endl; + test_case["error_type"] = gko::name_demangling::get_dynamic_type(e); + test_case["error"] = e.what(); + } + } + + if (profiler_hook) { + exec->remove_logger(profiler_hook); + } +} + + +#endif // GKO_BENCHMARK_UTILS_RUNNER_HPP_ diff --git a/benchmark/utils/spmv_validation.hpp b/benchmark/utils/spmv_validation.hpp deleted file mode 100644 index 83ea2085ec2..00000000000 --- a/benchmark/utils/spmv_validation.hpp +++ /dev/null @@ -1,83 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ -#define GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ - - -#include - - -#include -#include - - -#include - - -std::string example_config = R"( - [ - {"filename": "my_file.mtx"}, - {"filename": "my_file2.mtx"}, - {"size": 100, "stencil": "7pt"}, - ] -)"; - - -/** - * Function which outputs the input format for benchmarks similar to the spmv. - */ -[[noreturn]] void print_config_error_and_exit() -{ - std::cerr << "Input has to be a JSON array of matrix configurations:\n" - << example_config << std::endl; - std::exit(1); -} - - -/** - * Validates whether the input format is correct for spmv-like benchmarks. - * - * @param value the JSON value to test. - */ -void validate_option_object(const rapidjson::Value& value) -{ - if (!value.IsObject() || - !((value.HasMember("size") && value.HasMember("stencil") && - value["size"].IsInt64() && value["stencil"].IsString()) || - (value.HasMember("filename") && value["filename"].IsString()))) { - print_config_error_and_exit(); - } -} - - -#endif // GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index a54d4d506ee..828f95bc8ca 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -14,8 +14,8 @@ if(GINKGO_BUILD_BENCHMARKS) if (NOT gflags_FOUND) add_subdirectory(gflags) endif() - if (NOT RapidJSON_FOUND) - add_subdirectory(rapidjson) + if (NOT nlohmann_json_FOUND) + add_subdirectory(nlohmann_json) endif() endif() diff --git a/third_party/nlohmann_json/CMakeLists.txt b/third_party/nlohmann_json/CMakeLists.txt new file mode 100644 index 00000000000..77064c66c40 --- /dev/null +++ b/third_party/nlohmann_json/CMakeLists.txt @@ -0,0 +1,9 @@ +message(STATUS "Fetching external nlohmann_json") +include(FetchContent) +FetchContent_Declare( + nlohmann_json + GIT_REPOSITORY https://github.com/nlohmann/json.git + GIT_TAG bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d +) +set(JSON_BuildTests OFF CACHE INTERNAL "") +FetchContent_MakeAvailable(nlohmann_json) diff --git a/third_party/rapidjson/CMakeLists.txt b/third_party/rapidjson/CMakeLists.txt deleted file mode 100644 index a96b90cb882..00000000000 --- a/third_party/rapidjson/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -message(STATUS "Fetching external RapidJSON") -include(FetchContent) -FetchContent_Declare( - rapidjson - GIT_REPOSITORY https://github.com/Tencent/rapidjson.git - GIT_TAG 27c3a8dc0e2c9218fe94986d249a12b5ed838f1d -) -FetchContent_GetProperties(rapidjson) -if(NOT rapidjson_POPULATED) - FetchContent_Populate(rapidjson) -endif() -set(RapidJSON_INCLUDE_DIR "${rapidjson_SOURCE_DIR}/include") -add_library(rapidjson INTERFACE) -set_target_properties(rapidjson PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${RapidJSON_INCLUDE_DIR}") From d25a7573a6fd071e74a6f4e81028615c59cb2ab1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Jul 2023 23:45:43 +0200 Subject: [PATCH 245/583] add distributed tests again This reverts commit 0dab7626e920bfdf32a2285ff5741da1e36404cb. Additionally replaces the JSON test case output by their description --- benchmark/test/CMakeLists.txt | 4 +- benchmark/test/input.distributed_mtx.json | 7 + benchmark/test/multi_vector_distributed.py | 38 ++ benchmark/test/reference/blas.profile.stderr | 6 +- benchmark/test/reference/blas.simple.stderr | 6 +- .../test/reference/conversion.all.stderr | 7 +- .../test/reference/conversion.profile.stderr | 7 +- .../test/reference/conversion.simple.stderr | 7 +- .../distributed_solver.profile.stderr | 11 +- .../distributed_solver.simple.stderr | 11 +- .../reference/matrix_statistics.simple.stderr | 7 +- .../multi_vector_distributed.profile.stderr | 254 ++++++++++ .../multi_vector_distributed.profile.stdout | 29 ++ .../multi_vector_distributed.simple.stderr | 10 + .../multi_vector_distributed.simple.stdout | 29 ++ .../reference/preconditioner.profile.stderr | 7 +- .../reference/preconditioner.simple.stderr | 7 +- .../test/reference/solver.profile.stderr | 10 +- benchmark/test/reference/solver.simple.stderr | 10 +- .../test/reference/sparse_blas.profile.stderr | 7 +- .../test/reference/sparse_blas.simple.stderr | 7 +- benchmark/test/reference/spmv.profile.stderr | 7 +- benchmark/test/reference/spmv.simple.stderr | 7 +- .../reference/spmv_distributed.profile.stderr | 446 ++++++++++++++++++ .../reference/spmv_distributed.profile.stdout | 22 + .../reference/spmv_distributed.simple.stderr | 10 + .../reference/spmv_distributed.simple.stdout | 23 + benchmark/test/spmv_distributed.py | 42 ++ benchmark/test/test_framework.py.in | 2 +- benchmark/utils/general.hpp | 39 -- benchmark/utils/runner.hpp | 10 +- 31 files changed, 935 insertions(+), 154 deletions(-) create mode 100644 benchmark/test/input.distributed_mtx.json create mode 100644 benchmark/test/multi_vector_distributed.py create mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stderr create mode 100644 benchmark/test/reference/multi_vector_distributed.profile.stdout create mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stderr create mode 100644 benchmark/test/reference/multi_vector_distributed.simple.stdout create mode 100644 benchmark/test/reference/spmv_distributed.profile.stderr create mode 100644 benchmark/test/reference/spmv_distributed.profile.stdout create mode 100644 benchmark/test/reference/spmv_distributed.simple.stderr create mode 100644 benchmark/test/reference/spmv_distributed.simple.stdout create mode 100644 benchmark/test/spmv_distributed.py diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt index e1aab6dd75d..1cd589927fa 100644 --- a/benchmark/test/CMakeLists.txt +++ b/benchmark/test/CMakeLists.txt @@ -22,5 +22,7 @@ add_benchmark_test(solver) add_benchmark_test(sparse_blas) add_benchmark_test(spmv) if (GINKGO_BUILD_MPI) + add_benchmark_test(multi_vector_distributed) + add_benchmark_test(spmv_distributed) add_benchmark_test(solver_distributed) -endif() +endif() \ No newline at end of file diff --git a/benchmark/test/input.distributed_mtx.json b/benchmark/test/input.distributed_mtx.json new file mode 100644 index 00000000000..aca115179e6 --- /dev/null +++ b/benchmark/test/input.distributed_mtx.json @@ -0,0 +1,7 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil" + } +] \ No newline at end of file diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py new file mode 100644 index 00000000000..1e0c4c8adf5 --- /dev/null +++ b/benchmark/test/multi_vector_distributed.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output_distributed( + ["-input", '[{"n": 100}]'], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + num_procs=3, +) + +# stdin +test_framework.compare_output_distributed( + [], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + stdin='[{"n": 100}]', + num_procs=3, +) + +# file +test_framework.compare_output_distributed( + ["-input", str(test_framework.sourcepath / "input.blas.json")], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + stdin='[{"n": 100}]', + num_procs=3, +) + +# profiler annotations +test_framework.compare_output_distributed( + ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"], + expected_stdout="multi_vector_distributed.profile.stdout", + expected_stderr="multi_vector_distributed.profile.stderr", + stdin='[{"n": 100}]', + num_procs=3, +) diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index b64f4321287..1313c85e462 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -4,11 +4,7 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The operations are copy,axpy,scal -Running test case -{ - "n": 100, - "blas": {} -} +Running test case n = 100 DEBUG: begin n = 100 Running blas: copy DEBUG: begin copy diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr index f41b25c6ee1..966ed597166 100644 --- a/benchmark/test/reference/blas.simple.stderr +++ b/benchmark/test/reference/blas.simple.stderr @@ -4,11 +4,7 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are copy,axpy,scal -Running test case -{ - "n": 100, - "blas": {} -} +Running test case n = 100 Running blas: copy Running blas: axpy Running blas: scal diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr index 1d5df7477ba..77ff50a1b89 100644 --- a/benchmark/test/reference/conversion.all.stderr +++ b/benchmark/test/reference/conversion.all.stderr @@ -4,12 +4,7 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr,ell,sellp,hybrid -Running test case -{ - "size": 100, - "stencil": "7pt", - "conversion": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 Running conversion: coo-read Running conversion: coo-csr diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index 089e6be02f9..6078dd3db2f 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -4,12 +4,7 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Running test case -{ - "size": 100, - "stencil": "7pt", - "conversion": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 DEBUG: begin stencil(100,7pt) Running conversion: coo-read diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr index a814dba6888..9b51effac09 100644 --- a/benchmark/test/reference/conversion.simple.stderr +++ b/benchmark/test/reference/conversion.simple.stderr @@ -4,12 +4,7 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Running test case -{ - "size": 100, - "stencil": "7pt", - "conversion": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 Running conversion: coo-read Running conversion: coo-csr diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index e583a1411a8..1daab773a38 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -5,16 +5,7 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "comm_pattern": "stencil", - "optimal": { - "spmv": "csr-csr" - }, - "solver": {} -} +Running test case stencil(100,7pt,stencil) DEBUG: begin allocate DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr index 9feb7fa9522..607081a3949 100644 --- a/benchmark/test/reference/distributed_solver.simple.stderr +++ b/benchmark/test/reference/distributed_solver.simple.stderr @@ -5,15 +5,6 @@ Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "comm_pattern": "stencil", - "optimal": { - "spmv": "csr-csr" - }, - "solver": {} -} +Running test case stencil(100,7pt,stencil) Matrix is of size (125, 125) Running solver: cg diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr index 75a7cca709f..d02edbc44da 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stderr +++ b/benchmark/test/reference/matrix_statistics.simple.stderr @@ -1,9 +1,4 @@ This is Ginkgo 1.7.0 (develop) running with core module 1.7.0 (develop) -Running test case -{ - "size": 100, - "stencil": "7pt", - "problem": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr new file mode 100644 index 00000000000..a77484daacb --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr @@ -0,0 +1,254 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scal +Running test case n = 100 +DEBUG: begin n = 100 + Running blas: copy +DEBUG: begin copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end copy + Running blas: axpy +DEBUG: begin axpy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end axpy + Running blas: scal +DEBUG: begin scal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::scale +DEBUG: end dense::scale +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end scal +DEBUG: end n = 100 diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout new file mode 100644 index 00000000000..3a2e7e54f80 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout @@ -0,0 +1,29 @@ + +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr new file mode 100644 index 00000000000..966ed597166 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scal +Running test case n = 100 + Running blas: copy + Running blas: axpy + Running blas: scal diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout new file mode 100644 index 00000000000..08e692727fe --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout @@ -0,0 +1,29 @@ + +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index c215b22c925..def3a83993d 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -4,12 +4,7 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running with preconditioners: none -Running test case -{ - "size": 100, - "stencil": "7pt", - "preconditioner": {} -} +Running test case stencil(100,7pt) DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr index 07d2cca6704..0090e180d2b 100644 --- a/benchmark/test/reference/preconditioner.simple.stderr +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -4,11 +4,6 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 Running with preconditioners: none -Running test case -{ - "size": 100, - "stencil": "7pt", - "preconditioner": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 Running preconditioner: none diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 0c3f7060796..43ff852f68e 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -5,15 +5,7 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "optimal": { - "spmv": "csr" - }, - "solver": {} -} +Running test case stencil(100,7pt) DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr index c5e4267a6bd..659dd026588 100644 --- a/benchmark/test/reference/solver.simple.stderr +++ b/benchmark/test/reference/solver.simple.stderr @@ -5,14 +5,6 @@ Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "optimal": { - "spmv": "csr" - }, - "solver": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125) Running solver: cg diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index d1434dad146..c47ce2a515b 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -4,12 +4,7 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The operations are transpose -Running test case -{ - "size": 100, - "stencil": "7pt", - "sparse_blas": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 DEBUG: begin allocate DEBUG: end allocate diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr index 452374a9268..1f2bb34809f 100644 --- a/benchmark/test/reference/sparse_blas.simple.stderr +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -4,11 +4,6 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are transpose -Running test case -{ - "size": 100, - "stencil": "7pt", - "sparse_blas": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 Running sparse_blas: transpose diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 09a10b725ea..4ff0125782f 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -5,12 +5,7 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "spmv": {} -} +Running test case stencil(100,7pt) DEBUG: begin allocate DEBUG: end allocate DEBUG: begin allocate diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr index a910512ff31..9d5047febb6 100644 --- a/benchmark/test/reference/spmv.simple.stderr +++ b/benchmark/test/reference/spmv.simple.stderr @@ -5,11 +5,6 @@ Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo The number of right hand sides is 1 -Running test case -{ - "size": 100, - "stencil": "7pt", - "spmv": {} -} +Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 Running spmv: coo diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr new file mode 100644 index 00000000000..95a07c8275c --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -0,0 +1,446 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The formats are [csr]x[csr] +The number of right hand sides is 1 +Running test case stencil(100,7pt,stencil) +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +Matrix is of size (81, 81), 144 +DEBUG: begin stencil(100,7pt,stencil) + Running spmv: csr-csr +DEBUG: begin csr-csr +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin copy() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin apply() +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: end csr-csr +DEBUG: end stencil(100,7pt,stencil) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout new file mode 100644 index 00000000000..ebacddb887c --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.profile.stdout @@ -0,0 +1,22 @@ + +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 6420, + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 81, + "cols": 81, + "nonzeros": 144, + "optimal": { + "spmv": "csr-csr" + } + } +] diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr new file mode 100644 index 00000000000..0df742d5b9b --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are [csr]x[csr] +The number of right hand sides is 1 +Running test case stencil(100,7pt,stencil) +Matrix is of size (81, 81), 144 + Running spmv: csr-csr diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout new file mode 100644 index 00000000000..64203476f91 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.simple.stdout @@ -0,0 +1,23 @@ + +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 6420, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 81, + "cols": 81, + "nonzeros": 144, + "optimal": { + "spmv": "csr-csr" + } + } +] diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py new file mode 100644 index 00000000000..356db48459e --- /dev/null +++ b/benchmark/test/spmv_distributed.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output_distributed( + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, +) + +# stdin +test_framework.compare_output_distributed( + [], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, + stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', +) + +# input file +test_framework.compare_output_distributed( + ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, +) + +# profiler annotations +test_framework.compare_output_distributed( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="spmv_distributed.profile.stdout", + expected_stderr="spmv_distributed.profile.stderr", + num_procs=3, +) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index da1b0bfd618..faf898a21cb 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -247,7 +247,7 @@ def compare_output( def compare_output_distributed( args, expected_stdout, expected_stderr, num_procs, stdin="" ): - compare_output( + compare_output_impl( args, expected_stdout, expected_stderr, diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 41acb560ba1..1c48680f883 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -245,45 +245,6 @@ std::shared_ptr create_profiler_hook( } -struct owning_profiling_scope_guard { - std::string name; - gko::log::profiling_scope_guard guard; - - owning_profiling_scope_guard() = default; - - owning_profiling_scope_guard(std::string name_, - gko::log::ProfilerHook* profiler_hook) - : name(std::move(name_)), guard{profiler_hook->user_range(name.c_str())} - {} -}; - - -struct annotate_functor { - owning_profiling_scope_guard operator()(std::string name) const - { - if (profiler_hook) { - return owning_profiling_scope_guard{std::move(name), - profiler_hook.get()}; - } - return {}; - } - - gko::log::profiling_scope_guard operator()(const char* name) const - { - if (profiler_hook) { - return profiler_hook->user_range(name); - } - return {}; - } - - annotate_functor(std::shared_ptr profiler_hook) - : profiler_hook{std::move(profiler_hook)} - {} - - std::shared_ptr profiler_hook; -}; - - // Returns a random number engine std::default_random_engine& get_engine() { diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp index 3520f7299ee..661c403706f 100644 --- a/benchmark/utils/runner.hpp +++ b/benchmark/utils/runner.hpp @@ -153,13 +153,13 @@ void run_test_cases(const Benchmark& benchmark, if (!test_case.contains(benchmark.get_name())) { test_case[benchmark.get_name()] = json::object(); } + auto test_case_desc = benchmark.describe_config(test_case); if (benchmark.should_print()) { - std::clog << "Running test case\n" - << std::setw(4) << test_case << std::endl; + std::clog << "Running test case " << test_case_desc + << std::endl; } auto test_case_state = benchmark.setup(exec, test_case); - auto test_case_str = benchmark.describe_config(test_case); - auto test_case_range = annotate(test_case_str.c_str()); + auto test_case_range = annotate(test_case_desc.c_str()); auto& benchmark_case = test_case[benchmark.get_name()]; for (const auto& operation_name : benchmark.get_operations()) { if (benchmark_case.contains(operation_name) && @@ -183,7 +183,7 @@ void run_test_cases(const Benchmark& benchmark, gko::name_demangling::get_dynamic_type(e); operation_case["error"] = e.what(); std::cerr << "Error when processing test case\n" - << std::setw(4) << test_case << "\n" + << test_case_desc << "\n" << "what(): " << e.what() << std::endl; } From 11134cdb9e240fc16ffe3cd7276a35cef0652a39 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Jul 2023 23:44:32 +0200 Subject: [PATCH 246/583] handle JSON and non-JSON test output separately --- benchmark/test/reference/blas.profile.stdout | 3 +- benchmark/test/reference/blas.simple.stdout | 3 +- .../test/reference/conversion.all.stdout | 3 +- .../test/reference/conversion.profile.stdout | 3 +- .../test/reference/conversion.simple.stdout | 3 +- .../distributed_solver.profile.stdout | 3 +- .../distributed_solver.simple.stdout | 3 +- .../reference/matrix_statistics.simple.stdout | 3 +- .../multi_vector_distributed.profile.stdout | 3 +- .../multi_vector_distributed.simple.stdout | 3 +- .../reference/preconditioner.profile.stdout | 3 +- .../reference/preconditioner.simple.stdout | 3 +- .../test/reference/solver.profile.stdout | 3 +- benchmark/test/reference/solver.simple.stdout | 3 +- .../test/reference/sparse_blas.profile.stdout | 3 +- .../test/reference/sparse_blas.simple.stdout | 3 +- benchmark/test/reference/spmv.profile.stdout | 3 +- benchmark/test/reference/spmv.simple.stdout | 3 +- .../reference/spmv_distributed.profile.stdout | 3 +- .../reference/spmv_distributed.simple.stdout | 3 +- benchmark/test/test_framework.py.in | 78 ++++++------------- 21 files changed, 44 insertions(+), 94 deletions(-) diff --git a/benchmark/test/reference/blas.profile.stdout b/benchmark/test/reference/blas.profile.stdout index 3a2e7e54f80..8998d5eaed7 100644 --- a/benchmark/test/reference/blas.profile.stdout +++ b/benchmark/test/reference/blas.profile.stdout @@ -1,4 +1,3 @@ - [ { "n": 100, @@ -26,4 +25,4 @@ } } } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/blas.simple.stdout b/benchmark/test/reference/blas.simple.stdout index 08e692727fe..a586a9bc57b 100644 --- a/benchmark/test/reference/blas.simple.stdout +++ b/benchmark/test/reference/blas.simple.stdout @@ -1,4 +1,3 @@ - [ { "n": 100, @@ -26,4 +25,4 @@ } } } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout index c4b657a42c4..0c77d464793 100644 --- a/benchmark/test/reference/conversion.all.stdout +++ b/benchmark/test/reference/conversion.all.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -74,4 +73,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout index b29815f6c17..a9c3ea674fa 100644 --- a/benchmark/test/reference/conversion.profile.stdout +++ b/benchmark/test/reference/conversion.profile.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -29,4 +28,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout index 856f1330eea..81c735789d1 100644 --- a/benchmark/test/reference/conversion.simple.stdout +++ b/benchmark/test/reference/conversion.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -29,4 +28,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout index aef92652256..55dfb1dc428 100644 --- a/benchmark/test/reference/distributed_solver.profile.stdout +++ b/benchmark/test/reference/distributed_solver.profile.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -31,4 +30,4 @@ "rows": 125, "cols": 125 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout index 002b9d91347..eed8d864388 100644 --- a/benchmark/test/reference/distributed_solver.simple.stdout +++ b/benchmark/test/reference/distributed_solver.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -57,4 +56,4 @@ "rows": 125, "cols": 125 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout index 13746ce8a46..923bbc9f962 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stdout +++ b/benchmark/test/reference/matrix_statistics.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -38,4 +37,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout index 3a2e7e54f80..8998d5eaed7 100644 --- a/benchmark/test/reference/multi_vector_distributed.profile.stdout +++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout @@ -1,4 +1,3 @@ - [ { "n": 100, @@ -26,4 +25,4 @@ } } } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout index 08e692727fe..a586a9bc57b 100644 --- a/benchmark/test/reference/multi_vector_distributed.simple.stdout +++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout @@ -1,4 +1,3 @@ - [ { "n": 100, @@ -26,4 +25,4 @@ } } } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout index f53407d818d..e33a6502eea 100644 --- a/benchmark/test/reference/preconditioner.profile.stdout +++ b/benchmark/test/reference/preconditioner.profile.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -22,4 +21,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout index 92bb51ddb57..06291228a1c 100644 --- a/benchmark/test/reference/preconditioner.simple.stdout +++ b/benchmark/test/reference/preconditioner.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -30,4 +29,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout index 0148e6ef092..906c74de5e7 100644 --- a/benchmark/test/reference/solver.profile.stdout +++ b/benchmark/test/reference/solver.profile.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -30,4 +29,4 @@ "rows": 125, "cols": 125 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout index b4e7b56b2bf..5d127fe4b78 100644 --- a/benchmark/test/reference/solver.simple.stdout +++ b/benchmark/test/reference/solver.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -54,4 +53,4 @@ "rows": 125, "cols": 125 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout index 848fb503ed4..e9d48fde23d 100644 --- a/benchmark/test/reference/sparse_blas.profile.stdout +++ b/benchmark/test/reference/sparse_blas.profile.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -16,4 +15,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/sparse_blas.simple.stdout b/benchmark/test/reference/sparse_blas.simple.stdout index f39300ca35b..3cc5f774ebf 100644 --- a/benchmark/test/reference/sparse_blas.simple.stdout +++ b/benchmark/test/reference/sparse_blas.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -23,4 +22,4 @@ "cols": 125, "nonzeros": 725 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout index 5302d54f9f0..409a92d4e33 100644 --- a/benchmark/test/reference/spmv.profile.stdout +++ b/benchmark/test/reference/spmv.profile.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -18,4 +17,4 @@ "spmv": "coo" } } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout index 737938d7c96..9601a15b331 100644 --- a/benchmark/test/reference/spmv.simple.stdout +++ b/benchmark/test/reference/spmv.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -19,4 +18,4 @@ "spmv": "coo" } } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout index ebacddb887c..8de6a68ae8a 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stdout +++ b/benchmark/test/reference/spmv_distributed.profile.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -19,4 +18,4 @@ "spmv": "csr-csr" } } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout index 64203476f91..f94e4b992a1 100644 --- a/benchmark/test/reference/spmv_distributed.simple.stdout +++ b/benchmark/test/reference/spmv_distributed.simple.stdout @@ -1,4 +1,3 @@ - [ { "size": 100, @@ -20,4 +19,4 @@ "spmv": "csr-csr" } } -] +] \ No newline at end of file diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index faf898a21cb..3deb282297a 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -22,7 +22,8 @@ denumberify_paths = [ "rhs_norm", "max_relative_norm2", ] -empty_string_paths = ["error", "filename"] +detypenameify_key_starts = ["generate(", "apply(", "advanced_apply(", "copy(", "check("] +empty_string_paths = ["filename"] empty_array_paths = [ "recurrent_residuals", "true_residuals", @@ -31,6 +32,18 @@ empty_array_paths = [ ] +def sanitize_json_key(key: str): + """Applies sanitation to a single key. + + Strings that start with a name in detypenameify_key_starts will be truncated + """ + + for start in detypenameify_key_starts: + if key.startswith(start): + return start + ")" + return key + + def sanitize_json_key_value(key: str, value, sanitize_all: bool): """Applies sanitation to a single key-value pair. @@ -59,7 +72,7 @@ def sanitize_json(parsed_input, sanitize_all: bool = False): if isinstance(parsed_input, dict): return { - key: sanitize_json_key_value(key, value, sanitize_all) + sanitize_json_key(key): sanitize_json_key_value(key, value, sanitize_all) for key, value in parsed_input.items() } elif isinstance(parsed_input, list): @@ -70,40 +83,15 @@ def sanitize_json(parsed_input, sanitize_all: bool = False): return parsed_input -def sanitize_json_in_text(lines: List[str]) -> List[str]: - """Sanitizes all occurrences of JSON content inside text input. +def determinize_json_text(input: str) -> List[str]: + """Sanitizes the given input JSON string. - Takes a list of text lines and detects any pretty-printed JSON output inside - (recognized by a single [, {, } or ] in an otherwise empty line). - The JSON output will be parsed and sanitized through sanitize_json(...) + The JSON values will be parsed and sanitized through sanitize_json(...) and pretty-printed to replace the original JSON input. - The function returns the resulting output. """ - json_begins = [i for i, l in enumerate(lines) if l in ["[", "{"]] - json_ends = [i + 1 for i, l in enumerate(lines) if l in ["]", "}"]] - json_pairs = list(zip(json_begins, json_ends)) - if len(json_pairs) == 0: - return lines - assert all(begin < end for begin, end in json_pairs) - nonjson_pairs = ( - [(0, json_begins[0])] - + list(zip(json_ends[:-1], json_begins[1:])) - + [(json_ends[-1], len(lines))] - ) - combined_pairs = sorted( - [(begin, end, False) for begin, end in nonjson_pairs] - + [(begin, end, True) for begin, end in json_pairs] - ) - texts = [ - ("\n".join(lines[begin:end]), do_sanitize) - for begin, end, do_sanitize in combined_pairs - ] - reconstructed = [ - json.dumps(sanitize_json(json.loads(t)), indent=4) if do_sanitize else t - for t, do_sanitize in texts - ] - return "\n".join(reconstructed).split("\n") + result = json.dumps(sanitize_json(json.loads(input)), indent=4) + return result.splitlines() def determinize_text( @@ -116,9 +104,6 @@ def determinize_text( Every input line matching an entry from ignore_patterns will be removed. Every line matching the first string in an entry from replace_patterns will be replaced by the second string. - Finally, the text will be passed to sanitize_json_in_text, which removes - nondeterministic parts from JSON objects/arrays in the input, - if it can be parsed correctly. The output is guaranteed to end with an empty line. """ @@ -137,10 +122,7 @@ def determinize_text( output_lines.append(line) if len(output_lines) == 0 or output_lines[-1] != "": output_lines.append("") - try: - return sanitize_json_in_text(output_lines) - except json.decoder.JSONDecodeError: - return output_lines + return output_lines def compare_output_impl( @@ -173,13 +155,7 @@ def compare_output_impl( ] if generate: open(expected_stdout, "w").write( - "\n".join( - determinize_text( - result.stdout.decode(), - ignore_patterns=[], - replace_patterns=typename_patterns, - ) - ) + "\n".join(determinize_json_text(result.stdout.decode())) ) open(expected_stderr, "w").write( "\n".join( @@ -192,19 +168,13 @@ def compare_output_impl( ) print("GENERATED") return - result_stdout_processed = determinize_text( - result.stdout.decode(), ignore_patterns=[], replace_patterns=typename_patterns - ) + result_stdout_processed = determinize_json_text(result.stdout.decode()) result_stderr_processed = determinize_text( result.stderr.decode(), ignore_patterns=version_patterns, replace_patterns=typename_patterns, ) - expected_stdout_processed = determinize_text( - open(expected_stdout).read(), - ignore_patterns=[], - replace_patterns=typename_patterns, - ) + expected_stdout_processed = determinize_json_text(open(expected_stdout).read()) expected_stderr_processed = determinize_text( open(expected_stderr).read(), ignore_patterns=version_patterns, From 306792aa148ffa366e05369574fba70e19178f7f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 28 Jul 2023 00:15:00 +0200 Subject: [PATCH 247/583] benchmark reads on device_matrix_data --- benchmark/conversion/conversion.cpp | 11 +++-- .../test/reference/conversion.profile.stderr | 46 +++++++++++++++---- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp index b9a5d5c46d6..5f03cb2b933 100644 --- a/benchmark/conversion/conversion.cpp +++ b/benchmark/conversion/conversion.cpp @@ -60,7 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. using Generator = DefaultSystemGenerator<>; -struct ConversionBenchmark : Benchmark> { +struct ConversionBenchmark : Benchmark> { std::string name; std::vector operations; @@ -112,8 +112,8 @@ struct ConversionBenchmark : Benchmark> { return Generator::describe_config(test_case); } - gko::matrix_data setup(std::shared_ptr exec, - json& test_case) const override + gko::device_matrix_data setup( + std::shared_ptr exec, json& test_case) const override { gko::matrix_data data; data = Generator::generate_matrix_data(test_case); @@ -122,12 +122,13 @@ struct ConversionBenchmark : Benchmark> { test_case["rows"] = data.size[0]; test_case["cols"] = data.size[1]; test_case["nonzeros"] = data.nonzeros.size(); - return data; + return gko::device_matrix_data::create_from_host(exec, + data); } void run(std::shared_ptr exec, std::shared_ptr timer, - gko::matrix_data& data, + gko::device_matrix_data& data, const std::string& operation_name, json& operation_case) const override { diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index 6078dd3db2f..ca80375c5bf 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -6,17 +6,29 @@ The random seed for right hand sides is 42 The formats are coo,csr Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin allocate +DEBUG: end allocate +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa DEBUG: begin stencil(100,7pt) Running conversion: coo-read DEBUG: begin coo-read DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa +DEBUG: begin copy +DEBUG: end copy DEBUG: begin free DEBUG: end free DEBUG: begin free @@ -28,12 +40,16 @@ DEBUG: end coo-read DEBUG: begin coo-csr DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate DEBUG: begin components::fill_array @@ -75,12 +91,16 @@ DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free @@ -104,12 +124,16 @@ DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate -DEBUG: begin components::aos_to_soa -DEBUG: end components::aos_to_soa +DEBUG: begin copy +DEBUG: end copy DEBUG: begin allocate DEBUG: end allocate DEBUG: begin free @@ -146,3 +170,9 @@ DEBUG: begin free DEBUG: end free DEBUG: end csr-coo DEBUG: end stencil(100,7pt) +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free +DEBUG: begin free +DEBUG: end free From 6a9e59de9c0e8726a01efc2ddc3645df5ce5c680 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 17 Aug 2023 21:17:05 +0200 Subject: [PATCH 248/583] remove allocations from output they are sometimes implementation-dependent for libstdc++ types --- benchmark/test/reference/blas.profile.stderr | 28 -- .../test/reference/conversion.profile.stderr | 104 ------ .../distributed_solver.profile.stderr | 232 ------------- .../multi_vector_distributed.profile.stderr | 128 -------- .../reference/preconditioner.profile.stderr | 44 --- .../test/reference/solver.profile.stderr | 132 -------- .../test/reference/sparse_blas.profile.stderr | 36 -- benchmark/test/reference/spmv.profile.stderr | 48 --- .../reference/spmv_distributed.profile.stderr | 308 ------------------ benchmark/test/test_framework.py.in | 11 +- 10 files changed, 6 insertions(+), 1065 deletions(-) diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index 1313c85e462..529fc16009c 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -8,27 +8,13 @@ Running test case n = 100 DEBUG: begin n = 100 Running blas: copy DEBUG: begin copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end copy Running blas: axpy DEBUG: begin axpy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill @@ -37,28 +23,14 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::add_scaled DEBUG: end dense::add_scaled -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end axpy Running blas: scal DEBUG: begin scal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::scale DEBUG: end dense::scale -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end scal DEBUG: end n = 100 diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index ca80375c5bf..a233579c721 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -6,173 +6,69 @@ The random seed for right hand sides is 42 The formats are coo,csr Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin stencil(100,7pt) Running conversion: coo-read DEBUG: begin coo-read -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end coo-read Running conversion: coo-csr DEBUG: begin coo-csr -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: end copy() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end coo-csr Running conversion: csr-read DEBUG: begin csr-read -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end csr-read Running conversion: csr-coo DEBUG: begin csr-coo -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::convert_ptrs_to_idxs DEBUG: end components::convert_ptrs_to_idxs DEBUG: end copy() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end csr-coo DEBUG: end stencil(100,7pt) -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 1daab773a38..4ea20730117 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -6,18 +6,8 @@ The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 Running test case stencil(100,7pt,stencil) -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -32,18 +22,10 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin copy() @@ -56,8 +38,6 @@ DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin copy() @@ -70,135 +50,29 @@ DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: end distributed_matrix::build_local_nonlocal DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() @@ -206,62 +80,30 @@ Matrix is of size (125, 125) DEBUG: begin stencil(100,7pt,stencil) Running solver: cg DEBUG: begin cg -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() -DEBUG: begin free -DEBUG: end free DEBUG: begin generate() DEBUG: begin generate() DEBUG: end generate() DEBUG: end generate() DEBUG: begin apply() DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin cg::initialize DEBUG: end cg::initialize DEBUG: begin advanced_apply() @@ -276,20 +118,10 @@ DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin apply() DEBUG: begin copy() DEBUG: begin dense::copy @@ -586,25 +418,9 @@ DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm DEBUG: end check() DEBUG: end check() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end iteration DEBUG: end apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() @@ -620,59 +436,11 @@ DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::compute_squared_norm2 DEBUG: end dense::compute_squared_norm2 DEBUG: begin dense::compute_sqrt DEBUG: end dense::compute_sqrt DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end cg DEBUG: end stencil(100,7pt,stencil) -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr index a77484daacb..102330e38f4 100644 --- a/benchmark/test/reference/multi_vector_distributed.profile.stderr +++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr @@ -8,18 +8,8 @@ Running test case n = 100 DEBUG: begin n = 100 Running blas: copy DEBUG: begin copy -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -34,32 +24,10 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -74,45 +42,17 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::copy DEBUG: end dense::copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end copy Running blas: axpy DEBUG: begin axpy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -127,32 +67,10 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -167,20 +85,8 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill @@ -189,29 +95,11 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::add_scaled DEBUG: end dense::add_scaled -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end axpy Running blas: scal DEBUG: begin scal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -226,29 +114,13 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::scale DEBUG: end dense::scale -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end scal DEBUG: end n = 100 diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index def3a83993d..610dfe464ec 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -5,50 +5,20 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running with preconditioners: none Running test case stencil(100,7pt) -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data @@ -58,8 +28,6 @@ DEBUG: begin stencil(100,7pt) Running preconditioner: none DEBUG: begin none DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() @@ -71,17 +39,5 @@ DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() DEBUG: end apply() -DEBUG: begin free -DEBUG: end free DEBUG: end none DEBUG: end stencil(100,7pt) -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 43ff852f68e..238591eb0c9 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -6,49 +6,19 @@ The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 Running test case stencil(100,7pt) -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() @@ -56,78 +26,36 @@ Matrix is of size (125, 125) DEBUG: begin stencil(100,7pt) Running solver: cg DEBUG: begin cg -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() -DEBUG: begin free -DEBUG: end free DEBUG: begin generate() DEBUG: begin generate() DEBUG: end generate() DEBUG: end generate() DEBUG: begin apply() DEBUG: begin iteration -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin cg::initialize DEBUG: end cg::initialize DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin apply() DEBUG: begin copy() DEBUG: begin dense::copy @@ -352,25 +280,9 @@ DEBUG: begin residual_norm::residual_norm DEBUG: end residual_norm::residual_norm DEBUG: end check() DEBUG: end check() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end iteration DEBUG: end apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() @@ -378,53 +290,9 @@ DEBUG: begin advanced_apply() DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::compute_norm2_dispatch DEBUG: end dense::compute_norm2_dispatch DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end cg DEBUG: end stencil(100,7pt) -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index c47ce2a515b..60cf41ccbae 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -6,54 +6,18 @@ The random seed for right hand sides is 42 The operations are transpose Running test case stencil(100,7pt) Matrix is of size (125, 125), 725 -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free DEBUG: begin stencil(100,7pt) Running sparse_blas: transpose DEBUG: begin transpose -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin csr::transpose DEBUG: end csr::transpose -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end transpose DEBUG: end stencil(100,7pt) -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 4ff0125782f..2299614c6c4 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -6,61 +6,25 @@ The random seed for right hand sides is 42 The formats are coo The number of right hand sides is 1 Running test case stencil(100,7pt) -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free Matrix is of size (125, 125), 725 DEBUG: begin stencil(100,7pt) Running spmv: coo DEBUG: begin coo -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() @@ -68,17 +32,5 @@ DEBUG: begin apply() DEBUG: begin coo::spmv DEBUG: end coo::spmv DEBUG: end apply() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end coo DEBUG: end stencil(100,7pt) -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index 95a07c8275c..b44cef7f3f6 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -6,18 +6,8 @@ The random seed for right hand sides is 42 The formats are [csr]x[csr] The number of right hand sides is 1 Running test case stencil(100,7pt,stencil) -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -32,50 +22,16 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -90,54 +46,20 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free Matrix is of size (81, 81), 144 DEBUG: begin stencil(100,7pt,stencil) Running spmv: csr-csr DEBUG: begin csr-csr -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::fill_array @@ -152,18 +74,10 @@ DEBUG: begin partition::build_starting_indices DEBUG: end partition::build_starting_indices DEBUG: begin copy DEBUG: end copy -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin copy() @@ -176,8 +90,6 @@ DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin copy() @@ -190,219 +102,27 @@ DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: end copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::fill DEBUG: end dense::fill -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end distributed_matrix::build_local_nonlocal DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin free -DEBUG: end free DEBUG: begin copy DEBUG: end copy -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: begin copy() -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() DEBUG: begin apply() -DEBUG: begin allocate -DEBUG: end allocate -DEBUG: begin allocate -DEBUG: end allocate DEBUG: begin dense::row_gather DEBUG: end dense::row_gather DEBUG: begin apply() @@ -414,33 +134,5 @@ DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() DEBUG: end apply() -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free DEBUG: end csr-csr DEBUG: end stencil(100,7pt,stencil) -DEBUG: begin free -DEBUG: end free -DEBUG: begin free -DEBUG: end free diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 3deb282297a..014d3cb41a5 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -146,8 +146,9 @@ def compare_output_impl( " ".join(["'{}'".format(arg) for arg in launcher_flags + args]) ) ) - version_patterns = [ - " the .* module is", + ignore_patterns = [ + " the .* module is", # version numbers + "DEBUG: (begin|end ) (allocate|free)", # allocations ] typename_patterns = [ ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()"), @@ -161,7 +162,7 @@ def compare_output_impl( "\n".join( determinize_text( result.stderr.decode(), - ignore_patterns=version_patterns, + ignore_patterns=ignore_patterns, replace_patterns=typename_patterns, ) ) @@ -171,13 +172,13 @@ def compare_output_impl( result_stdout_processed = determinize_json_text(result.stdout.decode()) result_stderr_processed = determinize_text( result.stderr.decode(), - ignore_patterns=version_patterns, + ignore_patterns=ignore_patterns, replace_patterns=typename_patterns, ) expected_stdout_processed = determinize_json_text(open(expected_stdout).read()) expected_stderr_processed = determinize_text( open(expected_stderr).read(), - ignore_patterns=version_patterns, + ignore_patterns=ignore_patterns, replace_patterns=typename_patterns, ) failed = False From 3e0da3b3a580b185f1e0f335bc22db9693631620 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 17 Aug 2023 21:32:01 +0200 Subject: [PATCH 249/583] update matrix outputs --- benchmark/test/reference/conversion.matrix.stderr | 6 +----- benchmark/test/reference/conversion.matrix.stdout | 3 +-- .../test/reference/distributed_solver.matrix.stderr | 9 +-------- .../test/reference/distributed_solver.matrix.stdout | 3 +-- benchmark/test/reference/matrix_statistics.matrix.stderr | 6 +----- benchmark/test/reference/matrix_statistics.matrix.stdout | 3 +-- benchmark/test/reference/preconditioner.matrix.stderr | 6 +----- benchmark/test/reference/preconditioner.matrix.stdout | 3 +-- benchmark/test/reference/solver.matrix.stderr | 9 +-------- benchmark/test/reference/solver.matrix.stdout | 3 +-- benchmark/test/reference/sparse_blas.matrix.stderr | 6 +----- benchmark/test/reference/sparse_blas.matrix.stdout | 3 +-- benchmark/test/reference/spmv.matrix.stderr | 6 +----- benchmark/test/reference/spmv.matrix.stdout | 3 +-- benchmark/test/test_framework.py.in | 1 + 15 files changed, 15 insertions(+), 55 deletions(-) diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr index 369a363a53e..5e7bd1cce24 100644 --- a/benchmark/test/reference/conversion.matrix.stderr +++ b/benchmark/test/reference/conversion.matrix.stderr @@ -4,11 +4,7 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Running test case -{ - "filename": "", - "conversion": {} -} +Running test case Matrix is of size (36, 36), 208 Running conversion: coo-read Running conversion: coo-csr diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout index 7e537fa4919..7f27b0c25b3 100644 --- a/benchmark/test/reference/conversion.matrix.stdout +++ b/benchmark/test/reference/conversion.matrix.stdout @@ -1,4 +1,3 @@ - [ { "filename": "", @@ -28,4 +27,4 @@ "cols": 36, "nonzeros": 208 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr index 4f0c6b22edd..cd2bb49261c 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stderr +++ b/benchmark/test/reference/distributed_solver.matrix.stderr @@ -5,13 +5,6 @@ Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case -{ - "filename": "", - "optimal": { - "spmv": "csr-csr" - }, - "solver": {} -} +Running test case Matrix is of size (36, 36) Running solver: cg diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout index cd3c7b8bd43..ec1d258e2f4 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stdout +++ b/benchmark/test/reference/distributed_solver.matrix.stdout @@ -1,4 +1,3 @@ - [ { "filename": "", @@ -55,4 +54,4 @@ "rows": 36, "cols": 36 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr index 7bb33842f25..0b31ef3a888 100644 --- a/benchmark/test/reference/matrix_statistics.matrix.stderr +++ b/benchmark/test/reference/matrix_statistics.matrix.stderr @@ -1,8 +1,4 @@ This is Ginkgo 1.7.0 (develop) running with core module 1.7.0 (develop) -Running test case -{ - "filename": "", - "problem": {} -} +Running test case Matrix is of size (36, 36), 208 diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout index ea73587fde4..a6297e89b66 100644 --- a/benchmark/test/reference/matrix_statistics.matrix.stdout +++ b/benchmark/test/reference/matrix_statistics.matrix.stdout @@ -1,4 +1,3 @@ - [ { "filename": "", @@ -37,4 +36,4 @@ "cols": 36, "nonzeros": 208 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr index 4088a20c925..7452ab91b3a 100644 --- a/benchmark/test/reference/preconditioner.matrix.stderr +++ b/benchmark/test/reference/preconditioner.matrix.stderr @@ -4,10 +4,6 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 Running with preconditioners: none -Running test case -{ - "filename": "", - "preconditioner": {} -} +Running test case Matrix is of size (36, 36), 208 Running preconditioner: none diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout index 0415a87ea8d..51adb7383c3 100644 --- a/benchmark/test/reference/preconditioner.matrix.stdout +++ b/benchmark/test/reference/preconditioner.matrix.stdout @@ -1,4 +1,3 @@ - [ { "filename": "", @@ -29,4 +28,4 @@ "cols": 36, "nonzeros": 208 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr index 8a1ea117314..cd2bb49261c 100644 --- a/benchmark/test/reference/solver.matrix.stderr +++ b/benchmark/test/reference/solver.matrix.stderr @@ -5,13 +5,6 @@ Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case -{ - "filename": "", - "optimal": { - "spmv": "csr" - }, - "solver": {} -} +Running test case Matrix is of size (36, 36) Running solver: cg diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout index 56577288c2d..a87e78f7f66 100644 --- a/benchmark/test/reference/solver.matrix.stdout +++ b/benchmark/test/reference/solver.matrix.stdout @@ -1,4 +1,3 @@ - [ { "filename": "", @@ -53,4 +52,4 @@ "rows": 36, "cols": 36 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr index ff52b6a3269..483429fd71d 100644 --- a/benchmark/test/reference/sparse_blas.matrix.stderr +++ b/benchmark/test/reference/sparse_blas.matrix.stderr @@ -4,10 +4,6 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are transpose -Running test case -{ - "filename": "", - "sparse_blas": {} -} +Running test case Matrix is of size (36, 36), 208 Running sparse_blas: transpose diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout index 4a64c8ea1ce..74fdbf98e7a 100644 --- a/benchmark/test/reference/sparse_blas.matrix.stdout +++ b/benchmark/test/reference/sparse_blas.matrix.stdout @@ -1,4 +1,3 @@ - [ { "filename": "", @@ -22,4 +21,4 @@ "cols": 36, "nonzeros": 208 } -] +] \ No newline at end of file diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr index a618da5b321..45beba6cafb 100644 --- a/benchmark/test/reference/spmv.matrix.stderr +++ b/benchmark/test/reference/spmv.matrix.stderr @@ -5,10 +5,6 @@ Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo The number of right hand sides is 1 -Running test case -{ - "filename": "", - "spmv": {} -} +Running test case Matrix is of size (36, 36), 208 Running spmv: coo diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout index dc30ab6b284..4d03ce3cd07 100644 --- a/benchmark/test/reference/spmv.matrix.stdout +++ b/benchmark/test/reference/spmv.matrix.stdout @@ -1,4 +1,3 @@ - [ { "filename": "", @@ -18,4 +17,4 @@ "spmv": "coo" } } -] +] \ No newline at end of file diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 014d3cb41a5..6e3092bde6c 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -153,6 +153,7 @@ def compare_output_impl( typename_patterns = [ ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()"), ("what\\(\\): .*", "what(): "), + (re.escape(str(matrixpath)), ""), ] if generate: open(expected_stdout, "w").write( From 8ed05956a03696c59f0568944db2768d2d9998ed Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 19 Aug 2023 11:27:22 +0200 Subject: [PATCH 250/583] review updates - rename 'determinize' -> 'sanitize' - use empty struct for empty benchmark state - use version tag instead of commit ID - use std::endl where appropriate Co-authored-by: Marcel Koch --- .../matrix_statistics/matrix_statistics.cpp | 5 +++- benchmark/test/test_framework.py.in | 30 +++++++++++-------- benchmark/utils/general.hpp | 2 +- third_party/nlohmann_json/CMakeLists.txt | 2 +- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 40c505c7627..4bb63032550 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -149,7 +149,10 @@ void extract_matrix_statistics(gko::matrix_data& data, using Generator = DefaultSystemGenerator; -struct MatrixStatistics : Benchmark { +struct empty_state {}; + + +struct MatrixStatistics : Benchmark { std::string name; std::vector empty; diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 6e3092bde6c..1a07818df1f 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -22,7 +22,8 @@ denumberify_paths = [ "rhs_norm", "max_relative_norm2", ] -detypenameify_key_starts = ["generate(", "apply(", "advanced_apply(", "copy(", "check("] +detypenameify_key_starts = [ + "generate(", "apply(", "advanced_apply(", "copy(", "check("] empty_string_paths = ["filename"] empty_array_paths = [ "recurrent_residuals", @@ -44,7 +45,7 @@ def sanitize_json_key(key: str): return key -def sanitize_json_key_value(key: str, value, sanitize_all: bool): +def sanitize_json_value(key: str, value, sanitize_all: bool): """Applies sanitation to a single key-value pair. Strings with a key in empty_string_paths will be emptied @@ -72,7 +73,7 @@ def sanitize_json(parsed_input, sanitize_all: bool = False): if isinstance(parsed_input, dict): return { - sanitize_json_key(key): sanitize_json_key_value(key, value, sanitize_all) + sanitize_json_key(key): sanitize_json_value(key, value, sanitize_all) for key, value in parsed_input.items() } elif isinstance(parsed_input, list): @@ -83,7 +84,7 @@ def sanitize_json(parsed_input, sanitize_all: bool = False): return parsed_input -def determinize_json_text(input: str) -> List[str]: +def sanitize_json_text(input: str) -> List[str]: """Sanitizes the given input JSON string. The JSON values will be parsed and sanitized through sanitize_json(...) @@ -94,7 +95,7 @@ def determinize_json_text(input: str) -> List[str]: return result.splitlines() -def determinize_text( +def sanitize_text( input: str, ignore_patterns: List[str], replace_patterns: List[Tuple[str, str]], @@ -157,11 +158,11 @@ def compare_output_impl( ] if generate: open(expected_stdout, "w").write( - "\n".join(determinize_json_text(result.stdout.decode())) + "\n".join(sanitize_json_text(result.stdout.decode())) ) open(expected_stderr, "w").write( "\n".join( - determinize_text( + sanitize_text( result.stderr.decode(), ignore_patterns=ignore_patterns, replace_patterns=typename_patterns, @@ -170,14 +171,15 @@ def compare_output_impl( ) print("GENERATED") return - result_stdout_processed = determinize_json_text(result.stdout.decode()) - result_stderr_processed = determinize_text( + result_stdout_processed = sanitize_json_text(result.stdout.decode()) + result_stderr_processed = sanitize_text( result.stderr.decode(), ignore_patterns=ignore_patterns, replace_patterns=typename_patterns, ) - expected_stdout_processed = determinize_json_text(open(expected_stdout).read()) - expected_stderr_processed = determinize_text( + expected_stdout_processed = sanitize_json_text( + open(expected_stdout).read()) + expected_stderr_processed = sanitize_text( open(expected_stderr).read(), ignore_patterns=ignore_patterns, replace_patterns=typename_patterns, @@ -187,7 +189,8 @@ def compare_output_impl( print("FAIL: stdout differs") print( "\n".join( - difflib.unified_diff(expected_stdout_processed, result_stdout_processed) + difflib.unified_diff( + expected_stdout_processed, result_stdout_processed) ) ) failed = True @@ -195,7 +198,8 @@ def compare_output_impl( print("FAIL: stderr differs") print( "\n".join( - difflib.unified_diff(expected_stderr_processed, result_stderr_processed) + difflib.unified_diff( + expected_stderr_processed, result_stderr_processed) ) ) failed = True diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 1c48680f883..550f6fe2720 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -216,7 +216,7 @@ void print_general_information(const std::string& extra) } std::clog << "The random seed for right hand sides is " << FLAGS_seed << '\n' - << extra << '\n'; + << extra << std::endl; } diff --git a/third_party/nlohmann_json/CMakeLists.txt b/third_party/nlohmann_json/CMakeLists.txt index 77064c66c40..b95cfa5606a 100644 --- a/third_party/nlohmann_json/CMakeLists.txt +++ b/third_party/nlohmann_json/CMakeLists.txt @@ -3,7 +3,7 @@ include(FetchContent) FetchContent_Declare( nlohmann_json GIT_REPOSITORY https://github.com/nlohmann/json.git - GIT_TAG bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d + GIT_TAG v3.9.1 ) set(JSON_BuildTests OFF CACHE INTERNAL "") FetchContent_MakeAvailable(nlohmann_json) From 5fe683d85624fe376569fd8b98ee762eef037add Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 19 Aug 2023 12:19:00 +0200 Subject: [PATCH 251/583] annotate repetitions --- benchmark/blas/blas_common.hpp | 17 +++++++----- benchmark/conversion/conversion.cpp | 26 +++++++++++++------ .../matrix_statistics/matrix_statistics.cpp | 9 ++++--- benchmark/preconditioner/preconditioner.cpp | 13 +++++++--- benchmark/solver/solver_common.hpp | 21 +++++++++------ benchmark/sparse_blas/sparse_blas.cpp | 17 +++++++----- benchmark/spmv/spmv_common.hpp | 16 +++++++----- benchmark/utils/general.hpp | 26 +++++++++++++++++++ benchmark/utils/runner.hpp | 16 ++++-------- 9 files changed, 109 insertions(+), 52 deletions(-) diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp index 88819a043b0..1267dc57c15 100644 --- a/benchmark/blas/blas_common.hpp +++ b/benchmark/blas/blas_common.hpp @@ -489,7 +489,8 @@ struct BlasBenchmark : Benchmark { void run(std::shared_ptr exec, std::shared_ptr timer, - dimensions& dims, const std::string& operation_name, + annotate_functor annotate, dimensions& dims, + const std::string& operation_name, json& operation_case) const override { auto op = operation_map.at(operation_name)(exec, dims); @@ -497,16 +498,20 @@ struct BlasBenchmark : Benchmark { IterationControl ic(timer); // warm run - for (auto _ : ic.warmup_run()) { - op->prepare(); - exec->synchronize(); - op->run(); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + op->prepare(); + exec->synchronize(); + op->run(); + exec->synchronize(); + } } // timed run op->prepare(); for (auto _ : ic.run()) { + auto range = annotate("repetition"); op->run(); } const auto runtime = ic.compute_time(FLAGS_timer_method); diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp index 5f03cb2b933..c777db1a35a 100644 --- a/benchmark/conversion/conversion.cpp +++ b/benchmark/conversion/conversion.cpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general.hpp" #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/iteration_control.hpp" @@ -128,6 +129,7 @@ struct ConversionBenchmark : Benchmark> { void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, gko::device_matrix_data& data, const std::string& operation_name, json& operation_case) const override @@ -142,13 +144,17 @@ struct ConversionBenchmark : Benchmark> { IterationControl ic{timer}; if (to_name == "read") { // warm run - for (auto _ : ic.warmup_run()) { - exec->synchronize(); - readable->read(data); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + exec->synchronize(); + readable->read(data); + exec->synchronize(); + } } // timed run for (auto _ : ic.run()) { + auto range = annotate("repetition"); readable->read(data); } } else { @@ -156,13 +162,17 @@ struct ConversionBenchmark : Benchmark> { auto mtx_to = formats::matrix_type_factory.at(to_name)(exec); // warm run - for (auto _ : ic.warmup_run()) { - exec->synchronize(); - mtx_to->copy_from(mtx_from); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + exec->synchronize(); + mtx_to->copy_from(mtx_from); + exec->synchronize(); + } } // timed run for (auto _ : ic.run()) { + auto range = annotate("repetition"); mtx_to->copy_from(mtx_from); } } diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 4bb63032550..20feecf5ccf 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -182,8 +182,8 @@ struct MatrixStatistics : Benchmark { return Generator::describe_config(test_case); } - int setup(std::shared_ptr exec, - json& test_case) const override + empty_state setup(std::shared_ptr exec, + json& test_case) const override { auto data = Generator::generate_matrix_data(test_case); std::clog << "Matrix is of size (" << data.size[0] << ", " @@ -193,12 +193,13 @@ struct MatrixStatistics : Benchmark { test_case["nonzeros"] = data.nonzeros.size(); extract_matrix_statistics(data, test_case["problem"]); - return 0; + return {}; } void run(std::shared_ptr exec, std::shared_ptr timer, - int& data, const std::string& operation_name, + annotate_functor annotate, empty_state& data, + const std::string& operation_name, json& operation_case) const override {} }; diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 7c130328d34..98f116f9b12 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general.hpp" #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/iteration_control.hpp" @@ -199,7 +200,7 @@ struct PreconditionerBenchmark : Benchmark { void run(std::shared_ptr exec, std::shared_ptr timer, - preconditioner_benchmark_state& state, + annotate_functor annotate, preconditioner_benchmark_state& state, const std::string& encoded_precond_name, json& precond_case) const override { @@ -219,12 +220,17 @@ struct PreconditionerBenchmark : Benchmark { auto precond = precond_factory.at(decoded_precond_name)(exec); - for (auto _ : ic_apply.warmup_run()) { - precond->generate(state.system_matrix)->apply(state.b, x_clone); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic_apply.warmup_run()) { + precond->generate(state.system_matrix) + ->apply(state.b, x_clone); + } } std::unique_ptr precond_op; for (auto _ : ic_gen.run()) { + auto range = annotate("repetition generate"); precond_op = precond->generate(state.system_matrix); } @@ -234,6 +240,7 @@ struct PreconditionerBenchmark : Benchmark { ic_gen.get_num_repetitions(); for (auto _ : ic_apply.run()) { + auto range = annotate("repetition apply"); precond_op->apply(state.b, x_clone); } diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 4976e5759d4..597ab76729a 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -458,6 +458,7 @@ struct SolverBenchmark : Benchmark> { void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, solver_benchmark_state& state, const std::string& encoded_solver_name, json& solver_case) const override @@ -482,14 +483,17 @@ struct SolverBenchmark : Benchmark> { // warm run std::shared_ptr solver; - for (auto _ : ic.warmup_run()) { - auto x_clone = clone(state.x); - auto precond = precond_factory.at(precond_name)(exec); - solver = generate_solver(exec, give(precond), solver_name, - FLAGS_warmup_max_iters) - ->generate(state.system_matrix); - solver->apply(state.b, x_clone); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + auto x_clone = clone(state.x); + auto precond = precond_factory.at(precond_name)(exec); + solver = generate_solver(exec, give(precond), solver_name, + FLAGS_warmup_max_iters) + ->generate(state.system_matrix); + solver->apply(state.b, x_clone); + exec->synchronize(); + } } // detail run @@ -566,6 +570,7 @@ struct SolverBenchmark : Benchmark> { auto apply_timer = ic.get_timer(); auto x_clone = clone(state.x); for (auto status : ic.run(false)) { + auto range = annotate("repetition"); x_clone = clone(state.x); exec->synchronize(); diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index 21df4d9c448..5d479eb7fc0 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -128,7 +128,8 @@ struct SparseBlasBenchmark : Benchmark> { void run(std::shared_ptr exec, std::shared_ptr timer, - std::unique_ptr& mtx, const std::string& operation_name, + annotate_functor annotate, std::unique_ptr& mtx, + const std::string& operation_name, json& operation_case) const override { auto op = get_operation(operation_name, mtx.get()); @@ -136,16 +137,20 @@ struct SparseBlasBenchmark : Benchmark> { IterationControl ic(timer); // warm run - for (auto _ : ic.warmup_run()) { - op->prepare(); - exec->synchronize(); - op->run(); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + op->prepare(); + exec->synchronize(); + op->run(); + exec->synchronize(); + } } // timed run op->prepare(); for (auto _ : ic.run()) { + auto range = annotate("repetition"); op->run(); } const auto runtime = ic.compute_time(FLAGS_timer_method); diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index 4a7d014de8b..f589077834e 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -130,7 +130,7 @@ struct SpmvBenchmark : Benchmark> { } void run(std::shared_ptr exec, std::shared_ptr timer, - spmv_benchmark_state& state, + annotate_functor annotate, spmv_benchmark_state& state, const std::string& format_name, json& format_case) const override { auto system_matrix = generator.generate_matrix_with_format( @@ -149,11 +149,14 @@ struct SpmvBenchmark : Benchmark> { IterationControl ic{timer}; // warm run - for (auto _ : ic.warmup_run()) { - auto x_clone = clone(state.x); - exec->synchronize(); - system_matrix->apply(state.b, x_clone); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + auto x_clone = clone(state.x); + exec->synchronize(); + system_matrix->apply(state.b, x_clone); + exec->synchronize(); + } } // tuning run @@ -192,6 +195,7 @@ struct SpmvBenchmark : Benchmark> { // timed run auto x_clone = clone(state.x); for (auto _ : ic.run()) { + auto range = annotate("repetition"); system_matrix->apply(state.b, x_clone); } format_case["time"] = ic.compute_time(FLAGS_timer_method); diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 550f6fe2720..6012cb6c77b 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -245,6 +245,32 @@ std::shared_ptr create_profiler_hook( } +struct annotate_functor { + gko::log::profiling_scope_guard operator()(const char* name) const + { + if (profiler_hook) { + return profiler_hook->user_range(name); + } + return {}; + } + + gko::log::profiling_scope_guard operator()(const char* name, + bool should_annotate) const + { + if (profiler_hook && should_annotate) { + return profiler_hook->user_range(name); + } + return {}; + } + + annotate_functor(std::shared_ptr profiler_hook) + : profiler_hook{std::move(profiler_hook)} + {} + + std::shared_ptr profiler_hook; +}; + + // Returns a random number engine std::default_random_engine& get_engine() { diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp index 661c403706f..264dc3965db 100644 --- a/benchmark/utils/runner.hpp +++ b/benchmark/utils/runner.hpp @@ -102,8 +102,8 @@ struct Benchmark { /** Runs a single operation of the benchmark */ virtual void run(std::shared_ptr exec, - std::shared_ptr timer, State& state, - const std::string& operation, + std::shared_ptr timer, annotate_functor annotate, + State& state, const std::string& operation, json& operation_case) const = 0; /** Post-process test case info. */ @@ -139,13 +139,7 @@ void run_test_cases(const Benchmark& benchmark, if (profiler_hook) { exec->add_logger(profiler_hook); } - auto annotate = - [profiler_hook](const char* name) -> gko::log::profiling_scope_guard { - if (profiler_hook) { - return profiler_hook->user_range(name); - } - return {}; - }; + auto annotate = annotate_functor(profiler_hook); for (auto& test_case : test_cases) { try { @@ -174,8 +168,8 @@ void run_test_cases(const Benchmark& benchmark, auto& operation_case = benchmark_case[operation_name]; try { auto operation_range = annotate(operation_name.c_str()); - benchmark.run(exec, timer, test_case_state, operation_name, - operation_case); + benchmark.run(exec, timer, annotate, test_case_state, + operation_name, operation_case); operation_case["completed"] = true; } catch (const std::exception& e) { operation_case["completed"] = false; From 10ef14a67bb6240e2cdcc5266a4568833eff3cb8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 19 Aug 2023 12:19:10 +0200 Subject: [PATCH 252/583] update test output --- benchmark/test/reference/blas.profile.stderr | 6 ++++++ benchmark/test/reference/conversion.profile.stderr | 8 ++++++++ .../test/reference/distributed_solver.profile.stderr | 2 ++ .../reference/multi_vector_distributed.profile.stderr | 6 ++++++ benchmark/test/reference/preconditioner.profile.stderr | 4 ++++ benchmark/test/reference/solver.profile.stderr | 2 ++ benchmark/test/reference/sparse_blas.profile.stderr | 2 ++ benchmark/test/reference/spmv.profile.stderr | 2 ++ benchmark/test/reference/spmv_distributed.profile.stderr | 2 ++ 9 files changed, 34 insertions(+) diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index 529fc16009c..7307fb0ad7e 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -10,8 +10,10 @@ DEBUG: begin n = 100 DEBUG: begin copy DEBUG: begin dense::fill DEBUG: end dense::fill +DEBUG: begin repetition DEBUG: begin dense::copy DEBUG: end dense::copy +DEBUG: end repetition DEBUG: end copy Running blas: axpy DEBUG: begin axpy @@ -21,8 +23,10 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill +DEBUG: begin repetition DEBUG: begin dense::add_scaled DEBUG: end dense::add_scaled +DEBUG: end repetition DEBUG: end axpy Running blas: scal DEBUG: begin scal @@ -30,7 +34,9 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill +DEBUG: begin repetition DEBUG: begin dense::scale DEBUG: end dense::scale +DEBUG: end repetition DEBUG: end scal DEBUG: end n = 100 diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index a233579c721..3a4301b13eb 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -11,12 +11,14 @@ DEBUG: end components::aos_to_soa DEBUG: begin stencil(100,7pt) Running conversion: coo-read DEBUG: begin coo-read +DEBUG: begin repetition DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin copy DEBUG: end copy +DEBUG: end repetition DEBUG: end coo-read Running conversion: coo-csr DEBUG: begin coo-csr @@ -28,6 +30,7 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: begin repetition DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy @@ -36,11 +39,13 @@ DEBUG: end copy DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: end copy() +DEBUG: end repetition DEBUG: end coo-csr Running conversion: csr-read DEBUG: begin csr-read DEBUG: begin components::fill_array DEBUG: end components::fill_array +DEBUG: begin repetition DEBUG: begin copy DEBUG: end copy DEBUG: begin copy @@ -49,6 +54,7 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs +DEBUG: end repetition DEBUG: end csr-read Running conversion: csr-coo DEBUG: begin csr-coo @@ -62,6 +68,7 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin repetition DEBUG: begin copy() DEBUG: begin copy DEBUG: end copy @@ -70,5 +77,6 @@ DEBUG: end copy DEBUG: begin components::convert_ptrs_to_idxs DEBUG: end components::convert_ptrs_to_idxs DEBUG: end copy() +DEBUG: end repetition DEBUG: end csr-coo DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 4ea20730117..227737e56b3 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -90,6 +90,7 @@ DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() +DEBUG: begin repetition DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy @@ -420,6 +421,7 @@ DEBUG: end check() DEBUG: end check() DEBUG: end iteration DEBUG: end apply() +DEBUG: end repetition DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr index 102330e38f4..85bd138514b 100644 --- a/benchmark/test/reference/multi_vector_distributed.profile.stderr +++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr @@ -46,8 +46,10 @@ DEBUG: begin copy DEBUG: end copy DEBUG: begin dense::fill DEBUG: end dense::fill +DEBUG: begin repetition DEBUG: begin dense::copy DEBUG: end dense::copy +DEBUG: end repetition DEBUG: end copy Running blas: axpy DEBUG: begin axpy @@ -93,8 +95,10 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill +DEBUG: begin repetition DEBUG: begin dense::add_scaled DEBUG: end dense::add_scaled +DEBUG: end repetition DEBUG: end axpy Running blas: scal DEBUG: begin scal @@ -120,7 +124,9 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin dense::fill DEBUG: end dense::fill +DEBUG: begin repetition DEBUG: begin dense::scale DEBUG: end dense::scale +DEBUG: end repetition DEBUG: end scal DEBUG: end n = 100 diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index 610dfe464ec..e2069c318d2 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -31,13 +31,17 @@ DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() +DEBUG: begin repetition generate DEBUG: begin generate() DEBUG: end generate() +DEBUG: end repetition generate +DEBUG: begin repetition apply DEBUG: begin apply() DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() DEBUG: end apply() +DEBUG: end repetition apply DEBUG: end none DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 238591eb0c9..5e1e2cdb312 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -34,6 +34,7 @@ DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() +DEBUG: begin repetition DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy @@ -282,6 +283,7 @@ DEBUG: end check() DEBUG: end check() DEBUG: end iteration DEBUG: end apply() +DEBUG: end repetition DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index 60cf41ccbae..fd991de7063 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -15,9 +15,11 @@ DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin stencil(100,7pt) Running sparse_blas: transpose DEBUG: begin transpose +DEBUG: begin repetition DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin csr::transpose DEBUG: end csr::transpose +DEBUG: end repetition DEBUG: end transpose DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 2299614c6c4..1cc24a5f186 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -28,9 +28,11 @@ DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() +DEBUG: begin repetition DEBUG: begin apply() DEBUG: begin coo::spmv DEBUG: end coo::spmv DEBUG: end apply() +DEBUG: end repetition DEBUG: end coo DEBUG: end stencil(100,7pt) diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index b44cef7f3f6..f0d28332ef0 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -122,6 +122,7 @@ DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() +DEBUG: begin repetition DEBUG: begin apply() DEBUG: begin dense::row_gather DEBUG: end dense::row_gather @@ -134,5 +135,6 @@ DEBUG: begin csr::advanced_spmv DEBUG: end csr::advanced_spmv DEBUG: end advanced_apply() DEBUG: end apply() +DEBUG: end repetition DEBUG: end csr-csr DEBUG: end stencil(100,7pt,stencil) From b48d0e19c59e718b89b4eb4a38e52fd57de0ea0b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 22 Aug 2023 17:52:30 +0200 Subject: [PATCH 253/583] update documentation --- ABOUT-LICENSING.md | 105 +++----------------------------------- INSTALL.md | 6 +-- benchmark/CMakeLists.txt | 4 +- dev_tools/scripts/regroup | 2 +- 4 files changed, 14 insertions(+), 103 deletions(-) diff --git a/ABOUT-LICENSING.md b/ABOUT-LICENSING.md index df081e2211b..d6e68911d1a 100644 --- a/ABOUT-LICENSING.md +++ b/ABOUT-LICENSING.md @@ -76,7 +76,7 @@ the following license: When compiling Ginkgo with `-DGINKGO_BUILD_BENCHMARKS=ON` the build system will download, build, and link [gflags](https://github.com/gflags/gflags) and -[RapidJSON](https://github.com/Tencent/rapidjson) with the +[nlohmann-json](https://github.com/nlohmann/json) with the benchmark suites. gtest is available under the following license: > Copyright (c) 2006, Google Inc. @@ -108,110 +108,22 @@ benchmark suites. gtest is available under the following license: > (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -RapidJSON is available under the following license (note that Ginkgo's build -system automatically removes the `bin/jsonchecker/` directory which is licensed -under the problematic JSON license): +nlohmann-json is available under the following license: -> Tencent is pleased to support the open source community by making RapidJSON -> available. -> -> Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All -> rights reserved. -> -> If you have downloaded a copy of the RapidJSON binary from Tencent, please -> note that the RapidJSON binary is licensed under the MIT License. If you have -> downloaded a copy of the RapidJSON source code from Tencent, please note that -> RapidJSON source code is licensed under the MIT License, except for the -> third-party components listed below which are subject to different license -> terms. Your integration of RapidJSON into your own projects may require -> compliance with the MIT License, as well as the other licenses applicable to -> the third-party components included within RapidJSON. To avoid the problematic -> JSON license in your own projects, it's sufficient to exclude the -> bin/jsonchecker/ directory, as it's the only code under the JSON license. A -> copy of the MIT License is included in this file. -> -> Other dependencies and licenses: -> -> Open Source Software Licensed Under the BSD License: -> -------------------------------------------------------------------- -> -> The msinttypes r29 -> -> Copyright (c) 2006-2013 Alexander Chemeris -> All rights reserved. -> -> Redistribution and use in source and binary forms, with or without -> modification, are permitted provided that the following conditions are met: -> -> * Redistributions of source code must retain the above copyright notice, this -> list of conditions and the following disclaimer. -> * Redistributions in binary form must reproduce the above copyright notice, -> this list of conditions and the following disclaimer in the documentation -> and/or other materials provided with the distribution. -> * Neither the name of copyright holder nor the names of its contributors may -> be used to endorse or promote products derived from this software without -> specific prior written permission. -> -> THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY -> EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -> WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -> DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY -> DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -> (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -> LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -> ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -> (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -> SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -> -> Open Source Software Licensed Under the JSON License: -> -------------------------------------------------------------------- -> -> json.org -> Copyright (c) 2002 -> JSON.org All Rights Reserved. -> -> JSON_checker -> Copyright (c) 2002 JSON.org -> All Rights Reserved. -> -> -> Terms of the JSON License: -> --------------------------------------------------- -> -> Permission is hereby granted, free of charge, to any person obtaining a copy -> of this software and associated documentation files (the "Software"), to deal -> in the Software without restriction, including without limitation the rights -> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -> copies of the Software, and to permit persons to whom the Software is -> furnished to do so, subject to the following conditions: -> -> The above copyright notice and this permission notice shall be included in all -> copies or substantial portions of the Software. -> -> The Software shall be used for Good, not Evil. -> -> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -> SOFTWARE. -> -> -> Terms of the MIT License: -> -------------------------------------------------------------------- -> +> MIT License +> +> Copyright (c) 2013-2022 Niels Lohmann +> > Permission is hereby granted, free of charge, to any person obtaining a copy > of this software and associated documentation files (the "Software"), to deal > in the Software without restriction, including without limitation the rights > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > copies of the Software, and to permit persons to whom the Software is > furnished to do so, subject to the following conditions: -> + > The above copyright notice and this permission notice shall be included in all > copies or substantial portions of the Software. -> +> > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -220,7 +132,6 @@ under the problematic JSON license): > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE > SOFTWARE. - For generating the documentation of Ginkgo, some scripts from the deal.II library are used. You can refer to the `doc/` folder to see which files are a modified version of deal.II's documentation generation scripts. Additionally, diff --git a/INSTALL.md b/INSTALL.md index 5f788ed0e28..b29358d4eb6 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -31,7 +31,7 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests to speed them up, default is `OFF`. * `-DGINKGO_BUILD_BENCHMARKS={ON, OFF}` builds Ginkgo's benchmarks - (will download gflags and rapidjson), default is `ON`. + (will download gflags and nlohmann-json), default is `ON`. * `-DGINKGO_BUILD_EXAMPLES={ON, OFF}` builds Ginkgo's examples, default is `ON` * `-DGINKGO_BUILD_EXTLIB_EXAMPLE={ON, OFF}` builds the interfacing example with deal.II, default is `OFF`. @@ -205,7 +205,7 @@ packages can be turned off by disabling the relevant options. Test](https://github.com/google/googletest); + GINKGO_BUILD_BENCHMARKS=ON: For argument management we use [gflags](https://github.com/gflags/gflags) and for JSON parsing we use - [RapidJSON](https://github.com/Tencent/rapidjson); + [nlohmann-json](https://github.com/nlohmann/json); + GINKGO_DEVEL_TOOLS=ON: [git-cmake-format](https://github.com/gflegar/git-cmake-format) is our CMake helper for code formatting. @@ -224,7 +224,7 @@ packages can be turned off by disabling the relevant options. Ginkgo attempts to use pre-installed versions of these package if they match version requirements using `find_package`. Otherwise, the configuration step will download the files for each of the packages `GTest`, `gflags`, -`RapidJSON` and `hwloc` and build them internally. +`nlohmann-json` and `hwloc` and build them internally. Note that, if the external packages were not installed to the default location, the CMake option `-DCMAKE_PREFIX_PATH=` needs to be set to the diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index e993ee6cf0c..fd04620f595 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -46,7 +46,7 @@ endfunction() # Generates an executable for one precision. Each executable will be linked to -# `ginkgo`, `gflags` and `rapidjson`. +# `ginkgo`, `gflags` and `nlohmann-json`. # Note: This should only be used by `ginkgo_add_typed_benchmark_executables` # # \param name name for the executable to create (including type suffix) @@ -96,7 +96,7 @@ endfunction(ginkgo_add_single_benchmark_executable) # Generates an executable for each supported precision. Each executable will be -# linked to `ginkgo`, `gflags` and `rapidjson`. +# linked to `ginkgo`, `gflags` and `nlohmann-json`. # # \param name base-name for the executable to create # \param use_lib_linops Boolean indicating if linking against hipsparse/cusparse diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup index 85eade99289..e35bd37efee 100644 --- a/dev_tools/scripts/regroup +++ b/dev_tools/scripts/regroup @@ -1,6 +1,6 @@ IncludeBlocks: Regroup IncludeCategories: - - Regex: '^<(rapidjson|gflags|gtest|papi).*' + - Regex: '^<(nlohmann|gflags|gtest|papi).*' Priority: 3 - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi|mpi|nvToolsExt).*' Priority: 2 From e9436137ff32f8e9950a72a07834f4b166bf80ed Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 23 Aug 2023 13:18:37 +0200 Subject: [PATCH 254/583] review updates - remove unnecessary stdin in tests - simplify validate_config - consistently use pointer members instead of reference members Co-authored-by: Marcel Koch --- benchmark/solver/solver_common.hpp | 6 +--- benchmark/test/blas.py | 2 -- benchmark/test/multi_vector_distributed.py | 2 -- benchmark/utils/loggers.hpp | 34 +++++++++++----------- 4 files changed, 18 insertions(+), 26 deletions(-) diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 597ab76729a..0248ab8e757 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -414,11 +414,7 @@ struct SolverBenchmark : Benchmark> { bool validate_config(const json& value) const override { - return ((value.contains("size") && value.contains("stencil") && - value["size"].is_number_integer() && - value["stencil"].is_string()) || - (value.contains("filename") && - value["filename"].is_string())) && + return generator.validate_config(value) && (value.contains("optimal") && value["optimal"].contains("spmv") && value["optimal"]["spmv"].is_string()); diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py index 160d5364e20..ff5bddc5d08 100755 --- a/benchmark/test/blas.py +++ b/benchmark/test/blas.py @@ -22,7 +22,6 @@ ["-input", str(test_framework.sourcepath / "input.blas.json")], expected_stdout="blas.simple.stdout", expected_stderr="blas.simple.stderr", - stdin='[{"n": 100}]', ) # profiler annotations @@ -30,5 +29,4 @@ ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"], expected_stdout="blas.profile.stdout", expected_stderr="blas.profile.stderr", - stdin='[{"n": 100}]', ) diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py index 1e0c4c8adf5..c62cb8ebd17 100644 --- a/benchmark/test/multi_vector_distributed.py +++ b/benchmark/test/multi_vector_distributed.py @@ -24,7 +24,6 @@ ["-input", str(test_framework.sourcepath / "input.blas.json")], expected_stdout="multi_vector_distributed.simple.stdout", expected_stderr="multi_vector_distributed.simple.stderr", - stdin='[{"n": 100}]', num_procs=3, ) @@ -33,6 +32,5 @@ ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"], expected_stdout="multi_vector_distributed.profile.stdout", expected_stderr="multi_vector_distributed.profile.stderr", - stdin='[{"n": 100}]', num_procs=3, ) diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp index 1e651811f0f..89ea6108eda 100644 --- a/benchmark/utils/loggers.hpp +++ b/benchmark/utils/loggers.hpp @@ -179,16 +179,16 @@ struct ResidualLogger : gko::log::Logger { const gko::array* status, bool all_stopped) const override { - timestamps.push_back(std::chrono::duration( - std::chrono::steady_clock::now() - start) - .count()); + timestamps->push_back(std::chrono::duration( + std::chrono::steady_clock::now() - start) + .count()); if (residual_norm) { - rec_res_norms.push_back( + rec_res_norms->push_back( get_norm(gko::as>(residual_norm))); } else { gko::detail::vector_dispatch( residual, [&](const auto v_residual) { - rec_res_norms.push_back(compute_norm2(v_residual)); + rec_res_norms->push_back(compute_norm2(v_residual)); }); } if (solution) { @@ -196,18 +196,18 @@ struct ResidualLogger : gko::log::Logger { rc_vtype>(solution, [&](auto v_solution) { using concrete_type = std::remove_pointer_t>; - true_res_norms.push_back(compute_residual_norm( + true_res_norms->push_back(compute_residual_norm( matrix, gko::as(b), v_solution)); }); } else { - true_res_norms.push_back(-1.0); + true_res_norms->push_back(-1.0); } if (implicit_sq_residual_norm) { - implicit_res_norms.push_back(std::sqrt( + implicit_res_norms->push_back(std::sqrt( get_norm(gko::as>(implicit_sq_residual_norm)))); has_implicit_res_norm = true; } else { - implicit_res_norms.push_back(-1.0); + implicit_res_norms->push_back(-1.0); } } @@ -219,11 +219,11 @@ struct ResidualLogger : gko::log::Logger { matrix{matrix.get()}, b{b.get()}, start{std::chrono::steady_clock::now()}, - rec_res_norms{rec_res_norms}, - true_res_norms{true_res_norms}, + rec_res_norms{&rec_res_norms}, + true_res_norms{&true_res_norms}, has_implicit_res_norm{}, - implicit_res_norms{implicit_res_norms}, - timestamps{timestamps} + implicit_res_norms{&implicit_res_norms}, + timestamps{×tamps} {} bool has_implicit_res_norms() const { return has_implicit_res_norm; } @@ -232,11 +232,11 @@ struct ResidualLogger : gko::log::Logger { const gko::LinOp* matrix; const gko::LinOp* b; std::chrono::steady_clock::time_point start; - json& rec_res_norms; - json& true_res_norms; + json* rec_res_norms; + json* true_res_norms; mutable bool has_implicit_res_norm; - json& implicit_res_norms; - json& timestamps; + json* implicit_res_norms; + json* timestamps; }; From 310c686b8bdf4f4ef7595bac8de95131588d8d3c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 29 Aug 2023 19:24:34 +0200 Subject: [PATCH 255/583] review updates - don't install nlohmann-json - simplify code - improve config description formatting Co-authored-by: Yuhsiang M. Tsai --- benchmark/preconditioner/preconditioner.cpp | 3 +-- benchmark/spmv/spmv_common.hpp | 3 --- benchmark/test/CMakeLists.txt | 2 +- benchmark/test/reference/conversion.all.stderr | 2 +- benchmark/test/reference/conversion.profile.stderr | 6 +++--- benchmark/test/reference/conversion.simple.stderr | 2 +- benchmark/test/reference/distributed_solver.profile.stderr | 6 +++--- benchmark/test/reference/distributed_solver.simple.stderr | 2 +- benchmark/test/reference/matrix_statistics.simple.stderr | 2 +- benchmark/test/reference/preconditioner.profile.stderr | 6 +++--- benchmark/test/reference/preconditioner.simple.stderr | 2 +- benchmark/test/reference/solver.profile.stderr | 6 +++--- benchmark/test/reference/solver.simple.stderr | 2 +- benchmark/test/reference/sparse_blas.profile.stderr | 6 +++--- benchmark/test/reference/sparse_blas.simple.stderr | 2 +- benchmark/test/reference/spmv.profile.stderr | 6 +++--- benchmark/test/reference/spmv.simple.stderr | 2 +- benchmark/test/reference/spmv_distributed.profile.stderr | 6 +++--- benchmark/test/reference/spmv_distributed.simple.stderr | 2 +- benchmark/utils/generator.hpp | 6 +++--- third_party/nlohmann_json/CMakeLists.txt | 1 + 21 files changed, 36 insertions(+), 39 deletions(-) diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 98f116f9b12..074fe202e6c 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -205,9 +205,8 @@ struct PreconditionerBenchmark : Benchmark { json& precond_case) const override { auto decoded_precond_name = precond_decoder.at(encoded_precond_name); - precond_case["generate"] = json::object(); - precond_case["apply"] = json::object(); for (auto stage : {"generate", "apply"}) { + precond_case[stage] = json::object(); precond_case[stage]["components"] = json::object(); } diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index f589077834e..c85642bb5f1 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -211,9 +211,6 @@ struct SpmvBenchmark : Benchmark> { std::string best_format; // find the fastest among all formats we tested for (const auto& format : formats) { - if (!test_case[name].contains(format)) { - continue; - } auto& format_case = test_case[name][format]; if (format_case.contains("completed") && format_case["completed"].template get()) { diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt index 1cd589927fa..2f43b6eaf71 100644 --- a/benchmark/test/CMakeLists.txt +++ b/benchmark/test/CMakeLists.txt @@ -25,4 +25,4 @@ if (GINKGO_BUILD_MPI) add_benchmark_test(multi_vector_distributed) add_benchmark_test(spmv_distributed) add_benchmark_test(solver_distributed) -endif() \ No newline at end of file +endif() diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr index 77ff50a1b89..f6f1002e443 100644 --- a/benchmark/test/reference/conversion.all.stderr +++ b/benchmark/test/reference/conversion.all.stderr @@ -4,7 +4,7 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr,ell,sellp,hybrid -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 Running conversion: coo-read Running conversion: coo-csr diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index 3a4301b13eb..b25fb4d42ee 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -4,11 +4,11 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin stencil(100,7pt) +DEBUG: begin stencil(100, 7pt) Running conversion: coo-read DEBUG: begin coo-read DEBUG: begin repetition @@ -79,4 +79,4 @@ DEBUG: end components::convert_ptrs_to_idxs DEBUG: end copy() DEBUG: end repetition DEBUG: end csr-coo -DEBUG: end stencil(100,7pt) +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr index 9b51effac09..53777a4fc53 100644 --- a/benchmark/test/reference/conversion.simple.stderr +++ b/benchmark/test/reference/conversion.simple.stderr @@ -4,7 +4,7 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 Running conversion: coo-read Running conversion: coo-csr diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 227737e56b3..e8ef115f8c2 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case stencil(100,7pt,stencil) +Running test case stencil(100, 7pt, stencil) DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size DEBUG: begin components::fill_array @@ -77,7 +77,7 @@ DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() Matrix is of size (125, 125) -DEBUG: begin stencil(100,7pt,stencil) +DEBUG: begin stencil(100, 7pt, stencil) Running solver: cg DEBUG: begin cg DEBUG: begin dense::compute_squared_norm2 @@ -445,4 +445,4 @@ DEBUG: end dense::compute_sqrt DEBUG: begin copy DEBUG: end copy DEBUG: end cg -DEBUG: end stencil(100,7pt,stencil) +DEBUG: end stencil(100, 7pt, stencil) diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr index 607081a3949..bdf57c2d0e1 100644 --- a/benchmark/test/reference/distributed_solver.simple.stderr +++ b/benchmark/test/reference/distributed_solver.simple.stderr @@ -5,6 +5,6 @@ Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case stencil(100,7pt,stencil) +Running test case stencil(100, 7pt, stencil) Matrix is of size (125, 125) Running solver: cg diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr index d02edbc44da..bfaa411873e 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stderr +++ b/benchmark/test/reference/matrix_statistics.simple.stderr @@ -1,4 +1,4 @@ This is Ginkgo 1.7.0 (develop) running with core module 1.7.0 (develop) -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index e2069c318d2..328a738583c 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -4,7 +4,7 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running with preconditioners: none -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::aos_to_soa @@ -24,7 +24,7 @@ DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data Matrix is of size (125, 125), 725 -DEBUG: begin stencil(100,7pt) +DEBUG: begin stencil(100, 7pt) Running preconditioner: none DEBUG: begin none DEBUG: begin copy() @@ -44,4 +44,4 @@ DEBUG: end copy() DEBUG: end apply() DEBUG: end repetition apply DEBUG: end none -DEBUG: end stencil(100,7pt) +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr index 0090e180d2b..a428671486f 100644 --- a/benchmark/test/reference/preconditioner.simple.stderr +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -4,6 +4,6 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 Running with preconditioners: none -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 Running preconditioner: none diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 5e1e2cdb312..a9846dff61f 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) DEBUG: begin components::fill_array DEBUG: end components::fill_array DEBUG: begin components::aos_to_soa @@ -23,7 +23,7 @@ DEBUG: begin dense::copy DEBUG: end dense::copy DEBUG: end copy() Matrix is of size (125, 125) -DEBUG: begin stencil(100,7pt) +DEBUG: begin stencil(100, 7pt) Running solver: cg DEBUG: begin cg DEBUG: begin dense::compute_norm2_dispatch @@ -297,4 +297,4 @@ DEBUG: end dense::compute_norm2_dispatch DEBUG: begin copy DEBUG: end copy DEBUG: end cg -DEBUG: end stencil(100,7pt) +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr index 659dd026588..d9c04b69cf5 100644 --- a/benchmark/test/reference/solver.simple.stderr +++ b/benchmark/test/reference/solver.simple.stderr @@ -5,6 +5,6 @@ Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 The number of right hand sides is 1 -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125) Running solver: cg diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index fd991de7063..70a9299ccae 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -4,7 +4,7 @@ Running on reference(0) Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The operations are transpose -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 DEBUG: begin components::fill_array DEBUG: end components::fill_array @@ -12,7 +12,7 @@ DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin stencil(100,7pt) +DEBUG: begin stencil(100, 7pt) Running sparse_blas: transpose DEBUG: begin transpose DEBUG: begin repetition @@ -22,4 +22,4 @@ DEBUG: begin csr::transpose DEBUG: end csr::transpose DEBUG: end repetition DEBUG: end transpose -DEBUG: end stencil(100,7pt) +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr index 1f2bb34809f..fe6cf23d5b7 100644 --- a/benchmark/test/reference/sparse_blas.simple.stderr +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -4,6 +4,6 @@ Running on reference(0) Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are transpose -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 Running sparse_blas: transpose diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 1cc24a5f186..3c3ec3b7cfe 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo The number of right hand sides is 1 -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin dense::fill @@ -19,7 +19,7 @@ DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data Matrix is of size (125, 125), 725 -DEBUG: begin stencil(100,7pt) +DEBUG: begin stencil(100, 7pt) Running spmv: coo DEBUG: begin coo DEBUG: begin components::aos_to_soa @@ -35,4 +35,4 @@ DEBUG: end coo::spmv DEBUG: end apply() DEBUG: end repetition DEBUG: end coo -DEBUG: end stencil(100,7pt) +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr index 9d5047febb6..97fe670aff7 100644 --- a/benchmark/test/reference/spmv.simple.stderr +++ b/benchmark/test/reference/spmv.simple.stderr @@ -5,6 +5,6 @@ Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo The number of right hand sides is 1 -Running test case stencil(100,7pt) +Running test case stencil(100, 7pt) Matrix is of size (125, 125), 725 Running spmv: coo diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index f0d28332ef0..dc3cfd377c7 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -5,7 +5,7 @@ Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are [csr]x[csr] The number of right hand sides is 1 -Running test case stencil(100,7pt,stencil) +Running test case stencil(100, 7pt, stencil) DEBUG: begin partition::build_ranges_from_global_size DEBUG: end partition::build_ranges_from_global_size DEBUG: begin components::fill_array @@ -55,7 +55,7 @@ DEBUG: end dense::fill DEBUG: begin dense::fill_in_matrix_data DEBUG: end dense::fill_in_matrix_data Matrix is of size (81, 81), 144 -DEBUG: begin stencil(100,7pt,stencil) +DEBUG: begin stencil(100, 7pt, stencil) Running spmv: csr-csr DEBUG: begin csr-csr DEBUG: begin partition::build_ranges_from_global_size @@ -137,4 +137,4 @@ DEBUG: end advanced_apply() DEBUG: end apply() DEBUG: end repetition DEBUG: end csr-csr -DEBUG: end stencil(100,7pt,stencil) +DEBUG: end stencil(100, 7pt, stencil) diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr index 0df742d5b9b..7c7f6fccf54 100644 --- a/benchmark/test/reference/spmv_distributed.simple.stderr +++ b/benchmark/test/reference/spmv_distributed.simple.stderr @@ -5,6 +5,6 @@ Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are [csr]x[csr] The number of right hand sides is 1 -Running test case stencil(100,7pt,stencil) +Running test case stencil(100, 7pt, stencil) Matrix is of size (81, 81), 144 Running spmv: csr-csr diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 257a2384634..3f26ed3f2fc 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -90,7 +90,7 @@ struct DefaultSystemGenerator { return config["filename"].get(); } else if (config.contains("stencil")) { std::stringstream ss; - ss << "stencil(" << config["size"].get() << "," + ss << "stencil(" << config["size"].get() << ", " << config["stencil"].get() << ")"; return ss.str(); } else { @@ -231,8 +231,8 @@ struct DistributedDefaultSystemGenerator { return config["filename"].get(); } else if (config.contains("stencil")) { std::stringstream ss; - ss << "stencil(" << config["size"].get() << "," - << config["stencil"].get() << "," + ss << "stencil(" << config["size"].get() << ", " + << config["stencil"].get() << ", " << config["comm_pattern"].get() << ")"; return ss.str(); } else { diff --git a/third_party/nlohmann_json/CMakeLists.txt b/third_party/nlohmann_json/CMakeLists.txt index b95cfa5606a..6f413e458b9 100644 --- a/third_party/nlohmann_json/CMakeLists.txt +++ b/third_party/nlohmann_json/CMakeLists.txt @@ -6,4 +6,5 @@ FetchContent_Declare( GIT_TAG v3.9.1 ) set(JSON_BuildTests OFF CACHE INTERNAL "") +set(JSON_Install OFF CACHE INTERNAL "") FetchContent_MakeAvailable(nlohmann_json) From bf1ece4bcf34fdd4c534c90e0d0ab77f0572c853 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 30 Aug 2023 11:30:08 +0200 Subject: [PATCH 256/583] keep trailing EOL --- benchmark/test/reference/blas.profile.stdout | 2 +- benchmark/test/reference/blas.simple.stdout | 2 +- benchmark/test/reference/conversion.all.stdout | 2 +- benchmark/test/reference/conversion.matrix.stdout | 2 +- benchmark/test/reference/conversion.profile.stdout | 2 +- benchmark/test/reference/conversion.simple.stdout | 2 +- benchmark/test/reference/distributed_solver.matrix.stdout | 2 +- benchmark/test/reference/distributed_solver.profile.stdout | 2 +- benchmark/test/reference/distributed_solver.simple.stdout | 2 +- benchmark/test/reference/matrix_statistics.matrix.stdout | 2 +- benchmark/test/reference/matrix_statistics.simple.stdout | 2 +- .../test/reference/multi_vector_distributed.profile.stdout | 2 +- .../test/reference/multi_vector_distributed.simple.stdout | 2 +- benchmark/test/reference/preconditioner.matrix.stdout | 2 +- benchmark/test/reference/preconditioner.profile.stdout | 2 +- benchmark/test/reference/preconditioner.simple.stdout | 2 +- benchmark/test/reference/solver.matrix.stdout | 2 +- benchmark/test/reference/solver.profile.stdout | 2 +- benchmark/test/reference/solver.simple.stdout | 2 +- benchmark/test/reference/sparse_blas.matrix.stdout | 2 +- benchmark/test/reference/sparse_blas.profile.stdout | 2 +- benchmark/test/reference/sparse_blas.simple.stdout | 2 +- benchmark/test/reference/spmv.matrix.stdout | 2 +- benchmark/test/reference/spmv.profile.stdout | 2 +- benchmark/test/reference/spmv.simple.stdout | 2 +- benchmark/test/reference/spmv_distributed.profile.stdout | 2 +- benchmark/test/reference/spmv_distributed.simple.stdout | 2 +- benchmark/test/test_framework.py.in | 3 ++- 28 files changed, 29 insertions(+), 28 deletions(-) diff --git a/benchmark/test/reference/blas.profile.stdout b/benchmark/test/reference/blas.profile.stdout index 8998d5eaed7..209e115b557 100644 --- a/benchmark/test/reference/blas.profile.stdout +++ b/benchmark/test/reference/blas.profile.stdout @@ -25,4 +25,4 @@ } } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/blas.simple.stdout b/benchmark/test/reference/blas.simple.stdout index a586a9bc57b..54745d81104 100644 --- a/benchmark/test/reference/blas.simple.stdout +++ b/benchmark/test/reference/blas.simple.stdout @@ -25,4 +25,4 @@ } } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout index 0c77d464793..e7a5b8f0f51 100644 --- a/benchmark/test/reference/conversion.all.stdout +++ b/benchmark/test/reference/conversion.all.stdout @@ -73,4 +73,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout index 7f27b0c25b3..8489e4b30b4 100644 --- a/benchmark/test/reference/conversion.matrix.stdout +++ b/benchmark/test/reference/conversion.matrix.stdout @@ -27,4 +27,4 @@ "cols": 36, "nonzeros": 208 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout index a9c3ea674fa..907eac5b951 100644 --- a/benchmark/test/reference/conversion.profile.stdout +++ b/benchmark/test/reference/conversion.profile.stdout @@ -28,4 +28,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout index 81c735789d1..91b69b8a248 100644 --- a/benchmark/test/reference/conversion.simple.stdout +++ b/benchmark/test/reference/conversion.simple.stdout @@ -28,4 +28,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout index ec1d258e2f4..67ac333bec5 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stdout +++ b/benchmark/test/reference/distributed_solver.matrix.stdout @@ -54,4 +54,4 @@ "rows": 36, "cols": 36 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout index 55dfb1dc428..0a844879c4f 100644 --- a/benchmark/test/reference/distributed_solver.profile.stdout +++ b/benchmark/test/reference/distributed_solver.profile.stdout @@ -30,4 +30,4 @@ "rows": 125, "cols": 125 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout index eed8d864388..458115e6ab2 100644 --- a/benchmark/test/reference/distributed_solver.simple.stdout +++ b/benchmark/test/reference/distributed_solver.simple.stdout @@ -56,4 +56,4 @@ "rows": 125, "cols": 125 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout index a6297e89b66..f5eba9461f7 100644 --- a/benchmark/test/reference/matrix_statistics.matrix.stdout +++ b/benchmark/test/reference/matrix_statistics.matrix.stdout @@ -36,4 +36,4 @@ "cols": 36, "nonzeros": 208 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout index 923bbc9f962..23124781a7d 100644 --- a/benchmark/test/reference/matrix_statistics.simple.stdout +++ b/benchmark/test/reference/matrix_statistics.simple.stdout @@ -37,4 +37,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout index 8998d5eaed7..209e115b557 100644 --- a/benchmark/test/reference/multi_vector_distributed.profile.stdout +++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout @@ -25,4 +25,4 @@ } } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout index a586a9bc57b..54745d81104 100644 --- a/benchmark/test/reference/multi_vector_distributed.simple.stdout +++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout @@ -25,4 +25,4 @@ } } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout index 51adb7383c3..742ec55c41d 100644 --- a/benchmark/test/reference/preconditioner.matrix.stdout +++ b/benchmark/test/reference/preconditioner.matrix.stdout @@ -28,4 +28,4 @@ "cols": 36, "nonzeros": 208 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout index e33a6502eea..526349b55ad 100644 --- a/benchmark/test/reference/preconditioner.profile.stdout +++ b/benchmark/test/reference/preconditioner.profile.stdout @@ -21,4 +21,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout index 06291228a1c..ed567dcbb13 100644 --- a/benchmark/test/reference/preconditioner.simple.stdout +++ b/benchmark/test/reference/preconditioner.simple.stdout @@ -29,4 +29,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout index a87e78f7f66..594a3887921 100644 --- a/benchmark/test/reference/solver.matrix.stdout +++ b/benchmark/test/reference/solver.matrix.stdout @@ -52,4 +52,4 @@ "rows": 36, "cols": 36 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout index 906c74de5e7..c132ed1a572 100644 --- a/benchmark/test/reference/solver.profile.stdout +++ b/benchmark/test/reference/solver.profile.stdout @@ -29,4 +29,4 @@ "rows": 125, "cols": 125 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout index 5d127fe4b78..0ee0e4b9a4b 100644 --- a/benchmark/test/reference/solver.simple.stdout +++ b/benchmark/test/reference/solver.simple.stdout @@ -53,4 +53,4 @@ "rows": 125, "cols": 125 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout index 74fdbf98e7a..a50fa1159d9 100644 --- a/benchmark/test/reference/sparse_blas.matrix.stdout +++ b/benchmark/test/reference/sparse_blas.matrix.stdout @@ -21,4 +21,4 @@ "cols": 36, "nonzeros": 208 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout index e9d48fde23d..45cb7e2638a 100644 --- a/benchmark/test/reference/sparse_blas.profile.stdout +++ b/benchmark/test/reference/sparse_blas.profile.stdout @@ -15,4 +15,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/sparse_blas.simple.stdout b/benchmark/test/reference/sparse_blas.simple.stdout index 3cc5f774ebf..a44f4f189b2 100644 --- a/benchmark/test/reference/sparse_blas.simple.stdout +++ b/benchmark/test/reference/sparse_blas.simple.stdout @@ -22,4 +22,4 @@ "cols": 125, "nonzeros": 725 } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout index 4d03ce3cd07..ea5927ba148 100644 --- a/benchmark/test/reference/spmv.matrix.stdout +++ b/benchmark/test/reference/spmv.matrix.stdout @@ -17,4 +17,4 @@ "spmv": "coo" } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout index 409a92d4e33..6e4701af719 100644 --- a/benchmark/test/reference/spmv.profile.stdout +++ b/benchmark/test/reference/spmv.profile.stdout @@ -17,4 +17,4 @@ "spmv": "coo" } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout index 9601a15b331..38f2598c616 100644 --- a/benchmark/test/reference/spmv.simple.stdout +++ b/benchmark/test/reference/spmv.simple.stdout @@ -18,4 +18,4 @@ "spmv": "coo" } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout index 8de6a68ae8a..bbef87d0b89 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stdout +++ b/benchmark/test/reference/spmv_distributed.profile.stdout @@ -18,4 +18,4 @@ "spmv": "csr-csr" } } -] \ No newline at end of file +] diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout index f94e4b992a1..77bdef168d3 100644 --- a/benchmark/test/reference/spmv_distributed.simple.stdout +++ b/benchmark/test/reference/spmv_distributed.simple.stdout @@ -19,4 +19,4 @@ "spmv": "csr-csr" } } -] \ No newline at end of file +] diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 1a07818df1f..62c4293e7c0 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -92,7 +92,8 @@ def sanitize_json_text(input: str) -> List[str]: """ result = json.dumps(sanitize_json(json.loads(input)), indent=4) - return result.splitlines() + # json.dumps doesn't add a trailing newline + return result.splitlines() + [""] def sanitize_text( From 39d88071ff0978c7c99817a96444e03e01e4c709 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 12 Jul 2023 10:59:59 +0200 Subject: [PATCH 257/583] add resource_groups property to tests --- cmake/create_test.cmake | 265 +++++++++++++++++++++++------------ hip/test/base/CMakeLists.txt | 2 +- resources.json | 51 +++++++ 3 files changed, 230 insertions(+), 88 deletions(-) create mode 100644 resources.json diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 58a49ca066c..937beb4eb8d 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -1,10 +1,12 @@ -set(gko_test_single_args "MPI_SIZE") +set(gko_test_resource_args "LOCAL_CORES;PERCENT;TYPE") +set(gko_test_single_args "MPI_SIZE;${gko_test_resource_args}") set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES") +set(gko_test_option_args "NO_RESOURCES") ## Replaces / by _ to create valid target names from relative paths function(ginkgo_build_test_name test_name target_name) file(RELATIVE_PATH REL_BINARY_DIR - ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) endfunction(ginkgo_build_test_name) @@ -12,8 +14,8 @@ endfunction(ginkgo_build_test_name) function(ginkgo_create_gtest_mpi_main) add_library(gtest_mpi_main "") target_sources(gtest_mpi_main - PRIVATE - ${PROJECT_SOURCE_DIR}/core/test/mpi/gtest/mpi_listener.cpp) + PRIVATE + ${PROJECT_SOURCE_DIR}/core/test/mpi/gtest/mpi_listener.cpp) find_package(MPI 3.1 COMPONENTS CXX REQUIRED) target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX) endfunction(ginkgo_create_gtest_mpi_main) @@ -24,33 +26,96 @@ function(ginkgo_set_test_target_properties test_target_name) cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_args}") if (GINKGO_FAST_TESTS) target_compile_definitions(${test_target_name} PRIVATE GINKGO_FAST_TESTS) - endif() + endif () if (GINKGO_TEST_NONDEFAULT_STREAM) target_compile_definitions(${test_target_name} PRIVATE GKO_TEST_NONDEFAULT_STREAM) - endif() + endif () if (GINKGO_COMPILING_DPCPP_TEST AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) - endif() + endif () if (GINKGO_CHECK_CIRCULAR_DEPS) target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") - endif() + endif () if (set_properties_MPI_SIZE) - if(NOT TARGET gtest_mpi_main) + if (NOT TARGET gtest_mpi_main) ginkgo_create_gtest_mpi_main() - endif() + endif () set(gtest_main gtest_mpi_main MPI::MPI_CXX) - else() + else () set(gtest_main GTest::Main) - endif() + endif () target_compile_features(${test_target_name} PUBLIC cxx_std_14) target_compile_options(${test_target_name} PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES}) target_link_libraries(${test_target_name} PRIVATE ginkgo ${gtest_main} GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) endfunction() +function(ginkgo_add_cpu_resource_requirement_internal test_name local_cores mpi_size) + if (mpi_size) + math(EXPR cores "${mpi_size} * ${local_cores}") + else () + set(cores ${local_cores}) + endif () + set_property(TEST ${test_name} PROPERTY + RESOURCE_GROUPS "cpus:${cores}") +endfunction() + +function(ginkgo_add_resource_requirement test_name) + cmake_parse_arguments(PARSE_ARGV 1 add_rr "${gko_test_option_args}" "${gko_test_single_args}" "") + if(add_rr_NO_RESOURCES) + return() + endif() + + if (NOT add_rr_TYPE) + message(FATAL_ERROR "Need to provide resource type used by test.") + endif () + + if(add_rr_TYPE STREQUAL "ref") + set(single_resource "cpus:1") + elseif(add_rr_TYPE STREQUAL "cpu") + if(NOT add_rr_CORES) + set(add_rr_CORES 4) # perhaps get this from environment variable? + endif() + if(NOT add_rr_CORES MATCHES "^[0-9]+") + message(FATAL_ERROR "Resource specification is invalid: CORE=${add_rr_CORES}") + endif() + + set(single_resource "cpus:${add_rr_CORES}") + elseif(add_rr_TYPE STREQUAL "gpu") + if(NOT add_rr_PERCENTAGE) + set(add_rr_PERCENTAGE 50) + endif() + if(add_rr_MPI_SIZE GREATER 1) + set(add_rr_PERCENTAGE 100) + endif() + if(NOT add_rr_PERCENTAGE MATCHES "^[0-9]([0-9][0-9]?)?" + OR add_rr_PERCENTAGE LESS 0 + OR add_rr_PERCENTAGE GREATER 100) + message(FATAL_ERROR "Resource specification is invalid: PERCENTAGE=${add_rr_PERCENTAGE}") + endif() + + set(single_resource "gpus:${add_rr_PERCENTAGE}") + else() + message(FATAL_ERROR "Unrecognized resource type ${add_rr_TYPE}, allowed are: ref, cpu, gpu.") + endif() + + if(NOT add_rr_MPI_SIZE) + set(add_rr_MPI_SIZE 1) + endif() + foreach(unused RANGE ${MPI_SIZE}) + list(APPEND resources "${single_resource}") + endforeach() + set_property(TEST ${test_name} + PROPERTY + RESOURCE_GROUPS ${resources}) +endfunction() + + ## Adds a test to the list executed by ctest and sets its output binary name ## Possible additional arguments: ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes. +## - `CORES` the number of threads used by a test, default is 4 +## - `PERCENTAGE` usage percentage of a single GPU, default is 50 ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple) ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths @@ -60,36 +125,39 @@ function(ginkgo_add_test test_name test_target_name) set_target_properties(${test_target_name} PROPERTIES OUTPUT_NAME ${test_name}) if (add_test_MPI_SIZE) add_test(NAME ${REL_BINARY_DIR}/${test_name} - COMMAND - ${MPIEXEC_EXECUTABLE} - ${MPIEXEC_NUMPROC_FLAG} - ${add_test_MPI_SIZE} - "$" - WORKING_DIRECTORY "$") - else() + COMMAND + ${MPIEXEC_EXECUTABLE} + ${MPIEXEC_NUMPROC_FLAG} + ${add_test_MPI_SIZE} + "$" + WORKING_DIRECTORY "$") + else () add_test(NAME ${REL_BINARY_DIR}/${test_name} - COMMAND ${test_target_name} - WORKING_DIRECTORY "$") - endif() + COMMAND ${test_target_name} + WORKING_DIRECTORY "$") + endif () + + ginkgo_add_resource_requirement(${REL_BINARY_DIR}/${test_name} ${ARGN}) + set(test_preload) if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_CUDA) set(test_preload $:${test_preload}) - endif() + endif () if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") set(test_preload $:${test_preload}) - endif() - if(test_preload) + endif () + if (test_preload) set_tests_properties(${REL_BINARY_DIR}/${test_name} PROPERTIES ENVIRONMENT LD_PRELOAD=${test_preload}) - endif() + endif () endfunction() ## Normal test function(ginkgo_create_test test_name) ginkgo_build_test_name(${test_name} test_target_name) add_executable(${test_target_name} ${test_name}.cpp) - target_link_libraries(${test_target_name} PRIVATE ${create_test_ADDITIONAL_LIBRARIES}) + target_link_libraries(${test_target_name}) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE ref) endfunction(ginkgo_create_test) ## Test compiled with dpcpp @@ -100,11 +168,11 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE gpu) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}") - endif() + endif () endfunction(ginkgo_create_dpcpp_test) ## Test compiled with CUDA @@ -118,23 +186,23 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) add_executable(${test_target_name} ${filename}) target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA) target_compile_options(${test_target_name} - PRIVATE + PRIVATE $<$:${GINKGO_CUDA_COMPILER_FLAGS}>) - if(MSVC) + if (MSVC) target_compile_options(${test_target_name} - PRIVATE + PRIVATE $<$:--extended-lambda --expt-relaxed-constexpr>) - elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") + elseif (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") target_compile_options(${test_target_name} - PRIVATE + PRIVATE $<$:--expt-extended-lambda --expt-relaxed-constexpr>) - endif() + endif () # we handle CUDA architecture flags for now, disable CMake handling - if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF) - endif() + endif () ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE gpu) endfunction(ginkgo_create_cuda_test_internal) ## Test compiled with HIP @@ -149,71 +217,94 @@ function(ginkgo_create_hip_test_internal test_name filename test_target_name add set(GINKGO_TEST_HIP_DEFINES -DGKO_COMPILING_HIP ${additional_flags}) if (GINKGO_FAST_TESTS) list(APPEND GINKGO_TEST_HIP_DEFINES -DGINKGO_FAST_TESTS) - endif() + endif () if (GINKGO_TEST_NONDEFAULT_STREAM) list(APPEND GINKGO_TEST_HIP_DEFINES -DGKO_TEST_NONDEFAULT_STREAM) - endif() + endif () # NOTE: With how HIP works, passing the flags `HIPCC_OPTIONS` etc. here # creates a redefinition of all flags. This creates some issues with `nvcc`, # but `clang` seems fine with the redefinitions. if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") hip_add_executable(${test_target_name} ${filename} - # If `FindHIP.cmake`, namely `HIP_PARSE_HIPCC_OPTIONS` macro and - # call gets fixed, uncomment this. - HIPCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} # ${GINKGO_HIPCC_OPTIONS} - # NVCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_NVCC_OPTIONS} - # CLANG_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_CLANG_OPTIONS} - --expt-relaxed-constexpr --expt-extended-lambda - ) - else() # hcc/clang + # If `FindHIP.cmake`, namely `HIP_PARSE_HIPCC_OPTIONS` macro and + # call gets fixed, uncomment this. + HIPCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} # ${GINKGO_HIPCC_OPTIONS} + # NVCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_NVCC_OPTIONS} + # CLANG_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_CLANG_OPTIONS} + --expt-relaxed-constexpr --expt-extended-lambda + ) + else () # hcc/clang hip_add_executable(${test_target_name} ${filename} - HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} ${GINKGO_TEST_HIP_DEFINES} - NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} - CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} - ) - endif() + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} ${GINKGO_TEST_HIP_DEFINES} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} + CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} + ) + endif () # Let's use a normal compiler for linking set_target_properties(${test_target_name} PROPERTIES LINKER_LANGUAGE CXX) target_include_directories(${test_target_name} - PRIVATE - # Only `math` requires it so far, but it's much easier - # to put these this way. - ${GINKGO_HIP_THRUST_PATH} - # Only `exception_helpers` requires these so far, but it's much easier - # to put these this way. - ${HIPBLAS_INCLUDE_DIRS} - ${HIPFFT_INCLUDE_DIRS} - ${hiprand_INCLUDE_DIRS} - ${HIPSPARSE_INCLUDE_DIRS} - ) + PRIVATE + # Only `math` requires it so far, but it's much easier + # to put these this way. + ${GINKGO_HIP_THRUST_PATH} + # Only `exception_helpers` requires these so far, but it's much easier + # to put these this way. + ${HIPBLAS_INCLUDE_DIRS} + ${HIPFFT_INCLUDE_DIRS} + ${hiprand_INCLUDE_DIRS} + ${HIPSPARSE_INCLUDE_DIRS} + ) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE gpu) endfunction(ginkgo_create_hip_test_internal) + +## Test compiled with OpenMP +function(ginkgo_create_omp_test test_name) + ginkgo_build_test_name(${test_name} test_target_name) + ginkgo_create_omp_test_internal(${test_name} ${test_name}.cpp ${test_target_name} "" ${ARGN}) +endfunction() + +function(ginkgo_create_omp_test_internal test_name filename test_target_name) + ginkgo_build_test_name(${test_name} test_target_name) + add_executable(${test_target_name} ${test_name}.cpp) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP) + target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) + ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE cpu) +endfunction() + ## Common test compiled with the host compiler, one target for each enabled backend function(ginkgo_create_common_test test_name) - if(GINKGO_BUILD_OMP) + if (GINKGO_BUILD_OMP) ginkgo_create_common_test_internal(${test_name} OmpExecutor omp ${ARGN}) - endif() - if(GINKGO_BUILD_HIP) + endif () + if (GINKGO_BUILD_HIP) ginkgo_create_common_test_internal(${test_name} HipExecutor hip ${ARGN}) - endif() - if(GINKGO_BUILD_CUDA) + endif () + if (GINKGO_BUILD_CUDA) ginkgo_create_common_test_internal(${test_name} CudaExecutor cuda ${ARGN}) - endif() - if(GINKGO_BUILD_DPCPP) + endif () + if (GINKGO_BUILD_DPCPP) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) - endif() + endif () endfunction(ginkgo_create_common_test) function(ginkgo_create_common_test_internal test_name exec_type exec) cmake_parse_arguments(PARSE_ARGV 3 common_test "" "${gko_test_single_args}" "${gko_test_multi_args}") - if(exec IN_LIST common_test_DISABLE_EXECUTORS) + if (exec IN_LIST common_test_DISABLE_EXECUTORS) return() - endif() + endif () + if (exec STREQUAL reference) + set(test_resource_type ref) + elseif (exec STREQUAL omp) + set(test_resource_type cpu) + else () + set(test_resource_type gpu) + endif () ginkgo_build_test_name(${test_name} test_target_name) string(TOUPPER ${exec} exec_upper) # set up actual test @@ -222,39 +313,39 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES}) # use float for DPC++ if necessary - if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) + if ((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) - endif() + endif () ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN}) + ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN} TYPE ${test_resource_type}) endfunction(ginkgo_create_common_test_internal) ## Common test compiled with the device compiler, one target for each enabled backend function(ginkgo_create_common_device_test test_name) cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}") ginkgo_build_test_name(${test_name} test_target_name) - if(GINKGO_BUILD_DPCPP) + if (GINKGO_BUILD_DPCPP) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel) - endif() - if(GINKGO_BUILD_OMP) + endif () + if (GINKGO_BUILD_OMP) ginkgo_create_common_test_internal(${test_name} OmpExecutor omp ${ARGN}) target_link_libraries(${test_target_name}_omp PUBLIC OpenMP::OpenMP_CXX) - endif() - if(GINKGO_BUILD_CUDA) + endif () + if (GINKGO_BUILD_CUDA) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.cu COPYONLY) ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN}) target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor EXEC_NAMESPACE=cuda) - endif() - if(GINKGO_BUILD_HIP) + endif () + if (GINKGO_BUILD_HIP) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY) ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip "-std=c++14;-DEXEC_TYPE=HipExecutor;-DEXEC_NAMESPACE=hip" ${ARGN}) - endif() + endif () endfunction(ginkgo_create_common_device_test) ## Common test compiled with the host compiler for all enabled backends and Reference diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt index 486fca294c2..ed32ab5b6a7 100644 --- a/hip/test/base/CMakeLists.txt +++ b/hip/test/base/CMakeLists.txt @@ -15,4 +15,4 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") else() ginkgo_create_hip_test(exception_helpers) endif() -ginkgo_create_hip_test(scoped_device_id) +ginkgo_create_hip_test(scoped_device_id NO_RESOURCES) diff --git a/resources.json b/resources.json new file mode 100644 index 00000000000..9d69ada752b --- /dev/null +++ b/resources.json @@ -0,0 +1,51 @@ +{ + "version": { + "major": 1, + "minor": 0 + }, + "local": [ + { + "cpus": [ + { + "id": "0", + "slots": 32 + } + ], + + "gpus": [ + { + "id": "0", + "slots": 100 + }, + { + "id": "1", + "slots": 100 + }, + { + "id": "2", + "slots": 100 + }, + { + "id": "3", + "slots": 100 + }, + { + "id": "4", + "slots": 100 + }, + { + "id": "5", + "slots": 100 + }, + { + "id": "6", + "slots": 100 + }, + { + "id": "7", + "slots": 100 + } + ] + } + ] +} \ No newline at end of file From a6c247378bff944b92baf23de229a338af73bc76 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 1 Aug 2023 17:52:21 +0200 Subject: [PATCH 258/583] add custom gtest main files --- cmake/create_test.cmake | 57 +++++++++++-------- core/test/gtest/environments.hpp | 40 +++++++++++++ core/test/gtest/ginkgo_main.cpp | 14 +++++ .../ginkgo_mpi_main.cpp} | 5 ++ cuda/test/utils.hpp | 9 --- hip/test/utils.hip.hpp | 9 --- test/utils/executor.hpp | 33 ----------- 7 files changed, 93 insertions(+), 74 deletions(-) create mode 100644 core/test/gtest/environments.hpp create mode 100644 core/test/gtest/ginkgo_main.cpp rename core/test/{mpi/gtest/mpi_listener.cpp => gtest/ginkgo_mpi_main.cpp} (98%) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 937beb4eb8d..e6ebc6523a2 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -5,20 +5,28 @@ set(gko_test_option_args "NO_RESOURCES") ## Replaces / by _ to create valid target names from relative paths function(ginkgo_build_test_name test_name target_name) - file(RELATIVE_PATH REL_BINARY_DIR - ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) - string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") - set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) -endfunction(ginkgo_build_test_name) + file(RELATIVE_PATH REL_BINARY_DIR + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") + set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) +endfunction() + +function(ginkgo_create_gtest_main) + add_library(ginkgo_gtest_main "") + target_sources(ginkgo_gtest_main + PRIVATE + ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_main.cpp) + target_link_libraries(ginkgo_gtest_main PRIVATE GTest::GTest Ginkgo::ginkgo) +endfunction() function(ginkgo_create_gtest_mpi_main) - add_library(gtest_mpi_main "") - target_sources(gtest_mpi_main - PRIVATE - ${PROJECT_SOURCE_DIR}/core/test/mpi/gtest/mpi_listener.cpp) - find_package(MPI 3.1 COMPONENTS CXX REQUIRED) - target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX) -endfunction(ginkgo_create_gtest_mpi_main) + add_library(ginkgo_gtest_mpi_main "") + target_sources(ginkgo_gtest_mpi_main + PRIVATE + ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_mpi_main.cpp) + find_package(MPI 3.1 COMPONENTS CXX REQUIRED) + target_link_libraries(ginkgo_gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX Ginkgo::ginkgo) +endfunction() ## Set up shared target properties and handle ADDITIONAL_LIBRARIES/ADDITIONAL_INCLUDES ## `MPI_SIZE size` causes the tests to be run with `size` MPI processes. @@ -33,17 +41,20 @@ function(ginkgo_set_test_target_properties test_target_name) if (GINKGO_COMPILING_DPCPP_TEST AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif () - if (GINKGO_CHECK_CIRCULAR_DEPS) - target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") - endif () - if (set_properties_MPI_SIZE) - if (NOT TARGET gtest_mpi_main) - ginkgo_create_gtest_mpi_main() - endif () - set(gtest_main gtest_mpi_main MPI::MPI_CXX) - else () - set(gtest_main GTest::Main) - endif () + if(GINKGO_CHECK_CIRCULAR_DEPS) + target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + endif() + if(set_properties_MPI_SIZE) + if(NOT TARGET ginkgo_gtest_mpi_main) + ginkgo_create_gtest_mpi_main() + endif() + set(gtest_main ginkgo_gtest_mpi_main MPI::MPI_CXX) + else() + if(NOT TARGET ginkgo_gtest_main) + ginkgo_create_gtest_main() + endif() + set(gtest_main ginkgo_gtest_main) + endif() target_compile_features(${test_target_name} PUBLIC cxx_std_14) target_compile_options(${test_target_name} PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES}) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp new file mode 100644 index 00000000000..b248829bdfb --- /dev/null +++ b/core/test/gtest/environments.hpp @@ -0,0 +1,40 @@ +#ifndef GINKGO_ENVIRONMENTS_HPP +#define GINKGO_ENVIRONMENTS_HPP + + +#include + + +#ifdef GKO_COMPILING_CUDA + +#include "cuda/base/device.hpp" + +class CudaEnvironment : public ::testing::Environment { +public: + void TearDown() override { gko::kernels::cuda::reset_device(0); } +}; + +#else + +class CudaEnvironment : public ::testing::Environment {}; + +#endif + + +#ifdef GKO_COMPILING_HIP + +#include "hip/base/device.hpp" + +class HipEnvironment : public ::testing::Environment { +public: + void TearDown() override { gko::kernels::hip::reset_device(0); } +}; + +#else + +class HipEnvironment : public ::testing::Environment {}; + +#endif + + +#endif // GINKGO_ENVIRONMENTS_HPP diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp new file mode 100644 index 00000000000..c284db84794 --- /dev/null +++ b/core/test/gtest/ginkgo_main.cpp @@ -0,0 +1,14 @@ +#include + + +#include "core/test/gtest/environments.hpp" + + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + ::testing::AddGlobalTestEnvironment(new CudaEnvironment); + ::testing::AddGlobalTestEnvironment(new HipEnvironment); + int result = RUN_ALL_TESTS(); + return result; +} \ No newline at end of file diff --git a/core/test/mpi/gtest/mpi_listener.cpp b/core/test/gtest/ginkgo_mpi_main.cpp similarity index 98% rename from core/test/mpi/gtest/mpi_listener.cpp rename to core/test/gtest/ginkgo_mpi_main.cpp index 66c9e6cd319..6c9b1b248f3 100644 --- a/core/test/mpi/gtest/mpi_listener.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -51,6 +51,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/gtest/environments.hpp" + + namespace GTestMPIListener { // This class sets up the global test environment, which is needed @@ -378,6 +381,8 @@ int main(int argc, char** argv) ::testing::InitGoogleTest(&argc, argv); MPI_Init(&argc, &argv); ::testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); + ::testing::AddGlobalTestEnvironment(new CudaEnvironment); + ::testing::AddGlobalTestEnvironment(new HipEnvironment); ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); ::testing::TestEventListener* l = diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index e1156b91903..58d310024bd 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -47,15 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace { -class CudaEnvironment : public ::testing::Environment { -public: - void TearDown() override { gko::kernels::cuda::reset_device(0); } -}; - -testing::Environment* cuda_env = - testing::AddGlobalTestEnvironment(new CudaEnvironment); - - class CudaTestFixture : public ::testing::Test { protected: CudaTestFixture() diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index bf7073cf9a1..dcecc8d2522 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -47,15 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace { -class HipEnvironment : public ::testing::Environment { -public: - void TearDown() override { gko::kernels::hip::reset_device(0); } -}; - -testing::Environment* hip_env = - testing::AddGlobalTestEnvironment(new HipEnvironment); - - class HipTestFixture : public ::testing::Test { protected: HipTestFixture() diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 200f4652644..ca6ad2a75c9 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -44,39 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - -#ifdef GKO_COMPILING_CUDA - -#include "cuda/base/device.hpp" - -class CudaEnvironment : public ::testing::Environment { -public: - void TearDown() override { gko::kernels::cuda::reset_device(0); } -}; - -testing::Environment* cuda_env = - testing::AddGlobalTestEnvironment(new CudaEnvironment); - -#endif - - -#ifdef GKO_COMPILING_HIP - -#include "hip/base/device.hpp" - -class HipEnvironment : public ::testing::Environment { -public: - void TearDown() override { gko::kernels::hip::reset_device(0); } -}; - -testing::Environment* hip_env = - testing::AddGlobalTestEnvironment(new HipEnvironment); - -#endif - - #if GINKGO_COMMON_SINGLE_MODE #define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode" #else From af2ab0cf676051fb871870ad324bb63a5cb71457 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 12 Jul 2023 17:15:45 +0200 Subject: [PATCH 259/583] use resources in tests --- core/test/gtest/environments.hpp | 109 +++++++++++++++++++++++++++- core/test/gtest/ginkgo_main.cpp | 7 ++ core/test/gtest/ginkgo_mpi_main.cpp | 23 ++++-- test/utils/executor.hpp | 16 ++-- test/utils/mpi/executor.hpp | 73 +------------------ 5 files changed, 145 insertions(+), 83 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index b248829bdfb..3f93ea95b8a 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -1,17 +1,119 @@ #ifndef GINKGO_ENVIRONMENTS_HPP #define GINKGO_ENVIRONMENTS_HPP +#include +#include + #include +std::vector split(const std::string& s, char delimiter = ',') +{ + std::istringstream iss(s); + std::vector tokens; + std::string token; + while (std::getline(iss, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + + +struct resource { + int id; + int slots; +}; + +resource parse_single_resource(const std::string& resource_string) +{ + std::regex re(R"(id\:(\d+),slots\:(\d+))"); + std::smatch match; + + if (!std::regex_match(resource_string, match, re)) { + GKO_INVALID_STATE("Can't parse resource string: " + resource_string); + } + + return resource{std::stoi(match[1]), std::stoi(match[2])}; +} + +std::vector parse_all_resources(const std::string& resource_string) +{ + auto resource_strings = split(resource_string, ';'); + + std::vector resources; + for (const auto& rs : resource_strings) { + resources.push_back(parse_single_resource(rs)); + } + return resources; +} + + +std::vector get_ctest_resources() +{ + auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); + + if (!rs_count_env) { + return {{0, 1}}; + } + + auto rs_count = std::stoi(rs_count_env); + + if (rs_count > 1) { + GKO_INVALID_STATE("Can handle only one resource group."); + } + + std::string rs_type = std::getenv("CTEST_RESOURCE_GROUP_0"); + std::transform(rs_type.begin(), rs_type.end(), rs_type.begin(), + [](auto c) { return std::toupper(c); }); + std::string rs_env = + std::getenv(std::string("CTEST_RESOURCE_GROUP_0_" + rs_type).c_str()); + std::cerr << rs_env << std::endl; + return parse_all_resources(rs_env); +} + + +class ResourceEnvironment : public ::testing::Environment { +public: + explicit ResourceEnvironment(resource rs_) : ::testing::Environment() + { + rs = rs_; + } + + static resource rs; +}; + + +#ifdef GKO_COMPILING_OMP + +#include + +class OmpEnvironment : public ::testing::Environment { +public: + void SetUp() override + { + omp_set_num_threads(ResourceEnvironment::rs.slots); + } +}; + +#else + + +class OmpEnvironment : public ::testing::Environment {}; + +#endif + + #ifdef GKO_COMPILING_CUDA #include "cuda/base/device.hpp" class CudaEnvironment : public ::testing::Environment { public: - void TearDown() override { gko::kernels::cuda::reset_device(0); } + void TearDown() override + { + gko::kernels::cuda::reset_device(ResourceEnvironment::rs.id); + } }; #else @@ -27,7 +129,10 @@ class CudaEnvironment : public ::testing::Environment {}; class HipEnvironment : public ::testing::Environment { public: - void TearDown() override { gko::kernels::hip::reset_device(0); } + void TearDown() override + { + gko::kernels::hip::reset_device(ResourceEnvironment::rs.id); + } }; #else diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp index c284db84794..76a005a66e2 100644 --- a/core/test/gtest/ginkgo_main.cpp +++ b/core/test/gtest/ginkgo_main.cpp @@ -3,12 +3,19 @@ #include "core/test/gtest/environments.hpp" +resource ResourceEnvironment::rs = {}; int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + + auto resources = get_ctest_resources(); + + ::testing::AddGlobalTestEnvironment( + new ResourceEnvironment(resources.front())); ::testing::AddGlobalTestEnvironment(new CudaEnvironment); ::testing::AddGlobalTestEnvironment(new HipEnvironment); + ::testing::AddGlobalTestEnvironment(new OmpEnvironment); int result = RUN_ALL_TESTS(); return result; } \ No newline at end of file diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index 6c9b1b248f3..934a3dcd3f5 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -45,10 +45,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include -#include +#include #include "core/test/gtest/environments.hpp" @@ -95,7 +95,6 @@ class MPIEnvironment : public ::testing::Environment { private: // Disallow copying MPIEnvironment(const MPIEnvironment& env) {} - }; // class MPIEnvironment @@ -376,19 +375,31 @@ class MPIWrapperPrinter : public ::testing::TestEventListener { } // namespace GTestMPIListener +resource ResourceEnvironment::rs = {}; + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + MPI_Init(&argc, &argv); - ::testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); + MPI_Comm comm(MPI_COMM_WORLD); + int rank; + MPI_Comm_rank(comm, &rank); + + auto resources = get_ctest_resources(); + + testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); + ::testing::AddGlobalTestEnvironment( + new ResourceEnvironment(resources[rank])); ::testing::AddGlobalTestEnvironment(new CudaEnvironment); ::testing::AddGlobalTestEnvironment(new HipEnvironment); + ::testing::AddGlobalTestEnvironment(new OmpEnvironment); + ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); ::testing::TestEventListener* l = listeners.Release(listeners.default_result_printer()); - listeners.Append( - new GTestMPIListener::MPIWrapperPrinter(l, MPI_COMM_WORLD)); + listeners.Append(new GTestMPIListener::MPIWrapperPrinter(l, comm)); int result = RUN_ALL_TESTS(); return result; } diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index ca6ad2a75c9..ad4621d5c31 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/gtest/environments.hpp" + + #if GINKGO_COMMON_SINGLE_MODE #define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode" #else @@ -77,7 +80,7 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable CUDA devices"}; } exec = gko::CudaExecutor::create( - 0, ref, std::make_shared(), stream); + ResourceEnvironment::rs.id, ref, std::make_shared(), stream); } } @@ -90,7 +93,8 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable HIP devices"}; } exec = gko::HipExecutor::create( - 0, ref, std::make_shared(), stream); + ResourceEnvironment::rs.id, ref, std::make_shared< + gko::HipAllocator>(), stream); } @@ -98,9 +102,11 @@ inline void init_executor(std::shared_ptr ref, std::shared_ptr& exec) { if (gko::DpcppExecutor::get_num_devices("gpu") > 0) { - exec = gko::DpcppExecutor::create(0, ref, "gpu"); + exec = + gko::DpcppExecutor::create(ResourceEnvironment::rs.id, ref, "gpu"); } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) { - exec = gko::DpcppExecutor::create(0, ref, "cpu"); + exec = + gko::DpcppExecutor::create(ResourceEnvironment::rs.id, ref, "cpu"); } else { throw std::runtime_error{"No suitable DPC++ devices"}; } @@ -120,7 +126,7 @@ class CommonTestFixture : public ::testing::Test { : #if defined(GKO_TEST_NONDEFAULT_STREAM) && \ (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) - stream{0}, + stream(ResourceEnvironment::rs.id), #endif ref{gko::ReferenceExecutor::create()} { diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp index d8c94e01804..4eba5593c90 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/executor.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -43,73 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include - - -inline void init_executor(std::shared_ptr, - std::shared_ptr& exec) -{ - exec = gko::ReferenceExecutor::create(); -} - - -inline void init_executor(std::shared_ptr, - std::shared_ptr& exec) -{ - exec = gko::OmpExecutor::create(); -} - - -inline void init_executor(std::shared_ptr ref, - std::shared_ptr& exec, - CUstream_st* stream = nullptr) -{ - { - if (gko::CudaExecutor::get_num_devices() == 0) { - throw std::runtime_error{"No suitable CUDA devices"}; - } - exec = gko::CudaExecutor::create( - gko::experimental::mpi::map_rank_to_device_id( - MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()), - ref, std::make_shared(), stream); - } -} - - -inline void init_executor(std::shared_ptr ref, - std::shared_ptr& exec, - GKO_HIP_STREAM_STRUCT* stream = nullptr) -{ - if (gko::HipExecutor::get_num_devices() == 0) { - throw std::runtime_error{"No suitable HIP devices"}; - } - exec = gko::HipExecutor::create( - gko::experimental::mpi::map_rank_to_device_id( - MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()), - ref, std::make_shared(), stream); -} - - -inline void init_executor(std::shared_ptr ref, - std::shared_ptr& exec) -{ - auto num_gpu_devices = gko::DpcppExecutor::get_num_devices("gpu"); - auto num_cpu_devices = gko::DpcppExecutor::get_num_devices("cpu"); - if (num_gpu_devices > 0) { - exec = gko::DpcppExecutor::create( - gko::experimental::mpi::map_rank_to_device_id(MPI_COMM_WORLD, - num_gpu_devices), - ref, "gpu"); - } else if (num_cpu_devices > 0) { - exec = gko::DpcppExecutor::create( - gko::experimental::mpi::map_rank_to_device_id(MPI_COMM_WORLD, - num_cpu_devices), - ref, "cpu"); - } else { - throw std::runtime_error{"No suitable DPC++ devices"}; - } -} +#include "test/utils/executor.hpp" class CommonMpiTestFixture : public ::testing::Test { @@ -125,9 +60,7 @@ class CommonMpiTestFixture : public ::testing::Test { : comm(MPI_COMM_WORLD), #if defined(GKO_TEST_NONDEFAULT_STREAM) && \ (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) - - stream(gko::experimental::mpi::map_rank_to_device_id( - comm.get(), gko::EXEC_TYPE::get_num_devices())), + stream(ResourceEnvironment::rs.id), #endif ref{gko::ReferenceExecutor::create()} { From 20d32f69a28bfa92134047f80c0a2d7dbdf430f4 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jul 2023 14:14:09 +0200 Subject: [PATCH 260/583] add gtest_main.cpp directly to target --- cmake/create_test.cmake | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index e6ebc6523a2..96310e7f22f 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -45,20 +45,18 @@ function(ginkgo_set_test_target_properties test_target_name) target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") endif() if(set_properties_MPI_SIZE) - if(NOT TARGET ginkgo_gtest_mpi_main) - ginkgo_create_gtest_mpi_main() - endif() - set(gtest_main ginkgo_gtest_mpi_main MPI::MPI_CXX) + target_sources(${test_target_name} + PRIVATE + ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_mpi_main.cpp) else() - if(NOT TARGET ginkgo_gtest_main) - ginkgo_create_gtest_main() - endif() - set(gtest_main ginkgo_gtest_main) + target_sources(${test_target_name} + PRIVATE + ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_main.cpp) endif() target_compile_features(${test_target_name} PUBLIC cxx_std_14) target_compile_options(${test_target_name} PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES}) - target_link_libraries(${test_target_name} PRIVATE ginkgo ${gtest_main} GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) + target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) endfunction() function(ginkgo_add_cpu_resource_requirement_internal test_name local_cores mpi_size) From f94a07ed42c7e40831065a9a234265b923640968 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jul 2023 14:15:01 +0200 Subject: [PATCH 261/583] simplify resource group --- cmake/create_test.cmake | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 96310e7f22f..1d18d07b516 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -59,16 +59,6 @@ function(ginkgo_set_test_target_properties test_target_name) target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) endfunction() -function(ginkgo_add_cpu_resource_requirement_internal test_name local_cores mpi_size) - if (mpi_size) - math(EXPR cores "${mpi_size} * ${local_cores}") - else () - set(cores ${local_cores}) - endif () - set_property(TEST ${test_name} PROPERTY - RESOURCE_GROUPS "cpus:${cores}") -endfunction() - function(ginkgo_add_resource_requirement test_name) cmake_parse_arguments(PARSE_ARGV 1 add_rr "${gko_test_option_args}" "${gko_test_single_args}" "") if(add_rr_NO_RESOURCES) @@ -111,12 +101,9 @@ function(ginkgo_add_resource_requirement test_name) if(NOT add_rr_MPI_SIZE) set(add_rr_MPI_SIZE 1) endif() - foreach(unused RANGE ${MPI_SIZE}) - list(APPEND resources "${single_resource}") - endforeach() set_property(TEST ${test_name} PROPERTY - RESOURCE_GROUPS ${resources}) + RESOURCE_GROUPS "${add_rr_MPI_SIZE},${single_resource}") endfunction() From a2d0835dcf038bfce670a945a47d09a4e481b197 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jul 2023 14:37:34 +0200 Subject: [PATCH 262/583] rename cmake parameters --- cmake/create_test.cmake | 201 +++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 97 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 1d18d07b516..6ce37976f84 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -1,4 +1,4 @@ -set(gko_test_resource_args "LOCAL_CORES;PERCENT;TYPE") +set(gko_test_resource_args "RESOURCE_LOCAL_CORES;RESOURCE_PERCENT;RESOURCE_TYPE") set(gko_test_single_args "MPI_SIZE;${gko_test_resource_args}") set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES") set(gko_test_option_args "NO_RESOURCES") @@ -34,13 +34,13 @@ function(ginkgo_set_test_target_properties test_target_name) cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_args}") if (GINKGO_FAST_TESTS) target_compile_definitions(${test_target_name} PRIVATE GINKGO_FAST_TESTS) - endif () + endif() if (GINKGO_TEST_NONDEFAULT_STREAM) target_compile_definitions(${test_target_name} PRIVATE GKO_TEST_NONDEFAULT_STREAM) - endif () + endif() if (GINKGO_COMPILING_DPCPP_TEST AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) - endif () + endif() if(GINKGO_CHECK_CIRCULAR_DEPS) target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") endif() @@ -65,37 +65,37 @@ function(ginkgo_add_resource_requirement test_name) return() endif() - if (NOT add_rr_TYPE) + if (NOT add_rr_RESOURCE_TYPE) message(FATAL_ERROR "Need to provide resource type used by test.") endif () - if(add_rr_TYPE STREQUAL "ref") + if(add_rr_RESOURCE_TYPE STREQUAL "ref") set(single_resource "cpus:1") - elseif(add_rr_TYPE STREQUAL "cpu") - if(NOT add_rr_CORES) - set(add_rr_CORES 4) # perhaps get this from environment variable? + elseif(add_rr_RESOURCE_TYPE STREQUAL "cpu") + if(NOT add_rr_RESOURCE_LOCAL_CORES) + set(add_rr_RESOURCE_LOCAL_CORES 4) # perhaps get this from environment variable? endif() - if(NOT add_rr_CORES MATCHES "^[0-9]+") - message(FATAL_ERROR "Resource specification is invalid: CORE=${add_rr_CORES}") + if(NOT add_rr_RESOURCE_LOCAL_CORES MATCHES "^[0-9]+") + message(FATAL_ERROR "Resource specification is invalid: RESOURCE_LOCAL_CORE=${add_rr_RESOURCE_LOCAL_CORES}") endif() - set(single_resource "cpus:${add_rr_CORES}") - elseif(add_rr_TYPE STREQUAL "gpu") - if(NOT add_rr_PERCENTAGE) - set(add_rr_PERCENTAGE 50) + set(single_resource "cpus:${add_rr_RESOURCE_LOCAL_CORES}") + elseif(add_rr_RESOURCE_TYPE STREQUAL "gpu") + if(NOT add_rr_RESOURCE_PERCENTAGE) + set(add_rr_RESOURCE_PERCENTAGE 50) endif() if(add_rr_MPI_SIZE GREATER 1) - set(add_rr_PERCENTAGE 100) + set(add_rr_RESOURCE_PERCENTAGE 100) endif() - if(NOT add_rr_PERCENTAGE MATCHES "^[0-9]([0-9][0-9]?)?" - OR add_rr_PERCENTAGE LESS 0 - OR add_rr_PERCENTAGE GREATER 100) - message(FATAL_ERROR "Resource specification is invalid: PERCENTAGE=${add_rr_PERCENTAGE}") + if(NOT add_rr_RESOURCE_PERCENTAGE MATCHES "^[0-9]([0-9][0-9]?)?" + OR add_rr_RESOURCE_PERCENTAGE LESS 0 + OR add_rr_RESOURCE_PERCENTAGE GREATER 100) + message(FATAL_ERROR "Resource specification is invalid: RESOURCE_PERCENTAGE=${add_rr_RESOURCE_PERCENTAGE}") endif() - set(single_resource "gpus:${add_rr_PERCENTAGE}") + set(single_resource "gpus:${add_rr_RESOURCE_PERCENTAGE}") else() - message(FATAL_ERROR "Unrecognized resource type ${add_rr_TYPE}, allowed are: ref, cpu, gpu.") + message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: ref, cpu, gpu.") endif() if(NOT add_rr_MPI_SIZE) @@ -121,30 +121,30 @@ function(ginkgo_add_test test_name test_target_name) set_target_properties(${test_target_name} PROPERTIES OUTPUT_NAME ${test_name}) if (add_test_MPI_SIZE) add_test(NAME ${REL_BINARY_DIR}/${test_name} - COMMAND - ${MPIEXEC_EXECUTABLE} - ${MPIEXEC_NUMPROC_FLAG} - ${add_test_MPI_SIZE} - "$" - WORKING_DIRECTORY "$") - else () + COMMAND + ${MPIEXEC_EXECUTABLE} + ${MPIEXEC_NUMPROC_FLAG} + ${add_test_MPI_SIZE} + "$" + WORKING_DIRECTORY "$") + else() add_test(NAME ${REL_BINARY_DIR}/${test_name} - COMMAND ${test_target_name} - WORKING_DIRECTORY "$") - endif () + COMMAND ${test_target_name} + WORKING_DIRECTORY "$") + endif() ginkgo_add_resource_requirement(${REL_BINARY_DIR}/${test_name} ${ARGN}) set(test_preload) if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_CUDA) set(test_preload $:${test_preload}) - endif () + endif() if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") set(test_preload $:${test_preload}) - endif () - if (test_preload) + endif() + if(test_preload) set_tests_properties(${REL_BINARY_DIR}/${test_name} PROPERTIES ENVIRONMENT LD_PRELOAD=${test_preload}) - endif () + endif() endfunction() ## Normal test @@ -153,7 +153,7 @@ function(ginkgo_create_test test_name) add_executable(${test_target_name} ${test_name}.cpp) target_link_libraries(${test_target_name}) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE ref) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE ref) endfunction(ginkgo_create_test) ## Test compiled with dpcpp @@ -164,11 +164,11 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE gpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE gpu) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}") - endif () + endif() endfunction(ginkgo_create_dpcpp_test) ## Test compiled with CUDA @@ -182,23 +182,23 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) add_executable(${test_target_name} ${filename}) target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA) target_compile_options(${test_target_name} - PRIVATE + PRIVATE $<$:${GINKGO_CUDA_COMPILER_FLAGS}>) - if (MSVC) + if(MSVC) target_compile_options(${test_target_name} - PRIVATE + PRIVATE $<$:--extended-lambda --expt-relaxed-constexpr>) - elseif (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") + elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") target_compile_options(${test_target_name} - PRIVATE + PRIVATE $<$:--expt-extended-lambda --expt-relaxed-constexpr>) - endif () + endif() # we handle CUDA architecture flags for now, disable CMake handling - if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF) - endif () + endif() ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE gpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE gpu) endfunction(ginkgo_create_cuda_test_internal) ## Test compiled with HIP @@ -213,48 +213,48 @@ function(ginkgo_create_hip_test_internal test_name filename test_target_name add set(GINKGO_TEST_HIP_DEFINES -DGKO_COMPILING_HIP ${additional_flags}) if (GINKGO_FAST_TESTS) list(APPEND GINKGO_TEST_HIP_DEFINES -DGINKGO_FAST_TESTS) - endif () + endif() if (GINKGO_TEST_NONDEFAULT_STREAM) list(APPEND GINKGO_TEST_HIP_DEFINES -DGKO_TEST_NONDEFAULT_STREAM) - endif () + endif() # NOTE: With how HIP works, passing the flags `HIPCC_OPTIONS` etc. here # creates a redefinition of all flags. This creates some issues with `nvcc`, # but `clang` seems fine with the redefinitions. if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") hip_add_executable(${test_target_name} ${filename} - # If `FindHIP.cmake`, namely `HIP_PARSE_HIPCC_OPTIONS` macro and - # call gets fixed, uncomment this. - HIPCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} # ${GINKGO_HIPCC_OPTIONS} - # NVCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_NVCC_OPTIONS} - # CLANG_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_CLANG_OPTIONS} - --expt-relaxed-constexpr --expt-extended-lambda - ) - else () # hcc/clang + # If `FindHIP.cmake`, namely `HIP_PARSE_HIPCC_OPTIONS` macro and + # call gets fixed, uncomment this. + HIPCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} # ${GINKGO_HIPCC_OPTIONS} + # NVCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_NVCC_OPTIONS} + # CLANG_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_CLANG_OPTIONS} + --expt-relaxed-constexpr --expt-extended-lambda + ) + else() # hcc/clang hip_add_executable(${test_target_name} ${filename} - HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} ${GINKGO_TEST_HIP_DEFINES} - NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} - CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} - ) - endif () + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} ${GINKGO_TEST_HIP_DEFINES} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} + CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} + ) + endif() # Let's use a normal compiler for linking set_target_properties(${test_target_name} PROPERTIES LINKER_LANGUAGE CXX) target_include_directories(${test_target_name} - PRIVATE - # Only `math` requires it so far, but it's much easier - # to put these this way. - ${GINKGO_HIP_THRUST_PATH} - # Only `exception_helpers` requires these so far, but it's much easier - # to put these this way. - ${HIPBLAS_INCLUDE_DIRS} - ${HIPFFT_INCLUDE_DIRS} - ${hiprand_INCLUDE_DIRS} - ${HIPSPARSE_INCLUDE_DIRS} - ) + PRIVATE + # Only `math` requires it so far, but it's much easier + # to put these this way. + ${GINKGO_HIP_THRUST_PATH} + # Only `exception_helpers` requires these so far, but it's much easier + # to put these this way. + ${HIPBLAS_INCLUDE_DIRS} + ${HIPFFT_INCLUDE_DIRS} + ${hiprand_INCLUDE_DIRS} + ${HIPSPARSE_INCLUDE_DIRS} + ) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE gpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE gpu) endfunction(ginkgo_create_hip_test_internal) @@ -270,30 +270,30 @@ function(ginkgo_create_omp_test_internal test_name filename test_target_name) target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP) target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} TYPE cpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu) endfunction() ## Common test compiled with the host compiler, one target for each enabled backend function(ginkgo_create_common_test test_name) - if (GINKGO_BUILD_OMP) + if(GINKGO_BUILD_OMP) ginkgo_create_common_test_internal(${test_name} OmpExecutor omp ${ARGN}) - endif () - if (GINKGO_BUILD_HIP) + endif() + if(GINKGO_BUILD_HIP) ginkgo_create_common_test_internal(${test_name} HipExecutor hip ${ARGN}) - endif () - if (GINKGO_BUILD_CUDA) + endif() + if(GINKGO_BUILD_CUDA) ginkgo_create_common_test_internal(${test_name} CudaExecutor cuda ${ARGN}) - endif () - if (GINKGO_BUILD_DPCPP) + endif() + if(GINKGO_BUILD_DPCPP) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) - endif () + endif() endfunction(ginkgo_create_common_test) function(ginkgo_create_common_test_internal test_name exec_type exec) cmake_parse_arguments(PARSE_ARGV 3 common_test "" "${gko_test_single_args}" "${gko_test_multi_args}") - if (exec IN_LIST common_test_DISABLE_EXECUTORS) + if(exec IN_LIST common_test_DISABLE_EXECUTORS) return() - endif () + endif() if (exec STREQUAL reference) set(test_resource_type ref) elseif (exec STREQUAL omp) @@ -303,45 +303,52 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) endif () ginkgo_build_test_name(${test_name} test_target_name) string(TOUPPER ${exec} exec_upper) + # set up actual test set(test_target_name ${test_target_name}_${exec}) add_executable(${test_target_name} ${test_name}.cpp) + + # also need to add runtime libraries for other backends + if (exec STREQUAL omp) + target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) + endif () + target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES}) # use float for DPC++ if necessary - if ((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) + if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) - endif () + endif() ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN} TYPE ${test_resource_type}) + ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN} RESOURCE_TYPE ${test_resource_type}) endfunction(ginkgo_create_common_test_internal) ## Common test compiled with the device compiler, one target for each enabled backend function(ginkgo_create_common_device_test test_name) cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}") ginkgo_build_test_name(${test_name} test_target_name) - if (GINKGO_BUILD_DPCPP) + if(GINKGO_BUILD_DPCPP) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel) - endif () - if (GINKGO_BUILD_OMP) + endif() + if(GINKGO_BUILD_OMP) ginkgo_create_common_test_internal(${test_name} OmpExecutor omp ${ARGN}) target_link_libraries(${test_target_name}_omp PUBLIC OpenMP::OpenMP_CXX) - endif () - if (GINKGO_BUILD_CUDA) + endif() + if(GINKGO_BUILD_CUDA) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.cu COPYONLY) ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN}) target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor EXEC_NAMESPACE=cuda) - endif () - if (GINKGO_BUILD_HIP) + endif() + if(GINKGO_BUILD_HIP) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY) ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip "-std=c++14;-DEXEC_TYPE=HipExecutor;-DEXEC_NAMESPACE=hip" ${ARGN}) - endif () + endif() endfunction(ginkgo_create_common_device_test) ## Common test compiled with the host compiler for all enabled backends and Reference From a2af9d9a18d1372e28bb071caad195592faee4e6 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jul 2023 14:37:54 +0200 Subject: [PATCH 263/583] simplify parsing --- core/test/gtest/environments.hpp | 48 ++++++++++---------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 3f93ea95b8a..4cfb2a89959 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -8,24 +8,13 @@ #include -std::vector split(const std::string& s, char delimiter = ',') -{ - std::istringstream iss(s); - std::vector tokens; - std::string token; - while (std::getline(iss, token, delimiter)) { - tokens.push_back(token); - } - return tokens; -} - - struct resource { int id; int slots; }; -resource parse_single_resource(const std::string& resource_string) + +inline resource parse_single_resource(const std::string& resource_string) { std::regex re(R"(id\:(\d+),slots\:(\d+))"); std::smatch match; @@ -37,19 +26,8 @@ resource parse_single_resource(const std::string& resource_string) return resource{std::stoi(match[1]), std::stoi(match[2])}; } -std::vector parse_all_resources(const std::string& resource_string) -{ - auto resource_strings = split(resource_string, ';'); - - std::vector resources; - for (const auto& rs : resource_strings) { - resources.push_back(parse_single_resource(rs)); - } - return resources; -} - -std::vector get_ctest_resources() +inline std::vector get_ctest_resources() { auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); @@ -59,17 +37,19 @@ std::vector get_ctest_resources() auto rs_count = std::stoi(rs_count_env); - if (rs_count > 1) { - GKO_INVALID_STATE("Can handle only one resource group."); + std::vector resources; + + for (int i = 0; i < rs_count; ++i) { + std::string rs_group_env = "CTEST_RESOURCE_GROUP_" + std::to_string(i); + std::string rs_type = std::getenv(rs_group_env.c_str()); + std::transform(rs_type.begin(), rs_type.end(), rs_type.begin(), + [](auto c) { return std::toupper(c); }); + std::string rs_env = + std::getenv((rs_group_env + "_" + rs_type).c_str()); + resources.push_back(parse_single_resource(rs_env)); } - std::string rs_type = std::getenv("CTEST_RESOURCE_GROUP_0"); - std::transform(rs_type.begin(), rs_type.end(), rs_type.begin(), - [](auto c) { return std::toupper(c); }); - std::string rs_env = - std::getenv(std::string("CTEST_RESOURCE_GROUP_0_" + rs_type).c_str()); - std::cerr << rs_env << std::endl; - return parse_all_resources(rs_env); + return resources; } From 4117e931459a6a593481c1768b5b430cba242640 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Jul 2023 14:38:13 +0200 Subject: [PATCH 264/583] use ginkgo_create_omp_test --- omp/test/base/CMakeLists.txt | 6 ++---- omp/test/matrix/CMakeLists.txt | 2 +- omp/test/reorder/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/omp/test/base/CMakeLists.txt b/omp/test/base/CMakeLists.txt index 4c511b6def7..cfd00fe28cf 100644 --- a/omp/test/base/CMakeLists.txt +++ b/omp/test/base/CMakeLists.txt @@ -1,4 +1,2 @@ -ginkgo_create_test(kernel_launch) -target_compile_definitions(omp_test_base_kernel_launch PRIVATE GKO_COMPILING_OMP) -target_link_libraries(omp_test_base_kernel_launch PRIVATE OpenMP::OpenMP_CXX) -ginkgo_create_test(index_set) +ginkgo_create_omp_test(kernel_launch) +ginkgo_create_omp_test(index_set) diff --git a/omp/test/matrix/CMakeLists.txt b/omp/test/matrix/CMakeLists.txt index 88ab52e9c3f..398921ce75a 100644 --- a/omp/test/matrix/CMakeLists.txt +++ b/omp/test/matrix/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_test(fbcsr_kernels) +ginkgo_create_omp_test(fbcsr_kernels) diff --git a/omp/test/reorder/CMakeLists.txt b/omp/test/reorder/CMakeLists.txt index 8987ae28a48..089e51c67c9 100644 --- a/omp/test/reorder/CMakeLists.txt +++ b/omp/test/reorder/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_test(rcm_kernels) +ginkgo_create_omp_test(rcm_kernels) From 84c5b0434fc4ce9fb3e54dc612b20aad90eca94a Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 11:53:44 +0200 Subject: [PATCH 265/583] use custom stream by default otherwise, the default stream is used in some places, e.g. initializing cublas. But it is not clear with which device the default stream is associated. Thus, this now sets the device id correctly for the new stream --- test/utils/executor.hpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index ad4621d5c31..7afd2b0e4d9 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -124,14 +124,12 @@ class CommonTestFixture : public ::testing::Test { CommonTestFixture() : -#if defined(GKO_TEST_NONDEFAULT_STREAM) && \ - (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) stream(ResourceEnvironment::rs.id), #endif ref{gko::ReferenceExecutor::create()} { -#if defined(GKO_TEST_NONDEFAULT_STREAM) && \ - (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) init_executor(ref, exec, stream.get()); #else init_executor(ref, exec); @@ -145,13 +143,11 @@ class CommonTestFixture : public ::testing::Test { } } -#ifdef GKO_TEST_NONDEFAULT_STREAM #ifdef GKO_COMPILING_CUDA gko::cuda_stream stream; #endif #ifdef GKO_COMPILING_HIP gko::hip_stream stream; -#endif #endif std::shared_ptr ref; std::shared_ptr exec; From 6ec9475f4102f45f3b6d26040282d0420f4e126f Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 13:33:30 +0200 Subject: [PATCH 266/583] set device-id for each test this is necessary, since some test call the kernels directly and not through the executor. In this case, the setting of the device id by the executor is skipped, which leads to these kernel not run. --- cuda/test/utils.hpp | 16 +++++++--------- hip/test/utils.hip.hpp | 16 +++++++--------- test/utils/executor.hpp | 2 ++ test/utils/mpi/executor.hpp | 11 ++++------- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index 58d310024bd..f35cb8d4c12 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/gtest/environments.hpp" #include "cuda/base/device.hpp" @@ -51,13 +52,11 @@ class CudaTestFixture : public ::testing::Test { protected: CudaTestFixture() : ref(gko::ReferenceExecutor::create()), -#ifdef GKO_TEST_NONDEFAULT_STREAM - stream(0), - exec(gko::CudaExecutor::create( - 0, ref, std::make_shared(), stream.get())) -#else - exec(gko::CudaExecutor::create(0, ref)) -#endif + stream(ResourceEnvironment::rs.id), + exec(gko::CudaExecutor::create(ResourceEnvironment::rs.id, ref, std::make_shared< + gko::CudaAllocator>(), + stream.get())), + guard(exec->get_scoped_device_id_guard()) {} void TearDown() @@ -68,11 +67,10 @@ class CudaTestFixture : public ::testing::Test { } } -#ifdef GKO_TEST_NONDEFAULT_STREAM gko::cuda_stream stream; -#endif std::shared_ptr ref; std::shared_ptr exec; + gko::scoped_device_id_guard guard; }; diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index dcecc8d2522..1c57467b451 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/gtest/environments.hpp" #include "hip/base/device.hpp" @@ -51,13 +52,11 @@ class HipTestFixture : public ::testing::Test { protected: HipTestFixture() : ref(gko::ReferenceExecutor::create()), -#ifdef GKO_TEST_NONDEFAULT_STREAM - stream(0), - exec(gko::HipExecutor::create( - 0, ref, std::make_shared(), stream.get())) -#else - exec(gko::HipExecutor::create(0, ref)) -#endif + stream(ResourceEnvironment::rs.id), + exec(gko::HipExecutor::create(ResourceEnvironment::rs.id, ref, std::make_shared< + gko::HipAllocator>(), + stream.get())), + guard(exec->get_scoped_device_id_guard()) {} void TearDown() @@ -68,11 +67,10 @@ class HipTestFixture : public ::testing::Test { } } -#ifdef GKO_TEST_NONDEFAULT_STREAM gko::hip_stream stream; -#endif std::shared_ptr ref; std::shared_ptr exec; + gko::scoped_device_id_guard guard; }; diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 7afd2b0e4d9..d52b8083ac8 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -134,6 +134,7 @@ class CommonTestFixture : public ::testing::Test { #else init_executor(ref, exec); #endif + guard = exec->get_scoped_device_id_guard(); } void TearDown() final @@ -151,6 +152,7 @@ class CommonTestFixture : public ::testing::Test { #endif std::shared_ptr ref; std::shared_ptr exec; + gko::scoped_device_id_guard guard; }; diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp index 4eba5593c90..f317f60eb35 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/executor.hpp @@ -58,18 +58,17 @@ class CommonMpiTestFixture : public ::testing::Test { CommonMpiTestFixture() : comm(MPI_COMM_WORLD), -#if defined(GKO_TEST_NONDEFAULT_STREAM) && \ - (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) stream(ResourceEnvironment::rs.id), #endif ref{gko::ReferenceExecutor::create()} { -#if defined(GKO_TEST_NONDEFAULT_STREAM) && \ - (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) init_executor(ref, exec, stream.get()); #else init_executor(ref, exec); #endif + guard = exec->get_scoped_device_id_guard(); } void TearDown() final @@ -81,17 +80,15 @@ class CommonMpiTestFixture : public ::testing::Test { gko::experimental::mpi::communicator comm; -#ifdef GKO_TEST_NONDEFAULT_STREAM #ifdef GKO_COMPILING_CUDA gko::cuda_stream stream; #endif #ifdef GKO_COMPILING_HIP gko::hip_stream stream; #endif -#endif - std::shared_ptr ref; std::shared_ptr exec; + gko::scoped_device_id_guard guard; }; From 34e59dd094dc47633cb72345df393669e01f17e5 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 13:33:49 +0200 Subject: [PATCH 267/583] add ctest resource settings to logging output --- core/test/gtest/environments.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 4cfb2a89959..a7aedbc102c 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -30,6 +30,7 @@ inline resource parse_single_resource(const std::string& resource_string) inline std::vector get_ctest_resources() { auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); + std::cerr << "CTEST_RESOURCE_GROUP_COUNT=" << rs_count_env << std::endl; if (!rs_count_env) { return {{0, 1}}; @@ -42,10 +43,14 @@ inline std::vector get_ctest_resources() for (int i = 0; i < rs_count; ++i) { std::string rs_group_env = "CTEST_RESOURCE_GROUP_" + std::to_string(i); std::string rs_type = std::getenv(rs_group_env.c_str()); + std::cerr << rs_group_env << "=" << rs_type << std::endl; + std::transform(rs_type.begin(), rs_type.end(), rs_type.begin(), [](auto c) { return std::toupper(c); }); - std::string rs_env = - std::getenv((rs_group_env + "_" + rs_type).c_str()); + std::string rs_current_group = rs_group_env + "_" + rs_type; + std::string rs_env = std::getenv(rs_current_group.c_str()); + std::cerr << rs_current_group << "=" << rs_env << std::endl; + resources.push_back(parse_single_resource(rs_env)); } From 7c83c349efefa9bb2348220a2fb0f2c44ae2669e Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 14:37:33 +0200 Subject: [PATCH 268/583] fixes schwarz preconditioner test --- test/mpi/preconditioner/schwarz.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 8586711a114..3c9e3a8d69f 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -101,14 +101,14 @@ class SchwarzPreconditioner : public CommonMpiTestFixture { SchwarzPreconditioner() - : size{8, 8}, mat_input{size, {{0, 0, 2}, {0, 1, -1}, {1, 0, -1}, - {1, 1, 2}, {1, 2, -1}, {2, 1, -1}, - {2, 2, 2}, {2, 3, -1}, {3, 2, -1}, - {3, 3, 2}, {3, 4, -1}, {4, 3, -1}, - {4, 4, 2}, {4, 5, -1}, {5, 4, -1}, - {5, 5, 2}, {5, 6, -1}, {6, 5, -1}, - {6, 6, 2}, {6, 7, -1}, {7, 6, -1}, - {7, 7, 2}}} + : CommonMpiTestFixture(), + size{8, 8}, + mat_input{size, + {{0, 0, 2}, {0, 1, -1}, {1, 0, -1}, {1, 1, 2}, {1, 2, -1}, + {2, 1, -1}, {2, 2, 2}, {2, 3, -1}, {3, 2, -1}, {3, 3, 2}, + {3, 4, -1}, {4, 3, -1}, {4, 4, 2}, {4, 5, -1}, {5, 4, -1}, + {5, 5, 2}, {5, 6, -1}, {6, 5, -1}, {6, 6, 2}, {6, 7, -1}, + {7, 6, -1}, {7, 7, 2}}} { row_part = Partition::build_from_contiguous( exec, gko::array( From 9a5fd2c00d2ed8fb41fbdc1deb13dab6098ec2ea Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 14:38:11 +0200 Subject: [PATCH 269/583] use ginkgo_create_cuda_test consistently --- cuda/test/base/CMakeLists.txt | 6 +++--- cuda/test/base/{index_set.cpp => index_set.cu} | 0 cuda/test/base/{memory.cpp => memory.cu} | 0 cuda/test/reorder/CMakeLists.txt | 2 +- cuda/test/reorder/{rcm_kernels.cpp => rcm_kernels.cu} | 0 cuda/test/utils/CMakeLists.txt | 2 +- cuda/test/utils/{assertions_test.cpp => assertions_test.cu} | 0 7 files changed, 5 insertions(+), 5 deletions(-) rename cuda/test/base/{index_set.cpp => index_set.cu} (100%) rename cuda/test/base/{memory.cpp => memory.cu} (100%) rename cuda/test/reorder/{rcm_kernels.cpp => rcm_kernels.cu} (100%) rename cuda/test/utils/{assertions_test.cpp => assertions_test.cu} (100%) diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt index a213e65277a..f78e6e653fe 100644 --- a/cuda/test/base/CMakeLists.txt +++ b/cuda/test/base/CMakeLists.txt @@ -1,13 +1,13 @@ ginkgo_create_cuda_test(array) ginkgo_create_cuda_test(cuda_executor) -ginkgo_create_test(index_set) +ginkgo_create_cuda_test(index_set) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_cuda_test(cuda_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) -endif() +endif () ginkgo_create_cuda_test(exception_helpers) ginkgo_create_cuda_test(kernel_launch) ginkgo_create_cuda_test(lin_op) ginkgo_create_cuda_test(math) -ginkgo_create_test(memory) +ginkgo_create_cuda_test(memory) ginkgo_create_cuda_test(scoped_device_id) diff --git a/cuda/test/base/index_set.cpp b/cuda/test/base/index_set.cu similarity index 100% rename from cuda/test/base/index_set.cpp rename to cuda/test/base/index_set.cu diff --git a/cuda/test/base/memory.cpp b/cuda/test/base/memory.cu similarity index 100% rename from cuda/test/base/memory.cpp rename to cuda/test/base/memory.cu diff --git a/cuda/test/reorder/CMakeLists.txt b/cuda/test/reorder/CMakeLists.txt index 108e3b57dd5..e6cd8c0f5d2 100644 --- a/cuda/test/reorder/CMakeLists.txt +++ b/cuda/test/reorder/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_test(rcm_kernels) \ No newline at end of file +ginkgo_create_cuda_test(rcm_kernels) diff --git a/cuda/test/reorder/rcm_kernels.cpp b/cuda/test/reorder/rcm_kernels.cu similarity index 100% rename from cuda/test/reorder/rcm_kernels.cpp rename to cuda/test/reorder/rcm_kernels.cu diff --git a/cuda/test/utils/CMakeLists.txt b/cuda/test/utils/CMakeLists.txt index 06dffda5da0..28f5770856f 100644 --- a/cuda/test/utils/CMakeLists.txt +++ b/cuda/test/utils/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_test(assertions_test) +ginkgo_create_cuda_test(assertions_test) diff --git a/cuda/test/utils/assertions_test.cpp b/cuda/test/utils/assertions_test.cu similarity index 100% rename from cuda/test/utils/assertions_test.cpp rename to cuda/test/utils/assertions_test.cu From 2466da34682d50e0e90568686d88a4be1f336937 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 14:43:24 +0200 Subject: [PATCH 270/583] without resources, return the default number of omp threads --- core/test/gtest/environments.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index a7aedbc102c..4434185a4e0 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -33,7 +33,17 @@ inline std::vector get_ctest_resources() std::cerr << "CTEST_RESOURCE_GROUP_COUNT=" << rs_count_env << std::endl; if (!rs_count_env) { +#ifdef GKO_COMPILING_OMP + resource rs{}; +#pragma omp parallel +#pragma omp single + { + rs = resource{0, omp_get_num_threads()}; + } + return {rs}; +#else return {{0, 1}}; +#endif } auto rs_count = std::stoi(rs_count_env); From a72f56cb7a4d751d847d0b09cc3d5b029a185fea Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 15:22:15 +0200 Subject: [PATCH 271/583] fix check for no resources --- core/test/gtest/environments.hpp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 4434185a4e0..a678ce00ffd 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -5,6 +5,21 @@ #include +#ifdef GKO_COMPILING_OMP +#include +#endif + + +#ifdef GKO_COMPILING_CUDA +#include "cuda/base/device.hpp" +#endif + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/device.hpp" +#endif + + #include @@ -32,7 +47,9 @@ inline std::vector get_ctest_resources() auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); std::cerr << "CTEST_RESOURCE_GROUP_COUNT=" << rs_count_env << std::endl; - if (!rs_count_env) { + auto rs_count = rs_count_env ? std::stoi(rs_count_env) : 0; + + if (rs_count == 0) { #ifdef GKO_COMPILING_OMP resource rs{}; #pragma omp parallel @@ -46,8 +63,6 @@ inline std::vector get_ctest_resources() #endif } - auto rs_count = std::stoi(rs_count_env); - std::vector resources; for (int i = 0; i < rs_count; ++i) { @@ -81,8 +96,6 @@ class ResourceEnvironment : public ::testing::Environment { #ifdef GKO_COMPILING_OMP -#include - class OmpEnvironment : public ::testing::Environment { public: void SetUp() override @@ -101,8 +114,6 @@ class OmpEnvironment : public ::testing::Environment {}; #ifdef GKO_COMPILING_CUDA -#include "cuda/base/device.hpp" - class CudaEnvironment : public ::testing::Environment { public: void TearDown() override @@ -120,8 +131,6 @@ class CudaEnvironment : public ::testing::Environment {}; #ifdef GKO_COMPILING_HIP -#include "hip/base/device.hpp" - class HipEnvironment : public ::testing::Environment { public: void TearDown() override From 0aa6c718fd0cd549e004cca720799dbfdf0737f1 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 15:23:57 +0200 Subject: [PATCH 272/583] fix cmake resource parameters --- cmake/create_test.cmake | 2 +- cuda/test/base/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 6ce37976f84..34e27529e08 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -1,4 +1,4 @@ -set(gko_test_resource_args "RESOURCE_LOCAL_CORES;RESOURCE_PERCENT;RESOURCE_TYPE") +set(gko_test_resource_args "RESOURCE_LOCAL_CORES;RESOURCE_PERCENTAGE;RESOURCE_TYPE") set(gko_test_single_args "MPI_SIZE;${gko_test_resource_args}") set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES") set(gko_test_option_args "NO_RESOURCES") diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt index f78e6e653fe..bb99ba858a4 100644 --- a/cuda/test/base/CMakeLists.txt +++ b/cuda/test/base/CMakeLists.txt @@ -10,4 +10,4 @@ ginkgo_create_cuda_test(kernel_launch) ginkgo_create_cuda_test(lin_op) ginkgo_create_cuda_test(math) ginkgo_create_cuda_test(memory) -ginkgo_create_cuda_test(scoped_device_id) +ginkgo_create_cuda_test(scoped_device_id NO_RESOURCES) From 882dfcf6068225f2b286773c787cb86564b1c306 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jul 2023 16:06:20 +0200 Subject: [PATCH 273/583] allow 4 concurrent GPU tests --- cmake/create_test.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 34e27529e08..76330a26627 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -82,7 +82,7 @@ function(ginkgo_add_resource_requirement test_name) set(single_resource "cpus:${add_rr_RESOURCE_LOCAL_CORES}") elseif(add_rr_RESOURCE_TYPE STREQUAL "gpu") if(NOT add_rr_RESOURCE_PERCENTAGE) - set(add_rr_RESOURCE_PERCENTAGE 50) + set(add_rr_RESOURCE_PERCENTAGE 25) endif() if(add_rr_MPI_SIZE GREATER 1) set(add_rr_RESOURCE_PERCENTAGE 100) From cca63a1bc8885693acc74b128adf29b64c614d2c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 31 Jul 2023 11:08:10 +0200 Subject: [PATCH 274/583] use different resource type per executor Co-authored-by: Tobias Ribizel --- cmake/create_test.cmake | 24 +++-- core/test/gtest/environments.hpp | 139 ++++++++++++++++++---------- core/test/gtest/ginkgo_main.cpp | 12 ++- core/test/gtest/ginkgo_mpi_main.cpp | 13 ++- cuda/test/utils.hpp | 8 +- hip/test/utils.hip.hpp | 4 +- test/utils/executor.hpp | 19 ++-- test/utils/mpi/executor.hpp | 7 +- 8 files changed, 140 insertions(+), 86 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 76330a26627..1d3e041ff2a 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -70,17 +70,17 @@ function(ginkgo_add_resource_requirement test_name) endif () if(add_rr_RESOURCE_TYPE STREQUAL "ref") - set(single_resource "cpus:1") + set(single_resource "cpu:1") elseif(add_rr_RESOURCE_TYPE STREQUAL "cpu") if(NOT add_rr_RESOURCE_LOCAL_CORES) set(add_rr_RESOURCE_LOCAL_CORES 4) # perhaps get this from environment variable? endif() if(NOT add_rr_RESOURCE_LOCAL_CORES MATCHES "^[0-9]+") - message(FATAL_ERROR "Resource specification is invalid: RESOURCE_LOCAL_CORE=${add_rr_RESOURCE_LOCAL_CORES}") + message(FATAL_ERROR "Resource specification is invalid: RESOURCE_LOCAL_CORES=${add_rr_RESOURCE_LOCAL_CORES}") endif() - set(single_resource "cpus:${add_rr_RESOURCE_LOCAL_CORES}") - elseif(add_rr_RESOURCE_TYPE STREQUAL "gpu") + set(single_resource "cpu:${add_rr_RESOURCE_LOCAL_CORES}") + elseif(add_rr_RESOURCE_TYPE MATCHES "^(cuda|hip|sycl)gpu$") if(NOT add_rr_RESOURCE_PERCENTAGE) set(add_rr_RESOURCE_PERCENTAGE 25) endif() @@ -93,9 +93,9 @@ function(ginkgo_add_resource_requirement test_name) message(FATAL_ERROR "Resource specification is invalid: RESOURCE_PERCENTAGE=${add_rr_RESOURCE_PERCENTAGE}") endif() - set(single_resource "gpus:${add_rr_RESOURCE_PERCENTAGE}") + set(single_resource "${add_rr_RESOURCE_TYPE}:${add_rr_RESOURCE_PERCENTAGE}") else() - message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: ref, cpu, gpu.") + message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: ref, cpu, cudagpu, hipgpu, syclgpu.") endif() if(NOT add_rr_MPI_SIZE) @@ -164,7 +164,7 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE gpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE syclgpu) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}") @@ -198,7 +198,7 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF) endif() ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE gpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cudagpu) endfunction(ginkgo_create_cuda_test_internal) ## Test compiled with HIP @@ -254,7 +254,7 @@ function(ginkgo_create_hip_test_internal test_name filename test_target_name add ${HIPSPARSE_INCLUDE_DIRS} ) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE gpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu) endfunction(ginkgo_create_hip_test_internal) @@ -298,8 +298,12 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) set(test_resource_type ref) elseif (exec STREQUAL omp) set(test_resource_type cpu) + elseif (exec STREQUAL cuda) + set(test_resource_type cudagpu) + elseif (exec STREQUAL hip) + set(test_resource_type hipgpu) else () - set(test_resource_type gpu) + set(test_resource_type syclgpu) endif () ginkgo_build_test_name(${test_name} test_target_name) string(TOUPPER ${exec} exec_upper) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index a678ce00ffd..0d433f1c9d1 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -21,76 +21,113 @@ #include +#include +#include -struct resource { +struct ctest_resource { int id; int slots; }; -inline resource parse_single_resource(const std::string& resource_string) +inline char* get_ctest_group(std::string resource_type, int group_id) { - std::regex re(R"(id\:(\d+),slots\:(\d+))"); - std::smatch match; - - if (!std::regex_match(resource_string, match, re)) { - GKO_INVALID_STATE("Can't parse resource string: " + resource_string); - } - - return resource{std::stoi(match[1]), std::stoi(match[2])}; + std::transform(resource_type.begin(), resource_type.end(), + resource_type.begin(), + [](auto c) { return std::toupper(c); }); + std::string rs_group_env = "CTEST_RESOURCE_GROUP_" + + std::to_string(group_id) + "_" + resource_type; + return std::getenv(rs_group_env.c_str()); } -inline std::vector get_ctest_resources() +inline ctest_resource parse_ctest_resources(std::string resource) { - auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); - std::cerr << "CTEST_RESOURCE_GROUP_COUNT=" << rs_count_env << std::endl; - - auto rs_count = rs_count_env ? std::stoi(rs_count_env) : 0; - - if (rs_count == 0) { -#ifdef GKO_COMPILING_OMP - resource rs{}; -#pragma omp parallel -#pragma omp single - { - rs = resource{0, omp_get_num_threads()}; - } - return {rs}; -#else - return {{0, 1}}; -#endif - } - - std::vector resources; - - for (int i = 0; i < rs_count; ++i) { - std::string rs_group_env = "CTEST_RESOURCE_GROUP_" + std::to_string(i); - std::string rs_type = std::getenv(rs_group_env.c_str()); - std::cerr << rs_group_env << "=" << rs_type << std::endl; - - std::transform(rs_type.begin(), rs_type.end(), rs_type.begin(), - [](auto c) { return std::toupper(c); }); - std::string rs_current_group = rs_group_env + "_" + rs_type; - std::string rs_env = std::getenv(rs_current_group.c_str()); - std::cerr << rs_current_group << "=" << rs_env << std::endl; + std::regex re(R"(id\:(\d+),slots\:(\d+))"); + std::smatch match; - resources.push_back(parse_single_resource(rs_env)); + if (!std::regex_match(resource, match, re)) { + GKO_INVALID_STATE("Can't parse ctest_resource string: " + resource); } - return resources; + return ctest_resource{std::stoi(match[1]), std::stoi(match[2])}; } class ResourceEnvironment : public ::testing::Environment { public: - explicit ResourceEnvironment(resource rs_) : ::testing::Environment() + explicit ResourceEnvironment(int rank = 0, int size = 1) { - rs = rs_; +#if GINKGO_BUILD_MPI + if (size > 1) { + cuda_device_id = gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, + std::max(gko::CudaExecutor::get_num_devices(), 1)); + hip_device_id = gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, + std::max(gko::HipExecutor::get_num_devices(), 1)); + sycl_device_id = gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, + std::max(gko::DpcppExecutor::get_num_devices("gpu"), 1)); + } +#endif + + auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); + auto rs_count = rs_count_env ? std::stoi(rs_count_env) : 0; + if (rs_count == 0) { + std::cerr << "Running without CTest ctest_resource configuration" + << std::endl; + return; + } + if (rs_count != size) { + GKO_INVALID_STATE("Invalid resource group count: " + + std::to_string(rs_count)); + } + + // parse CTest ctest_resource group descriptions + if (rank == 0) { + std::cerr << "Running with CTest ctest_resource configuration:" + << std::endl; + } + // OpenMP CPU threads + if (auto rs_omp_env = get_ctest_group("cpu", rank)) { + auto resource = parse_ctest_resources(rs_omp_env); + omp_threads = resource.slots; + if (rank == 0) { + std::cerr << omp_threads << " CPU threads" << std::endl; + } + } + // CUDA GPUs + if (auto rs_cuda_env = get_ctest_group("cudagpu", rank)) { + auto resource = parse_ctest_resources(rs_cuda_env); + cuda_device_id = resource.id; + if (rank == 0) { + std::cerr << "CUDA device " << cuda_device_id << std::endl; + } + } + // HIP GPUs + if (auto rs_hip_env = get_ctest_group("hipgpu", rank)) { + auto resource = parse_ctest_resources(rs_hip_env); + hip_device_id = resource.id; + if (rank == 0) { + std::cerr << "HIP device " << hip_device_id << std::endl; + } + } + // SYCL GPUs (no other devices!) + if (auto rs_sycl_env = get_ctest_group("syclgpu", rank)) { + auto resource = parse_ctest_resources(rs_sycl_env); + sycl_device_id = resource.id; + if (rank == 0) { + std::cerr << "SYCL device " << sycl_device_id << std::endl; + } + } } - static resource rs; + static int omp_threads; + static int cuda_device_id; + static int hip_device_id; + static int sycl_device_id; }; @@ -100,7 +137,9 @@ class OmpEnvironment : public ::testing::Environment { public: void SetUp() override { - omp_set_num_threads(ResourceEnvironment::rs.slots); + if (ResourceEnvironment::omp_threads > 0) { + omp_set_num_threads(ResourceEnvironment::omp_threads); + } } }; @@ -118,7 +157,7 @@ class CudaEnvironment : public ::testing::Environment { public: void TearDown() override { - gko::kernels::cuda::reset_device(ResourceEnvironment::rs.id); + gko::kernels::cuda::reset_device(ResourceEnvironment::cuda_device_id); } }; @@ -135,7 +174,7 @@ class HipEnvironment : public ::testing::Environment { public: void TearDown() override { - gko::kernels::hip::reset_device(ResourceEnvironment::rs.id); + gko::kernels::hip::reset_device(ResourceEnvironment::hip_device_id); } }; diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp index 76a005a66e2..71117f2d73b 100644 --- a/core/test/gtest/ginkgo_main.cpp +++ b/core/test/gtest/ginkgo_main.cpp @@ -3,16 +3,18 @@ #include "core/test/gtest/environments.hpp" -resource ResourceEnvironment::rs = {}; + +int ResourceEnvironment::omp_threads = 0; +int ResourceEnvironment::cuda_device_id = 0; +int ResourceEnvironment::hip_device_id = 0; +int ResourceEnvironment::sycl_device_id = 0; + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - auto resources = get_ctest_resources(); - - ::testing::AddGlobalTestEnvironment( - new ResourceEnvironment(resources.front())); + ::testing::AddGlobalTestEnvironment(new ResourceEnvironment); ::testing::AddGlobalTestEnvironment(new CudaEnvironment); ::testing::AddGlobalTestEnvironment(new HipEnvironment); ::testing::AddGlobalTestEnvironment(new OmpEnvironment); diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index 934a3dcd3f5..945ec7ec7cd 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -375,7 +375,11 @@ class MPIWrapperPrinter : public ::testing::TestEventListener { } // namespace GTestMPIListener -resource ResourceEnvironment::rs = {}; +int ResourceEnvironment::omp_threads = 0; +int ResourceEnvironment::cuda_device_id = 0; +int ResourceEnvironment::hip_device_id = 0; +int ResourceEnvironment::sycl_device_id = 0; + int main(int argc, char** argv) { @@ -384,13 +388,12 @@ int main(int argc, char** argv) MPI_Init(&argc, &argv); MPI_Comm comm(MPI_COMM_WORLD); int rank; + int size; MPI_Comm_rank(comm, &rank); - - auto resources = get_ctest_resources(); + MPI_Comm_size(comm, &size); testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); - ::testing::AddGlobalTestEnvironment( - new ResourceEnvironment(resources[rank])); + ::testing::AddGlobalTestEnvironment(new ResourceEnvironment(rank, size)); ::testing::AddGlobalTestEnvironment(new CudaEnvironment); ::testing::AddGlobalTestEnvironment(new HipEnvironment); ::testing::AddGlobalTestEnvironment(new OmpEnvironment); diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index f35cb8d4c12..0410b3a6a22 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -52,10 +52,10 @@ class CudaTestFixture : public ::testing::Test { protected: CudaTestFixture() : ref(gko::ReferenceExecutor::create()), - stream(ResourceEnvironment::rs.id), - exec(gko::CudaExecutor::create(ResourceEnvironment::rs.id, ref, std::make_shared< - gko::CudaAllocator>(), - stream.get())), + stream(ResourceEnvironment::cuda_device_id), + exec(gko::CudaExecutor::create( + ResourceEnvironment::cuda_device_id, ref, std::make_shared< + gko::CudaAllocator>(), stream.get())), guard(exec->get_scoped_device_id_guard()) {} diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index 1c57467b451..38fc3763ece 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -52,8 +52,8 @@ class HipTestFixture : public ::testing::Test { protected: HipTestFixture() : ref(gko::ReferenceExecutor::create()), - stream(ResourceEnvironment::rs.id), - exec(gko::HipExecutor::create(ResourceEnvironment::rs.id, ref, std::make_shared< + stream(ResourceEnvironment::hip_device_id), + exec(gko::HipExecutor::create(ResourceEnvironment::hip_device_id, ref, std::make_shared< gko::HipAllocator>(), stream.get())), guard(exec->get_scoped_device_id_guard()) diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index d52b8083ac8..082c3556381 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -80,7 +80,8 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable CUDA devices"}; } exec = gko::CudaExecutor::create( - ResourceEnvironment::rs.id, ref, std::make_shared(), stream); + ResourceEnvironment::cuda_device_id, + ref, std::make_shared(), stream); } } @@ -93,7 +94,7 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable HIP devices"}; } exec = gko::HipExecutor::create( - ResourceEnvironment::rs.id, ref, std::make_shared< + ResourceEnvironment::hip_device_id, ref, std::make_shared< gko::HipAllocator>(), stream); } @@ -102,11 +103,10 @@ inline void init_executor(std::shared_ptr ref, std::shared_ptr& exec) { if (gko::DpcppExecutor::get_num_devices("gpu") > 0) { - exec = - gko::DpcppExecutor::create(ResourceEnvironment::rs.id, ref, "gpu"); + exec = gko::DpcppExecutor::create(ResourceEnvironment::sycl_device_id, + ref, "gpu"); } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) { - exec = - gko::DpcppExecutor::create(ResourceEnvironment::rs.id, ref, "cpu"); + exec = gko::DpcppExecutor::create(0, ref, "cpu"); } else { throw std::runtime_error{"No suitable DPC++ devices"}; } @@ -124,8 +124,11 @@ class CommonTestFixture : public ::testing::Test { CommonTestFixture() : -#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) - stream(ResourceEnvironment::rs.id), +#ifdef GKO_COMPILING_CUDA + stream(ResourceEnvironment::cuda_device_id), +#endif +#ifdef GKO_COMPILING_HIP + stream(ResourceEnvironment::hip_device_id), #endif ref{gko::ReferenceExecutor::create()} { diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp index f317f60eb35..f02834a5a1f 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/executor.hpp @@ -58,8 +58,11 @@ class CommonMpiTestFixture : public ::testing::Test { CommonMpiTestFixture() : comm(MPI_COMM_WORLD), -#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) - stream(ResourceEnvironment::rs.id), +#ifdef GKO_COMPILING_CUDA + stream(ResourceEnvironment::cuda_device_id), +#endif +#ifdef GKO_COMPILING_HIP + stream(ResourceEnvironment::hip_device_id), #endif ref{gko::ReferenceExecutor::create()} { From 941382c94bec5a47490ad6ac90c1a0d5b310d826 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 31 Jul 2023 11:33:24 +0200 Subject: [PATCH 275/583] adds generator for ctest resource file Co-authored-by: Tobias Ribizel --- test/CMakeLists.txt | 1 + test/tools/CMakeLists.txt | 2 + test/tools/resource_file_generator.cpp | 72 ++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 test/tools/CMakeLists.txt create mode 100644 test/tools/resource_file_generator.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8a6eb305b6a..6e72dbdf0aa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory(preconditioner) add_subdirectory(reorder) add_subdirectory(solver) add_subdirectory(stop) +add_subdirectory(tools) diff --git a/test/tools/CMakeLists.txt b/test/tools/CMakeLists.txt new file mode 100644 index 00000000000..d3aa14b8ca7 --- /dev/null +++ b/test/tools/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(resource_file_generator resource_file_generator.cpp) +target_link_libraries(resource_file_generator Ginkgo::ginkgo ) diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp new file mode 100644 index 00000000000..1070a569662 --- /dev/null +++ b/test/tools/resource_file_generator.cpp @@ -0,0 +1,72 @@ +#include + +#include +#include + + +std::vector split(const std::string& s, char delimiter = ',') +{ + std::istringstream iss(s); + std::vector tokens; + std::string token; + while (std::getline(iss, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + +std::string create_json(const std::string& resources) +{ + std::string json; + json.append(R"({ + "version": { + "major": 1, + "minor": 0 + }, + "local": [ + { +)"); + for (const auto& line : split(resources, '\n')) { + json.append(R"( )"); + json.append(line); + json.append("\n"); + } + json.append(R"( } + ] +})"); + return json; +} + + +int main() +{ + auto num_cpu_threads = std::max(std::thread::hardware_concurrency(), 1u); + auto num_cuda_gpus = gko::CudaExecutor::get_num_devices(); + auto num_hip_gpus = gko::HipExecutor::get_num_devices(); + auto num_sycl_gpus = gko::DpcppExecutor::get_num_devices("gpu"); + + std::string cpus = R"("cpu": [{"id": "0", "slots": )" + + std::to_string(num_cpu_threads) + "}]"; + + std::string gpus = ""; + auto add_devices = [&](int num_devices, const std::string& name) { + if(num_devices){ + gpus.append(",\n"); + gpus += '"' + name + "\": [\n"; + } + for (int i = 0; i < num_devices; i++) { + if(i > 0){ + gpus.append(",\n"); + } + gpus+= R"( {"id": ")" + std::to_string(i) + R"(", "slots": 100})"; + } + if(num_devices){ + gpus.append("\n]"); + } + }; + add_devices(num_cuda_gpus, "cudagpu"); + add_devices(num_hip_gpus, "hipgpu"); + add_devices(num_sycl_gpus, "syclgpu"); + + std::cout << create_json(cpus + gpus) << std::endl; +} \ No newline at end of file From b5e191b3482aebd7b8d12787a78241ebe6020bd3 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 1 Aug 2023 18:06:03 +0200 Subject: [PATCH 276/583] review updates: - remove test file - small documentation - more verbose device id output Co-authored-by: Tobias Ribizel --- core/test/gtest/environments.hpp | 15 +++----- resources.json | 51 -------------------------- test/tools/CMakeLists.txt | 2 +- test/tools/resource_file_generator.cpp | 2 +- test/utils/executor.hpp | 2 + 5 files changed, 10 insertions(+), 62 deletions(-) delete mode 100644 resources.json diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 0d433f1c9d1..6276de9372a 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -102,25 +102,22 @@ class ResourceEnvironment : public ::testing::Environment { if (auto rs_cuda_env = get_ctest_group("cudagpu", rank)) { auto resource = parse_ctest_resources(rs_cuda_env); cuda_device_id = resource.id; - if (rank == 0) { - std::cerr << "CUDA device " << cuda_device_id << std::endl; - } + std::cerr << "Rank " << rank << ": CUDA device " << cuda_device_id + << std::endl; } // HIP GPUs if (auto rs_hip_env = get_ctest_group("hipgpu", rank)) { auto resource = parse_ctest_resources(rs_hip_env); hip_device_id = resource.id; - if (rank == 0) { - std::cerr << "HIP device " << hip_device_id << std::endl; - } + std::cerr << "Rank " << rank << ": HIP device " << cuda_device_id + << std::endl; } // SYCL GPUs (no other devices!) if (auto rs_sycl_env = get_ctest_group("syclgpu", rank)) { auto resource = parse_ctest_resources(rs_sycl_env); sycl_device_id = resource.id; - if (rank == 0) { - std::cerr << "SYCL device " << sycl_device_id << std::endl; - } + std::cerr << "Rank " << rank << ": SYCL device " << cuda_device_id + << std::endl; } } diff --git a/resources.json b/resources.json deleted file mode 100644 index 9d69ada752b..00000000000 --- a/resources.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "version": { - "major": 1, - "minor": 0 - }, - "local": [ - { - "cpus": [ - { - "id": "0", - "slots": 32 - } - ], - - "gpus": [ - { - "id": "0", - "slots": 100 - }, - { - "id": "1", - "slots": 100 - }, - { - "id": "2", - "slots": 100 - }, - { - "id": "3", - "slots": 100 - }, - { - "id": "4", - "slots": 100 - }, - { - "id": "5", - "slots": 100 - }, - { - "id": "6", - "slots": 100 - }, - { - "id": "7", - "slots": 100 - } - ] - } - ] -} \ No newline at end of file diff --git a/test/tools/CMakeLists.txt b/test/tools/CMakeLists.txt index d3aa14b8ca7..21a7a5fc695 100644 --- a/test/tools/CMakeLists.txt +++ b/test/tools/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(resource_file_generator resource_file_generator.cpp) -target_link_libraries(resource_file_generator Ginkgo::ginkgo ) +target_link_libraries(resource_file_generator Ginkgo::ginkgo) diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp index 1070a569662..de9464ce82d 100644 --- a/test/tools/resource_file_generator.cpp +++ b/test/tools/resource_file_generator.cpp @@ -69,4 +69,4 @@ int main() add_devices(num_sycl_gpus, "syclgpu"); std::cout << create_json(cpus + gpus) << std::endl; -} \ No newline at end of file +} diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 082c3556381..836f70d2352 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -137,6 +137,8 @@ class CommonTestFixture : public ::testing::Test { #else init_executor(ref, exec); #endif + // set device-id test-wide since some test call device + // kernels directly guard = exec->get_scoped_device_id_guard(); } From 183e01fea1cf90e9f92a30c90b257eee95cd25e4 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 2 Aug 2023 14:30:21 +0200 Subject: [PATCH 277/583] fixes tests after rebase --- core/test/gtest/environments.hpp | 3 +++ test/utils/executor.hpp | 1 + 2 files changed, 4 insertions(+) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 6276de9372a..ff029995baf 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -5,6 +5,9 @@ #include +#include + + #ifdef GKO_COMPILING_OMP #include #endif diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 836f70d2352..419e089f793 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include From a9879c12c1b20dfdb3273479e8e51b8a4f7149d8 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 3 Aug 2023 07:23:10 +0000 Subject: [PATCH 278/583] Format files Co-authored-by: Marcel Koch --- core/test/gtest/environments.hpp | 32 +++++++++++++++++++ core/test/gtest/ginkgo_main.cpp | 34 +++++++++++++++++++- cuda/test/utils.hpp | 4 +-- hip/test/utils.hip.hpp | 4 +-- test/tools/resource_file_generator.cpp | 43 +++++++++++++++++++++++--- test/utils/executor.hpp | 14 +++++---- test/utils/mpi/executor.hpp | 4 ++- 7 files changed, 118 insertions(+), 17 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index ff029995baf..856763d4105 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -1,3 +1,35 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #ifndef GINKGO_ENVIRONMENTS_HPP #define GINKGO_ENVIRONMENTS_HPP diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp index 71117f2d73b..4d69b421875 100644 --- a/core/test/gtest/ginkgo_main.cpp +++ b/core/test/gtest/ginkgo_main.cpp @@ -1,3 +1,35 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #include @@ -20,4 +52,4 @@ int main(int argc, char** argv) ::testing::AddGlobalTestEnvironment(new OmpEnvironment); int result = RUN_ALL_TESTS(); return result; -} \ No newline at end of file +} diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index 0410b3a6a22..35f382806ec 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -54,8 +54,8 @@ class CudaTestFixture : public ::testing::Test { : ref(gko::ReferenceExecutor::create()), stream(ResourceEnvironment::cuda_device_id), exec(gko::CudaExecutor::create( - ResourceEnvironment::cuda_device_id, ref, std::make_shared< - gko::CudaAllocator>(), stream.get())), + ResourceEnvironment::cuda_device_id, ref, + std::make_shared(), stream.get())), guard(exec->get_scoped_device_id_guard()) {} diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index 38fc3763ece..d67c8935ab4 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -53,8 +53,8 @@ class HipTestFixture : public ::testing::Test { HipTestFixture() : ref(gko::ReferenceExecutor::create()), stream(ResourceEnvironment::hip_device_id), - exec(gko::HipExecutor::create(ResourceEnvironment::hip_device_id, ref, std::make_shared< - gko::HipAllocator>(), + exec(gko::HipExecutor::create(ResourceEnvironment::hip_device_id, ref, + std::make_shared(), stream.get())), guard(exec->get_scoped_device_id_guard()) {} diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp index de9464ce82d..7db262cf982 100644 --- a/test/tools/resource_file_generator.cpp +++ b/test/tools/resource_file_generator.cpp @@ -1,9 +1,42 @@ -#include +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ #include #include +#include + + std::vector split(const std::string& s, char delimiter = ',') { std::istringstream iss(s); @@ -50,17 +83,17 @@ int main() std::string gpus = ""; auto add_devices = [&](int num_devices, const std::string& name) { - if(num_devices){ + if (num_devices) { gpus.append(",\n"); gpus += '"' + name + "\": [\n"; } for (int i = 0; i < num_devices; i++) { - if(i > 0){ + if (i > 0) { gpus.append(",\n"); } - gpus+= R"( {"id": ")" + std::to_string(i) + R"(", "slots": 100})"; + gpus += R"( {"id": ")" + std::to_string(i) + R"(", "slots": 100})"; } - if(num_devices){ + if (num_devices) { gpus.append("\n]"); } }; diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 419e089f793..2a8ace8e39a 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include @@ -45,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include "core/test/gtest/environments.hpp" @@ -81,8 +83,8 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable CUDA devices"}; } exec = gko::CudaExecutor::create( - ResourceEnvironment::cuda_device_id, - ref, std::make_shared(), stream); + ResourceEnvironment::cuda_device_id, ref, + std::make_shared(), stream); } } @@ -94,9 +96,9 @@ inline void init_executor(std::shared_ptr ref, if (gko::HipExecutor::get_num_devices() == 0) { throw std::runtime_error{"No suitable HIP devices"}; } - exec = gko::HipExecutor::create( - ResourceEnvironment::hip_device_id, ref, std::make_shared< - gko::HipAllocator>(), stream); + exec = + gko::HipExecutor::create(ResourceEnvironment::hip_device_id, ref, + std::make_shared(), stream); } diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp index f02834a5a1f..504fc5d761c 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/executor.hpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include @@ -44,6 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include "test/utils/executor.hpp" From 248eaf958ee917494a1713b341bb850a007b9fd9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 3 Aug 2023 09:57:00 +0200 Subject: [PATCH 279/583] allow using ctest resources in CI runs --- .gitlab/scripts.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index b007caff35f..15a2004bde6 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -100,7 +100,7 @@ - awk '!/^#/ { print ($2 - $1)/1000 " " $4 }' .ninja_log | sort -nr - | (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1 - - ctest -V --timeout 6000 + - ctest --output-on-failure --timeout 6000 ${CTEST_EXTRA_ARGS} - ninja test_install - pushd test/test_install - ninja install @@ -152,7 +152,7 @@ - cd ${CI_JOB_NAME/test/build} - | (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1 - - ctest -V --timeout 6000 + - ctest --output-on-failure --timeout 6000 ${CTEST_EXTRA_ARGS} - ninja test_install - pushd test/test_install - ninja install From 19d5b2f2478d1e75bacdc7e7590f6dbd40bc832c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 3 Aug 2023 11:00:49 +0200 Subject: [PATCH 280/583] add query for the default number of omp threads --- core/device_hooks/omp_hooks.cpp | 4 +++ include/ginkgo/core/base/executor.hpp | 2 ++ omp/CMakeLists.txt | 1 + omp/base/executor.cpp | 52 +++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) create mode 100644 omp/base/executor.cpp diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp index f652a4d4582..f79ddfdeca6 100644 --- a/core/device_hooks/omp_hooks.cpp +++ b/core/device_hooks/omp_hooks.cpp @@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include +#include #include #include @@ -51,6 +52,9 @@ scoped_device_id_guard::scoped_device_id_guard(const OmpExecutor* exec, GKO_NOT_COMPILED(omp); +int OmpExecutor::get_num_omp_threads() { return 1; } + + } // namespace gko diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 456b69d3d7e..5f0c307bc73 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1398,6 +1398,8 @@ class OmpExecutor : public detail::ExecutorBase, return this->get_exec_info().num_pu_per_cu; } + static int get_num_omp_threads(); + scoped_device_id_guard get_scoped_device_id_guard() const override; protected: diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index c689ffc42f3..7f46feff5da 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -5,6 +5,7 @@ target_sources(ginkgo_omp PRIVATE base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp + base/executor.cpp base/index_set_kernels.cpp base/scoped_device_id.cpp base/version.cpp diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp new file mode 100644 index 00000000000..3e14270ecdc --- /dev/null +++ b/omp/base/executor.cpp @@ -0,0 +1,52 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +namespace gko { + + +int OmpExecutor::get_num_omp_threads() +{ + int num_threads; +#pragma omp parallel +#pragma omp single + num_threads = omp_get_num_threads(); + return num_threads; +} + + +} // namespace gko \ No newline at end of file From a2591c59e34a03c82a6162a9dab97a68c0189432 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 3 Aug 2023 12:42:59 +0200 Subject: [PATCH 281/583] print device used in test environment Co-authored-by: Tobias Ribizel --- core/test/gtest/environments.hpp | 112 ++++++++++++++++++++++------ core/test/gtest/ginkgo_main.cpp | 7 +- core/test/gtest/ginkgo_mpi_main.cpp | 7 +- cuda/base/device.cpp | 8 ++ cuda/base/device.hpp | 4 + dpcpp/base/device.hpp | 3 + dpcpp/base/executor.dp.cpp | 11 +++ hip/base/device.hip.cpp | 8 ++ hip/base/device.hpp | 4 + 9 files changed, 136 insertions(+), 28 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 856763d4105..125fa7b9b8b 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GINKGO_ENVIRONMENTS_HPP -#define GINKGO_ENVIRONMENTS_HPP +#ifndef GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_ +#define GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_ #include #include @@ -55,6 +55,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if GKO_COMPILING_DPCPP +#include "dpcpp/base/device.hpp" +#endif + + #include #include #include @@ -111,8 +116,11 @@ class ResourceEnvironment : public ::testing::Environment { auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); auto rs_count = rs_count_env ? std::stoi(rs_count_env) : 0; if (rs_count == 0) { - std::cerr << "Running without CTest ctest_resource configuration" - << std::endl; + if (rank == 0) { + std::cerr + << "Running without CTest ctest_resource configuration" + << std::endl; + } return; } if (rs_count != size) { @@ -121,38 +129,25 @@ class ResourceEnvironment : public ::testing::Environment { } // parse CTest ctest_resource group descriptions - if (rank == 0) { - std::cerr << "Running with CTest ctest_resource configuration:" - << std::endl; - } // OpenMP CPU threads if (auto rs_omp_env = get_ctest_group("cpu", rank)) { auto resource = parse_ctest_resources(rs_omp_env); omp_threads = resource.slots; - if (rank == 0) { - std::cerr << omp_threads << " CPU threads" << std::endl; - } } // CUDA GPUs if (auto rs_cuda_env = get_ctest_group("cudagpu", rank)) { auto resource = parse_ctest_resources(rs_cuda_env); cuda_device_id = resource.id; - std::cerr << "Rank " << rank << ": CUDA device " << cuda_device_id - << std::endl; } // HIP GPUs if (auto rs_hip_env = get_ctest_group("hipgpu", rank)) { auto resource = parse_ctest_resources(rs_hip_env); hip_device_id = resource.id; - std::cerr << "Rank " << rank << ": HIP device " << cuda_device_id - << std::endl; } // SYCL GPUs (no other devices!) if (auto rs_sycl_env = get_ctest_group("syclgpu", rank)) { auto resource = parse_ctest_resources(rs_sycl_env); sycl_device_id = resource.id; - std::cerr << "Rank " << rank << ": SYCL device " << cuda_device_id - << std::endl; } } @@ -167,18 +162,31 @@ class ResourceEnvironment : public ::testing::Environment { class OmpEnvironment : public ::testing::Environment { public: + explicit OmpEnvironment(int rank) : rank_(rank) {} + void SetUp() override { if (ResourceEnvironment::omp_threads > 0) { - omp_set_num_threads(ResourceEnvironment::omp_threads); + omp_set_num_threads(num_threads); } +#pragma omp parallel +#pragma single + std::cerr << "Rank " << rank_ << ": OMP threads " + << omp_get_num_threads(); + << std::endl; } + +private: + int rank_; }; #else -class OmpEnvironment : public ::testing::Environment {}; +class OmpEnvironment : public ::testing::Environment { +public: + explicit OmpEnvironment(int){}; +}; #endif @@ -187,15 +195,31 @@ class OmpEnvironment : public ::testing::Environment {}; class CudaEnvironment : public ::testing::Environment { public: + explicit CudaEnvironment(int rank) : rank_(rank) {} + + void SetUp() override + { + auto device_id = ResourceEnvironment::cuda_device_id; + std::cerr << "Rank " << rank_ << ": CUDA device " + << gko::kernels::cuda::get_device_name(device_id) << " ID " + << device_id << std::endl; + } + void TearDown() override { gko::kernels::cuda::reset_device(ResourceEnvironment::cuda_device_id); } + +private: + int rank_; }; #else -class CudaEnvironment : public ::testing::Environment {}; +class CudaEnvironment : public ::testing::Environment { +public: + explicit CudaEnvironment(int){}; +}; #endif @@ -204,17 +228,61 @@ class CudaEnvironment : public ::testing::Environment {}; class HipEnvironment : public ::testing::Environment { public: + explicit HipEnvironment(int rank) : rank_(rank) {} + + void SetUp() override + { + auto device_id = ResourceEnvironment::hip_device_id; + std::cerr << "Rank " << rank_ << ": HIP device " + << gko::kernels::hip::get_device_name(device_id) << " ID " + << device_id << std::endl; + } + void TearDown() override { gko::kernels::hip::reset_device(ResourceEnvironment::hip_device_id); } + +private: + int rank_; +}; + +#else + +class HipEnvironment : public ::testing::Environment { +public: + explicit HipEnvironment(int){}; +}; + +#endif + + +#ifdef GKO_COMPILING_DPCPP + +class SyclEnvironment : public ::testing::Environment { +public: + explicit SyclEnvironment(int rank) : rank_(rank) {} + + void SetUp() override + { + auto device_id = ResourceEnvironment::sycl_device_id; + std::cerr << "Rank " << rank_ << ": SYCL device " + << gko::kernels::dpcpp::get_device_name(device_id) << " ID " + << device_id << std::endl; + } + +private: + int rank_; }; #else -class HipEnvironment : public ::testing::Environment {}; +class SyclEnvironment : public ::testing::Environment { +public: + explicit SyclEnvironment(int){}; +}; #endif -#endif // GINKGO_ENVIRONMENTS_HPP +#endif // GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_ diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp index 4d69b421875..01d1fc393c3 100644 --- a/core/test/gtest/ginkgo_main.cpp +++ b/core/test/gtest/ginkgo_main.cpp @@ -47,9 +47,10 @@ int main(int argc, char** argv) ::testing::InitGoogleTest(&argc, argv); ::testing::AddGlobalTestEnvironment(new ResourceEnvironment); - ::testing::AddGlobalTestEnvironment(new CudaEnvironment); - ::testing::AddGlobalTestEnvironment(new HipEnvironment); - ::testing::AddGlobalTestEnvironment(new OmpEnvironment); + ::testing::AddGlobalTestEnvironment(new CudaEnvironment(0)); + ::testing::AddGlobalTestEnvironment(new HipEnvironment(0)); + ::testing::AddGlobalTestEnvironment(new SyclEnvironment(0)); + ::testing::AddGlobalTestEnvironment(new OmpEnvironment(0)); int result = RUN_ALL_TESTS(); return result; } diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index 945ec7ec7cd..f7fe71981d2 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -394,9 +394,10 @@ int main(int argc, char** argv) testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); ::testing::AddGlobalTestEnvironment(new ResourceEnvironment(rank, size)); - ::testing::AddGlobalTestEnvironment(new CudaEnvironment); - ::testing::AddGlobalTestEnvironment(new HipEnvironment); - ::testing::AddGlobalTestEnvironment(new OmpEnvironment); + ::testing::AddGlobalTestEnvironment(new CudaEnvironment(rank)); + ::testing::AddGlobalTestEnvironment(new HipEnvironment(rank)); + ::testing::AddGlobalTestEnvironment(new SyclEnvironment(rank)); + ::testing::AddGlobalTestEnvironment(new OmpEnvironment(rank)); ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp index 2db0876ca95..32cf6265160 100644 --- a/cuda/base/device.cpp +++ b/cuda/base/device.cpp @@ -58,6 +58,14 @@ void destroy_event(CUevent_st* event) } +std::string get_device_name(int device_id) +{ + cudaDeviceProp prop; + GKO_ASSERT_NO_CUDA_ERRORS(cudaGetDeviceProperties(&prop, device_id)); + return {prop.name}; +} + + } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/device.hpp b/cuda/base/device.hpp index 7bd9390c54e..e363f455300 100644 --- a/cuda/base/device.hpp +++ b/cuda/base/device.hpp @@ -50,6 +50,10 @@ void reset_device(int device_id); void destroy_event(CUevent_st* event); +/** returns cudaDeviceProp.name for the given device */ +std::string get_device_name(int device_id); + + } // namespace cuda } // namespace kernels } // namespace gko diff --git a/dpcpp/base/device.hpp b/dpcpp/base/device.hpp index 6047fbed615..658ccbe18f4 100644 --- a/dpcpp/base/device.hpp +++ b/dpcpp/base/device.hpp @@ -46,6 +46,9 @@ namespace dpcpp { void destroy_event(sycl::event* event); +std::string get_device_name(int device_id); + + } // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index 3d01e271f15..6d6bbbe0388 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -323,6 +323,17 @@ namespace dpcpp { void destroy_event(sycl::event* event) { delete event; } +std::string get_device_name(int device_id) +{ + auto devices = ::gko::detail::get_devices("gpu"); + if (devices.empty()) { + return "CPU"; + } + + return devices[device_id].get_info(); +} + + } // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index 9a01d6aacee..d539fa69b43 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -62,6 +62,14 @@ void destroy_event(GKO_HIP_EVENT_STRUCT* event) } +std::string get_device_name(int device_id) +{ + hipDeviceProp_t prop; + GKO_ASSERT_NO_HIP_ERRORS(hipGetDeviceProperties(&prop, device_id)); + return {prop.name}; +} + + } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/base/device.hpp b/hip/base/device.hpp index dcc8c3ba0f1..fceffe4a503 100644 --- a/hip/base/device.hpp +++ b/hip/base/device.hpp @@ -49,6 +49,10 @@ void reset_device(int device_id); void destroy_event(GKO_HIP_EVENT_STRUCT* event); +/** returns hipDeviceProp.name for the given device */ +std::string get_device_name(int device_id); + + } // namespace hip } // namespace kernels } // namespace gko From f5d7209a5bbee6b35ecfe03fde71807426c5ae46 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 3 Aug 2023 12:44:21 +0200 Subject: [PATCH 282/583] use ginkgo_create_dpcpp_test consistently --- dpcpp/test/matrix/CMakeLists.txt | 2 +- dpcpp/test/matrix/{fbcsr_kernels.cpp => fbcsr_kernels.dp.cpp} | 0 dpcpp/test/preconditioner/CMakeLists.txt | 2 +- .../{jacobi_kernels.cpp => jacobi_kernels.dp.cpp} | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename dpcpp/test/matrix/{fbcsr_kernels.cpp => fbcsr_kernels.dp.cpp} (100%) rename dpcpp/test/preconditioner/{jacobi_kernels.cpp => jacobi_kernels.dp.cpp} (100%) diff --git a/dpcpp/test/matrix/CMakeLists.txt b/dpcpp/test/matrix/CMakeLists.txt index 88ab52e9c3f..7ada04882da 100644 --- a/dpcpp/test/matrix/CMakeLists.txt +++ b/dpcpp/test/matrix/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_test(fbcsr_kernels) +ginkgo_create_dpcpp_test(fbcsr_kernels) diff --git a/dpcpp/test/matrix/fbcsr_kernels.cpp b/dpcpp/test/matrix/fbcsr_kernels.dp.cpp similarity index 100% rename from dpcpp/test/matrix/fbcsr_kernels.cpp rename to dpcpp/test/matrix/fbcsr_kernels.dp.cpp diff --git a/dpcpp/test/preconditioner/CMakeLists.txt b/dpcpp/test/preconditioner/CMakeLists.txt index a0ca5a2e38a..c606e12ac3e 100644 --- a/dpcpp/test/preconditioner/CMakeLists.txt +++ b/dpcpp/test/preconditioner/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_test(jacobi_kernels) +ginkgo_create_dpcpp_test(jacobi_kernels) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp similarity index 100% rename from dpcpp/test/preconditioner/jacobi_kernels.cpp rename to dpcpp/test/preconditioner/jacobi_kernels.dp.cpp From d00494b338d4f24f2c0a487ffb2aa78544bd6a21 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 3 Aug 2023 14:21:18 +0200 Subject: [PATCH 283/583] fixup! print device used in test environment --- core/test/gtest/environments.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 125fa7b9b8b..2d0d1eac33a 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -167,13 +167,12 @@ class OmpEnvironment : public ::testing::Environment { void SetUp() override { if (ResourceEnvironment::omp_threads > 0) { - omp_set_num_threads(num_threads); + omp_set_num_threads(ResourceEnvironment::omp_threads); } #pragma omp parallel #pragma single std::cerr << "Rank " << rank_ << ": OMP threads " - << omp_get_num_threads(); - << std::endl; + << omp_get_num_threads() << std::endl; } private: From 14e84670bfab913e5bad07a1ee7e93285f765fc4 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 3 Aug 2023 17:45:28 +0200 Subject: [PATCH 284/583] review updates: - cmake documentation - take omp num thread into account for resource file Co-authored-by: Yu-Hsiang M. Tsai --- cmake/create_test.cmake | 5 +++-- test/tools/resource_file_generator.cpp | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 1d3e041ff2a..2a905570a4b 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -110,8 +110,9 @@ endfunction() ## Adds a test to the list executed by ctest and sets its output binary name ## Possible additional arguments: ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes. -## - `CORES` the number of threads used by a test, default is 4 -## - `PERCENTAGE` usage percentage of a single GPU, default is 50 +## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is 4 +## - `RESOURCE_PERCENTAGE` usage percentage of a single GPU, default is 25 +## - `RESOURCE_TYPE` the resource type, can be ref, cpu, cudagpu, hipgpu, syclgpu ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple) ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp index 7db262cf982..a2b0b9bd5cd 100644 --- a/test/tools/resource_file_generator.cpp +++ b/test/tools/resource_file_generator.cpp @@ -48,6 +48,7 @@ std::vector split(const std::string& s, char delimiter = ',') return tokens; } + std::string create_json(const std::string& resources) { std::string json; @@ -73,7 +74,7 @@ std::string create_json(const std::string& resources) int main() { - auto num_cpu_threads = std::max(std::thread::hardware_concurrency(), 1u); + auto num_cpu_threads = gko::OmpExecutor::get_num_omp_threads(); auto num_cuda_gpus = gko::CudaExecutor::get_num_devices(); auto num_hip_gpus = gko::HipExecutor::get_num_devices(); auto num_sycl_gpus = gko::DpcppExecutor::get_num_devices("gpu"); From 70504d08ea53eaa9445d45e4a841c3a5d37575e1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 26 Aug 2023 22:50:04 +0200 Subject: [PATCH 285/583] use a single SYCL CPU as fall-back --- CMakeLists.txt | 6 ++++++ cmake/create_test.cmake | 17 ++++++++++------- include/ginkgo/core/base/executor.hpp | 2 +- .../core/base/{fwd_defs.hpp => fwd_decls.hpp} | 6 +++--- include/ginkgo/core/base/memory.hpp | 2 +- include/ginkgo/ginkgo.hpp | 2 +- test/tools/resource_file_generator.cpp | 3 ++- 7 files changed, 24 insertions(+), 14 deletions(-) rename include/ginkgo/core/base/{fwd_defs.hpp => fwd_decls.hpp} (94%) diff --git a/CMakeLists.txt b/CMakeLists.txt index fab64e43c76..bec31a4360c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,8 @@ option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF) option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF) +set(GINKGO_TEST_OMP_PARALLELISM "4" CACHE STRING + "The number of OpenMP threads to use for a test binary during CTest resource file-constrained test.") # load executor-specific configuration if(GINKGO_BUILD_CUDA) @@ -307,6 +309,10 @@ configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in # propagated to the other parts of Ginkgo in case of building as static libraries add_subdirectory(devices) # Basic device functionalities. Always compiled. add_subdirectory(common) # Import list of unified kernel source files +if(GINKGO_BUILD_TESTS) + # use custom target `tests` to build only test binaries + add_custom_target(tests) +endif() if(GINKGO_BUILD_CUDA) add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs endif() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 2a905570a4b..f11657ec324 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -73,14 +73,14 @@ function(ginkgo_add_resource_requirement test_name) set(single_resource "cpu:1") elseif(add_rr_RESOURCE_TYPE STREQUAL "cpu") if(NOT add_rr_RESOURCE_LOCAL_CORES) - set(add_rr_RESOURCE_LOCAL_CORES 4) # perhaps get this from environment variable? + set(add_rr_RESOURCE_LOCAL_CORES ${GINKGO_TEST_OMP_PARALLELISM}) endif() if(NOT add_rr_RESOURCE_LOCAL_CORES MATCHES "^[0-9]+") message(FATAL_ERROR "Resource specification is invalid: RESOURCE_LOCAL_CORES=${add_rr_RESOURCE_LOCAL_CORES}") endif() set(single_resource "cpu:${add_rr_RESOURCE_LOCAL_CORES}") - elseif(add_rr_RESOURCE_TYPE MATCHES "^(cuda|hip|sycl)gpu$") + elseif(add_rr_RESOURCE_TYPE MATCHES "^(cudagpu|hipgpu|sycl)$") if(NOT add_rr_RESOURCE_PERCENTAGE) set(add_rr_RESOURCE_PERCENTAGE 25) endif() @@ -95,7 +95,7 @@ function(ginkgo_add_resource_requirement test_name) set(single_resource "${add_rr_RESOURCE_TYPE}:${add_rr_RESOURCE_PERCENTAGE}") else() - message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: ref, cpu, cudagpu, hipgpu, syclgpu.") + message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: ref, cpu, cudagpu, hipgpu, sycl.") endif() if(NOT add_rr_MPI_SIZE) @@ -110,9 +110,10 @@ endfunction() ## Adds a test to the list executed by ctest and sets its output binary name ## Possible additional arguments: ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes. -## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is 4 +## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is +## $GINKGO_TEST_OMP_PARALLELISM ## - `RESOURCE_PERCENTAGE` usage percentage of a single GPU, default is 25 -## - `RESOURCE_TYPE` the resource type, can be ref, cpu, cudagpu, hipgpu, syclgpu +## - `RESOURCE_TYPE` the resource type, can be ref, cpu, cudagpu, hipgpu, sycl ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple) ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths @@ -133,6 +134,8 @@ function(ginkgo_add_test test_name test_target_name) COMMAND ${test_target_name} WORKING_DIRECTORY "$") endif() + # use custom target `tests` to build only test binaries + add_dependencies(tests ${test_target_name}) ginkgo_add_resource_requirement(${REL_BINARY_DIR}/${test_name} ${ARGN}) @@ -165,7 +168,7 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE syclgpu) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE sycl) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}") @@ -304,7 +307,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) elseif (exec STREQUAL hip) set(test_resource_type hipgpu) else () - set(test_resource_type syclgpu) + set(test_resource_type sycl) endif () ginkgo_build_test_name(${test_name} test_target_name) string(TOUPPER ${exec} exec_upper) diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 5f0c307bc73..c7195501178 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include #include #include #include diff --git a/include/ginkgo/core/base/fwd_defs.hpp b/include/ginkgo/core/base/fwd_decls.hpp similarity index 94% rename from include/ginkgo/core/base/fwd_defs.hpp rename to include/ginkgo/core/base/fwd_decls.hpp index 5f0cbd9d960..f99d3a0f90e 100644 --- a/include/ginkgo/core/base/fwd_defs.hpp +++ b/include/ginkgo/core/base/fwd_decls.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ -#define GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ +#ifndef GKO_PUBLIC_CORE_BASE_FWD_DECLS_HPP_ +#define GKO_PUBLIC_CORE_BASE_FWD_DECLS_HPP_ #include @@ -87,4 +87,4 @@ class event; #endif -#endif // GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ +#endif // GKO_PUBLIC_CORE_BASE_FWD_DECLS_HPP_ diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index f421abf7da4..6997b6351e5 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_PUBLIC_CORE_BASE_MEMORY_HPP_ -#include +#include #include diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 594ad880b8c..bcdaa5d2d20 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include #include #include #include diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp index a2b0b9bd5cd..f0dbbea0353 100644 --- a/test/tools/resource_file_generator.cpp +++ b/test/tools/resource_file_generator.cpp @@ -100,7 +100,8 @@ int main() }; add_devices(num_cuda_gpus, "cudagpu"); add_devices(num_hip_gpus, "hipgpu"); - add_devices(num_sycl_gpus, "syclgpu"); + // SYCL GPUs, fall back to CPU + add_devices(std::max(1, num_sycl_gpus), "sycl"); std::cout << create_json(cpus + gpus) << std::endl; } From 681b53d2fd658f3b31a5e927a189529013f6c383 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 26 Aug 2023 22:56:33 +0200 Subject: [PATCH 286/583] fix OMP environment --- core/test/gtest/environments.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 2d0d1eac33a..815902d71a8 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -170,7 +170,7 @@ class OmpEnvironment : public ::testing::Environment { omp_set_num_threads(ResourceEnvironment::omp_threads); } #pragma omp parallel -#pragma single +#pragma omp single std::cerr << "Rank " << rank_ << ": OMP threads " << omp_get_num_threads() << std::endl; } From 1c627a977a8066dc5465649cdabaaf15ef092577 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 31 Aug 2023 11:10:39 +0200 Subject: [PATCH 287/583] create separate GTest main libraries --- cmake/create_test.cmake | 41 ++--- core/test/CMakeLists.txt | 2 + core/test/gtest/CMakeLists.txt | 30 ++++ core/test/gtest/environments.hpp | 231 +++++----------------------- core/test/gtest/ginkgo_main.cpp | 5 +- core/test/gtest/ginkgo_mpi_main.cpp | 7 +- core/test/gtest/resources.cpp | 145 +++++++++++++++++ core/test/gtest/resources.hpp | 51 ++++++ cuda/test/utils.hpp | 2 +- hip/test/utils.hip.hpp | 2 +- test/utils/executor.hpp | 2 +- 11 files changed, 285 insertions(+), 233 deletions(-) create mode 100644 core/test/gtest/CMakeLists.txt create mode 100644 core/test/gtest/resources.cpp create mode 100644 core/test/gtest/resources.hpp diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index f11657ec324..baaf84f59eb 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -11,26 +11,9 @@ function(ginkgo_build_test_name test_name target_name) set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) endfunction() -function(ginkgo_create_gtest_main) - add_library(ginkgo_gtest_main "") - target_sources(ginkgo_gtest_main - PRIVATE - ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_main.cpp) - target_link_libraries(ginkgo_gtest_main PRIVATE GTest::GTest Ginkgo::ginkgo) -endfunction() - -function(ginkgo_create_gtest_mpi_main) - add_library(ginkgo_gtest_mpi_main "") - target_sources(ginkgo_gtest_mpi_main - PRIVATE - ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_mpi_main.cpp) - find_package(MPI 3.1 COMPONENTS CXX REQUIRED) - target_link_libraries(ginkgo_gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX Ginkgo::ginkgo) -endfunction() - ## Set up shared target properties and handle ADDITIONAL_LIBRARIES/ADDITIONAL_INCLUDES ## `MPI_SIZE size` causes the tests to be run with `size` MPI processes. -function(ginkgo_set_test_target_properties test_target_name) +function(ginkgo_set_test_target_properties test_target_name test_library_suffix) cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_args}") if (GINKGO_FAST_TESTS) target_compile_definitions(${test_target_name} PRIVATE GINKGO_FAST_TESTS) @@ -42,16 +25,12 @@ function(ginkgo_set_test_target_properties test_target_name) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() if(GINKGO_CHECK_CIRCULAR_DEPS) - target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") + target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") endif() if(set_properties_MPI_SIZE) - target_sources(${test_target_name} - PRIVATE - ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_mpi_main.cpp) + target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main_mpi${test_library_suffix}) else() - target_sources(${test_target_name} - PRIVATE - ${PROJECT_SOURCE_DIR}/core/test/gtest/ginkgo_main.cpp) + target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main${test_library_suffix}) endif() target_compile_features(${test_target_name} PUBLIC cxx_std_14) target_compile_options(${test_target_name} PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) @@ -156,7 +135,7 @@ function(ginkgo_create_test test_name) ginkgo_build_test_name(${test_name} test_target_name) add_executable(${test_target_name} ${test_name}.cpp) target_link_libraries(${test_target_name}) - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE ref) endfunction(ginkgo_create_test) @@ -167,7 +146,7 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_features(${test_target_name} PUBLIC cxx_std_17) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_dpcpp" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE sycl) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) @@ -201,7 +180,7 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF) endif() - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_cuda" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cudagpu) endfunction(ginkgo_create_cuda_test_internal) @@ -257,7 +236,7 @@ function(ginkgo_create_hip_test_internal test_name filename test_target_name add ${hiprand_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS} ) - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu) endfunction(ginkgo_create_hip_test_internal) @@ -273,7 +252,7 @@ function(ginkgo_create_omp_test_internal test_name filename test_target_name) add_executable(${test_target_name} ${test_name}.cpp) target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP) target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_omp" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu) endfunction() @@ -328,7 +307,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_${exec}" ${ARGN}) ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN} RESOURCE_TYPE ${test_resource_type}) endfunction(ginkgo_create_common_test_internal) diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt index b330a493b38..776d0b72c7d 100644 --- a/core/test/CMakeLists.txt +++ b/core/test/CMakeLists.txt @@ -1,5 +1,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake) +add_subdirectory(gtest) + add_subdirectory(accessor) add_subdirectory(base) add_subdirectory(components) diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt new file mode 100644 index 00000000000..43bb6863224 --- /dev/null +++ b/core/test/gtest/CMakeLists.txt @@ -0,0 +1,30 @@ +function(add_gtest_main suffix definitions) + add_library(ginkgo_gtest_main${suffix} ginkgo_main.cpp resources.cpp) + target_link_libraries(ginkgo_gtest_main${suffix} PUBLIC Ginkgo::ginkgo GTest::GTest) + target_compile_definitions(ginkgo_gtest_main${suffix} PRIVATE ${definitions}) + ginkgo_compile_features(ginkgo_gtest_main${suffix}) + if (GINKGO_BUILD_MPI) + add_library(ginkgo_gtest_main_mpi${suffix} ginkgo_mpi_main.cpp resources.cpp) + target_link_libraries(ginkgo_gtest_main_mpi${suffix} PUBLIC Ginkgo::ginkgo GTest::GTest MPI::MPI_CXX) + target_compile_definitions(ginkgo_gtest_main_mpi${suffix} PRIVATE ${definitions}) + ginkgo_compile_features(ginkgo_gtest_main_mpi${suffix}) + endif() +endfunction() + +add_gtest_main("" "") +add_library(ginkgo_gtest_main_reference ALIAS ginkgo_gtest_main) +if (GINKGO_BUILD_MPI) + add_library(ginkgo_gtest_main_mpi_reference ALIAS ginkgo_gtest_main_mpi) +endif() +if (GINKGO_BUILD_OMP) + add_gtest_main("_omp" "GKO_COMPILING_OMP") +endif() +if (GINKGO_BUILD_CUDA) + add_gtest_main("_cuda" "GKO_COMPILING_CUDA") +endif() +if (GINKGO_BUILD_HIP) + add_gtest_main("_hip" "GKO_COMPILING_HIP") +endif() +if (GINKGO_BUILD_DPCPP) + add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP") +endif() diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 815902d71a8..89166a0594c 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -35,11 +35,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include +#include +#include +#include + + +#include "core/test/gtest/resources.hpp" + + #ifdef GKO_COMPILING_OMP #include #endif @@ -60,228 +69,68 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#include -#include -#include - - -struct ctest_resource { - int id; - int slots; -}; - - -inline char* get_ctest_group(std::string resource_type, int group_id) -{ - std::transform(resource_type.begin(), resource_type.end(), - resource_type.begin(), - [](auto c) { return std::toupper(c); }); - std::string rs_group_env = "CTEST_RESOURCE_GROUP_" + - std::to_string(group_id) + "_" + resource_type; - return std::getenv(rs_group_env.c_str()); -} - - -inline ctest_resource parse_ctest_resources(std::string resource) -{ - std::regex re(R"(id\:(\d+),slots\:(\d+))"); - std::smatch match; - - if (!std::regex_match(resource, match, re)) { - GKO_INVALID_STATE("Can't parse ctest_resource string: " + resource); - } - - return ctest_resource{std::stoi(match[1]), std::stoi(match[2])}; -} - - -class ResourceEnvironment : public ::testing::Environment { +class DeviceEnvironment : public ::testing::Environment { public: - explicit ResourceEnvironment(int rank = 0, int size = 1) - { -#if GINKGO_BUILD_MPI - if (size > 1) { - cuda_device_id = gko::experimental::mpi::map_rank_to_device_id( - MPI_COMM_WORLD, - std::max(gko::CudaExecutor::get_num_devices(), 1)); - hip_device_id = gko::experimental::mpi::map_rank_to_device_id( - MPI_COMM_WORLD, - std::max(gko::HipExecutor::get_num_devices(), 1)); - sycl_device_id = gko::experimental::mpi::map_rank_to_device_id( - MPI_COMM_WORLD, - std::max(gko::DpcppExecutor::get_num_devices("gpu"), 1)); - } -#endif - - auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); - auto rs_count = rs_count_env ? std::stoi(rs_count_env) : 0; - if (rs_count == 0) { - if (rank == 0) { - std::cerr - << "Running without CTest ctest_resource configuration" - << std::endl; - } - return; - } - if (rs_count != size) { - GKO_INVALID_STATE("Invalid resource group count: " + - std::to_string(rs_count)); - } - - // parse CTest ctest_resource group descriptions - // OpenMP CPU threads - if (auto rs_omp_env = get_ctest_group("cpu", rank)) { - auto resource = parse_ctest_resources(rs_omp_env); - omp_threads = resource.slots; - } - // CUDA GPUs - if (auto rs_cuda_env = get_ctest_group("cudagpu", rank)) { - auto resource = parse_ctest_resources(rs_cuda_env); - cuda_device_id = resource.id; - } - // HIP GPUs - if (auto rs_hip_env = get_ctest_group("hipgpu", rank)) { - auto resource = parse_ctest_resources(rs_hip_env); - hip_device_id = resource.id; - } - // SYCL GPUs (no other devices!) - if (auto rs_sycl_env = get_ctest_group("syclgpu", rank)) { - auto resource = parse_ctest_resources(rs_sycl_env); - sycl_device_id = resource.id; - } - } - - static int omp_threads; - static int cuda_device_id; - static int hip_device_id; - static int sycl_device_id; -}; - + explicit DeviceEnvironment(int rank) : rank_(rank) { print_environment(); } #ifdef GKO_COMPILING_OMP - -class OmpEnvironment : public ::testing::Environment { -public: - explicit OmpEnvironment(int rank) : rank_(rank) {} - - void SetUp() override + void print_environment() const { if (ResourceEnvironment::omp_threads > 0) { omp_set_num_threads(ResourceEnvironment::omp_threads); } -#pragma omp parallel -#pragma omp single - std::cerr << "Rank " << rank_ << ": OMP threads " - << omp_get_num_threads() << std::endl; + std::stringstream ss; + ss << "Rank " << rank_ << ": OMP threads " << omp_get_max_threads() + << std::endl; + std::cerr << ss.str(); } - -private: - int rank_; -}; - -#else - - -class OmpEnvironment : public ::testing::Environment { -public: - explicit OmpEnvironment(int){}; -}; - -#endif - - -#ifdef GKO_COMPILING_CUDA - -class CudaEnvironment : public ::testing::Environment { -public: - explicit CudaEnvironment(int rank) : rank_(rank) {} - - void SetUp() override +#elif defined(GKO_COMPILING_CUDA) + void print_environment() const { auto device_id = ResourceEnvironment::cuda_device_id; - std::cerr << "Rank " << rank_ << ": CUDA device " - << gko::kernels::cuda::get_device_name(device_id) << " ID " - << device_id << std::endl; + std::stringstream ss; + ss << "Rank " << rank_ << ": CUDA device " + << gko::kernels::cuda::get_device_name(device_id) << " ID " + << device_id << std::endl; + std::cerr << ss.str(); } void TearDown() override { gko::kernels::cuda::reset_device(ResourceEnvironment::cuda_device_id); } - -private: - int rank_; -}; - -#else - -class CudaEnvironment : public ::testing::Environment { -public: - explicit CudaEnvironment(int){}; -}; - -#endif - - -#ifdef GKO_COMPILING_HIP - -class HipEnvironment : public ::testing::Environment { -public: - explicit HipEnvironment(int rank) : rank_(rank) {} - - void SetUp() override +#elif defined(GKO_COMPILING_HIP) + void print_environment() const { auto device_id = ResourceEnvironment::hip_device_id; - std::cerr << "Rank " << rank_ << ": HIP device " - << gko::kernels::hip::get_device_name(device_id) << " ID " - << device_id << std::endl; + std::stringstream ss; + ss << "Rank " << rank_ << ": HIP device " + << gko::kernels::hip::get_device_name(device_id) << " ID " + << device_id << std::endl; + std::cerr << ss.str(); } void TearDown() override { gko::kernels::hip::reset_device(ResourceEnvironment::hip_device_id); } - -private: - int rank_; -}; - -#else - -class HipEnvironment : public ::testing::Environment { -public: - explicit HipEnvironment(int){}; -}; - -#endif - - -#ifdef GKO_COMPILING_DPCPP - -class SyclEnvironment : public ::testing::Environment { -public: - explicit SyclEnvironment(int rank) : rank_(rank) {} - - void SetUp() override +#elif defined(GKO_COMPILING_DPCPP) + void print_environment() const { auto device_id = ResourceEnvironment::sycl_device_id; - std::cerr << "Rank " << rank_ << ": SYCL device " - << gko::kernels::dpcpp::get_device_name(device_id) << " ID " - << device_id << std::endl; + std::stringstream ss; + ss << "Rank " << rank_ << ": SYCL device " + << gko::kernels::dpcpp::get_device_name(device_id) << " ID " + << device_id << std::endl; + std::cerr << ss.str(); } +#else + void print_environment() const {} +#endif private: int rank_; }; -#else - -class SyclEnvironment : public ::testing::Environment { -public: - explicit SyclEnvironment(int){}; -}; - -#endif - #endif // GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_ diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp index 01d1fc393c3..b8458dbc0b0 100644 --- a/core/test/gtest/ginkgo_main.cpp +++ b/core/test/gtest/ginkgo_main.cpp @@ -47,10 +47,7 @@ int main(int argc, char** argv) ::testing::InitGoogleTest(&argc, argv); ::testing::AddGlobalTestEnvironment(new ResourceEnvironment); - ::testing::AddGlobalTestEnvironment(new CudaEnvironment(0)); - ::testing::AddGlobalTestEnvironment(new HipEnvironment(0)); - ::testing::AddGlobalTestEnvironment(new SyclEnvironment(0)); - ::testing::AddGlobalTestEnvironment(new OmpEnvironment(0)); + ::testing::AddGlobalTestEnvironment(new DeviceEnvironment(0)); int result = RUN_ALL_TESTS(); return result; } diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index f7fe71981d2..c34d3c84693 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -394,10 +395,8 @@ int main(int argc, char** argv) testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); ::testing::AddGlobalTestEnvironment(new ResourceEnvironment(rank, size)); - ::testing::AddGlobalTestEnvironment(new CudaEnvironment(rank)); - ::testing::AddGlobalTestEnvironment(new HipEnvironment(rank)); - ::testing::AddGlobalTestEnvironment(new SyclEnvironment(rank)); - ::testing::AddGlobalTestEnvironment(new OmpEnvironment(rank)); + ::testing::AddGlobalTestEnvironment(new DeviceEnvironment(rank)); + MPI_Barrier(comm); ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); diff --git a/core/test/gtest/resources.cpp b/core/test/gtest/resources.cpp new file mode 100644 index 00000000000..2bd0cdc3496 --- /dev/null +++ b/core/test/gtest/resources.cpp @@ -0,0 +1,145 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/test/gtest/resources.hpp" + + +#include +#include +#include + + +#ifdef GKO_COMPILING_OMP +#include +#endif + + +#ifdef GKO_COMPILING_CUDA +#include "cuda/base/device.hpp" +#endif + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/device.hpp" +#endif + + +#if GKO_COMPILING_DPCPP +#include "dpcpp/base/device.hpp" +#endif + + +#include +#include +#include + + +struct ctest_resource { + int id; + int slots; +}; + + +char* get_ctest_group(std::string resource_type, int group_id) +{ + std::transform(resource_type.begin(), resource_type.end(), + resource_type.begin(), + [](auto c) { return std::toupper(c); }); + std::string rs_group_env = "CTEST_RESOURCE_GROUP_" + + std::to_string(group_id) + "_" + resource_type; + return std::getenv(rs_group_env.c_str()); +} + + +ctest_resource parse_ctest_resources(std::string resource) +{ + std::regex re(R"(id\:(\d+),slots\:(\d+))"); + std::smatch match; + + if (!std::regex_match(resource, match, re)) { + GKO_INVALID_STATE("Can't parse ctest_resource string: " + resource); + } + + return ctest_resource{std::stoi(match[1]), std::stoi(match[2])}; +} + + +ResourceEnvironment::ResourceEnvironment(int rank, int size) +{ +#if GINKGO_BUILD_MPI + if (size > 1) { + cuda_device_id = gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, std::max(gko::CudaExecutor::get_num_devices(), 1)); + hip_device_id = gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, std::max(gko::HipExecutor::get_num_devices(), 1)); + sycl_device_id = gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, + std::max(gko::DpcppExecutor::get_num_devices("gpu"), 1)); + } +#endif + + auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT"); + auto rs_count = rs_count_env ? std::stoi(rs_count_env) : 0; + if (rs_count == 0) { + if (rank == 0) { + std::cerr << "Running without CTest ctest_resource configuration" + << std::endl; + } + return; + } + if (rs_count != size) { + GKO_INVALID_STATE("Invalid resource group count: " + + std::to_string(rs_count)); + } + + // parse CTest ctest_resource group descriptions + // OpenMP CPU threads + if (auto rs_omp_env = get_ctest_group("cpu", rank)) { + auto resource = parse_ctest_resources(rs_omp_env); + omp_threads = resource.slots; + } + // CUDA GPUs + if (auto rs_cuda_env = get_ctest_group("cudagpu", rank)) { + auto resource = parse_ctest_resources(rs_cuda_env); + cuda_device_id = resource.id; + } + // HIP GPUs + if (auto rs_hip_env = get_ctest_group("hipgpu", rank)) { + auto resource = parse_ctest_resources(rs_hip_env); + hip_device_id = resource.id; + } + // SYCL GPUs (no other devices!) + if (auto rs_sycl_env = get_ctest_group("syclgpu", rank)) { + auto resource = parse_ctest_resources(rs_sycl_env); + sycl_device_id = resource.id; + } +} diff --git a/core/test/gtest/resources.hpp b/core/test/gtest/resources.hpp new file mode 100644 index 00000000000..a88280f29c7 --- /dev/null +++ b/core/test/gtest/resources.hpp @@ -0,0 +1,51 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_TEST_GTEST_RESOURCES_HPP_ +#define GKO_CORE_TEST_GTEST_RESOURCES_HPP_ + + +#include + + +class ResourceEnvironment : public ::testing::Environment { +public: + explicit ResourceEnvironment(int rank = 0, int size = 1); + + static int omp_threads; + static int cuda_device_id; + static int hip_device_id; + static int sycl_device_id; +}; + + +#endif // GKO_CORE_TEST_GTEST_RESOURCES_HPP_ diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index 35f382806ec..6ef808aa1b3 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/gtest/environments.hpp" +#include "core/test/gtest/resources.hpp" #include "cuda/base/device.hpp" diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index d67c8935ab4..e1c9f9341fb 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/gtest/environments.hpp" +#include "core/test/gtest/resources.hpp" #include "hip/base/device.hpp" diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 2a8ace8e39a..e4ce56f3d7a 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/test/gtest/environments.hpp" +#include "core/test/gtest/resources.hpp" #if GINKGO_COMMON_SINGLE_MODE From 1adef1712d566b3ca313c84a9728582d6e12edbe Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 31 Aug 2023 09:20:51 +0000 Subject: [PATCH 288/583] Format files Co-authored-by: Tobias Ribizel --- core/test/gtest/environments.hpp | 1 + core/test/gtest/ginkgo_mpi_main.cpp | 4 +++- core/test/gtest/resources.cpp | 6 +++--- omp/base/executor.cpp | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 89166a0594c..78c5a40f8a5 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_ #define GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_ + #include #include #include diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index c34d3c84693..12107ca55f8 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -45,10 +45,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include +#include + + #include diff --git a/core/test/gtest/resources.cpp b/core/test/gtest/resources.cpp index 2bd0cdc3496..dc8ad7931a9 100644 --- a/core/test/gtest/resources.cpp +++ b/core/test/gtest/resources.cpp @@ -30,14 +30,14 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/test/gtest/resources.hpp" - - #include #include #include +#include "core/test/gtest/resources.hpp" + + #ifdef GKO_COMPILING_OMP #include #endif diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp index 3e14270ecdc..49fd1332ed5 100644 --- a/omp/base/executor.cpp +++ b/omp/base/executor.cpp @@ -49,4 +49,4 @@ int OmpExecutor::get_num_omp_threads() } -} // namespace gko \ No newline at end of file +} // namespace gko From c9fcd208f2cdf4e1cf5cc6ab7eb8cde19a2151c2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 1 Sep 2023 16:47:38 +0200 Subject: [PATCH 289/583] link against GTest main statically --- core/test/gtest/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt index 43bb6863224..c4e9cb52870 100644 --- a/core/test/gtest/CMakeLists.txt +++ b/core/test/gtest/CMakeLists.txt @@ -1,10 +1,10 @@ function(add_gtest_main suffix definitions) - add_library(ginkgo_gtest_main${suffix} ginkgo_main.cpp resources.cpp) + add_library(ginkgo_gtest_main${suffix} STATIC ginkgo_main.cpp resources.cpp) target_link_libraries(ginkgo_gtest_main${suffix} PUBLIC Ginkgo::ginkgo GTest::GTest) target_compile_definitions(ginkgo_gtest_main${suffix} PRIVATE ${definitions}) ginkgo_compile_features(ginkgo_gtest_main${suffix}) if (GINKGO_BUILD_MPI) - add_library(ginkgo_gtest_main_mpi${suffix} ginkgo_mpi_main.cpp resources.cpp) + add_library(ginkgo_gtest_main_mpi${suffix} STATIC ginkgo_mpi_main.cpp resources.cpp) target_link_libraries(ginkgo_gtest_main_mpi${suffix} PUBLIC Ginkgo::ginkgo GTest::GTest MPI::MPI_CXX) target_compile_definitions(ginkgo_gtest_main_mpi${suffix} PRIVATE ${definitions}) ginkgo_compile_features(ginkgo_gtest_main_mpi${suffix}) From a430a918f92e7def6a488607a1dcb00967d4060f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 1 Sep 2023 16:54:33 +0200 Subject: [PATCH 290/583] use nla-gpu1 for ROCm tests as well --- .gitlab/image.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.gitlab/image.yml b/.gitlab/image.yml index 72fb51ad372..eb1ab5128af 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -72,15 +72,13 @@ image: ginkgohub/rocm:45-mvapich2-gnu8-llvm8 tags: - private_ci - - amdci - - gpu + - amd-gpu .use_gko-rocm502-nompi-gnu11-llvm11: image: ginkgohub/rocm:502-openmpi-gnu11-llvm11 tags: - private_ci - - amdci - - gpu + - amd-gpu .use_gko-oneapi-cpu: image: ginkgohub/oneapi:2022.1 From 1978788235e00f8b5591fb476b9048162d973fb3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 7 Sep 2023 10:21:07 +0100 Subject: [PATCH 291/583] review updates - formatting - remove remaining occurrences of syclgpu - rename to GINKGO_CI_TEST_OMP_PARALLELISM Co-authored-by: Yuhsiang M. Tsai --- CMakeLists.txt | 2 +- cmake/create_test.cmake | 12 ++++++------ core/test/CMakeLists.txt | 1 - core/test/gtest/resources.cpp | 2 +- cuda/test/base/CMakeLists.txt | 2 +- omp/CMakeLists.txt | 2 +- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bec31a4360c..e07023bc46b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,7 +89,7 @@ option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF) option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF) -set(GINKGO_TEST_OMP_PARALLELISM "4" CACHE STRING +set(GINKGO_CI_TEST_OMP_PARALLELISM "4" CACHE STRING "The number of OpenMP threads to use for a test binary during CTest resource file-constrained test.") # load executor-specific configuration diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index baaf84f59eb..50271e12c9c 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -5,10 +5,10 @@ set(gko_test_option_args "NO_RESOURCES") ## Replaces / by _ to create valid target names from relative paths function(ginkgo_build_test_name test_name target_name) - file(RELATIVE_PATH REL_BINARY_DIR - ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) - string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") - set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) + file(RELATIVE_PATH REL_BINARY_DIR + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") + set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) endfunction() ## Set up shared target properties and handle ADDITIONAL_LIBRARIES/ADDITIONAL_INCLUDES @@ -52,7 +52,7 @@ function(ginkgo_add_resource_requirement test_name) set(single_resource "cpu:1") elseif(add_rr_RESOURCE_TYPE STREQUAL "cpu") if(NOT add_rr_RESOURCE_LOCAL_CORES) - set(add_rr_RESOURCE_LOCAL_CORES ${GINKGO_TEST_OMP_PARALLELISM}) + set(add_rr_RESOURCE_LOCAL_CORES ${GINKGO_CI_TEST_OMP_PARALLELISM}) endif() if(NOT add_rr_RESOURCE_LOCAL_CORES MATCHES "^[0-9]+") message(FATAL_ERROR "Resource specification is invalid: RESOURCE_LOCAL_CORES=${add_rr_RESOURCE_LOCAL_CORES}") @@ -90,7 +90,7 @@ endfunction() ## Possible additional arguments: ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes. ## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is -## $GINKGO_TEST_OMP_PARALLELISM +## $GINKGO_CI_TEST_OMP_PARALLELISM ## - `RESOURCE_PERCENTAGE` usage percentage of a single GPU, default is 25 ## - `RESOURCE_TYPE` the resource type, can be ref, cpu, cudagpu, hipgpu, sycl ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple) diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt index 776d0b72c7d..69f7ddd749e 100644 --- a/core/test/CMakeLists.txt +++ b/core/test/CMakeLists.txt @@ -1,7 +1,6 @@ include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake) add_subdirectory(gtest) - add_subdirectory(accessor) add_subdirectory(base) add_subdirectory(components) diff --git a/core/test/gtest/resources.cpp b/core/test/gtest/resources.cpp index dc8ad7931a9..0dd427b75ee 100644 --- a/core/test/gtest/resources.cpp +++ b/core/test/gtest/resources.cpp @@ -138,7 +138,7 @@ ResourceEnvironment::ResourceEnvironment(int rank, int size) hip_device_id = resource.id; } // SYCL GPUs (no other devices!) - if (auto rs_sycl_env = get_ctest_group("syclgpu", rank)) { + if (auto rs_sycl_env = get_ctest_group("sycl", rank)) { auto resource = parse_ctest_resources(rs_sycl_env); sycl_device_id = resource.id; } diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt index bb99ba858a4..174f4533c52 100644 --- a/cuda/test/base/CMakeLists.txt +++ b/cuda/test/base/CMakeLists.txt @@ -4,7 +4,7 @@ ginkgo_create_cuda_test(index_set) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_cuda_test(cuda_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) -endif () +endif() ginkgo_create_cuda_test(exception_helpers) ginkgo_create_cuda_test(kernel_launch) ginkgo_create_cuda_test(lin_op) diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 7f46feff5da..47259feeac0 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -5,7 +5,7 @@ target_sources(ginkgo_omp PRIVATE base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp - base/executor.cpp + base/executor.cpp base/index_set_kernels.cpp base/scoped_device_id.cpp base/version.cpp From 7b7b867be81e528938a638f2a2c2835defd7ee76 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 13 Sep 2023 10:38:15 +0200 Subject: [PATCH 292/583] use slots instead of percentages for GPUs --- cmake/create_test.cmake | 17 ++--------------- test/tools/resource_file_generator.cpp | 2 +- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 50271e12c9c..375135dcb13 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -1,4 +1,4 @@ -set(gko_test_resource_args "RESOURCE_LOCAL_CORES;RESOURCE_PERCENTAGE;RESOURCE_TYPE") +set(gko_test_resource_args "RESOURCE_LOCAL_CORES;RESOURCE_TYPE") set(gko_test_single_args "MPI_SIZE;${gko_test_resource_args}") set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES") set(gko_test_option_args "NO_RESOURCES") @@ -60,19 +60,7 @@ function(ginkgo_add_resource_requirement test_name) set(single_resource "cpu:${add_rr_RESOURCE_LOCAL_CORES}") elseif(add_rr_RESOURCE_TYPE MATCHES "^(cudagpu|hipgpu|sycl)$") - if(NOT add_rr_RESOURCE_PERCENTAGE) - set(add_rr_RESOURCE_PERCENTAGE 25) - endif() - if(add_rr_MPI_SIZE GREATER 1) - set(add_rr_RESOURCE_PERCENTAGE 100) - endif() - if(NOT add_rr_RESOURCE_PERCENTAGE MATCHES "^[0-9]([0-9][0-9]?)?" - OR add_rr_RESOURCE_PERCENTAGE LESS 0 - OR add_rr_RESOURCE_PERCENTAGE GREATER 100) - message(FATAL_ERROR "Resource specification is invalid: RESOURCE_PERCENTAGE=${add_rr_RESOURCE_PERCENTAGE}") - endif() - - set(single_resource "${add_rr_RESOURCE_TYPE}:${add_rr_RESOURCE_PERCENTAGE}") + set(single_resource "${add_rr_RESOURCE_TYPE}:1") else() message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: ref, cpu, cudagpu, hipgpu, sycl.") endif() @@ -91,7 +79,6 @@ endfunction() ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes. ## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is ## $GINKGO_CI_TEST_OMP_PARALLELISM -## - `RESOURCE_PERCENTAGE` usage percentage of a single GPU, default is 25 ## - `RESOURCE_TYPE` the resource type, can be ref, cpu, cudagpu, hipgpu, sycl ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple) ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp index f0dbbea0353..ca7b09288e8 100644 --- a/test/tools/resource_file_generator.cpp +++ b/test/tools/resource_file_generator.cpp @@ -92,7 +92,7 @@ int main() if (i > 0) { gpus.append(",\n"); } - gpus += R"( {"id": ")" + std::to_string(i) + R"(", "slots": 100})"; + gpus += R"( {"id": ")" + std::to_string(i) + R"(", "slots": 1})"; } if (num_devices) { gpus.append("\n]"); From 927b443d3a06b9bc7e914caee27370cb15c38a90 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 13 Sep 2023 11:01:37 +0200 Subject: [PATCH 293/583] use non-default stream only if necessary --- test/utils/executor.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index e4ce56f3d7a..ea2aef157fd 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -127,10 +127,10 @@ class CommonTestFixture : public ::testing::Test { CommonTestFixture() : -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_CUDA) stream(ResourceEnvironment::cuda_device_id), #endif -#ifdef GKO_COMPILING_HIP +#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_HIP) stream(ResourceEnvironment::hip_device_id), #endif ref{gko::ReferenceExecutor::create()} From 4f84679127d94408001513a2df544595d2d5bd54 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 13 Sep 2023 11:01:50 +0200 Subject: [PATCH 294/583] fix CUDA trisolve regression --- cuda/solver/common_trs_kernels.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index f42b11f510d..6ee2c7521ff 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -120,12 +120,13 @@ struct CudaSolveStruct : gko::solver::SolveStruct { const auto rows = matrix->get_size()[0]; // workaround suggested by NVIDIA engineers: for some reason // cusparse needs non-nullptr input vectors even for analysis + // also make sure they are aligned by 16 bytes auto descr_b = cusparse::create_dnmat( dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], - reinterpret_cast(0xDEAD)); + reinterpret_cast(0xDEAD0)); auto descr_c = cusparse::create_dnmat( dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], - reinterpret_cast(0xDEAF)); + reinterpret_cast(0xDEAF0)); auto work_size = cusparse::spsm_buffer_size( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, From f0106bce310ff9a393a6e5561fd5d6b100fec01c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 14 Sep 2023 18:13:49 +0200 Subject: [PATCH 295/583] fix nondefault stream handling --- cuda/test/utils.hpp | 2 ++ hip/test/utils.hip.hpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index 6ef808aa1b3..d13e364d66a 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -52,7 +52,9 @@ class CudaTestFixture : public ::testing::Test { protected: CudaTestFixture() : ref(gko::ReferenceExecutor::create()), +#ifdef GKO_TEST_NONDEFAULT_STREAM stream(ResourceEnvironment::cuda_device_id), +#endif exec(gko::CudaExecutor::create( ResourceEnvironment::cuda_device_id, ref, std::make_shared(), stream.get())), diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index e1c9f9341fb..9fc3edc3f82 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -52,7 +52,9 @@ class HipTestFixture : public ::testing::Test { protected: HipTestFixture() : ref(gko::ReferenceExecutor::create()), +#ifdef GKO_TEST_NONDEFAULT_STREAM stream(ResourceEnvironment::hip_device_id), +#endif exec(gko::HipExecutor::create(ResourceEnvironment::hip_device_id, ref, std::make_shared(), stream.get())), From 22be0ce76aad55c9374d53da2af766077ec53dd2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 14 Sep 2023 18:14:24 +0200 Subject: [PATCH 296/583] remove `tests` target --- CMakeLists.txt | 4 ---- cmake/create_test.cmake | 2 -- 2 files changed, 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e07023bc46b..8ac16267717 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -309,10 +309,6 @@ configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in # propagated to the other parts of Ginkgo in case of building as static libraries add_subdirectory(devices) # Basic device functionalities. Always compiled. add_subdirectory(common) # Import list of unified kernel source files -if(GINKGO_BUILD_TESTS) - # use custom target `tests` to build only test binaries - add_custom_target(tests) -endif() if(GINKGO_BUILD_CUDA) add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs endif() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 375135dcb13..4dd6bd12125 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -100,8 +100,6 @@ function(ginkgo_add_test test_name test_target_name) COMMAND ${test_target_name} WORKING_DIRECTORY "$") endif() - # use custom target `tests` to build only test binaries - add_dependencies(tests ${test_target_name}) ginkgo_add_resource_requirement(${REL_BINARY_DIR}/${test_name} ${ARGN}) From 556f6387b50684c63c15541a41766e37db804e1e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 14 Sep 2023 18:15:19 +0200 Subject: [PATCH 297/583] remove `ref` resource type This can be handled by the `-j` flag --- cmake/create_test.cmake | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 4dd6bd12125..e66bfb3178c 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -40,17 +40,11 @@ endfunction() function(ginkgo_add_resource_requirement test_name) cmake_parse_arguments(PARSE_ARGV 1 add_rr "${gko_test_option_args}" "${gko_test_single_args}" "") - if(add_rr_NO_RESOURCES) + if(add_rr_NO_RESOURCES OR (NOT add_rr_RESOURCE_TYPE)) return() - endif() - - if (NOT add_rr_RESOURCE_TYPE) - message(FATAL_ERROR "Need to provide resource type used by test.") endif () - if(add_rr_RESOURCE_TYPE STREQUAL "ref") - set(single_resource "cpu:1") - elseif(add_rr_RESOURCE_TYPE STREQUAL "cpu") + if(add_rr_RESOURCE_TYPE STREQUAL "cpu") if(NOT add_rr_RESOURCE_LOCAL_CORES) set(add_rr_RESOURCE_LOCAL_CORES ${GINKGO_CI_TEST_OMP_PARALLELISM}) endif() @@ -79,7 +73,7 @@ endfunction() ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes. ## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is ## $GINKGO_CI_TEST_OMP_PARALLELISM -## - `RESOURCE_TYPE` the resource type, can be ref, cpu, cudagpu, hipgpu, sycl +## - `RESOURCE_TYPE` the resource type, can be cpu, cudagpu, hipgpu, sycl ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple) ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths @@ -121,7 +115,7 @@ function(ginkgo_create_test test_name) add_executable(${test_target_name} ${test_name}.cpp) target_link_libraries(${test_target_name}) ginkgo_set_test_target_properties(${test_target_name} "" ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE ref) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) endfunction(ginkgo_create_test) ## Test compiled with dpcpp @@ -263,7 +257,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) return() endif() if (exec STREQUAL reference) - set(test_resource_type ref) + set(test_resource_type "") elseif (exec STREQUAL omp) set(test_resource_type cpu) elseif (exec STREQUAL cuda) From dc0c474fe0f0c336a63e209210f0a1fa794f3371 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 14 Sep 2023 18:15:55 +0200 Subject: [PATCH 298/583] move more tests to host compiler --- cuda/test/base/CMakeLists.txt | 10 +++++----- cuda/test/base/{array.cu => array.cpp} | 0 cuda/test/base/{index_set.cu => index_set.cpp} | 0 cuda/test/base/{lin_op.cu => lin_op.cpp} | 0 cuda/test/base/{memory.cu => memory.cpp} | 0 hip/test/base/CMakeLists.txt | 8 ++++---- hip/test/base/{lin_op.hip.cpp => lin_op.cpp} | 0 hip/test/base/{memory.hip.cpp => memory.cpp} | 0 hip/test/matrix/CMakeLists.txt | 2 +- .../{fbcsr_kernels.hip.cpp => fbcsr_kernels.cpp} | 0 hip/test/solver/CMakeLists.txt | 4 ++-- hip/test/utils/CMakeLists.txt | 2 +- .../{assertions_test.hip.cpp => assertions_test.cpp} | 0 13 files changed, 13 insertions(+), 13 deletions(-) rename cuda/test/base/{array.cu => array.cpp} (100%) rename cuda/test/base/{index_set.cu => index_set.cpp} (100%) rename cuda/test/base/{lin_op.cu => lin_op.cpp} (100%) rename cuda/test/base/{memory.cu => memory.cpp} (100%) rename hip/test/base/{lin_op.hip.cpp => lin_op.cpp} (100%) rename hip/test/base/{memory.hip.cpp => memory.cpp} (100%) rename hip/test/matrix/{fbcsr_kernels.hip.cpp => fbcsr_kernels.cpp} (100%) rename hip/test/utils/{assertions_test.hip.cpp => assertions_test.cpp} (100%) diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt index 174f4533c52..d4260c6e934 100644 --- a/cuda/test/base/CMakeLists.txt +++ b/cuda/test/base/CMakeLists.txt @@ -1,13 +1,13 @@ -ginkgo_create_cuda_test(array) +ginkgo_create_test(array RESOURCE_TYPE cudagpu) ginkgo_create_cuda_test(cuda_executor) -ginkgo_create_cuda_test(index_set) +ginkgo_create_test(index_set RESOURCE_TYPE cudagpu) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_cuda_test(cuda_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) endif() ginkgo_create_cuda_test(exception_helpers) ginkgo_create_cuda_test(kernel_launch) -ginkgo_create_cuda_test(lin_op) +ginkgo_create_test(lin_op RESOURCE_TYPE cudagpu) ginkgo_create_cuda_test(math) -ginkgo_create_cuda_test(memory) -ginkgo_create_cuda_test(scoped_device_id NO_RESOURCES) +ginkgo_create_test(memory RESOURCE_TYPE cudagpu) +ginkgo_create_cuda_test(scoped_device_id) diff --git a/cuda/test/base/array.cu b/cuda/test/base/array.cpp similarity index 100% rename from cuda/test/base/array.cu rename to cuda/test/base/array.cpp diff --git a/cuda/test/base/index_set.cu b/cuda/test/base/index_set.cpp similarity index 100% rename from cuda/test/base/index_set.cu rename to cuda/test/base/index_set.cpp diff --git a/cuda/test/base/lin_op.cu b/cuda/test/base/lin_op.cpp similarity index 100% rename from cuda/test/base/lin_op.cu rename to cuda/test/base/lin_op.cpp diff --git a/cuda/test/base/memory.cu b/cuda/test/base/memory.cpp similarity index 100% rename from cuda/test/base/memory.cu rename to cuda/test/base/memory.cpp diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt index ed32ab5b6a7..bfe8c8be96a 100644 --- a/hip/test/base/CMakeLists.txt +++ b/hip/test/base/CMakeLists.txt @@ -1,18 +1,18 @@ ginkgo_create_hip_test(hip_executor) -ginkgo_create_test(index_set) +ginkgo_create_test(index_set RESOURCE_TYPE hipgpu) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_hip_test(hip_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) endif() ginkgo_create_hip_test(kernel_launch) # correct flags for kernel_launch.hpp are set in GINKGO_HIPCC_OPTIONS -ginkgo_create_hip_test(lin_op) +ginkgo_create_test(lin_op RESOURCE_TYPE hipgpu) ginkgo_create_hip_test(math) -ginkgo_create_hip_test(memory) +ginkgo_create_test(memory RESOURCE_TYPE hipgpu) # Only hcc needs the libraries. nvcc only requires the headers. if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") ginkgo_create_hip_test(exception_helpers ADDITIONAL_LIBRARIES roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) else() ginkgo_create_hip_test(exception_helpers) endif() -ginkgo_create_hip_test(scoped_device_id NO_RESOURCES) +ginkgo_create_hip_test(scoped_device_id) diff --git a/hip/test/base/lin_op.hip.cpp b/hip/test/base/lin_op.cpp similarity index 100% rename from hip/test/base/lin_op.hip.cpp rename to hip/test/base/lin_op.cpp diff --git a/hip/test/base/memory.hip.cpp b/hip/test/base/memory.cpp similarity index 100% rename from hip/test/base/memory.hip.cpp rename to hip/test/base/memory.cpp diff --git a/hip/test/matrix/CMakeLists.txt b/hip/test/matrix/CMakeLists.txt index 82db4b8b376..a52069daea0 100644 --- a/hip/test/matrix/CMakeLists.txt +++ b/hip/test/matrix/CMakeLists.txt @@ -1,4 +1,4 @@ -ginkgo_create_hip_test(fbcsr_kernels) +ginkgo_create_test(fbcsr_kernels RESOURCE_TYPE hipgpu) if (hipfft_FOUND) ginkgo_create_hip_test(fft_kernels) endif() diff --git a/hip/test/matrix/fbcsr_kernels.hip.cpp b/hip/test/matrix/fbcsr_kernels.cpp similarity index 100% rename from hip/test/matrix/fbcsr_kernels.hip.cpp rename to hip/test/matrix/fbcsr_kernels.cpp diff --git a/hip/test/solver/CMakeLists.txt b/hip/test/solver/CMakeLists.txt index a3b86589410..fcbb3de0c47 100644 --- a/hip/test/solver/CMakeLists.txt +++ b/hip/test/solver/CMakeLists.txt @@ -1,2 +1,2 @@ -ginkgo_create_test(lower_trs_kernels) -ginkgo_create_test(upper_trs_kernels) +ginkgo_create_test(lower_trs_kernels RESOURCE_TYPE hipgpu) +ginkgo_create_test(upper_trs_kernels RESOURCE_TYPE hipgpu) diff --git a/hip/test/utils/CMakeLists.txt b/hip/test/utils/CMakeLists.txt index a6c52f65d9c..d9ec2ff29a7 100644 --- a/hip/test/utils/CMakeLists.txt +++ b/hip/test/utils/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_hip_test(assertions_test) +ginkgo_create_test(assertions_test RESOURCE_TYPE hipgpu) diff --git a/hip/test/utils/assertions_test.hip.cpp b/hip/test/utils/assertions_test.cpp similarity index 100% rename from hip/test/utils/assertions_test.hip.cpp rename to hip/test/utils/assertions_test.cpp From 346d6230a5e2e305c8cfea4b6c51a2b7f07b460c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 15 Sep 2023 18:03:07 +0200 Subject: [PATCH 299/583] Update cmake/create_test.cmake Co-authored-by: Yu-Hsiang M. Tsai <19565938+yhmtsai@users.noreply.github.com> --- cmake/create_test.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index e66bfb3178c..1911276f61d 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -56,7 +56,7 @@ function(ginkgo_add_resource_requirement test_name) elseif(add_rr_RESOURCE_TYPE MATCHES "^(cudagpu|hipgpu|sycl)$") set(single_resource "${add_rr_RESOURCE_TYPE}:1") else() - message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: ref, cpu, cudagpu, hipgpu, sycl.") + message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: cpu, cudagpu, hipgpu, sycl.") endif() if(NOT add_rr_MPI_SIZE) From 78485c2bd085dbb680c3f907bf18bdd81423e847 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 21 Sep 2023 11:34:23 +0200 Subject: [PATCH 300/583] review updates - make more tests host-compiled - make GTest main library suffix more descriptive - more consistent formatting --- cmake/create_test.cmake | 2 +- core/test/gtest/CMakeLists.txt | 2 ++ core/test/gtest/ginkgo_mpi_main.cpp | 2 +- cuda/test/reorder/CMakeLists.txt | 2 +- cuda/test/reorder/{rcm_kernels.cu => rcm_kernels.cpp} | 0 omp/test/reorder/CMakeLists.txt | 2 +- 6 files changed, 6 insertions(+), 4 deletions(-) rename cuda/test/reorder/{rcm_kernels.cu => rcm_kernels.cpp} (100%) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 1911276f61d..cec47fced74 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -114,7 +114,7 @@ function(ginkgo_create_test test_name) ginkgo_build_test_name(${test_name} test_target_name) add_executable(${test_target_name} ${test_name}.cpp) target_link_libraries(${test_target_name}) - ginkgo_set_test_target_properties(${test_target_name} "" ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_cpu" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) endfunction(ginkgo_create_test) diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt index c4e9cb52870..6d77b663e84 100644 --- a/core/test/gtest/CMakeLists.txt +++ b/core/test/gtest/CMakeLists.txt @@ -13,8 +13,10 @@ endfunction() add_gtest_main("" "") add_library(ginkgo_gtest_main_reference ALIAS ginkgo_gtest_main) +add_library(ginkgo_gtest_main_cpu ALIAS ginkgo_gtest_main) if (GINKGO_BUILD_MPI) add_library(ginkgo_gtest_main_mpi_reference ALIAS ginkgo_gtest_main_mpi) + add_library(ginkgo_gtest_main_mpi_cpu ALIAS ginkgo_gtest_main_mpi) endif() if (GINKGO_BUILD_OMP) add_gtest_main("_omp" "GKO_COMPILING_OMP") diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index 12107ca55f8..6853a12c940 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -395,7 +395,7 @@ int main(int argc, char** argv) MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); - testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); + ::testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment); ::testing::AddGlobalTestEnvironment(new ResourceEnvironment(rank, size)); ::testing::AddGlobalTestEnvironment(new DeviceEnvironment(rank)); MPI_Barrier(comm); diff --git a/cuda/test/reorder/CMakeLists.txt b/cuda/test/reorder/CMakeLists.txt index e6cd8c0f5d2..79deba957b3 100644 --- a/cuda/test/reorder/CMakeLists.txt +++ b/cuda/test/reorder/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_cuda_test(rcm_kernels) +ginkgo_create_test(rcm_kernels RESOURCE_TYPE cudagpu) diff --git a/cuda/test/reorder/rcm_kernels.cu b/cuda/test/reorder/rcm_kernels.cpp similarity index 100% rename from cuda/test/reorder/rcm_kernels.cu rename to cuda/test/reorder/rcm_kernels.cpp diff --git a/omp/test/reorder/CMakeLists.txt b/omp/test/reorder/CMakeLists.txt index 089e51c67c9..65aea4a0fdb 100644 --- a/omp/test/reorder/CMakeLists.txt +++ b/omp/test/reorder/CMakeLists.txt @@ -1 +1 @@ -ginkgo_create_omp_test(rcm_kernels) +ginkgo_create_test(rcm_kernels RESOURCE_TYPE cpu) From 963a19752cbe5f209b084557ab43f18d81465041 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 23 Sep 2023 17:34:53 +0200 Subject: [PATCH 301/583] fix unused-argument warning for unsafe-atomic flag --- cmake/hip.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index e1897b42c9c..72a7a3a86d8 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -207,7 +207,7 @@ set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILIN set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS}) if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5) - list(APPEND GINKGO_HIP_CLANG_OPTIONS -munsafe-fp-atomics) + list(APPEND GINKGO_HIP_CLANG_OPTIONS "-munsafe-fp-atomics -Wno-unused-command-line-argument") endif() # HIP's cmake support secretly carries around global state to remember # whether we created any shared libraries, and sets PIC flags accordingly. From 985a836153949957580d9a9ffdbfec8779fe0c47 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 24 Sep 2023 21:53:59 +0200 Subject: [PATCH 302/583] use non-deprecated headers for HIP libs --- benchmark/utils/hip_linops.hip.cpp | 3 --- hip/base/exception.hip.cpp | 6 ++++++ hip/base/hipblas_bindings.hip.hpp | 5 +++++ hip/base/hiprand_bindings.hip.hpp | 5 +++++ hip/base/hipsparse_bindings.hip.hpp | 5 +++++ hip/base/hipsparse_block_bindings.hip.hpp | 5 +++++ hip/base/pointer_mode_guard.hip.hpp | 5 +++++ hip/base/roctx.hip.cpp | 4 ++++ hip/base/types.hip.hpp | 4 ++++ hip/matrix/fft_kernels.hip.cpp | 5 +++++ hip/solver/common_trs_kernels.hip.hpp | 4 ++++ hip/solver/lower_trs_kernels.hip.cpp | 4 ++++ hip/solver/upper_trs_kernels.hip.cpp | 4 ++++ hip/test/base/exception_helpers.hip.cpp | 6 ++++++ hip/test/matrix/fft_kernels.hip.cpp | 5 +++++ 15 files changed, 67 insertions(+), 3 deletions(-) diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index 627dfad980e..c8664778e02 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -36,9 +36,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp index 19a2b3739ac..7a182963f74 100644 --- a/hip/base/exception.hip.cpp +++ b/hip/base/exception.hip.cpp @@ -37,9 +37,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if HIP_VERSION >= 50200000 +#include +#include +#include +#else #include #include #include +#endif #include diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp index 2ff73c81e34..63751aa725a 100644 --- a/hip/base/hipblas_bindings.hip.hpp +++ b/hip/base/hipblas_bindings.hip.hpp @@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_ +#include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 900433af339..14e144f6d84 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_ +#include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp index 90378d3c711..322467dc2b3 100644 --- a/hip/base/hipsparse_bindings.hip.hpp +++ b/hip/base/hipsparse_bindings.hip.hpp @@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_ +#include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp index bc6c28394eb..49ef1e86c7d 100644 --- a/hip/base/hipsparse_block_bindings.hip.hpp +++ b/hip/base/hipsparse_block_bindings.hip.hpp @@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ +#include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp index 681839ec9e2..11fa5afeb9e 100644 --- a/hip/base/pointer_mode_guard.hip.hpp +++ b/hip/base/pointer_mode_guard.hip.hpp @@ -38,8 +38,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if HIP_VERSION >= 50200000 +#include +#include +#else #include #include +#endif #include diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index 23b07e60254..6e2d93b3a06 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -37,8 +37,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX +#if HIP_VERSION >= 50200000 +#include +#else #include #endif +#endif #include diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 93ae3646a4c..c886378ec80 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -43,7 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index 238aeddc40f..56c967d9e49 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -36,7 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index 643c875561e..6cf2ca516f2 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -39,7 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 2e9dd0d0ce3..283f5ee5284 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -37,7 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index a3c6070614c..09e71826130 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -37,7 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include diff --git a/hip/test/base/exception_helpers.hip.cpp b/hip/test/base/exception_helpers.hip.cpp index 29dea03961f..7014738bd76 100644 --- a/hip/test/base/exception_helpers.hip.cpp +++ b/hip/test/base/exception_helpers.hip.cpp @@ -34,9 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if HIP_VERSION >= 50200000 +#include +#include +#include +#else #include #include #include +#endif #include diff --git a/hip/test/matrix/fft_kernels.hip.cpp b/hip/test/matrix/fft_kernels.hip.cpp index 59c24492b5b..8c213df8ad5 100644 --- a/hip/test/matrix/fft_kernels.hip.cpp +++ b/hip/test/matrix/fft_kernels.hip.cpp @@ -33,7 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#if HIP_VERSION >= 50200000 +#include +#else #include +#endif #include From 089c46f577717add5e121f5fd9ff4815204e72c7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 28 Sep 2023 11:24:15 +0200 Subject: [PATCH 303/583] fix HIP Jacobi transposition compilation --- .../preconditioner/jacobi_kernels.hpp.inc | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc index 8827a47620b..c63a644f87b 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc @@ -145,8 +145,8 @@ __global__ void agglomerate_supervariables_kernel( } -template +template __global__ void __launch_bounds__(warps_per_block* config::warp_size) transpose_jacobi(const ValueType* __restrict__ blocks, preconditioner::block_interleaved_storage_scheme @@ -154,8 +154,7 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size) const IndexType* __restrict__ block_ptrs, size_type num_blocks, ValueType* __restrict__ out_blocks) { - const auto block_id = - thread::get_subwarp_id(); + const auto block_id = thread::get_subwarp_id_flat(); const auto subwarp = group::tiled_partition(group::this_thread_block()); if (block_id >= num_blocks) { @@ -176,18 +175,16 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size) } -template -__global__ void -__launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi( +template +__global__ void adaptive_transpose_jacobi( const ValueType* __restrict__ blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const precision_reduction* __restrict__ block_precisions, const IndexType* __restrict__ block_ptrs, size_type num_blocks, ValueType* __restrict__ out_blocks) { - const auto block_id = - thread::get_subwarp_id(); + const auto block_id = thread::get_subwarp_id_flat(); const auto subwarp = group::tiled_partition(group::this_thread_block()); if (block_id >= num_blocks) { @@ -197,23 +194,23 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi( const auto block_stride = storage_scheme.get_stride(); const auto rank = subwarp.thread_rank(); - if (rank < block_size) { - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - ValueType, block_precisions[block_id], - auto local_block = - reinterpret_cast( - blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id); - auto local_out_block = - reinterpret_cast( - out_blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id); - for (IndexType i = 0; i < block_size; ++i) { - auto val = local_block[i * block_stride + rank]; - local_out_block[i + rank * block_stride] = - conjugate ? conj(val) : val; - }); - } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + auto local_block = + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + auto local_out_block = + reinterpret_cast( + out_blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + for (int i = rank; i < block_size * block_size; i += subwarp_size) { + int row = i % block_size; + int col = i / block_size; + auto val = local_block[row + col * block_stride]; + local_out_block[row * block_stride + col] = + conjugate ? conj(val) : val; + }); } @@ -313,18 +310,16 @@ void transpose_jacobi( constexpr int blocks_per_warp = config::warp_size / subwarp_size; const auto grid_size = ceildiv(num_blocks, warps_per_block * blocks_per_warp); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + const auto block_size = subwarp_size * blocks_per_warp * warps_per_block; if (grid_size > 0) { if (block_precisions) { - adaptive_transpose_jacobi + adaptive_transpose_jacobi <<get_stream()>>>( as_device_type(blocks), storage_scheme, block_precisions, block_pointers, num_blocks, as_device_type(out_blocks)); } else { - transpose_jacobi + transpose_jacobi <<get_stream()>>>( as_device_type(blocks), storage_scheme, block_pointers, num_blocks, as_device_type(out_blocks)); From 2afdc4525ff0d0b998f51edbbf4a1738a29fc3db Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 28 Sep 2023 11:35:35 +0200 Subject: [PATCH 304/583] avoid changing the thread indexing scheme --- .../preconditioner/jacobi_kernels.hpp.inc | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc index c63a644f87b..2a0f7bd0dd7 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc @@ -145,8 +145,8 @@ __global__ void agglomerate_supervariables_kernel( } -template +template __global__ void __launch_bounds__(warps_per_block* config::warp_size) transpose_jacobi(const ValueType* __restrict__ blocks, preconditioner::block_interleaved_storage_scheme @@ -154,7 +154,8 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size) const IndexType* __restrict__ block_ptrs, size_type num_blocks, ValueType* __restrict__ out_blocks) { - const auto block_id = thread::get_subwarp_id_flat(); + const auto block_id = + thread::get_subwarp_id(); const auto subwarp = group::tiled_partition(group::this_thread_block()); if (block_id >= num_blocks) { @@ -175,16 +176,18 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size) } -template -__global__ void adaptive_transpose_jacobi( +template +__global__ void +__launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi( const ValueType* __restrict__ blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const precision_reduction* __restrict__ block_precisions, const IndexType* __restrict__ block_ptrs, size_type num_blocks, ValueType* __restrict__ out_blocks) { - const auto block_id = thread::get_subwarp_id_flat(); + const auto block_id = + thread::get_subwarp_id(); const auto subwarp = group::tiled_partition(group::this_thread_block()); if (block_id >= num_blocks) { @@ -310,16 +313,18 @@ void transpose_jacobi( constexpr int blocks_per_warp = config::warp_size / subwarp_size; const auto grid_size = ceildiv(num_blocks, warps_per_block * blocks_per_warp); - const auto block_size = subwarp_size * blocks_per_warp * warps_per_block; + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); if (grid_size > 0) { if (block_precisions) { - adaptive_transpose_jacobi + adaptive_transpose_jacobi <<get_stream()>>>( as_device_type(blocks), storage_scheme, block_precisions, block_pointers, num_blocks, as_device_type(out_blocks)); } else { - transpose_jacobi + transpose_jacobi <<get_stream()>>>( as_device_type(blocks), storage_scheme, block_pointers, num_blocks, as_device_type(out_blocks)); From 6698a995327f508e853d22b31cbe2df195d7e781 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 28 Sep 2023 16:34:44 +0200 Subject: [PATCH 305/583] fix SLURM GPU allocation --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 94dedd030c6..6185608864f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -295,7 +295,7 @@ test/cuda110/nompi/clang/cuda/release/static: variables: USE_NAME: "cuda110-nompi-clang-${CI_PIPELINE_ID}" SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:1" + SLURM_GRES: "gpu:4" SLURM_TIME: "01:30:00" dependencies: null needs: [ "build/cuda110/nompi/clang/cuda/release/static" ] @@ -329,7 +329,7 @@ test/cuda110/nompi/intel/cuda/debug/static: variables: USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}" SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:1" + SLURM_GRES: "gpu:4" SLURM_TIME: "02:00:00" dependencies: null needs: [ "build/cuda110/nompi/intel/cuda/debug/static" ] From d610c526b09b7f71b2781cfc3f00bf657cc02589 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 4 Oct 2023 13:06:56 +0200 Subject: [PATCH 306/583] fix ell accessor type --- common/cuda_hip/matrix/ell_kernels.hpp.inc | 24 +++++++------ cuda/matrix/ell_kernels.cu | 6 ++-- dpcpp/matrix/ell_kernels.dp.cpp | 40 +++++++++++++--------- hip/matrix/ell_kernels.hip.cpp | 6 ++-- 4 files changed, 45 insertions(+), 31 deletions(-) diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc index 6c81fb4964c..e7bcac351cb 100644 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/ell_kernels.hpp.inc @@ -43,13 +43,14 @@ __device__ void spmv_kernel( acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op) { + using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(); const decltype(tidx) column_id = blockIdx.y; if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -69,13 +70,13 @@ __device__ void spmv_kernel( const auto worker_id = tidx / num_rows; const auto step_size = num_worker_per_row * num_thread_per_worker; __shared__ uninitialized_array< - OutputValueType, default_block_size / num_thread_per_worker> + arithmetic_type, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { storage[threadIdx.x] = 0; } __syncthreads(); - auto temp = zero(); + auto temp = zero(); for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { @@ -114,7 +115,9 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [](const OutputValueType& x, const OutputValueType& y) { return x; }); + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }); } @@ -128,7 +131,8 @@ __global__ __launch_bounds__(default_block_size) void spmv( const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, const size_type c_stride) { - const OutputValueType alpha_val = alpha(0); + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; if (atomic) { // Because the atomic operation changes the values of c during @@ -139,16 +143,16 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const OutputValueType& x, const OutputValueType& y) { - return alpha_val * x; + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const OutputValueType& x, - const OutputValueType& y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); }); } } diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 124a4deda75..7b20236827e 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -122,10 +122,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 65fad771140..4817b9a5991 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -120,16 +120,17 @@ void spmv_kernel( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { + using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(item_ct1); const decltype(tidx) column_id = item_ct1.get_group(1); if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -150,11 +151,11 @@ void spmv_kernel( const auto step_size = num_worker_per_row * num_thread_per_worker; if (runnable && idx_in_worker == 0) { - storage[item_ct1.get_local_id(2)] = 0; + storage[item_ct1.get_local_id(2)] = zero(); } item_ct1.barrier(sycl::access::fence_space::local_space); - auto temp = zero(); + auto temp = zero(); if (runnable) { for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; @@ -193,13 +194,15 @@ void spmv( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [](const OutputValueType& x, const OutputValueType& y) { return x; }, + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }, item_ct1, storage); } @@ -214,7 +217,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); @@ -239,10 +242,11 @@ void spmv( const size_type num_stored_elements_per_row, acc::range b, const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, const size_type c_stride, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { - const OutputValueType alpha_val = alpha(0); + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; if (atomic) { // Because the atomic operation changes the values of c during @@ -253,17 +257,17 @@ void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const OutputValueType& x, const OutputValueType& y) { - return alpha_val * x; + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); }, item_ct1, storage); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const OutputValueType& x, - const OutputValueType& y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); }, item_ct1, storage); } @@ -281,7 +285,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); @@ -316,10 +320,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index db9d5aa11bb..1567548463f 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -125,10 +125,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - acc::reduced_row_major<2, OutputValueType, const InputValueType>; + acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); From 4bd96dd57efbd5fd177f6a500c0e577a215b3bc7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 10 May 2023 19:30:29 +0200 Subject: [PATCH 307/583] omit .on(exec) in factory factory parameters --- include/ginkgo/core/base/lin_op.hpp | 12 +- include/ginkgo/core/preconditioner/ic.hpp | 109 +++++++++++++-- include/ginkgo/core/preconditioner/ilu.hpp | 151 +++++++++++++++++++-- include/ginkgo/core/solver/bicg.hpp | 27 +--- include/ginkgo/core/solver/bicgstab.hpp | 26 +--- include/ginkgo/core/solver/cb_gmres.hpp | 27 +--- include/ginkgo/core/solver/cg.hpp | 27 +--- include/ginkgo/core/solver/cgs.hpp | 27 +--- include/ginkgo/core/solver/direct.hpp | 53 +++++++- include/ginkgo/core/solver/fcg.hpp | 27 +--- include/ginkgo/core/solver/gcr.hpp | 30 +--- include/ginkgo/core/solver/gmres.hpp | 35 ++--- include/ginkgo/core/solver/idr.hpp | 24 +--- include/ginkgo/core/solver/ir.hpp | 73 ++++++++-- include/ginkgo/core/solver/multigrid.hpp | 12 +- include/ginkgo/core/solver/solver_base.hpp | 149 ++++++++++++++++++++ 16 files changed, 552 insertions(+), 257 deletions(-) diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index c06c43bbb6e..20d7771822f 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -1119,8 +1119,7 @@ public: \ mutable _name{__VA_ARGS__}; \ \ template \ - auto with_##_name(Args&&... _value) \ - const->const std::decay_t& \ + auto with_##_name(Args&&... _value)->std::decay_t& \ { \ using type = decltype(this->_name); \ this->_name = type{std::forward(_value)...}; \ @@ -1170,8 +1169,7 @@ public: \ mutable _name{__VA_ARGS__}; \ \ template \ - auto with_##_name(Args&&... _value) \ - const->const std::decay_t& \ + auto with_##_name(Args&&... _value)->std::decay_t& \ { \ GKO_NOT_IMPLEMENTED; \ return *this; \ @@ -1184,8 +1182,7 @@ public: \ mutable _name{_default}; \ \ template \ - auto with_##_name(Arg&& _value) \ - const->const std::decay_t& \ + auto with_##_name(Arg&& _value)->std::decay_t& \ { \ using type = decltype(this->_name); \ this->_name = type{std::forward(_value)}; \ @@ -1199,8 +1196,7 @@ public: \ mutable _name{__VA_ARGS__}; \ \ template \ - auto with_##_name(Args&&... _value) \ - const->const std::decay_t& \ + auto with_##_name(Args&&... _value)->std::decay_t& \ { \ using type = decltype(this->_name); \ this->_name = type{std::forward(_value)...}; \ diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp index aa19a004dc1..cb00119582a 100644 --- a/include/ginkgo/core/preconditioner/ic.hpp +++ b/include/ginkgo/core/preconditioner/ic.hpp @@ -119,19 +119,106 @@ class Ic : public EnableLinOp>, public Transposable { using index_type = IndexType; using transposed_type = Ic; - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { + class Factory; + + struct parameters_type + : public enable_parameters_type { /** * Factory for the L solver */ - std::shared_ptr - GKO_FACTORY_PARAMETER_SCALAR(l_solver_factory, nullptr); + std::shared_ptr + l_solver_factory{}; /** * Factory for the factorization */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - factorization_factory, nullptr); + std::shared_ptr factorization_factory{}; + + [[deprecated("use with_l_solver instead")]] parameters_type& + with_l_solver_factory( + std::shared_ptr factory) + { + return with_l_solver(std::move(factory)); + } + + parameters_type& with_l_solver( + std::shared_ptr factory) + { + this->l_solver_generator = + [factory](std::shared_ptr) + -> std::shared_ptr { + return factory; + }; + return *this; + } + + template ().on( + std::shared_ptr{}))> + parameters_type& with_l_solver(SolverParameters parameters) + { + this->l_solver_generator = + [parameters](std::shared_ptr exec) + -> std::shared_ptr { + return parameters.on(exec); + }; + return *this; + } + + [[deprecated("use with_factorization instead")]] parameters_type& + with_factorization_factory(std::shared_ptr factory) + { + return with_factorization(std::move(factory)); + } + + parameters_type& with_factorization( + std::shared_ptr factory) + { + this->factorization_generator = + [factory](std::shared_ptr) + -> std::shared_ptr { return factory; }; + return *this; + } + + template < + typename FactorizationParameters, + typename = decltype(std::declval().on( + std::shared_ptr{}))> + parameters_type& with_factorization(FactorizationParameters parameters) + { + this->factorization_generator = + [parameters](std::shared_ptr exec) + -> std::shared_ptr { + return parameters.on(exec); + }; + return *this; + } + + /** + * + */ + std::unique_ptr on(std::shared_ptr exec) const + { + auto parameters_copy = *this; + if (l_solver_generator) { + parameters_copy.l_solver_factory = l_solver_generator(exec); + } + if (factorization_generator) { + parameters_copy.factorization_factory = + factorization_generator(exec); + } + return parameters_copy + .enable_parameters_type::on(exec); + } + + private: + std::function( + std::shared_ptr)> + l_solver_generator; + + std::function( + std::shared_ptr)> + factorization_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Ic, parameters, Factory); @@ -365,12 +452,10 @@ class Ic : public EnableLinOp>, public Transposable { static_cast(mtx->get_size()[0])}; return SolverType::build() - .with_criteria(gko::stop::Iteration::build() - .with_max_iters(default_max_iters) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(default_reduce_residual) - .on(exec)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(default_max_iters), + gko::stop::ResidualNorm::build() + .with_reduction_factor(default_reduce_residual)) .on(exec) ->generate(mtx); } diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 7db9d19c7c2..bd6d665b009 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -131,25 +131,150 @@ class Ilu : public EnableLinOp< Ilu; - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { + class Factory; + + struct parameters_type + : public enable_parameters_type { /** * Factory for the L solver */ - std::shared_ptr - GKO_FACTORY_PARAMETER_SCALAR(l_solver_factory, nullptr); + std::shared_ptr + l_solver_factory{}; /** * Factory for the U solver */ - std::shared_ptr - GKO_FACTORY_PARAMETER_SCALAR(u_solver_factory, nullptr); + std::shared_ptr + u_solver_factory{}; /** * Factory for the factorization */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - factorization_factory, nullptr); + std::shared_ptr factorization_factory{}; + + [[deprecated("use with_l_solver instead")]] parameters_type& + with_l_solver_factory( + std::shared_ptr factory) + { + return with_l_solver(std::move(factory)); + } + + parameters_type& with_l_solver( + std::shared_ptr factory) + { + this->l_solver_generator = + [factory](std::shared_ptr) + -> std::shared_ptr { + return factory; + }; + return *this; + } + + template ().on( + std::shared_ptr{}))> + parameters_type& with_l_solver(SolverParameters parameters) + { + this->l_solver_generator = + [parameters](std::shared_ptr exec) + -> std::shared_ptr { + return parameters.on(exec); + }; + return *this; + } + + [[deprecated("use with_u_solver instead")]] parameters_type& + with_u_solver_factory( + std::shared_ptr factory) + { + return with_u_solver(std::move(factory)); + } + + parameters_type& with_u_solver( + std::shared_ptr factory) + { + this->u_solver_generator = + [factory](std::shared_ptr) + -> std::shared_ptr { + return factory; + }; + return *this; + } + + template ().on( + std::shared_ptr{}))> + parameters_type& with_u_solver(SolverParameters parameters) + { + this->u_solver_generator = + [parameters](std::shared_ptr exec) + -> std::shared_ptr { + return parameters.on(exec); + }; + return *this; + } + + [[deprecated("use with_factorization instead")]] parameters_type& + with_factorization_factory(std::shared_ptr factory) + { + return with_factorization(std::move(factory)); + } + + parameters_type& with_factorization( + std::shared_ptr factory) + { + this->factorization_generator = + [factory](std::shared_ptr) + -> std::shared_ptr { return factory; }; + return *this; + } + + template < + typename FactorizationParameters, + typename = decltype(std::declval().on( + std::shared_ptr{}))> + parameters_type& with_factorization(FactorizationParameters parameters) + { + this->factorization_generator = + [parameters](std::shared_ptr exec) + -> std::shared_ptr { + return parameters.on(exec); + }; + return *this; + } + + /** + * + */ + std::unique_ptr on(std::shared_ptr exec) const + { + auto parameters_copy = *this; + if (l_solver_generator) { + parameters_copy.l_solver_factory = l_solver_generator(exec); + } + if (u_solver_generator) { + parameters_copy.u_solver_factory = u_solver_generator(exec); + } + if (factorization_generator) { + parameters_copy.factorization_factory = + factorization_generator(exec); + } + return parameters_copy + .enable_parameters_type::on(exec); + } + + private: + std::function( + std::shared_ptr)> + l_solver_generator; + + std::function( + std::shared_ptr)> + u_solver_generator; + + std::function( + std::shared_ptr)> + factorization_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory); @@ -393,12 +518,10 @@ class Ilu : public EnableLinOp< static_cast(mtx->get_size()[0])}; return SolverType::build() - .with_criteria(gko::stop::Iteration::build() - .with_max_iters(default_max_iters) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(default_reduce_residual) - .on(exec)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(default_max_iters), + gko::stop::ResidualNorm::build() + .with_reduction_factor(default_reduce_residual)) .on(exec) ->generate(mtx); } diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp index c7b47a0e807..205be85df6c 100644 --- a/include/ginkgo/core/solver/bicg.hpp +++ b/include/ginkgo/core/solver/bicg.hpp @@ -99,27 +99,12 @@ class Bicg */ bool apply_uses_initial_guess() const override { return true; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - }; + class Factory; + + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> {}; + GKO_ENABLE_LIN_OP_FACTORY(Bicg, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp index 214e669b2ff..58d76c5e0df 100644 --- a/include/ginkgo/core/solver/bicgstab.hpp +++ b/include/ginkgo/core/solver/bicgstab.hpp @@ -98,27 +98,11 @@ class Bicgstab */ bool apply_uses_initial_guess() const override { return true; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - }; + class Factory; + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> {}; + GKO_ENABLE_LIN_OP_FACTORY(Bicgstab, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp index a2dbb1efce1..9bf4cf91a76 100644 --- a/include/ginkgo/core/solver/cb_gmres.hpp +++ b/include/ginkgo/core/solver/cb_gmres.hpp @@ -153,38 +153,23 @@ class CbGmres : public EnableLinOp>, return parameters_.storage_precision; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { + class Factory; + + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> { /** * Determines which storage type is used. */ cb_gmres::storage_precision GKO_FACTORY_PARAMETER_SCALAR( storage_precision, cb_gmres::storage_precision::reduce1); - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - /** * Krylov dimension factory. */ size_type GKO_FACTORY_PARAMETER_SCALAR(krylov_dim, 100u); }; + GKO_ENABLE_LIN_OP_FACTORY(CbGmres, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index bc0861cf270..c0fff29fedd 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -92,27 +92,12 @@ class Cg : public EnableLinOp>, */ bool apply_uses_initial_guess() const override { return true; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - }; + class Factory; + + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> {}; + GKO_ENABLE_LIN_OP_FACTORY(Cg, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp index 22f81d8a292..57a834b0ead 100644 --- a/include/ginkgo/core/solver/cgs.hpp +++ b/include/ginkgo/core/solver/cgs.hpp @@ -90,27 +90,12 @@ class Cgs */ bool apply_uses_initial_guess() const override { return true; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - }; + class Factory; + + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> {}; + GKO_ENABLE_LIN_OP_FACTORY(Cgs, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/direct.hpp b/include/ginkgo/core/solver/direct.hpp index 4a9a69731be..f66546cd2ec 100644 --- a/include/ginkgo/core/solver/direct.hpp +++ b/include/ginkgo/core/solver/direct.hpp @@ -74,8 +74,9 @@ class Direct : public EnableLinOp>, std::unique_ptr conj_transpose() const override; - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { + class Factory; + + struct parameters_type : enable_parameters_type { /** * Number of right hand sides. * @@ -86,8 +87,52 @@ class Direct : public EnableLinOp>, gko::size_type GKO_FACTORY_PARAMETER_SCALAR(num_rhs, 1u); /** The factorization factory to use for generating the factors. */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - factorization, nullptr); + std::shared_ptr factorization; + + /** + * + */ + parameters_type& with_factorization( + std::shared_ptr factorization) + { + this->factorization_generator = + [factorization](std::shared_ptr) + -> std::shared_ptr { + return factorization; + }; + return *this; + } + + template < + typename FactorizationParameters, + typename = decltype(std::declval().on( + std::shared_ptr{}))> + parameters_type& with_factorization( + FactorizationParameters factorization_parameters) + { + this->factorization_generator = + [factorization_parameters](std::shared_ptr exec) + -> std::shared_ptr { + return factorization_parameters.on(exec); + }; + return *this; + } + + /** + * + */ + std::unique_ptr on(std::shared_ptr exec) const + { + auto parameters_copy = *this; + parameters_copy.factorization = factorization_generator(exec); + return parameters_copy + .enable_parameters_type::on(exec); + } + + private: + std::function( + std::shared_ptr)> + factorization_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Direct, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index cad7a29fc27..b6715f07512 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -98,27 +98,12 @@ class Fcg */ bool apply_uses_initial_guess() const override { return true; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - }; + class Factory; + + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> {}; + GKO_ENABLE_LIN_OP_FACTORY(Fcg, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/gcr.hpp b/include/ginkgo/core/solver/gcr.hpp index fdc95d30c8f..8dc68e6e33d 100644 --- a/include/ginkgo/core/solver/gcr.hpp +++ b/include/ginkgo/core/solver/gcr.hpp @@ -108,30 +108,12 @@ class Gcr */ void set_krylov_dim(size_type other) { parameters_.krylov_dim = other; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - - /** - * Krylov dimension factory. - */ + class Factory; + + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> { + /** Krylov subspace dimension/restart value. */ size_type GKO_FACTORY_PARAMETER_SCALAR(krylov_dim, 0u); }; GKO_ENABLE_LIN_OP_FACTORY(Gcr, parameters, Factory); diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index d7d0f57a8a4..0ea056c9333 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -109,35 +109,16 @@ class Gmres */ void set_krylov_dim(size_type other) { parameters_.krylov_dim = other; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - - /** - * Krylov dimension factory. - */ + + class Factory; + + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> { + /** Krylov subspace dimension/restart value. */ size_type GKO_FACTORY_PARAMETER_SCALAR(krylov_dim, 0u); - /** - * Flexible GMRES - */ + /** Flexible GMRES */ bool GKO_FACTORY_PARAMETER_SCALAR(flexible, false); }; GKO_ENABLE_LIN_OP_FACTORY(Gmres, parameters, Factory); diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index fc677f33171..a7b8af31bf4 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -180,27 +180,11 @@ class Idr parameters_.complex_subspace = other; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); + class Factory; + struct parameters_type + : enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> { /** * Dimension of the subspace S. Determines how many intermediate * residuals are computed in each iteration. diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index c5c69c1fb67..792a0cdcfc6 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -177,26 +177,20 @@ class Ir : public EnableLinOp>, */ Ir(Ir&&); - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); + class Factory; + struct parameters_type + : enable_iterative_solver_factory_parameters { /** * Inner solver factory. */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - solver, nullptr); + std::shared_ptr solver{}; /** * Already generated solver. If one is provided, the factory `solver` * will be ignored. */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_solver, nullptr); + std::shared_ptr generated_solver{}; /** * Relaxation factor for Richardson iteration @@ -210,6 +204,57 @@ class Ir : public EnableLinOp>, */ initial_guess_mode GKO_FACTORY_PARAMETER_SCALAR( default_initial_guess, initial_guess_mode::provided); + + /** + * + */ + parameters_type& with_solver(std::shared_ptr solver) + { + this->solver_generator = [solver](std::shared_ptr) + -> std::shared_ptr { return solver; }; + return *this; + } + + template ().on( + std::shared_ptr{}))> + parameters_type& with_solver(SolverParameters solver_parameters) + { + this->solver_generator = + [solver_parameters](std::shared_ptr exec) + -> std::shared_ptr { + return solver_parameters.on(exec); + }; + return *this; + } + + /** + * + */ + parameters_type& with_generated_solver( + std::shared_ptr generated_solver) + { + this->generated_solver = std::move(generated_solver); + return *this; + } + + /** + * + */ + std::unique_ptr on(std::shared_ptr exec) const + { + auto parameters_copy = *this; + if (solver_generator) { + parameters_copy.solver = solver_generator(exec); + } + return parameters_copy.enable_iterative_solver_factory_parameters< + parameters_type, Factory>::on(exec); + } + + private: + std::function( + std::shared_ptr)> + solver_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Ir, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); @@ -319,8 +364,7 @@ auto build_smoother(std::shared_ptr factory, return Ir::build() .with_solver(factory) .with_relaxation_factor(relaxation_factor) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(iteration).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(iteration)) .on(exec); } @@ -344,8 +388,7 @@ auto build_smoother(std::shared_ptr solver, return Ir::build() .with_generated_solver(solver) .with_relaxation_factor(relaxation_factor) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(iteration).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(iteration)) .on(exec); } diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 2d04a889445..c78e54a773d 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -215,14 +215,12 @@ class Multigrid : public EnableLinOp, */ void set_cycle(multigrid::cycle cycle) { parameters_.cycle = cycle; } - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); + class Factory; + + struct parameters_type + : public enable_iterative_solver_factory_parameters { /** * MultigridLevel Factory list */ diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index 53909337554..f527978c200 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -35,11 +35,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include +#include #include #include #include @@ -859,6 +861,153 @@ class EnablePreconditionedIterativeSolver }; +struct iterative_solver_factory_parameters { + /** + * Stopping criteria to be used by the solver. + */ + std::vector> criteria{}; +}; + + +template +struct enable_iterative_solver_factory_parameters + : enable_parameters_type, + iterative_solver_factory_parameters { + /** + * Provides stopping criteria via stop::CriterionFactory instances to be + * used by the iterative solver in a fluent interface. + */ + template + Parameters& with_criteria(Args... value) + { + this->criterion_generators = {build_generator(std::move(value))...}; + return *self(); + } + + /** + * @copydoc enable_solver_factory_parameters::on + * + * @note This variant instantiates stopping criteria that were provided + * without calling `.on(exec)` before generating the factory. + */ + std::unique_ptr on(std::shared_ptr exec) const + { + auto copy = *self(); + copy.criteria.clear(); + for (auto& generator : criterion_generators) { + copy.criteria.push_back(generator(exec)); + } + auto factory = + copy.enable_parameters_type::on(exec); + return factory; + } + +private: + GKO_ENABLE_SELF(Parameters); + + std::function( + std::shared_ptr)> + build_generator(std::shared_ptr criterion) + { + return + [criterion](std::shared_ptr) { return criterion; }; + } + + template ().on( + std::shared_ptr{}))> + std::function( + std::shared_ptr)> + build_generator(CriterionParameters criterion_parameters) + { + return [criterion_parameters](std::shared_ptr exec) { + return criterion_parameters.on(exec); + }; + } + + std::vector( + std::shared_ptr)>> + criterion_generators; +}; + + +struct preconditioned_iterative_solver_factory_parameters { + /** + * The preconditioner to be used by the iterative solver. By default, no + * preconditioner is used. + */ + std::shared_ptr preconditioner{nullptr}; + + /** + * Already generated preconditioner. If one is provided, the factory + * `preconditioner` will be ignored. + */ + std::shared_ptr generated_preconditioner{nullptr}; +}; + + +template +struct enable_preconditioned_iterative_solver_factory_parameters + : enable_iterative_solver_factory_parameters, + preconditioned_iterative_solver_factory_parameters { + /** + * + */ + Parameters& with_preconditioner( + std::shared_ptr preconditioner) + { + this->preconditioner_generator = + [preconditioner](std::shared_ptr) + -> std::shared_ptr { return preconditioner; }; + return *self(); + } + + template ().on( + std::shared_ptr{}))> + Parameters& with_preconditioner( + PreconditionerParameters preconditioner_parameters) + { + this->preconditioner_generator = + [preconditioner_parameters](std::shared_ptr exec) + -> std::shared_ptr { + return preconditioner_parameters.on(exec); + }; + return *self(); + } + + /** + * + */ + Parameters& with_generated_preconditioner( + std::shared_ptr generated_preconditioner) + { + this->generated_preconditioner = std::move(generated_preconditioner); + return *self(); + } + + /** + * + */ + std::unique_ptr on(std::shared_ptr exec) const + { + auto parameters_copy = *self(); + if (preconditioner_generator) { + parameters_copy.preconditioner = preconditioner_generator(exec); + } + return parameters_copy.enable_iterative_solver_factory_parameters< + Parameters, Factory>::on(exec); + } + +private: + GKO_ENABLE_SELF(Parameters); + + std::function( + std::shared_ptr)> + preconditioner_generator; +}; + + } // namespace solver } // namespace gko From dd10f5f96ad600b28bcedaef04d61bdcb57f4fbd Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 11 May 2023 12:21:55 +0200 Subject: [PATCH 308/583] simplify deferred factory creation --- include/ginkgo/core/base/abstract_factory.hpp | 54 +++++++++++ include/ginkgo/core/preconditioner/ic.hpp | 61 +++---------- include/ginkgo/core/preconditioner/ilu.hpp | 91 ++++--------------- include/ginkgo/core/solver/ir.hpp | 25 +---- include/ginkgo/core/solver/solver_base.hpp | 55 ++--------- 5 files changed, 100 insertions(+), 186 deletions(-) diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index 1c5043c186f..3609a3f1205 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -274,6 +274,60 @@ class enable_parameters_type { }; +template +class deferred_factory_parameter { +public: + deferred_factory_parameter() = default; + + template >::value>* = nullptr> + deferred_factory_parameter(std::shared_ptr factory) + { + generator_ = + [factory = std::shared_ptr(std::move(factory))]( + std::shared_ptr) { return factory; }; + } + + template >::value>* = nullptr> + deferred_factory_parameter( + std::unique_ptr factory) + { + generator_ = + [factory = std::shared_ptr(std::move(factory))]( + std::shared_ptr) { return factory; }; + } + + template ().on( + std::shared_ptr{}))> + deferred_factory_parameter(ParametersType parameters) + { + generator_ = [parameters](std::shared_ptr exec) + -> std::shared_ptr { + return parameters.on(exec); + }; + } + + std::shared_ptr on( + std::shared_ptr exec) const + { + return generator_(exec); + } + + explicit operator bool() const { return bool(generator_); } + +private: + std::function( + std::shared_ptr)> + generator_; +}; + + } // namespace gko diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp index cb00119582a..ed5063d403b 100644 --- a/include/ginkgo/core/preconditioner/ic.hpp +++ b/include/ginkgo/core/preconditioner/ic.hpp @@ -136,61 +136,29 @@ class Ic : public EnableLinOp>, public Transposable { [[deprecated("use with_l_solver instead")]] parameters_type& with_l_solver_factory( - std::shared_ptr factory) + deferred_factory_parameter solver) { - return with_l_solver(std::move(factory)); + return with_l_solver(std::move(solver)); } parameters_type& with_l_solver( - std::shared_ptr factory) + deferred_factory_parameter solver) { - this->l_solver_generator = - [factory](std::shared_ptr) - -> std::shared_ptr { - return factory; - }; - return *this; - } - - template ().on( - std::shared_ptr{}))> - parameters_type& with_l_solver(SolverParameters parameters) - { - this->l_solver_generator = - [parameters](std::shared_ptr exec) - -> std::shared_ptr { - return parameters.on(exec); - }; + this->l_solver_generator = std::move(solver); return *this; } [[deprecated("use with_factorization instead")]] parameters_type& - with_factorization_factory(std::shared_ptr factory) + with_factorization_factory( + deferred_factory_parameter factorization) { - return with_factorization(std::move(factory)); + return with_factorization(std::move(factorization)); } parameters_type& with_factorization( - std::shared_ptr factory) - { - this->factorization_generator = - [factory](std::shared_ptr) - -> std::shared_ptr { return factory; }; - return *this; - } - - template < - typename FactorizationParameters, - typename = decltype(std::declval().on( - std::shared_ptr{}))> - parameters_type& with_factorization(FactorizationParameters parameters) + deferred_factory_parameter factorization) { - this->factorization_generator = - [parameters](std::shared_ptr exec) - -> std::shared_ptr { - return parameters.on(exec); - }; + this->factorization_generator = std::move(factorization); return *this; } @@ -201,24 +169,21 @@ class Ic : public EnableLinOp>, public Transposable { { auto parameters_copy = *this; if (l_solver_generator) { - parameters_copy.l_solver_factory = l_solver_generator(exec); + parameters_copy.l_solver_factory = l_solver_generator.on(exec); } if (factorization_generator) { parameters_copy.factorization_factory = - factorization_generator(exec); + factorization_generator.on(exec); } return parameters_copy .enable_parameters_type::on(exec); } private: - std::function( - std::shared_ptr)> + deferred_factory_parameter l_solver_generator; - std::function( - std::shared_ptr)> - factorization_generator; + deferred_factory_parameter factorization_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Ic, parameters, Factory); diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index bd6d665b009..f4f8d0abd5b 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -154,92 +154,43 @@ class Ilu : public EnableLinOp< [[deprecated("use with_l_solver instead")]] parameters_type& with_l_solver_factory( - std::shared_ptr factory) + deferred_factory_parameter solver) { - return with_l_solver(std::move(factory)); + return with_l_solver(std::move(solver)); } parameters_type& with_l_solver( - std::shared_ptr factory) + deferred_factory_parameter solver) { - this->l_solver_generator = - [factory](std::shared_ptr) - -> std::shared_ptr { - return factory; - }; - return *this; - } - - template ().on( - std::shared_ptr{}))> - parameters_type& with_l_solver(SolverParameters parameters) - { - this->l_solver_generator = - [parameters](std::shared_ptr exec) - -> std::shared_ptr { - return parameters.on(exec); - }; + this->l_solver_generator = std::move(solver); return *this; } [[deprecated("use with_u_solver instead")]] parameters_type& with_u_solver_factory( - std::shared_ptr factory) + deferred_factory_parameter solver) { - return with_u_solver(std::move(factory)); + return with_u_solver(std::move(solver)); } parameters_type& with_u_solver( - std::shared_ptr factory) - { - this->u_solver_generator = - [factory](std::shared_ptr) - -> std::shared_ptr { - return factory; - }; - return *this; - } - - template ().on( - std::shared_ptr{}))> - parameters_type& with_u_solver(SolverParameters parameters) + deferred_factory_parameter solver) { - this->u_solver_generator = - [parameters](std::shared_ptr exec) - -> std::shared_ptr { - return parameters.on(exec); - }; + this->u_solver_generator = std::move(solver); return *this; } [[deprecated("use with_factorization instead")]] parameters_type& - with_factorization_factory(std::shared_ptr factory) + with_factorization_factory( + deferred_factory_parameter factorization) { - return with_factorization(std::move(factory)); + return with_factorization(std::move(factorization)); } parameters_type& with_factorization( - std::shared_ptr factory) - { - this->factorization_generator = - [factory](std::shared_ptr) - -> std::shared_ptr { return factory; }; - return *this; - } - - template < - typename FactorizationParameters, - typename = decltype(std::declval().on( - std::shared_ptr{}))> - parameters_type& with_factorization(FactorizationParameters parameters) + deferred_factory_parameter factorization) { - this->factorization_generator = - [parameters](std::shared_ptr exec) - -> std::shared_ptr { - return parameters.on(exec); - }; + this->factorization_generator = std::move(factorization); return *this; } @@ -250,31 +201,27 @@ class Ilu : public EnableLinOp< { auto parameters_copy = *this; if (l_solver_generator) { - parameters_copy.l_solver_factory = l_solver_generator(exec); + parameters_copy.l_solver_factory = l_solver_generator.on(exec); } if (u_solver_generator) { - parameters_copy.u_solver_factory = u_solver_generator(exec); + parameters_copy.u_solver_factory = u_solver_generator.on(exec); } if (factorization_generator) { parameters_copy.factorization_factory = - factorization_generator(exec); + factorization_generator.on(exec); } return parameters_copy .enable_parameters_type::on(exec); } private: - std::function( - std::shared_ptr)> + deferred_factory_parameter l_solver_generator; - std::function( - std::shared_ptr)> + deferred_factory_parameter u_solver_generator; - std::function( - std::shared_ptr)> - factorization_generator; + deferred_factory_parameter factorization_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory); diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index 792a0cdcfc6..d30fd9d69bc 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -208,23 +208,10 @@ class Ir : public EnableLinOp>, /** * */ - parameters_type& with_solver(std::shared_ptr solver) + parameters_type& with_solver( + deferred_factory_parameter solver) { - this->solver_generator = [solver](std::shared_ptr) - -> std::shared_ptr { return solver; }; - return *this; - } - - template ().on( - std::shared_ptr{}))> - parameters_type& with_solver(SolverParameters solver_parameters) - { - this->solver_generator = - [solver_parameters](std::shared_ptr exec) - -> std::shared_ptr { - return solver_parameters.on(exec); - }; + this->solver_generator = std::move(solver); return *this; } @@ -245,16 +232,14 @@ class Ir : public EnableLinOp>, { auto parameters_copy = *this; if (solver_generator) { - parameters_copy.solver = solver_generator(exec); + parameters_copy.solver = solver_generator.on(exec); } return parameters_copy.enable_iterative_solver_factory_parameters< parameters_type, Factory>::on(exec); } private: - std::function( - std::shared_ptr)> - solver_generator; + deferred_factory_parameter solver_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Ir, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index f527978c200..2ed7375ac76 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -880,7 +880,9 @@ struct enable_iterative_solver_factory_parameters template Parameters& with_criteria(Args... value) { - this->criterion_generators = {build_generator(std::move(value))...}; + this->criterion_generators = { + deferred_factory_parameter{ + std::move(value)}...}; return *self(); } @@ -895,7 +897,7 @@ struct enable_iterative_solver_factory_parameters auto copy = *self(); copy.criteria.clear(); for (auto& generator : criterion_generators) { - copy.criteria.push_back(generator(exec)); + copy.criteria.push_back(generator.on(exec)); } auto factory = copy.enable_parameters_type::on(exec); @@ -905,28 +907,7 @@ struct enable_iterative_solver_factory_parameters private: GKO_ENABLE_SELF(Parameters); - std::function( - std::shared_ptr)> - build_generator(std::shared_ptr criterion) - { - return - [criterion](std::shared_ptr) { return criterion; }; - } - - template ().on( - std::shared_ptr{}))> - std::function( - std::shared_ptr)> - build_generator(CriterionParameters criterion_parameters) - { - return [criterion_parameters](std::shared_ptr exec) { - return criterion_parameters.on(exec); - }; - } - - std::vector( - std::shared_ptr)>> + std::vector> criterion_generators; }; @@ -954,25 +935,9 @@ struct enable_preconditioned_iterative_solver_factory_parameters * */ Parameters& with_preconditioner( - std::shared_ptr preconditioner) - { - this->preconditioner_generator = - [preconditioner](std::shared_ptr) - -> std::shared_ptr { return preconditioner; }; - return *self(); - } - - template ().on( - std::shared_ptr{}))> - Parameters& with_preconditioner( - PreconditionerParameters preconditioner_parameters) + deferred_factory_parameter preconditioner) { - this->preconditioner_generator = - [preconditioner_parameters](std::shared_ptr exec) - -> std::shared_ptr { - return preconditioner_parameters.on(exec); - }; + this->preconditioner_generator = std::move(preconditioner); return *self(); } @@ -993,7 +958,7 @@ struct enable_preconditioned_iterative_solver_factory_parameters { auto parameters_copy = *self(); if (preconditioner_generator) { - parameters_copy.preconditioner = preconditioner_generator(exec); + parameters_copy.preconditioner = preconditioner_generator.on(exec); } return parameters_copy.enable_iterative_solver_factory_parameters< Parameters, Factory>::on(exec); @@ -1002,9 +967,7 @@ struct enable_preconditioned_iterative_solver_factory_parameters private: GKO_ENABLE_SELF(Parameters); - std::function( - std::shared_ptr)> - preconditioner_generator; + deferred_factory_parameter preconditioner_generator; }; From d6895fdc267edb92ead75b38e1cbbb9667bdb508 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 8 Aug 2023 16:43:05 +0200 Subject: [PATCH 309/583] handle Multigrid --- include/ginkgo/core/base/abstract_factory.hpp | 5 + include/ginkgo/core/solver/multigrid.hpp | 117 ++++++++++++++++-- include/ginkgo/core/solver/solver_base.hpp | 4 +- 3 files changed, 113 insertions(+), 13 deletions(-) diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index 3609a3f1205..e8ec803b480 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -279,6 +279,8 @@ class deferred_factory_parameter { public: deferred_factory_parameter() = default; + deferred_factory_parameter(std::nullptr_t) {} + template on( std::shared_ptr exec) const { + if (!(*this)) { + GKO_NOT_SUPPORTED(*this); + } return generator_(exec); } diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index c78e54a773d..5aab788f71f 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -224,8 +225,16 @@ class Multigrid : public EnableLinOp, /** * MultigridLevel Factory list */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(mg_level, nullptr); + std::vector> mg_level{nullptr}; + + template + parameters_type& with_mg_level(Args&&... level) + { + this->mg_level_generator = { + deferred_factory_parameter{ + std::forward(level)}...}; + return *this; + } /** * Custom selector size_type (size_type level, const LinOp* fine_matrix) @@ -256,6 +265,7 @@ class Multigrid : public EnableLinOp, std::function GKO_FACTORY_PARAMETER_SCALAR(level_selector, nullptr); + using smoother_list = std::vector>; /** * Pre-smooth Factory list. * Its size must be 0, 1 or be the same as mg_level's. @@ -270,17 +280,14 @@ class Multigrid : public EnableLinOp, * If any element in the vector is a `nullptr` then the smoother * application at the corresponding level is skipped. */ - using smoother_list = std::vector>; - smoother_list GKO_FACTORY_PARAMETER_VECTOR(pre_smoother, - smoother_list{}); + smoother_list pre_smoother{}; /** * Post-smooth Factory list. * It is similar to Pre-smooth Factory list. It is ignored if * the factory parameter post_uses_pre is set to true. */ - smoother_list GKO_FACTORY_PARAMETER_VECTOR(post_smoother, - smoother_list{}); + smoother_list post_smoother{}; /** * Mid-smooth Factory list. If it contains available elements, multigrid @@ -289,8 +296,34 @@ class Multigrid : public EnableLinOp, * Pre-smooth Factory list. It is ignored if the factory parameter * mid_case is not mid. */ - smoother_list GKO_FACTORY_PARAMETER_VECTOR(mid_smoother, - smoother_list{}); + smoother_list mid_smoother{}; + + template + parameters_type& with_pre_smoother(Args&&... smoother) + { + this->pre_smoother_generator = { + deferred_factory_parameter{ + std::forward(smoother)}...}; + return *this; + } + + template + parameters_type& with_post_smoother(Args&&... smoother) + { + this->post_smoother_generator = { + deferred_factory_parameter{ + std::forward(smoother)}...}; + return *this; + } + + template + parameters_type& with_mid_smoother(Args&&... smoother) + { + this->mid_smoother_generator = { + deferred_factory_parameter{ + std::forward(smoother)}...}; + return *this; + } /** * Whether post-smoothing-related calls use corresponding @@ -330,8 +363,17 @@ class Multigrid : public EnableLinOp, * If not set, then a direct LU solver will be used as solver on the * coarsest level. */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(coarsest_solver, nullptr); + std::vector> coarsest_solver{ + nullptr}; + + template + parameters_type& with_coarsest_solver(Args&&... solver) + { + this->coarsest_solver_generator = { + deferred_factory_parameter{ + std::forward(solver)}...}; + return *this; + } /** * Custom coarsest_solver selector @@ -403,6 +445,59 @@ class Multigrid : public EnableLinOp, */ initial_guess_mode GKO_FACTORY_PARAMETER_SCALAR( default_initial_guess, initial_guess_mode::zero); + + std::unique_ptr on(std::shared_ptr exec) const + { + auto copy = *this; + if (!copy.mg_level_generator.empty()) { + copy.mg_level.clear(); + for (auto& generator : copy.mg_level_generator) { + copy.mg_level.push_back(generator.on(exec)); + } + } + if (!copy.pre_smoother_generator.empty()) { + copy.pre_smoother.clear(); + for (auto& generator : copy.pre_smoother_generator) { + copy.pre_smoother.push_back(generator ? generator.on(exec) + : nullptr); + } + } + if (!copy.mid_smoother_generator.empty()) { + copy.mid_smoother.clear(); + for (auto& generator : copy.mid_smoother_generator) { + copy.mid_smoother.push_back(generator ? generator.on(exec) + : nullptr); + } + } + if (!copy.post_smoother_generator.empty()) { + copy.post_smoother.clear(); + for (auto& generator : copy.post_smoother_generator) { + copy.post_smoother.push_back(generator ? generator.on(exec) + : nullptr); + } + } + if (!copy.coarsest_solver_generator.empty()) { + copy.coarsest_solver.clear(); + for (auto& generator : copy.coarsest_solver_generator) { + copy.coarsest_solver.push_back( + generator ? generator.on(exec) : nullptr); + } + } + return copy.enable_iterative_solver_factory_parameters< + parameters_type, Factory>::on(exec); + } + + private: + std::vector> + mg_level_generator; + std::vector> + pre_smoother_generator; + std::vector> + mid_smoother_generator; + std::vector> + post_smoother_generator; + std::vector> + coarsest_solver_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Multigrid, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index 2ed7375ac76..f9132426c61 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -878,11 +878,11 @@ struct enable_iterative_solver_factory_parameters * used by the iterative solver in a fluent interface. */ template - Parameters& with_criteria(Args... value) + Parameters& with_criteria(Args&&... value) { this->criterion_generators = { deferred_factory_parameter{ - std::move(value)}...}; + std::forward(value)}...}; return *self(); } From 05fad8e878439e8cb8f68d11f4715d36e75e4160 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 10 May 2023 19:46:12 +0200 Subject: [PATCH 310/583] remove unnecessary .on(...) calls --- benchmark/solver/solver_common.hpp | 9 +-- benchmark/utils/overhead_linop.hpp | 27 ++------ benchmark/utils/preconditioners.hpp | 45 +++++++------- core/distributed/preconditioner/schwarz.cpp | 4 +- core/preconditioner/isai.cpp | 10 ++- core/solver/multigrid.cpp | 14 ++--- core/test/log/convergence.cpp | 3 +- core/test/log/profiler_hook.cpp | 3 +- core/test/log/record.cpp | 9 +-- core/test/log/stream.cpp | 9 +-- .../distributed/preconditioner/schwarz.cpp | 14 ++--- core/test/preconditioner/ic.cpp | 7 +-- core/test/preconditioner/ilu.cpp | 12 ++-- core/test/solver/bicg.cpp | 5 +- core/test/solver/bicgstab.cpp | 5 +- core/test/solver/cb_gmres.cpp | 11 ++-- core/test/solver/cg.cpp | 5 +- core/test/solver/cgs.cpp | 5 +- core/test/solver/fcg.cpp | 5 +- core/test/solver/gcr.cpp | 20 +++--- core/test/solver/gmres.cpp | 11 ++-- core/test/solver/idr.cpp | 5 +- core/test/solver/ir.cpp | 14 ++--- core/test/solver/multigrid.cpp | 32 +++++----- .../adaptiveprecision-blockjacobi.cpp | 16 ++--- examples/cb-gmres/cb-gmres.cpp | 20 +++--- examples/custom-logger/custom-logger.cpp | 8 +-- .../custom-matrix-format.cpp | 10 ++- .../custom-stopping-criterion.cpp | 13 ++-- .../distributed-solver/distributed-solver.cpp | 22 +++---- .../external-lib-interfacing.cpp | 8 +-- examples/ginkgo-overhead/ginkgo-overhead.cpp | 3 +- examples/heat-equation/heat-equation.cpp | 5 +- .../ilu-preconditioned-solver.cpp | 8 +-- .../inverse-iteration/inverse-iteration.cpp | 8 +-- .../ir-ilu-preconditioned-solver.cpp | 12 ++-- .../iterative-refinement.cpp | 16 ++--- examples/kokkos_assembly/kokkos_assembly.cpp | 12 ++-- .../minimal-cuda-solver.cpp | 8 +-- .../mixed-multigrid-preconditioned-solver.cpp | 18 ++---- .../mixed-multigrid-solver.cpp | 20 +++--- .../mixed-precision-ir/mixed-precision-ir.cpp | 10 ++- ...igrid-preconditioned-solver-customized.cpp | 6 +- .../multigrid-preconditioned-solver.cpp | 15 ++--- .../nine-pt-stencil-solver.cpp | 10 ++- examples/papi-logging/papi-logging.cpp | 8 +-- .../performance-debugging.cpp | 3 +- examples/poisson-solver/poisson-solver.cpp | 12 ++-- .../preconditioned-solver.cpp | 10 ++- .../preconditioner-export.cpp | 36 +++++------ .../simple-solver-logging.cpp | 5 +- examples/simple-solver/simple-solver.cpp | 8 +-- .../three-pt-stencil-solver.cpp | 10 ++- .../distributed/preconditioner/schwarz.hpp | 22 ++++++- reference/test/preconditioner/ilu.cpp | 48 ++++++-------- .../test/preconditioner/isai_kernels.cpp | 9 +-- reference/test/reorder/scaled_reordered.cpp | 6 +- reference/test/solver/bicg_kernels.cpp | 31 ++++------ reference/test/solver/bicgstab_kernels.cpp | 31 ++++------ reference/test/solver/cb_gmres_kernels.cpp | 17 ++--- reference/test/solver/cg_kernels.cpp | 32 ++++------ reference/test/solver/cgs_kernels.cpp | 27 +++----- reference/test/solver/direct.cpp | 3 +- reference/test/solver/fcg_kernels.cpp | 31 ++++------ reference/test/solver/gcr_kernels.cpp | 33 ++++------ reference/test/solver/gmres_kernels.cpp | 22 +++---- reference/test/solver/idr_kernels.cpp | 32 ++++------ reference/test/solver/ir_kernels.cpp | 15 ++--- reference/test/solver/multigrid_kernels.cpp | 42 +++++-------- test/mpi/preconditioner/schwarz.cpp | 16 +++-- test/mpi/solver/solver.cpp | 20 ++---- test/solver/bicg_kernels.cpp | 32 ++++------ test/solver/bicgstab_kernels.cpp | 10 ++- test/solver/cg_kernels.cpp | 16 ++--- test/solver/cgs_kernels.cpp | 10 ++- test/solver/direct.cpp | 24 +++---- test/solver/fcg_kernels.cpp | 16 ++--- test/solver/gcr_kernels.cpp | 10 ++- test/solver/gmres_kernels.cpp | 10 ++- test/solver/idr_kernels.cpp | 18 ++---- test/solver/ir_kernels.cpp | 62 ++++++------------- test/solver/solver.cpp | 25 +++----- test/test_install/test_install.cpp | 19 +++--- 83 files changed, 517 insertions(+), 796 deletions(-) diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 0248ab8e757..784b70eca61 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -239,21 +239,18 @@ std::unique_ptr generate_solver( return gko::experimental::solver::Direct::build() .with_factorization( gko::experimental::factorization::Cholesky::build() - .on(exec)) + itype>::build()) .on(exec); } else if (description == "symm_direct") { return gko::experimental::solver::Direct::build() .with_factorization( gko::experimental::factorization::Lu::build() - .with_symmetric_sparsity(true) - .on(exec)) + .with_symmetric_sparsity(true)) .on(exec); } else if (description == "direct") { return gko::experimental::solver::Direct::build() .with_factorization( - gko::experimental::factorization::Lu::build().on( - exec)) + gko::experimental::factorization::Lu::build()) .on(exec); } else if (description == "overhead") { return add_criteria_precond_finalize>( diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp index 168e650234d..d947b8de38e 100644 --- a/benchmark/utils/overhead_linop.hpp +++ b/benchmark/utils/overhead_linop.hpp @@ -104,27 +104,12 @@ class Overhead : public EnableLinOp>, friend class EnablePolymorphicObject; public: - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - }; + class Factory; + + struct parameters_type + : public gko::solver:: + enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> {}; GKO_ENABLE_LIN_OP_FACTORY(Overhead, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp index 466d5f2d3f9..3450eb71b44 100644 --- a/benchmark/utils/preconditioners.hpp +++ b/benchmark/utils/preconditioners.hpp @@ -122,7 +122,7 @@ const std::map( .on(exec)); return gko::preconditioner::Ic, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"parict", @@ -137,7 +137,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"parilu", @@ -150,7 +150,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"parilut", @@ -165,7 +165,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"ic", @@ -174,7 +174,7 @@ const std::map( gko::factorization::Ic::build().on(exec)); return gko::preconditioner::Ic, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"ilu", @@ -184,7 +184,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"paric-isai", @@ -201,8 +201,8 @@ const std::map( return gko::preconditioner::Ic< gko::preconditioner::LowerIsai, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) + .with_factorization(fact) + .with_l_solver(lisai) .on(exec); }}, {"parict-isai", @@ -221,8 +221,8 @@ const std::map( return gko::preconditioner::Ic< gko::preconditioner::LowerIsai, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) + .with_factorization(fact) + .with_l_solver(lisai) .on(exec); }}, {"parilu-isai", @@ -244,9 +244,9 @@ const std::map( gko::preconditioner::LowerIsai, gko::preconditioner::UpperIsai, false, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) - .with_u_solver_factory(uisai) + .with_factorization(fact) + .with_l_solver(lisai) + .with_u_solver(uisai) .on(exec); }}, {"parilut-isai", @@ -270,9 +270,9 @@ const std::map( gko::preconditioner::LowerIsai, gko::preconditioner::UpperIsai, false, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) - .with_u_solver_factory(uisai) + .with_factorization(fact) + .with_l_solver(lisai) + .with_u_solver(uisai) .on(exec); }}, {"ic-isai", @@ -286,8 +286,8 @@ const std::map( return gko::preconditioner::Ic< gko::preconditioner::LowerIsai, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) + .with_factorization(fact) + .with_l_solver(lisai) .on(exec); }}, {"ilu-isai", @@ -306,9 +306,9 @@ const std::map( gko::preconditioner::LowerIsai, gko::preconditioner::UpperIsai, false, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) - .with_u_solver_factory(uisai) + .with_factorization(fact) + .with_l_solver(lisai) + .with_u_solver(uisai) .on(exec); }}, {"general-isai", @@ -326,8 +326,7 @@ const std::map( {"overhead", [](std::shared_ptr exec) { return gko::Overhead::build() .with_criteria(gko::stop::ResidualNorm::build() - .with_reduction_factor(rc_etype{}) - .on(exec)) + .with_reduction_factor(rc_etype{})) .on(exec); }}}; diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 31c57947704..0d1267bc0b4 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -102,8 +102,8 @@ template void Schwarz::generate( std::shared_ptr system_matrix) { - if (parameters_.local_solver_factory) { - this->local_solver_ = parameters_.local_solver_factory->generate( + if (parameters_.local_solver) { + this->local_solver_ = parameters_.local_solver->generate( as>( system_matrix) diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp index 52fa9140853..4e0e2ea95d8 100644 --- a/core/preconditioner/isai.cpp +++ b/core/preconditioner/isai.cpp @@ -230,17 +230,15 @@ void Isai::generate_inverse( excess_solver_factory = Gmres::build() .with_preconditioner( - Bj::build().with_max_block_size(32u).on(exec)) + Bj::build().with_max_block_size(32u)) .with_criteria( - gko::stop::Iteration::build() - .with_max_iters(excess_dim) - .on(exec), + gko::stop::Iteration::build().with_max_iters( + excess_dim), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::rhs_norm) .with_reduction_factor( remove_complex{ - excess_solver_reduction}) - .on(exec)) + excess_solver_reduction})) .on(exec); excess_solution->copy_from(excess_rhs); } else if (is_lower) { diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 303106fa4f6..84afc1666cc 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -569,21 +569,18 @@ void Multigrid::generate() using absolute_value_type = remove_complex; return solver::Gmres::build() .with_criteria( - stop::Iteration::build() - .with_max_iters(matrix->get_size()[0]) - .on(exec), + stop::Iteration::build().with_max_iters( + matrix->get_size()[0]), stop::ResidualNorm::build() .with_reduction_factor( std::numeric_limits< absolute_value_type>::epsilon() * - absolute_value_type{10}) - .on(exec)) + absolute_value_type{10})) .with_krylov_dim( std::min(size_type(100), matrix->get_size()[0])) .with_preconditioner( preconditioner::Jacobi::build() - .with_max_block_size(1u) - .on(exec)) + .with_max_block_size(1u)) .on(exec) ->generate(matrix); } else { @@ -591,8 +588,7 @@ void Multigrid::generate() int32>::build() .with_factorization( experimental::factorization::Lu::build() - .on(exec)) + int32>::build()) .on(exec) ->generate(matrix); } diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp index f6294d08cd4..746e8603865 100644 --- a/core/test/log/convergence.cpp +++ b/core/test/log/convergence.cpp @@ -68,8 +68,7 @@ class Convergence : public ::testing::Test { gko::initialize({6}, exec); std::unique_ptr system = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec) ->generate(gko::initialize(I>{{1, 2}, {0, 3}}, exec)); std::unique_ptr rhs = gko::initialize({15, 25}, exec); diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp index 281eed2d70b..cd6e1b0a3ce 100644 --- a/core/test/log/profiler_hook.cpp +++ b/core/test/log/profiler_hook.cpp @@ -202,8 +202,7 @@ TEST(ProfilerHook, LogsIteration) auto alpha = gko::share(gko::initialize({1.0}, exec)); auto solver = gko::solver::Ir<>::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec) ->generate(mtx); logger->set_object_name(solver, "solver"); diff --git a/core/test/log/record.cpp b/core/test/log/record.cpp index 0aeca2b3df7..098f93ad523 100644 --- a/core/test/log/record.cpp +++ b/core/test/log/record.cpp @@ -440,8 +440,7 @@ TEST(Record, CatchesLinopFactoryGenerateStarted) gko::log::Logger::linop_factory_generate_started_mask); auto factory = gko::solver::Bicgstab<>::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); auto input = factory->generate(gko::matrix::Dense<>::create(exec)); @@ -462,8 +461,7 @@ TEST(Record, CatchesLinopFactoryGenerateCompleted) gko::log::Logger::linop_factory_generate_completed_mask); auto factory = gko::solver::Bicgstab<>::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); auto input = factory->generate(gko::matrix::Dense<>::create(exec)); auto output = factory->generate(gko::matrix::Dense<>::create(exec)); @@ -569,8 +567,7 @@ TEST(Record, CatchesIterations) gko::log::Record::create(gko::log::Logger::iteration_complete_mask); auto factory = gko::solver::Bicgstab<>::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); auto solver = factory->generate(gko::initialize({1.1}, exec)); auto right_hand_side = gko::initialize({-5.5}, exec); diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp index 3558a7d5564..721273ca468 100644 --- a/core/test/log/stream.cpp +++ b/core/test/log/stream.cpp @@ -606,8 +606,7 @@ TYPED_TEST(Stream, CatchesLinopFactoryGenerateStarted) gko::log::Logger::linop_factory_generate_started_mask, out); auto factory = gko::solver::Bicgstab::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); auto input = factory->generate(gko::matrix::Dense::create(exec)); std::stringstream ptrstream_factory; @@ -633,8 +632,7 @@ TYPED_TEST(Stream, CatchesLinopFactoryGenerateCompleted) gko::log::Logger::linop_factory_generate_completed_mask, out); auto factory = gko::solver::Bicgstab::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); auto input = factory->generate(gko::matrix::Dense::create(exec)); auto output = @@ -815,8 +813,7 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) auto factory = gko::solver::Bicgstab::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); auto solver = factory->generate(gko::initialize({1.1}, exec)); auto right_hand_side = gko::initialize({-5.5}, exec); diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp index ff1cd0d45e5..e0b5749e987 100644 --- a/core/test/mpi/distributed/preconditioner/schwarz.cpp +++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp @@ -67,7 +67,7 @@ class SchwarzFactory : public ::testing::Test { mtx(Mtx::create(exec, MPI_COMM_WORLD)) { schwarz = Schwarz::build() - .with_local_solver_factory(jacobi_factory) + .with_local_solver(jacobi_factory) .on(exec) ->generate(mtx); } @@ -83,8 +83,8 @@ class SchwarzFactory : public ::testing::Test { gko::ptr_param b) { ASSERT_EQ(a->get_size(), b->get_size()); - ASSERT_EQ(a->get_parameters().local_solver_factory, - b->get_parameters().local_solver_factory); + ASSERT_EQ(a->get_parameters().local_solver, + b->get_parameters().local_solver); } std::shared_ptr exec; @@ -105,7 +105,7 @@ TYPED_TEST(SchwarzFactory, KnowsItsExecutor) TYPED_TEST(SchwarzFactory, CanSetLocalFactory) { - ASSERT_EQ(this->schwarz->get_parameters().local_solver_factory, + ASSERT_EQ(this->schwarz->get_parameters().local_solver, this->jacobi_factory); } @@ -125,7 +125,7 @@ TYPED_TEST(SchwarzFactory, CanBeCopied) using Mtx = typename TestFixture::Mtx; auto bj = gko::share(Jacobi::build().on(this->exec)); auto copy = Schwarz::build() - .with_local_solver_factory(bj) + .with_local_solver(bj) .on(this->exec) ->generate(Mtx::create(this->exec, MPI_COMM_WORLD)); @@ -143,7 +143,7 @@ TYPED_TEST(SchwarzFactory, CanBeMoved) auto tmp = clone(this->schwarz); auto bj = gko::share(Jacobi::build().on(this->exec)); auto copy = Schwarz::build() - .with_local_solver_factory(bj) + .with_local_solver(bj) .on(this->exec) ->generate(Mtx::create(this->exec, MPI_COMM_WORLD)); @@ -158,7 +158,7 @@ TYPED_TEST(SchwarzFactory, CanBeCleared) this->schwarz->clear(); ASSERT_EQ(this->schwarz->get_size(), gko::dim<2>(0, 0)); - ASSERT_EQ(this->schwarz->get_parameters().local_solver_factory, nullptr); + ASSERT_EQ(this->schwarz->get_parameters().local_solver, nullptr); } diff --git a/core/test/preconditioner/ic.cpp b/core/test/preconditioner/ic.cpp index efd54ee9ebc..9e1e3f3e3c4 100644 --- a/core/test/preconditioner/ic.cpp +++ b/core/test/preconditioner/ic.cpp @@ -77,9 +77,8 @@ TEST_F(IcFactory, KnowsItsExecutor) TEST_F(IcFactory, CanSetLSolverFactory) { - auto ic_factory = ic_prec_type::build() - .with_l_solver_factory(this->l_factory) - .on(this->exec); + auto ic_factory = + ic_prec_type::build().with_l_solver(this->l_factory).on(this->exec); ASSERT_EQ(ic_factory->get_parameters().l_solver_factory, this->l_factory); } @@ -88,7 +87,7 @@ TEST_F(IcFactory, CanSetLSolverFactory) TEST_F(IcFactory, CanSetFactorizationFactory) { auto ic_factory = ic_prec_type::build() - .with_factorization_factory(this->fact_factory) + .with_factorization(this->fact_factory) .on(this->exec); ASSERT_EQ(ic_factory->get_parameters().factorization_factory, diff --git a/core/test/preconditioner/ilu.cpp b/core/test/preconditioner/ilu.cpp index c7b72e09b09..f25a20b47e3 100644 --- a/core/test/preconditioner/ilu.cpp +++ b/core/test/preconditioner/ilu.cpp @@ -81,9 +81,8 @@ TEST_F(IluFactory, KnowsItsExecutor) TEST_F(IluFactory, CanSetLSolverFactory) { - auto ilu_factory = ilu_prec_type::build() - .with_l_solver_factory(this->l_factory) - .on(this->exec); + auto ilu_factory = + ilu_prec_type::build().with_l_solver(this->l_factory).on(this->exec); ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory); } @@ -91,9 +90,8 @@ TEST_F(IluFactory, CanSetLSolverFactory) TEST_F(IluFactory, CanSetUSolverFactory) { - auto ilu_factory = ilu_prec_type::build() - .with_u_solver_factory(this->u_factory) - .on(this->exec); + auto ilu_factory = + ilu_prec_type::build().with_u_solver(this->u_factory).on(this->exec); ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, this->u_factory); } @@ -102,7 +100,7 @@ TEST_F(IluFactory, CanSetUSolverFactory) TEST_F(IluFactory, CanSetFactorizationFactory) { auto ilu_factory = ilu_prec_type::build() - .with_factorization_factory(this->fact_factory) + .with_factorization(this->fact_factory) .on(this->exec); ASSERT_EQ(ilu_factory->get_parameters().factorization_factory, diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp index 4c7421f63e1..37ed110bdf4 100644 --- a/core/test/solver/bicg.cpp +++ b/core/test/solver/bicg.cpp @@ -66,10 +66,9 @@ class Bicg : public ::testing::Test { bicg_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(gko::remove_complex{1e-6}) - .on(exec)) + .with_reduction_factor(gko::remove_complex{1e-6})) .on(exec)), solver(bicg_factory->generate(mtx)) {} diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp index c42cd7db2af..937064da7c4 100644 --- a/core/test/solver/bicgstab.cpp +++ b/core/test/solver/bicgstab.cpp @@ -64,10 +64,9 @@ class Bicgstab : public ::testing::Test { bicgstab_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(gko::remove_complex{1e-6}) - .on(exec)) + .with_reduction_factor(gko::remove_complex{1e-6})) .on(exec)), solver(bicgstab_factory->generate(mtx)) {} diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp index b81d84f8b1e..17dcf0c385f 100644 --- a/core/test/solver/cb_gmres.cpp +++ b/core/test/solver/cb_gmres.cpp @@ -72,23 +72,20 @@ class CbGmres : public ::testing::Test { Solver::build() .with_storage_precision(storage_precision) .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(nc_value_type{1e-6}) - .on(exec)) + .with_reduction_factor(nc_value_type{1e-6})) .on(exec)), solver(cb_gmres_factory->generate(mtx)), cb_gmres_big_factory( Solver::build() .with_storage_precision(storage_precision) .with_criteria( - gko::stop::Iteration::build().with_max_iters(128u).on( - exec), + gko::stop::Iteration::build().with_max_iters(128u), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(nc_value_type{1e-6}) - .on(exec)) + .with_reduction_factor(nc_value_type{1e-6})) .on(exec)), big_solver(cb_gmres_big_factory->generate(mtx)) {} diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp index 5daf43bc160..d0381a6e5ab 100644 --- a/core/test/solver/cg.cpp +++ b/core/test/solver/cg.cpp @@ -66,10 +66,9 @@ class Cg : public ::testing::Test { cg_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(gko::remove_complex{1e-6}) - .on(exec)) + .with_reduction_factor(gko::remove_complex{1e-6})) .on(exec)), solver(cg_factory->generate(mtx)) {} diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp index c23dc7b2e3b..7509c22d76e 100644 --- a/core/test/solver/cgs.cpp +++ b/core/test/solver/cgs.cpp @@ -66,10 +66,9 @@ class Cgs : public ::testing::Test { cgs_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(gko::remove_complex{1e-6}) - .on(exec)) + .with_reduction_factor(gko::remove_complex{1e-6})) .on(exec)), solver(cgs_factory->generate(mtx)) {} diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp index 59bb5e0cdee..21cc686bd01 100644 --- a/core/test/solver/fcg.cpp +++ b/core/test/solver/fcg.cpp @@ -63,10 +63,9 @@ class Fcg : public ::testing::Test { fcg_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(gko::remove_complex{1e-6}) - .on(exec)) + .with_reduction_factor(gko::remove_complex{1e-6})) .on(exec)), solver(fcg_factory->generate(mtx)) {} diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index f7ba80ebba1..fec313582ed 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -67,23 +67,19 @@ class Gcr : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx(gko::initialize( {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)), - gcr_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) - .on(exec)), + gcr_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) + .on(exec)), solver(gcr_factory->generate(mtx)), gcr_big_factory( Big_solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(128u).on( - exec), + gko::stop::Iteration::build().with_max_iters(128u), gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_reduction_factor(reduction_factor)) .on(exec)), big_solver(gcr_big_factory->generate(mtx)) {} diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index 11cafe2c86f..8ce8135f8b2 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -70,20 +70,17 @@ class Gmres : public ::testing::Test { gmres_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_reduction_factor(reduction_factor)) .on(exec)), solver(gmres_factory->generate(mtx)), gmres_big_factory( Big_solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(128u).on( - exec), + gko::stop::Iteration::build().with_max_iters(128u), gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_reduction_factor(reduction_factor)) .on(exec)), big_solver(gmres_big_factory->generate(mtx)) {} diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp index 45511be8e1b..e2657be8581 100644 --- a/core/test/solver/idr.cpp +++ b/core/test/solver/idr.cpp @@ -64,10 +64,9 @@ class Idr : public ::testing::Test { idr_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(gko::remove_complex{1e-6}) - .on(exec)) + .with_reduction_factor(gko::remove_complex{1e-6})) .on(exec)), solver(idr_factory->generate(mtx)) {} diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp index 5fdcd55af14..7419f99bfd0 100644 --- a/core/test/solver/ir.cpp +++ b/core/test/solver/ir.cpp @@ -64,14 +64,12 @@ class Ir : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx(gko::initialize( {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)), - ir_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), + ir_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), solver(ir_factory->generate(mtx)) {} diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 856f9651ebe..8fea85a40bb 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -153,11 +153,10 @@ class Multigrid : public ::testing::Test { multigrid_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(gko::remove_complex{1e-6}) - .on(exec)) + .with_reduction_factor(gko::remove_complex{1e-6})) .with_max_levels(2u) .with_coarsest_solver(lo_factory) .with_pre_smoother(lo_factory) @@ -427,28 +426,25 @@ TYPED_TEST(Multigrid, ThrowWhenNullMgLevel) TYPED_TEST(Multigrid, ThrowWhenMgLevelContainsNullptr) { using Solver = typename TestFixture::Solver; - auto factory = Solver::build() - .with_max_levels(1u) - .with_min_coarse_rows(2u) - .with_criteria(this->criterion) - .with_mg_level(this->rp_factory, nullptr) - .on(this->exec); + auto factory_parameters = Solver::build() + .with_max_levels(1u) + .with_min_coarse_rows(2u) + .with_criteria(this->criterion) + .with_mg_level(this->rp_factory, nullptr); - ASSERT_THROW(factory->generate(this->mtx), gko::NotSupported); + ASSERT_THROW(factory_parameters.on(this->exec), gko::NotSupported); } TYPED_TEST(Multigrid, ThrowWhenEmptyMgLevelList) { using Solver = typename TestFixture::Solver; - auto factory = - Solver::build() - .with_max_levels(1u) - .with_min_coarse_rows(2u) - .with_mg_level( - std::vector>{}) - .with_criteria(this->criterion) - .on(this->exec); + auto factory = Solver::build() + .with_max_levels(1u) + .with_min_coarse_rows(2u) + .with_mg_level() + .with_criteria(this->criterion) + .on(this->exec); ASSERT_THROW(factory->generate(this->mtx), gko::NotSupported); } diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp index 79b197aacc8..b673024c6fe 100644 --- a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp +++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp @@ -110,18 +110,14 @@ int main(int argc, char* argv[]) const RealValueType reduction_factor = 1e-7; auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(10000u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(10000u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) // Add preconditioner, these 2 lines are the only // difference from the simple solver example - .with_preconditioner(bj::build() - .with_max_block_size(16u) - .with_storage_optimization( - gko::precision_reduction::autodetect()) - .on(exec)) + .with_preconditioner( + bj::build().with_max_block_size(16u).with_storage_optimization( + gko::precision_reduction::autodetect())) .on(exec); // Create solver std::shared_ptr> logger = diff --git a/examples/cb-gmres/cb-gmres.cpp b/examples/cb-gmres/cb-gmres.cpp index b096e48c71a..915035fd642 100644 --- a/examples/cb-gmres/cb-gmres.cpp +++ b/examples/cb-gmres/cb-gmres.cpp @@ -154,12 +154,10 @@ int main(int argc, char* argv[]) // storage type auto solver_gen_keep = cb_gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1000u).on(exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::rhs_norm) - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1000u), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::rhs_norm) + .with_reduction_factor(reduction_factor)) .with_krylov_dim(100u) .with_storage_precision( gko::solver::cb_gmres::storage_precision::keep) @@ -167,12 +165,10 @@ int main(int argc, char* argv[]) auto solver_gen_reduce = cb_gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1000u).on(exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::rhs_norm) - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1000u), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::rhs_norm) + .with_reduction_factor(reduction_factor)) .with_krylov_dim(100u) .with_storage_precision( gko::solver::cb_gmres::storage_precision::reduce1) diff --git a/examples/custom-logger/custom-logger.cpp b/examples/custom-logger/custom-logger.cpp index 7e6cf531edd..e44303b81a2 100644 --- a/examples/custom-logger/custom-logger.cpp +++ b/examples/custom-logger/custom-logger.cpp @@ -290,11 +290,9 @@ int main(int argc, char* argv[]) // object needs to be built on. auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(20u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) .on(exec); // Instantiate a ResidualLogger logger. diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp index 4610413fe9c..bcaa126cdaa 100644 --- a/examples/custom-matrix-format/custom-matrix-format.cpp +++ b/examples/custom-matrix-format/custom-matrix-format.cpp @@ -291,12 +291,10 @@ int main(int argc, char* argv[]) const RealValueType reduction_factor{1e-7}; // Generate solver and solve the system cg::build() - .with_criteria(gko::stop::Iteration::build() - .with_max_iters(discretization_points) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(discretization_points), + gko::stop::ResidualNorm::build().with_reduction_factor( + reduction_factor)) .on(exec) // notice how our custom StencilMatrix can be used in the same way as // any built-in type diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp index 800846cfbd9..e4c7d88785c 100644 --- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp +++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp @@ -109,13 +109,12 @@ void run_solver(volatile bool* stop_iteration_process, auto x = gko::read(std::ifstream("data/x0.mtx"), exec); // Create solver factory and solve system - auto solver = bicg::build() - .with_criteria(ByInteraction::build() - .with_stop_iteration_process( - stop_iteration_process) - .on(exec)) - .on(exec) - ->generate(A); + auto solver = + bicg::build() + .with_criteria(ByInteraction::build().with_stop_iteration_process( + stop_iteration_process)) + .on(exec) + ->generate(A); solver->add_logger(gko::log::Stream::create( gko::log::Logger::iteration_complete_mask, std::cout, true)); solver->apply(b, x); diff --git a/examples/distributed-solver/distributed-solver.cpp b/examples/distributed-solver/distributed-solver.cpp index 123f93775f5..1b758d186a4 100644 --- a/examples/distributed-solver/distributed-solver.cpp +++ b/examples/distributed-solver/distributed-solver.cpp @@ -222,19 +222,15 @@ int main(int argc, char* argv[]) const gko::remove_complex reduction_factor{1e-8}; std::shared_ptr> logger = gko::log::Convergence::create(); - auto Ainv = - solver::build() - .with_preconditioner(schwarz::build() - .with_local_solver_factory(local_solver) - .on(exec)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(num_iters).on( - exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) - .on(exec) - ->generate(A); + auto Ainv = solver::build() + .with_preconditioner( + schwarz::build().with_local_solver(local_solver)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(num_iters), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) + .on(exec) + ->generate(A); // Add logger to the generated solver to log the iteration count and // residual norm Ainv->add_logger(logger); diff --git a/examples/external-lib-interfacing/external-lib-interfacing.cpp b/examples/external-lib-interfacing/external-lib-interfacing.cpp index 1766af3001f..04824cb9578 100644 --- a/examples/external-lib-interfacing/external-lib-interfacing.cpp +++ b/examples/external-lib-interfacing/external-lib-interfacing.cpp @@ -880,11 +880,9 @@ void AdvectionProblem::solve() auto solver_gen = bicgstab::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(1000).on(exec), - gko::stop::ResidualNorm<>::build() - .with_reduction_factor(1e-12) - .on(exec)) - .with_preconditioner(bj::build().on(exec)) + gko::stop::Iteration::build().with_max_iters(1000), + gko::stop::ResidualNorm<>::build().with_reduction_factor(1e-12)) + .with_preconditioner(bj::build()) .on(exec); auto solver = solver_gen->generate(gko::give(A)); diff --git a/examples/ginkgo-overhead/ginkgo-overhead.cpp b/examples/ginkgo-overhead/ginkgo-overhead.cpp index 5bd90ba0bad..f3f308c495f 100644 --- a/examples/ginkgo-overhead/ginkgo-overhead.cpp +++ b/examples/ginkgo-overhead/ginkgo-overhead.cpp @@ -72,8 +72,7 @@ int main(int argc, char* argv[]) auto cg_factory = cg::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(num_iters).on( - exec)) + gko::stop::Iteration::build().with_max_iters(num_iters)) .on(exec); auto A = gko::initialize({1.0}, exec); auto b = gko::initialize({std::nan("")}, exec); diff --git a/examples/heat-equation/heat-equation.cpp b/examples/heat-equation/heat-equation.cpp index eae87f7e64f..8e69931b250 100644 --- a/examples/heat-equation/heat-equation.cpp +++ b/examples/heat-equation/heat-equation.cpp @@ -192,11 +192,10 @@ int main(int argc, char* argv[]) // stopping at 1e-10 relative accuracy auto solver = gko::solver::Cg<>::build() - .with_preconditioner(gko::preconditioner::Ic<>::build().on(exec)) + .with_preconditioner(gko::preconditioner::Ic<>::build()) .with_criteria(gko::stop::ResidualNorm<>::build() .with_baseline(gko::stop::mode::rhs_norm) - .with_reduction_factor(1e-10) - .on(exec)) + .with_reduction_factor(1e-10)) .on(exec) ->generate(stencil_matrix); // time stamp of the last output frame (initialized to a sentinel value) diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp index 33946b7de44..acebd9d96ff 100644 --- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp +++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp @@ -114,11 +114,9 @@ int main(int argc, char* argv[]) const RealValueType reduction_factor{1e-7}; auto ilu_gmres_factory = gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1000u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1000u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) .with_generated_preconditioner(ilu_preconditioner) .on(exec); diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp index 460370b7e00..2b584e0ca4f 100644 --- a/examples/inverse-iteration/inverse-iteration.cpp +++ b/examples/inverse-iteration/inverse-iteration.cpp @@ -118,12 +118,10 @@ int main(int argc, char* argv[]) // Generate solver operator (A - zI)^-1 auto solver = solver_type::build() - .with_criteria(gko::stop::Iteration::build() - .with_max_iters(system_max_iterations) - .on(exec), + .with_criteria(gko::stop::Iteration::build().with_max_iters( + system_max_iterations), gko::stop::ResidualNorm::build() - .with_reduction_factor(system_residual_goal) - .on(exec)) + .with_reduction_factor(system_residual_goal)) .on(exec) ->generate(system_matrix); diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp index 407a083e548..be7e8261f2c 100644 --- a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp +++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp @@ -119,18 +119,16 @@ int main(int argc, char* argv[]) auto trisolve_factory = ir::build() .with_solver(bj_factory) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(sweeps).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(sweeps)) .on(exec); // Generate an ILU preconditioner factory by setting lower and upper // triangular solver - in this case the previously defined iterative // refinement method. - auto ilu_pre_factory = - gko::preconditioner::Ilu::build() - .with_l_solver_factory(gko::clone(trisolve_factory)) - .with_u_solver_factory(gko::clone(trisolve_factory)) - .on(exec); + auto ilu_pre_factory = gko::preconditioner::Ilu::build() + .with_l_solver(gko::clone(trisolve_factory)) + .with_u_solver(gko::clone(trisolve_factory)) + .on(exec); // Use incomplete factors to generate ILU preconditioner auto ilu_preconditioner = gko::share(ilu_pre_factory->generate(par_ilu)); diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp index 14384eaab52..711d43049a1 100644 --- a/examples/iterative-refinement/iterative-refinement.cpp +++ b/examples/iterative-refinement/iterative-refinement.cpp @@ -113,19 +113,13 @@ int main(int argc, char* argv[]) RealValueType inner_reduction_factor{1e-2}; auto solver_gen = ir::build() - .with_solver( - cg::build() - .with_criteria( - gko::stop::ResidualNorm::build() - .with_reduction_factor(inner_reduction_factor) - .on(exec)) - .on(exec)) + .with_solver(cg::build().with_criteria( + gko::stop::ResidualNorm::build() + .with_reduction_factor(inner_reduction_factor))) .with_criteria( - gko::stop::Iteration::build().with_max_iters(max_iters).on( - exec), + gko::stop::Iteration::build().with_max_iters(max_iters), gko::stop::ResidualNorm::build() - .with_reduction_factor(outer_reduction_factor) - .on(exec)) + .with_reduction_factor(outer_reduction_factor)) .on(exec); // Create solver auto solver = solver_gen->generate(A); diff --git a/examples/kokkos_assembly/kokkos_assembly.cpp b/examples/kokkos_assembly/kokkos_assembly.cpp index ba579199ee3..88ff261b759 100644 --- a/examples/kokkos_assembly/kokkos_assembly.cpp +++ b/examples/kokkos_assembly/kokkos_assembly.cpp @@ -208,13 +208,11 @@ int main(int argc, char* argv[]) const RealValueType reduction_factor{1e-7}; // Generate solver and solve the system cg::build() - .with_criteria(gko::stop::Iteration::build() - .with_max_iters(discretization_points) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) - .with_preconditioner(bj::build().on(exec)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(discretization_points), + gko::stop::ResidualNorm::build().with_reduction_factor( + reduction_factor)) + .with_preconditioner(bj::build()) .on(exec) ->generate(A) ->apply(rhs, u); diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp index 5a7a8c086af..ccbdaadfc41 100644 --- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp +++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp @@ -44,12 +44,10 @@ int main() // Create the solver auto solver = gko::solver::Cg<>::build() - .with_preconditioner(gko::preconditioner::Jacobi<>::build().on(gpu)) + .with_preconditioner(gko::preconditioner::Jacobi<>::build()) .with_criteria( - gko::stop::Iteration::build().with_max_iters(20u).on(gpu), - gko::stop::ResidualNorm<>::build() - .with_reduction_factor(1e-15) - .on(gpu)) + gko::stop::Iteration::build().with_max_iters(20u), + gko::stop::ResidualNorm<>::build().with_reduction_factor(1e-15)) .on(gpu); // Solve system solver->generate(give(A))->apply(b, x); diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp index cef918983e9..3834fa7f33f 100644 --- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp +++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp @@ -139,15 +139,13 @@ int main(int argc, char* argv[]) ir::build() .with_solver(inner_solver_gen) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec)); auto smoother_gen_f = gko::share( ir_f::build() .with_solver(inner_solver_gen_f) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec)); // Create MultigridLevel factory auto mg_level_gen = @@ -159,15 +157,13 @@ int main(int argc, char* argv[]) ir::build() .with_solver(inner_solver_gen) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(exec)); auto coarsest_gen_f = gko::share( ir_f::build() .with_solver(inner_solver_gen_f) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(exec)); // Create multigrid factory std::shared_ptr multigrid_gen; @@ -192,8 +188,7 @@ int main(int argc, char* argv[]) .with_coarsest_solver(coarsest_gen_f) .with_default_initial_guess( gko::solver::initial_guess_mode::zero) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); } else { multigrid_gen = @@ -206,8 +201,7 @@ int main(int argc, char* argv[]) .with_coarsest_solver(coarsest_gen) .with_default_initial_guess( gko::solver::initial_guess_mode::zero) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); } // Create solver factory diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp index 4241a74cdf2..33684198c83 100644 --- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp +++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp @@ -125,17 +125,15 @@ int main(int argc, char* argv[]) // Create smoother factory (ir with bj) auto smoother_gen = gko::share( ir::build() - .with_solver(bj::build().with_max_block_size(1u).on(exec)) + .with_solver(bj::build().with_max_block_size(1u)) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec)); auto smoother_gen2 = gko::share( ir2::build() - .with_solver(bj2::build().with_max_block_size(1u).on(exec)) + .with_solver(bj2::build().with_max_block_size(1u)) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec)); // Create RestrictProlong factory auto mg_level_gen = @@ -145,17 +143,15 @@ int main(int argc, char* argv[]) // Create CoarsesSolver factory auto coarsest_solver_gen = gko::share( ir::build() - .with_solver(bj::build().with_max_block_size(1u).on(exec)) + .with_solver(bj::build().with_max_block_size(1u)) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(exec)); auto coarsest_solver_gen2 = gko::share( ir2::build() - .with_solver(bj2::build().with_max_block_size(1u).on(exec)) + .with_solver(bj2::build().with_max_block_size(1u)) .with_relaxation_factor(static_cast(0.9)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(exec)); // Create multigrid factory std::shared_ptr multigrid_gen; diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp index 0882d755cdc..0083ca15162 100644 --- a/examples/mixed-precision-ir/mixed-precision-ir.cpp +++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp @@ -124,12 +124,10 @@ int main(int argc, char* argv[]) // Create inner solver auto inner_solver = cg::build() - .with_criteria(gko::stop::ResidualNorm::build() - .with_reduction_factor(inner_reduction_factor) - .on(exec), - gko::stop::Iteration::build() - .with_max_iters(max_inner_iters) - .on(exec)) + .with_criteria( + gko::stop::ResidualNorm::build() + .with_reduction_factor(inner_reduction_factor), + gko::stop::Iteration::build().with_max_iters(max_inner_iters)) .on(exec) ->generate(give(solver_A)); diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp index f82a603d662..d63dedf486b 100644 --- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp +++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp @@ -130,8 +130,7 @@ int main(int argc, char* argv[]) // iterative refinement with two iterations and an Ic solver. auto ic_gen = gko::share( ic::build() - .with_factorization_factory( - gko::factorization::Ic::build().on(exec)) + .with_factorization(gko::factorization::Ic::build()) .on(exec)); auto smoother_gen = gko::share( gko::solver::build_smoother(ic_gen, 2u, static_cast(0.9))); @@ -159,8 +158,7 @@ int main(int argc, char* argv[]) .with_mg_level(mg_level_gen) .with_coarsest_solver(coarsest_gen) .with_default_initial_guess(gko::solver::initial_guess_mode::zero) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); // Create solver factory auto solver_gen = cg::build() diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp index b31b7906902..0bb51e6fee9 100644 --- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp +++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp @@ -108,19 +108,16 @@ int main(int argc, char* argv[]) std::shared_ptr multigrid_gen; multigrid_gen = mg::build() - .with_mg_level(pgm::build().with_deterministic(true).on(exec)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_mg_level(pgm::build().with_deterministic(true)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); const gko::remove_complex tolerance = 1e-8; auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on(exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::absolute) - .with_reduction_factor(tolerance) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::absolute) + .with_reduction_factor(tolerance)) .with_preconditioner(multigrid_gen) .on(exec); // Create solver diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp index 05ee0503a5f..be3cc958baf 100644 --- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp +++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp @@ -282,12 +282,10 @@ void solve_system(const std::string& executor_string, // Generate solver auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(dp_2).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) - .with_preconditioner(bj::build().on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(dp_2), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) + .with_preconditioner(bj::build()) .on(exec); auto solver = solver_gen->generate(gko::give(matrix)); diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp index 1ae2ae9ec08..0b26e56dd80 100644 --- a/examples/papi-logging/papi-logging.cpp +++ b/examples/papi-logging/papi-logging.cpp @@ -177,11 +177,9 @@ int main(int argc, char* argv[]) const RealValueType reduction_factor{1e-7}; auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(20u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) .on(exec); auto solver = solver_gen->generate(A); diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp index 5f036728924..cb06ac6cc86 100644 --- a/examples/performance-debugging/performance-debugging.cpp +++ b/examples/performance-debugging/performance-debugging.cpp @@ -416,8 +416,7 @@ int main(int argc, char* argv[]) solver::build() .with_criteria( gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec), + .with_reduction_factor(reduction_factor), gko::stop::Iteration::build().with_max_iters(max_iters).on( exec)) .with_preconditioner(preconditioner::create(exec)) diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp index e16f0b26968..eba163fb281 100644 --- a/examples/poisson-solver/poisson-solver.cpp +++ b/examples/poisson-solver/poisson-solver.cpp @@ -184,13 +184,11 @@ int main(int argc, char* argv[]) const gko::remove_complex reduction_factor = 1e-7; // Generate solver and solve the system cg::build() - .with_criteria(gko::stop::Iteration::build() - .with_max_iters(discretization_points) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) - .with_preconditioner(bj::build().on(exec)) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(discretization_points), + gko::stop::ResidualNorm::build().with_reduction_factor( + reduction_factor)) + .with_preconditioner(bj::build()) .on(exec) ->generate(clone(exec, matrix)) // copy the matrix to the executor ->apply(rhs, u); diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp index b64b588c4ef..cb3d34be8bc 100644 --- a/examples/preconditioned-solver/preconditioned-solver.cpp +++ b/examples/preconditioned-solver/preconditioned-solver.cpp @@ -95,14 +95,12 @@ int main(int argc, char* argv[]) // Create solver factory auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(20u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) // Add preconditioner, these 2 lines are the only // difference from the simple solver example - .with_preconditioner(bj::build().with_max_block_size(8u).on(exec)) + .with_preconditioner(bj::build().with_max_block_size(8u)) .on(exec); // Create solver auto solver = solver_gen->generate(A); diff --git a/examples/preconditioner-export/preconditioner-export.cpp b/examples/preconditioner-export/preconditioner-export.cpp index 81aeece1cb1..16baffc6472 100644 --- a/examples/preconditioner-export/preconditioner-export.cpp +++ b/examples/preconditioner-export/preconditioner-export.cpp @@ -193,13 +193,11 @@ int main(int argc, char* argv[]) auto factory = gko::preconditioner::Ilu, gko::preconditioner::UpperIsai<>>::build() - .with_factorization_factory(fact_factory) - .with_l_solver_factory(gko::preconditioner::LowerIsai<>::build() - .with_sparsity_power(sparsity_power) - .on(exec)) - .with_u_solver_factory(gko::preconditioner::UpperIsai<>::build() - .with_sparsity_power(sparsity_power) - .on(exec)) + .with_factorization(fact_factory) + .with_l_solver(gko::preconditioner::LowerIsai<>::build() + .with_sparsity_power(sparsity_power)) + .with_u_solver(gko::preconditioner::UpperIsai<>::build() + .with_sparsity_power(sparsity_power)) .on(exec); auto ilu_isai = try_generate([&] { return factory->generate(mtx); }); output(ilu_isai->get_l_solver()->get_approximate_inverse(), @@ -220,13 +218,11 @@ int main(int argc, char* argv[]) auto factory = gko::preconditioner::Ilu, gko::preconditioner::UpperIsai<>>::build() - .with_factorization_factory(fact_factory) - .with_l_solver_factory(gko::preconditioner::LowerIsai<>::build() - .with_sparsity_power(sparsity_power) - .on(exec)) - .with_u_solver_factory(gko::preconditioner::UpperIsai<>::build() - .with_sparsity_power(sparsity_power) - .on(exec)) + .with_factorization(fact_factory) + .with_l_solver(gko::preconditioner::LowerIsai<>::build() + .with_sparsity_power(sparsity_power)) + .with_u_solver(gko::preconditioner::UpperIsai<>::build() + .with_sparsity_power(sparsity_power)) .on(exec); auto ilu_isai = try_generate([&] { return factory->generate(mtx); }); output(ilu_isai->get_l_solver()->get_approximate_inverse(), @@ -250,13 +246,11 @@ int main(int argc, char* argv[]) auto factory = gko::preconditioner::Ilu, gko::preconditioner::UpperIsai<>>::build() - .with_factorization_factory(fact_factory) - .with_l_solver_factory(gko::preconditioner::LowerIsai<>::build() - .with_sparsity_power(sparsity_power) - .on(exec)) - .with_u_solver_factory(gko::preconditioner::UpperIsai<>::build() - .with_sparsity_power(sparsity_power) - .on(exec)) + .with_factorization(fact_factory) + .with_l_solver(gko::preconditioner::LowerIsai<>::build() + .with_sparsity_power(sparsity_power)) + .with_u_solver(gko::preconditioner::UpperIsai<>::build() + .with_sparsity_power(sparsity_power)) .on(exec); auto ilu_isai = try_generate([&] { return factory->generate(mtx); }); output(ilu_isai->get_l_solver()->get_approximate_inverse(), diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp index 02318dd7784..2ef47524612 100644 --- a/examples/simple-solver-logging/simple-solver-logging.cpp +++ b/examples/simple-solver-logging/simple-solver-logging.cpp @@ -136,9 +136,8 @@ int main(int argc, char* argv[]) // Generate solver auto solver_gen = cg::build() - .with_criteria( - residual_criterion, - gko::stop::Iteration::build().with_max_iters(20u).on(exec)) + .with_criteria(residual_criterion, + gko::stop::Iteration::build().with_max_iters(20u)) .on(exec); auto solver = solver_gen->generate(A); diff --git a/examples/simple-solver/simple-solver.cpp b/examples/simple-solver/simple-solver.cpp index 81dc9ee6d74..d80c0633ab8 100644 --- a/examples/simple-solver/simple-solver.cpp +++ b/examples/simple-solver/simple-solver.cpp @@ -130,11 +130,9 @@ int main(int argc, char* argv[]) const RealValueType reduction_factor{1e-7}; auto solver_gen = cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(20u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(20u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) .on(exec); // Generate the solver from the matrix. The solver factory built in the // previous step takes a "matrix"(a gko::LinOp to be more general) as an diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp index 63adfaa5571..f4af38882b0 100644 --- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp +++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp @@ -216,13 +216,11 @@ void solve_system(const std::string& executor_string, // Generate solver auto solver_gen = cg::build() - .with_criteria(gko::stop::Iteration::build() - .with_max_iters(gko::size_type(dp)) - .on(exec), + .with_criteria(gko::stop::Iteration::build().with_max_iters( + gko::size_type(dp)), gko::stop::ResidualNorm::build() - .with_reduction_factor(reduction_factor) - .on(exec)) - .with_preconditioner(bj::build().on(exec)) + .with_reduction_factor(reduction_factor)) + .with_preconditioner(bj::build()) .on(exec); auto solver = solver_gen->generate(gko::give(matrix)); diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index 441bc63d22c..3347828a55d 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if GINKGO_BUILD_MPI +#include #include #include #include @@ -93,8 +94,25 @@ class Schwarz /** * Local solver factory. */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - local_solver_factory, nullptr); + std::shared_ptr local_solver{}; + + parameters_type& with_local_solver( + deferred_factory_parameter solver) + { + this->local_solver_generator = std::move(solver); + return *this; + } + + std::unique_ptr on(std::shared_ptr exec) const + { + auto copy = *this; + copy.local_solver = local_solver_generator.on(exec); + return copy.enable_parameters_type::on( + exec); + } + + private: + deferred_factory_parameter local_solver_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index ce3ea72725f..92fe8fac8cf 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -80,37 +80,29 @@ class Ilu : public ::testing::Test { u_factor(gko::initialize( {{2., 1., 1.}, {0., 4., 1.}, {0., 0., 3.}}, exec)), l_u_composition(Composition::create(l_factor, u_factor)), - l_factory( - l_solver_type::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(10u).on( - exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), - u_factory( - u_solver_type::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(10u).on( - exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), + l_factory(l_solver_type::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(10u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), + u_factory(u_solver_type::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(10u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), ilu_pre_factory(ilu_prec_type::build() - .with_l_solver_factory(l_factory) - .with_u_solver_factory(u_factory) + .with_l_solver(l_factory) + .with_u_solver(u_factory) .on(exec)), ilu_rev_pre_factory(ilu_rev_prec_type::build() - .with_l_solver_factory(l_factory) - .with_u_solver_factory(u_factory) + .with_l_solver(l_factory) + .with_u_solver(u_factory) .on(exec)) {} diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp index eea171d60fe..86d0f40142a 100644 --- a/reference/test/preconditioner/isai_kernels.cpp +++ b/reference/test/preconditioner/isai_kernels.cpp @@ -82,16 +82,13 @@ class Isai : public ::testing::Test { : exec{gko::ReferenceExecutor::create()}, excess_solver_factory( excess_solver_type::build() - .with_preconditioner( - bj::build().with_max_block_size(16u).on(exec)) + .with_preconditioner(bj::build().with_max_block_size(16u)) .with_criteria( - gko::stop::Iteration::build().with_max_iters(1000u).on( - exec), + gko::stop::Iteration::build().with_max_iters(1000u), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::rhs_norm) .with_reduction_factor( - gko::remove_complex{1e-6}) - .on(exec)) + gko::remove_complex{1e-6})) .on(exec)), a_dense{gko::initialize({{2, 1, 2}, {1, -2, 3}, {-1, 1, 1}}, exec)}, diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index 8789ded37ca..edadc245b33 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -110,11 +110,9 @@ class ScaledReordered : public ::testing::Test { solver_factory( Bicgstab::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), tol{r::value} { diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index e317677b2de..aa27eb4afa3 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -64,17 +64,14 @@ class Bicg : public ::testing::Test { {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)), stopped{}, non_stopped{}, - bicg_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), + bicg_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), mtx_big(gko::initialize( {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0}, {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5}, @@ -86,20 +83,16 @@ class Bicg : public ::testing::Test { bicg_factory_big( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), bicg_factory_big2( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ImplicitResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), mtx_non_symmetric(gko::initialize( {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)) diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index ec44b6b6f17..70302e95796 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -69,36 +69,29 @@ class Bicgstab : public ::testing::Test { bicgstab_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(8u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(8u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), bicgstab_factory2( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(8u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(8u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ImplicitResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), bicgstab_factory_precision( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on( - exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)) { auto small_size = gko::dim<2>{2, 2}; diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp index 1127d7caff7..e5b933ad82c 100644 --- a/reference/test/solver/cb_gmres_kernels.cpp +++ b/reference/test/solver/cb_gmres_kernels.cpp @@ -77,15 +77,12 @@ class CbGmres : public ::testing::Test { gmres_type::build() .with_storage_precision(storage_prec) .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(this->reduction_factor()) - .on(exec)) + .with_reduction_factor(this->reduction_factor())) .on(exec)), mtx_big(gko::initialize( {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5}, @@ -99,12 +96,10 @@ class CbGmres : public ::testing::Test { gmres_type::build() .with_storage_precision(storage_prec) .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(this->reduction_factor()) - .on(exec)) + .with_reduction_factor(this->reduction_factor())) .on(exec)), mtx_medium( gko::initialize({{-86.40, 153.30, -108.90, 8.60, -61.60}, diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 76b8cf55946..c089442488f 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -64,18 +64,14 @@ class Cg : public ::testing::Test { {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)), stopped{}, non_stopped{}, - cg_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(400u).on( - exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), + cg_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(400u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), mtx_big(gko::initialize( {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0}, {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5}, @@ -87,20 +83,16 @@ class Cg : public ::testing::Test { cg_factory_big( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), cg_factory_big2( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ImplicitResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)) { auto small_size = gko::dim<2>{2, 2}; diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 9c3ce2071a7..91c7c1e821b 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -65,15 +65,12 @@ class Cgs : public ::testing::Test { {{1.0, -3.0, 0.0}, {-4.0, 1.0, -3.0}, {2.0, -1.0, 2.0}}, exec)), stopped{}, non_stopped{}, - cgs_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(40u).on( - exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), + cgs_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(40u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), mtx_big( gko::initialize({{-99.0, 87.0, -67.0, -62.0, -68.0, -19.0}, {-30.0, -17.0, -1.0, 9.0, 23.0, 77.0}, @@ -85,20 +82,16 @@ class Cgs : public ::testing::Test { cgs_factory_big( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), cgs_factory_big2( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ImplicitResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)) { auto small_size = gko::dim<2>{2, 2}; diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp index 617015bac1f..f69846b548d 100644 --- a/reference/test/solver/direct.cpp +++ b/reference/test/solver/direct.cpp @@ -77,8 +77,7 @@ class Direct : public ::testing::Test { .with_factorization( gko::experimental::factorization::Lu::build() - .with_symmetric_sparsity(true) - .on(exec)) + .with_symmetric_sparsity(true)) .on(exec); solver = factory->generate(mtx); std::normal_distribution> dist(0, 1); diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index e8163752689..3dd4149405e 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -65,17 +65,14 @@ class Fcg : public ::testing::Test { {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)), stopped{}, non_stopped{}, - fcg_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), + fcg_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), mtx_big(gko::initialize( {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0}, {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5}, @@ -87,20 +84,16 @@ class Fcg : public ::testing::Test { fcg_factory_big( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), fcg_factory_big2( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ImplicitResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)) { auto small_size = gko::dim<2>{2, 2}; diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index 888cbc3b4fe..adf5c35fd1d 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -72,18 +72,15 @@ class Gcr : public ::testing::Test { non_stopped{}, mtx(gko::initialize( {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)), - gcr_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .with_krylov_dim(3u) - .on(exec)), + gcr_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .with_krylov_dim(3u) + .on(exec)), mtx_big(gko::initialize( {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5}, {2752.6, -1127.7, 1212.8, -299.1, 987.7, 786.8}, @@ -95,20 +92,16 @@ class Gcr : public ::testing::Test { gcr_factory_big( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), gcr_factory_big2( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ImplicitResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), mtx_medium( gko::initialize({{-86.40, 153.30, -108.90, 8.60, -61.60}, diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 585fec833bc..a99400e412b 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -76,13 +76,11 @@ class Gmres : public ::testing::Test { gmres_factory( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .with_krylov_dim(3u) .on(exec)), mtx_big(gko::initialize( @@ -96,20 +94,16 @@ class Gmres : public ::testing::Test { gmres_factory_big( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), gmres_factory_big2( Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - exec), + gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ImplicitResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), mtx_medium( gko::initialize({{-86.40, 153.30, -108.90, 8.60, -61.60}, diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index 3e74e0c319b..da1b73a035c 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -62,30 +62,24 @@ class Idr : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx(gko::initialize( {{1.0, -3.0, 0.0}, {-4.0, 1.0, -3.0}, {2.0, -1.0, 2.0}}, exec)), - idr_factory( - Solver::build() - .with_deterministic(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(8u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)), + idr_factory(Solver::build() + .with_deterministic(true) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(8u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)), idr_factory_precision( Solver::build() .with_deterministic(true) .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on( - exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)) {} diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index 8b4255b72ef..fc0c130aa83 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -65,15 +65,12 @@ class Ir : public ::testing::Test { // Eigenvalues of mtx are 0.9, 1.0 and 1.1 // Richardson iteration, converges since // | relaxation_factor * lambda - 1 | < 1 - ir_factory( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(30u).on( - exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(exec)) - .on(exec)) + ir_factory(Solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(30u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .on(exec)) {} std::shared_ptr exec; diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 3efb9d41c5e..23307d20b33 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -289,30 +289,26 @@ class Multigrid : public ::testing::Test { .on(exec)), smoother_factory(gko::give( Smoother::build() - .with_solver( - InnerSolver::build().with_max_block_size(1u).on(exec)) + .with_solver(InnerSolver::build().with_max_block_size(1u)) .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + gko::stop::Iteration::build().with_max_iters(1u)) .on(exec))), coarsest_factory( CoarsestSolver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .on(exec)), coarsestnext_factory( CoarsestNextSolver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec)) + gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6))) .on(exec)), rp_factory(DummyRPFactory::build().on(exec)), lo_factory(DummyFactory::build().on(exec)), @@ -357,14 +353,12 @@ class Multigrid : public ::testing::Test { .with_mid_case(gko::solver::multigrid::mid_smooth_type::both) .with_mg_level(coarse_factory) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(4u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .with_cycle(cycle) .with_min_coarse_rows(1u) .on(exec)); @@ -382,14 +376,12 @@ class Multigrid : public ::testing::Test { .with_mid_case(gko::solver::multigrid::mid_smooth_type::both) .with_mg_level(coarse_factory, coarsenext_factory) .with_criteria( - gko::stop::Iteration::build().with_max_iters(200u).on(exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(100)) - .on(exec), + gko::stop::Iteration::build().with_max_iters(200u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(100)), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(r::value) - .on(exec)) + .with_reduction_factor(r::value)) .with_cycle(cycle) .with_min_coarse_rows(1u) .on(exec)); diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 3c9e3a8d69f..8d07ba44046 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -197,7 +197,7 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver) cg::build() .with_preconditioner( prec::build() - .with_local_solver_factory(this->local_solver_factory) + .with_local_solver(this->local_solver_factory) .on(this->exec)) .with_criteria(iter_stop, tol_stop) .on(this->exec); @@ -225,10 +225,9 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditioner) using cg = typename TestFixture::solver_type; using prec = typename TestFixture::dist_prec_type; - auto precond_factory = - prec::build() - .with_local_solver_factory(this->local_solver_factory) - .on(this->exec); + auto precond_factory = prec::build() + .with_local_solver(this->local_solver_factory) + .on(this->exec); auto local_precond = this->local_solver_factory->generate(this->non_dist_mat); auto precond = precond_factory->generate(this->dist_mat); @@ -249,10 +248,9 @@ TYPED_TEST(SchwarzPreconditioner, CanAdvancedApplyPreconditioner) using cg = typename TestFixture::solver_type; using prec = typename TestFixture::dist_prec_type; - auto precond_factory = - prec::build() - .with_local_solver_factory(this->local_solver_factory) - .on(this->exec); + auto precond_factory = prec::build() + .with_local_solver(this->local_solver_factory) + .on(this->exec); auto local_precond = this->local_solver_factory->generate(this->non_dist_mat); auto precond = precond_factory->generate(this->dist_mat); diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index 59462a9be59..f53b2784124 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -107,9 +107,7 @@ struct SimpleSolverTest { std::shared_ptr exec) { return solver_type::build().with_criteria( - gko::stop::Iteration::build() - .with_max_iters(iteration_count()) - .on(exec), + gko::stop::Iteration::build().with_max_iters(iteration_count()), gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::absolute) .with_reduction_factor(reduction_factor()) @@ -164,17 +162,11 @@ struct Ir : SimpleSolverTest> { std::shared_ptr exec) { return SimpleSolverTest>::build(exec) - .with_solver( - gko::solver::Cg::build() - .with_criteria( - gko::stop::Iteration::build() - .with_max_iters(iteration_count()) - .on(exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::absolute) - .with_reduction_factor(2 * reduction_factor()) - .on(exec)) - .on(exec)) + .with_solver(gko::solver::Cg::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(iteration_count()), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::absolute) + .with_reduction_factor(2 * reduction_factor()))) .with_relaxation_factor(0.9); } }; diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp index a62ab3f6d72..d35e6de227d 100644 --- a/test/solver/bicg_kernels.cpp +++ b/test/solver/bicg_kernels.cpp @@ -239,19 +239,15 @@ TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef) auto d_b = gko::clone(exec, b); auto bicg_factory = gko::solver::Bicg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(ref), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(ref); auto d_bicg_factory = gko::solver::Bicg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(exec); auto solver = bicg_factory->generate(std::move(mtx)); auto d_solver = d_bicg_factory->generate(std::move(d_mtx)); @@ -271,19 +267,15 @@ TEST_F(Bicg, ApplyWithSuiteSparseMatrixIsEquivalentToRef) auto d_b = gko::clone(exec, b); auto bicg_factory = gko::solver::Bicg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(ref), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(ref); auto d_bicg_factory = gko::solver::Bicg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(exec); auto solver = bicg_factory->generate(std::move(mtx_ani)); auto d_solver = d_bicg_factory->generate(std::move(d_mtx_ani)); diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp index 15eda2a74cb..422d51c86ad 100644 --- a/test/solver/bicgstab_kernels.cpp +++ b/test/solver/bicgstab_kernels.cpp @@ -71,19 +71,17 @@ class Bicgstab : public CommonTestFixture { exec_bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(exec), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(exec)) + .with_reduction_factor(::r::value)) .on(exec); ref_bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(ref), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(ref)) + .with_reduction_factor(::r::value)) .on(ref); } diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp index a51ac48c59b..dcb4b0147f6 100644 --- a/test/solver/cg_kernels.cpp +++ b/test/solver/cg_kernels.cpp @@ -203,19 +203,15 @@ TEST_F(Cg, ApplyIsEquivalentToRef) auto d_b = gko::clone(exec, b); auto cg_factory = gko::solver::Cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(ref), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(ref); auto d_cg_factory = gko::solver::Cg::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(exec); auto solver = cg_factory->generate(std::move(mtx)); auto d_solver = d_cg_factory->generate(std::move(d_mtx)); diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp index b1b124ed420..35914d4afa6 100644 --- a/test/solver/cgs_kernels.cpp +++ b/test/solver/cgs_kernels.cpp @@ -70,18 +70,16 @@ class Cgs : public CommonTestFixture { exec_cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(exec), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(exec)) + .with_reduction_factor(::r::value)) .on(exec); ref_cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(ref), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(ref)) + .with_reduction_factor(::r::value)) .on(ref); } diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index 0a30f7ba67f..31b7bd976ce 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -93,22 +93,22 @@ class Direct : public CommonTestFixture { mtx = gko::read(s_mtx, ref); dmtx = gko::clone(exec, mtx); const auto num_rows = mtx->get_size()[0]; - factory = solver_type::build() - .with_factorization(factorization_type::build() - .with_symmetric_sparsity(true) - .on(ref)) - .with_num_rhs(static_cast(nrhs)) - .on(ref); + factory = + solver_type::build() + .with_factorization( + factorization_type::build().with_symmetric_sparsity(true)) + .with_num_rhs(static_cast(nrhs)) + .on(ref); alpha = gen_mtx(1, 1); beta = gen_mtx(1, 1); input = gen_mtx(num_rows, nrhs); output = gen_mtx(num_rows, nrhs); - dfactory = solver_type::build() - .with_factorization(factorization_type::build() - .with_symmetric_sparsity(true) - .on(exec)) - .with_num_rhs(static_cast(nrhs)) - .on(exec); + dfactory = + solver_type::build() + .with_factorization( + factorization_type::build().with_symmetric_sparsity(true)) + .with_num_rhs(static_cast(nrhs)) + .on(exec); dalpha = gko::clone(exec, alpha); dbeta = gko::clone(exec, beta); dinput = gko::clone(exec, input); diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp index 0d1ced86f85..d8a3a1ef9b2 100644 --- a/test/solver/fcg_kernels.cpp +++ b/test/solver/fcg_kernels.cpp @@ -212,19 +212,15 @@ TEST_F(Fcg, ApplyIsEquivalentToRef) auto d_b = gko::clone(exec, b); auto fcg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(ref), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(ref); auto d_fcg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(50u).on(exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(::r::value) - .on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(50u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(::r::value)) .on(exec); auto solver = fcg_factory->generate(std::move(mtx)); auto d_solver = d_fcg_factory->generate(std::move(d_mtx)); diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp index 8f02c431f98..8db5570a6f0 100644 --- a/test/solver/gcr_kernels.cpp +++ b/test/solver/gcr_kernels.cpp @@ -74,19 +74,17 @@ class Gcr : public CommonTestFixture { exec_gcr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(exec), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(value_type{1e-15}) - .on(exec)) + .with_reduction_factor(value_type{1e-15})) .on(exec); ref_gcr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(ref), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(value_type{1e-15}) - .on(ref)) + .with_reduction_factor(value_type{1e-15})) .on(ref); } diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index 5c2541da1a7..7752ff4dda6 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -70,19 +70,17 @@ class Gmres : public CommonTestFixture { exec_gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(exec), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(value_type{1e-15}) - .on(exec)) + .with_reduction_factor(value_type{1e-15})) .on(exec); ref_gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(246u).on(ref), + gko::stop::Iteration::build().with_max_iters(246u), gko::stop::ResidualNorm::build() - .with_reduction_factor(value_type{1e-15}) - .on(ref)) + .with_reduction_factor(value_type{1e-15})) .on(ref); } diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp index 959c857cb71..0019c05b9d4 100644 --- a/test/solver/idr_kernels.cpp +++ b/test/solver/idr_kernels.cpp @@ -76,15 +76,13 @@ class Idr : public CommonTestFixture { exec_idr_factory = Solver::build() .with_deterministic(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); ref_idr_factory = Solver::build() .with_deterministic(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(ref); } @@ -295,15 +293,13 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) Solver::build() .with_deterministic(true) .with_complex_subspace(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); ref_idr_factory = Solver::build() .with_deterministic(true) .with_complex_subspace(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(ref); auto ref_solver = ref_idr_factory->generate(mtx); auto exec_solver = exec_idr_factory->generate(d_mtx); @@ -337,15 +333,13 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) Solver::build() .with_deterministic(true) .with_complex_subspace(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); ref_idr_factory = Solver::build() .with_deterministic(true) .with_complex_subspace(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(ref); auto exec_solver = exec_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp index c21f6da3f66..81464036c69 100644 --- a/test/solver/ir_kernels.cpp +++ b/test/solver/ir_kernels.cpp @@ -105,13 +105,11 @@ TEST_F(Ir, ApplyIsEquivalentToRef) // both executors auto ir_factory = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .on(ref); auto d_ir_factory = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .on(exec); auto solver = ir_factory->generate(std::move(mtx)); auto d_solver = d_ir_factory->generate(std::move(d_mtx)); @@ -134,25 +132,15 @@ TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef) auto ir_factory = gko::solver::Ir::build() - .with_solver( - gko::solver::Gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on( - ref)) - .on(ref)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_solver(gko::solver::Gmres::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(ref))) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .on(ref); auto d_ir_factory = gko::solver::Ir::build() - .with_solver( - gko::solver::Gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on( - exec)) - .on(exec)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(exec)) + .with_solver(gko::solver::Gmres::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(exec))) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .on(exec); auto solver = ir_factory->generate(std::move(mtx)); auto d_solver = d_ir_factory->generate(std::move(d_mtx)); @@ -180,14 +168,12 @@ TEST_F(Ir, RichardsonApplyIsEquivalentToRef) // both executors auto ir_factory = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_relaxation_factor(value_type{0.9}) .on(ref); auto d_ir_factory = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_relaxation_factor(value_type{0.9}) .on(exec); auto solver = ir_factory->generate(std::move(mtx)); @@ -210,26 +196,16 @@ TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef) auto d_b = clone(exec, b); auto ir_factory = gko::solver::Ir::build() - .with_solver( - gko::solver::Gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on( - ref)) - .on(ref)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_solver(gko::solver::Gmres::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(ref))) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_relaxation_factor(value_type{0.9}) .on(ref); auto d_ir_factory = gko::solver::Ir::build() - .with_solver( - gko::solver::Gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on( - exec)) - .on(exec)) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(exec)) + .with_solver(gko::solver::Gmres::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(exec))) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_relaxation_factor(value_type{0.9}) .on(exec); auto solver = ir_factory->generate(std::move(mtx)); @@ -258,14 +234,12 @@ TEST_F(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef) auto d_x = clone(exec, x); auto ir_factory = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(ref)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_default_initial_guess(guess) .on(ref); auto d_ir_factory = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_default_initial_guess(guess) .on(exec); auto solver = ir_factory->generate(mtx); diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index b6f228c13f5..6bb59507f17 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -101,9 +101,7 @@ struct SimpleSolverTest { gko::size_type iteration_count, bool check_residual = true) { return solver_type::build().with_criteria( - gko::stop::Iteration::build() - .with_max_iters(iteration_count) - .on(exec), + gko::stop::Iteration::build().with_max_iters(iteration_count), check_residual ? gko::stop::ResidualNorm::build() .with_baseline(gko::stop::mode::absolute) .with_reduction_factor(1e-30) @@ -116,8 +114,7 @@ struct SimpleSolverTest { gko::size_type iteration_count, bool check_residual = true) { return build(exec, iteration_count, check_residual) - .with_preconditioner( - precond_type::build().with_max_block_size(1u).on(exec)); + .with_preconditioner(precond_type::build().with_max_block_size(1u)); } static const gko::LinOp* get_preconditioner( @@ -185,8 +182,7 @@ struct Idr : SimpleSolverTest> { gko::size_type iteration_count, bool check_residual = true) { return build(exec, iteration_count, check_residual) - .with_preconditioner( - precond_type::build().with_max_block_size(1u).on(exec)); + .with_preconditioner(precond_type::build().with_max_block_size(1u)); } }; @@ -200,8 +196,7 @@ struct Ir : SimpleSolverTest> { { return SimpleSolverTest>::build( exec, iteration_count, check_residual) - .with_solver( - precond_type::build().with_max_block_size(1u).on(exec)); + .with_solver(precond_type::build().with_max_block_size(1u)); } static const gko::LinOp* get_preconditioner( @@ -232,8 +227,7 @@ struct CbGmres : SimpleSolverTest> { gko::size_type iteration_count, bool check_residual = true) { return build(exec, iteration_count, check_residual) - .with_preconditioner( - precond_type::build().with_max_block_size(1u).on(exec)); + .with_preconditioner(precond_type::build().with_max_block_size(1u)); } }; @@ -254,8 +248,7 @@ struct Gmres : SimpleSolverTest> { gko::size_type iteration_count, bool check_residual = true) { return build(exec, iteration_count, check_residual) - .with_preconditioner( - precond_type::build().with_max_block_size(1u).on(exec)); + .with_preconditioner(precond_type::build().with_max_block_size(1u)); } }; @@ -277,8 +270,7 @@ struct FGmres : SimpleSolverTest> { gko::size_type iteration_count, bool check_residual = true) { return build(exec, iteration_count, check_residual) - .with_preconditioner( - precond_type::build().with_max_block_size(1u).on(exec)) + .with_preconditioner(precond_type::build().with_max_block_size(1u)) .with_flexible(true); } }; @@ -300,8 +292,7 @@ struct Gcr : SimpleSolverTest> { gko::size_type iteration_count, bool check_residual = true) { return build(exec, iteration_count, check_residual) - .with_preconditioner( - precond_type::build().with_max_block_size(1u).on(exec)); + .with_preconditioner(precond_type::build().with_max_block_size(1u)); } }; diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index d2c273b4e0f..d442647a985 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -104,11 +104,9 @@ void check_solver(std::shared_ptr exec, auto solver_gen = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(num_iters).on( - exec), - gko::stop::ResidualNorm<>::build() - .with_reduction_factor(reduction_factor) - .on(exec)) + gko::stop::Iteration::build().with_max_iters(num_iters), + gko::stop::ResidualNorm<>::build().with_reduction_factor( + reduction_factor)) .on(exec); #if HAS_REFERENCE A->read(A_raw); @@ -126,11 +124,9 @@ void check_solver(std::shared_ptr exec, auto solver_gen_ref = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(num_iters).on( - exec_ref), - gko::stop::ResidualNorm<>::build() - .with_reduction_factor(reduction_factor) - .on(exec_ref)) + gko::stop::Iteration::build().with_max_iters(num_iters), + gko::stop::ResidualNorm<>::build().with_reduction_factor( + reduction_factor)) .on(exec_ref); auto x_ref = gko::clone(exec_ref, x); solver_gen->generate(A_ref)->apply(b, x_ref); @@ -493,8 +489,7 @@ int main() using Solver = gko::solver::Ir<>; auto test = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(exec); } From fe24ad4f332b6188f337f87f02fefc96bbb4f42e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 14 Aug 2023 17:17:31 +0200 Subject: [PATCH 311/583] review updates - move parameter macros to abstract_factory.hpp - use macros for defining deferred parameters Co-authored-by: Yuhsiang M. Tsai --- include/ginkgo/core/base/abstract_factory.hpp | 203 ++++++++++++++++++ include/ginkgo/core/base/lin_op.hpp | 124 ----------- .../distributed/preconditioner/schwarz.hpp | 14 +- include/ginkgo/core/solver/direct.hpp | 38 +--- include/ginkgo/core/solver/ir.hpp | 32 +-- include/ginkgo/core/solver/multigrid.hpp | 89 ++------ 6 files changed, 227 insertions(+), 273 deletions(-) diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index e8ec803b480..e644bcdcd76 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -274,13 +274,26 @@ class enable_parameters_type { }; +/** + * Represents a factory parameter of factory type that can either initialized by + * a pre-existing factory or by passing in a factory_parameters object whose + * `.on(exec)` will be called to instantiate a factory. + * + * @tparam FactoryType the type of factory that can be instantiated from this + * object. + */ template class deferred_factory_parameter { public: deferred_factory_parameter() = default; + /** Creates an empty deferred factory parameter. */ deferred_factory_parameter(std::nullptr_t) {} + /** + * Creates a deferred factory parameter from a preexisting factory with + * shared ownership. + */ template ) { return factory; }; } + /** + * Creates a deferred factory parameter by taking ownership of a + * preexisting factory with unique ownership. + */ template ) { return factory; }; } + /** + * Creates a deferred factory parameter object from a + * factory_parameters-like object. To instantiate the actual factory, the + * parameter's `.on(exec)` function will be called. + */ template ().on( std::shared_ptr{}))> @@ -315,6 +337,7 @@ class deferred_factory_parameter { }; } + /** Instantiates the deferred parameter into an actual factory. */ std::shared_ptr on( std::shared_ptr exec) const { @@ -324,6 +347,7 @@ class deferred_factory_parameter { return generator_(exec); } + /** Returns true iff the parameter contains a factory. */ explicit operator bool() const { return bool(generator_); } private: @@ -333,6 +357,185 @@ class deferred_factory_parameter { }; +/** + * Defines a build method for the factory, simplifying its construction by + * removing the repetitive typing of factory's name. + * + * @param _factory_name the factory for which to define the method + * + * @ingroup LinOp + */ +#define GKO_ENABLE_BUILD_METHOD(_factory_name) \ + static auto build()->decltype(_factory_name::create()) \ + { \ + return _factory_name::create(); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + +#if !(defined(__CUDACC__) || defined(__HIPCC__)) +/** + * Creates a factory parameter in the factory parameters structure. + * + * @param _name name of the parameter + * @param __VA_ARGS__ default value of the parameter + * + * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example + * + * @deprecated Use GKO_FACTORY_PARAMETER_SCALAR or GKO_FACTORY_PARAMETER_VECTOR + * + * @ingroup LinOp + */ +#define GKO_FACTORY_PARAMETER(_name, ...) \ + mutable _name{__VA_ARGS__}; \ + \ + template \ + auto with_##_name(Args&&... _value)->std::decay_t& \ + { \ + using type = decltype(this->_name); \ + this->_name = type{std::forward(_value)...}; \ + return *this; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +/** + * Creates a scalar factory parameter in the factory parameters structure. + * + * Scalar in this context means that the constructor for this type only takes + * a single parameter. + * + * @param _name name of the parameter + * @param _default default value of the parameter + * + * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example + * + * @ingroup LinOp + */ +#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default) \ + GKO_FACTORY_PARAMETER(_name, _default) + +/** + * Creates a vector factory parameter in the factory parameters structure. + * + * Vector in this context means that the constructor for this type takes + * multiple parameters. + * + * @param _name name of the parameter + * @param _default default value of the parameter + * + * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example + * + * @ingroup LinOp + */ +#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...) \ + GKO_FACTORY_PARAMETER(_name, __VA_ARGS__) +#else // defined(__CUDACC__) || defined(__HIPCC__) +// A workaround for the NVCC compiler - parameter pack expansion does not work +// properly, because while the assignment to a scalar value is translated by +// cudafe into a C-style cast, the parameter pack expansion is not removed and +// `Args&&... args` is still kept as a parameter pack. +#define GKO_FACTORY_PARAMETER(_name, ...) \ + mutable _name{__VA_ARGS__}; \ + \ + template \ + auto with_##_name(Args&&... _value)->std::decay_t& \ + { \ + GKO_NOT_IMPLEMENTED; \ + return *this; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default) \ + mutable _name{_default}; \ + \ + template \ + auto with_##_name(Arg&& _value)->std::decay_t& \ + { \ + using type = decltype(this->_name); \ + this->_name = type{std::forward(_value)}; \ + return *this; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...) \ + mutable _name{__VA_ARGS__}; \ + \ + template \ + auto with_##_name(Args&&... _value)->std::decay_t& \ + { \ + using type = decltype(this->_name); \ + this->_name = type{std::forward(_value)...}; \ + return *this; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif // defined(__CUDACC__) || defined(__HIPCC__) + +/** + * Creates a factory parameter of factory type. The parameter can either be set + * directly, or its creation can be deferred until the executor is set in the + * `.on(exec)` function call, by using a deferred_factory_parameter. + * + * @param _name name of the parameter + * @param _type pointee type of the parameter, e.g. LinOpFactory + * + */ +#define GKO_DEFERRED_FACTORY_PARAMETER(_name, _type) \ +public: \ + std::shared_ptr _name{}; \ + parameters_type& with_##_name(deferred_factory_parameter<_type> factory) \ + { \ + this->_name##_generator_ = std::move(factory); \ + return *this; \ + } \ + \ +private: \ + deferred_factory_parameter<_type> _name##_generator_; \ + \ +public: \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +/** + * Creates a factory parameter representing a vector of factories type. The + * parameter can either be set directly, or its creation can be deferred until + * the executor is set in the + * `.on(exec)` function call, by using a vector of deferred_factory_parameters. + * + * @param _name name of the parameter + * @param _type pointee type of the vector entries, e.g. LinOpFactory + * + */ +#define GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(_name, _type) \ +public: \ + std::vector> _name{}; \ + template \ + parameters_type& with_##_name(Args&&... factories) \ + { \ + this->_name##_generator_ = {deferred_factory_parameter<_type>{ \ + std::forward(factories)}...}; \ + return *this; \ + } \ + \ +private: \ + std::vector> _name##_generator_; \ + \ +public: \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + } // namespace gko diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index 20d7771822f..e2660baff2e 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -1084,130 +1084,6 @@ public: \ "semi-colon warnings") -/** - * Defines a build method for the factory, simplifying its construction by - * removing the repetitive typing of factory's name. - * - * @param _factory_name the factory for which to define the method - * - * @ingroup LinOp - */ -#define GKO_ENABLE_BUILD_METHOD(_factory_name) \ - static auto build()->decltype(_factory_name::create()) \ - { \ - return _factory_name::create(); \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - - -#if !(defined(__CUDACC__) || defined(__HIPCC__)) -/** - * Creates a factory parameter in the factory parameters structure. - * - * @param _name name of the parameter - * @param __VA_ARGS__ default value of the parameter - * - * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example - * - * @deprecated Use GKO_FACTORY_PARAMETER_SCALAR or GKO_FACTORY_PARAMETER_VECTOR - * - * @ingroup LinOp - */ -#define GKO_FACTORY_PARAMETER(_name, ...) \ - mutable _name{__VA_ARGS__}; \ - \ - template \ - auto with_##_name(Args&&... _value)->std::decay_t& \ - { \ - using type = decltype(this->_name); \ - this->_name = type{std::forward(_value)...}; \ - return *this; \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - -/** - * Creates a scalar factory parameter in the factory parameters structure. - * - * Scalar in this context means that the constructor for this type only takes - * a single parameter. - * - * @param _name name of the parameter - * @param _default default value of the parameter - * - * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example - * - * @ingroup LinOp - */ -#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default) \ - GKO_FACTORY_PARAMETER(_name, _default) - -/** - * Creates a vector factory parameter in the factory parameters structure. - * - * Vector in this context means that the constructor for this type takes - * multiple parameters. - * - * @param _name name of the parameter - * @param _default default value of the parameter - * - * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example - * - * @ingroup LinOp - */ -#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...) \ - GKO_FACTORY_PARAMETER(_name, __VA_ARGS__) -#else // defined(__CUDACC__) || defined(__HIPCC__) -// A workaround for the NVCC compiler - parameter pack expansion does not work -// properly, because while the assignment to a scalar value is translated by -// cudafe into a C-style cast, the parameter pack expansion is not removed and -// `Args&&... args` is still kept as a parameter pack. -#define GKO_FACTORY_PARAMETER(_name, ...) \ - mutable _name{__VA_ARGS__}; \ - \ - template \ - auto with_##_name(Args&&... _value)->std::decay_t& \ - { \ - GKO_NOT_IMPLEMENTED; \ - return *this; \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - -#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default) \ - mutable _name{_default}; \ - \ - template \ - auto with_##_name(Arg&& _value)->std::decay_t& \ - { \ - using type = decltype(this->_name); \ - this->_name = type{std::forward(_value)}; \ - return *this; \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - -#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...) \ - mutable _name{__VA_ARGS__}; \ - \ - template \ - auto with_##_name(Args&&... _value)->std::decay_t& \ - { \ - using type = decltype(this->_name); \ - this->_name = type{std::forward(_value)...}; \ - return *this; \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") -#endif // defined(__CUDACC__) || defined(__HIPCC__) - - } // namespace gko diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index 3347828a55d..fe0539570ee 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -94,25 +94,15 @@ class Schwarz /** * Local solver factory. */ - std::shared_ptr local_solver{}; - - parameters_type& with_local_solver( - deferred_factory_parameter solver) - { - this->local_solver_generator = std::move(solver); - return *this; - } + GKO_DEFERRED_FACTORY_PARAMETER(local_solver, LinOpFactory); std::unique_ptr on(std::shared_ptr exec) const { auto copy = *this; - copy.local_solver = local_solver_generator.on(exec); + copy.local_solver = local_solver_generator_.on(exec); return copy.enable_parameters_type::on( exec); } - - private: - deferred_factory_parameter local_solver_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/direct.hpp b/include/ginkgo/core/solver/direct.hpp index f66546cd2ec..dcd6fd189a6 100644 --- a/include/ginkgo/core/solver/direct.hpp +++ b/include/ginkgo/core/solver/direct.hpp @@ -87,36 +87,7 @@ class Direct : public EnableLinOp>, gko::size_type GKO_FACTORY_PARAMETER_SCALAR(num_rhs, 1u); /** The factorization factory to use for generating the factors. */ - std::shared_ptr factorization; - - /** - * - */ - parameters_type& with_factorization( - std::shared_ptr factorization) - { - this->factorization_generator = - [factorization](std::shared_ptr) - -> std::shared_ptr { - return factorization; - }; - return *this; - } - - template < - typename FactorizationParameters, - typename = decltype(std::declval().on( - std::shared_ptr{}))> - parameters_type& with_factorization( - FactorizationParameters factorization_parameters) - { - this->factorization_generator = - [factorization_parameters](std::shared_ptr exec) - -> std::shared_ptr { - return factorization_parameters.on(exec); - }; - return *this; - } + GKO_DEFERRED_FACTORY_PARAMETER(factorization, LinOpFactory); /** * @@ -124,15 +95,10 @@ class Direct : public EnableLinOp>, std::unique_ptr on(std::shared_ptr exec) const { auto parameters_copy = *this; - parameters_copy.factorization = factorization_generator(exec); + parameters_copy.factorization = factorization_generator_.on(exec); return parameters_copy .enable_parameters_type::on(exec); } - - private: - std::function( - std::shared_ptr)> - factorization_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Direct, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index d30fd9d69bc..1f04c8b75d2 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -184,13 +184,14 @@ class Ir : public EnableLinOp>, /** * Inner solver factory. */ - std::shared_ptr solver{}; + GKO_DEFERRED_FACTORY_PARAMETER(solver, LinOpFactory); /** * Already generated solver. If one is provided, the factory `solver` * will be ignored. */ - std::shared_ptr generated_solver{}; + std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( + generated_solver, nullptr); /** * Relaxation factor for Richardson iteration @@ -205,41 +206,18 @@ class Ir : public EnableLinOp>, initial_guess_mode GKO_FACTORY_PARAMETER_SCALAR( default_initial_guess, initial_guess_mode::provided); - /** - * - */ - parameters_type& with_solver( - deferred_factory_parameter solver) - { - this->solver_generator = std::move(solver); - return *this; - } - - /** - * - */ - parameters_type& with_generated_solver( - std::shared_ptr generated_solver) - { - this->generated_solver = std::move(generated_solver); - return *this; - } - /** * */ std::unique_ptr on(std::shared_ptr exec) const { auto parameters_copy = *this; - if (solver_generator) { - parameters_copy.solver = solver_generator.on(exec); + if (solver_generator_) { + parameters_copy.solver = solver_generator_.on(exec); } return parameters_copy.enable_iterative_solver_factory_parameters< parameters_type, Factory>::on(exec); } - - private: - deferred_factory_parameter solver_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Ir, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 5aab788f71f..0a0a6fdd191 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -225,16 +225,7 @@ class Multigrid : public EnableLinOp, /** * MultigridLevel Factory list */ - std::vector> mg_level{nullptr}; - - template - parameters_type& with_mg_level(Args&&... level) - { - this->mg_level_generator = { - deferred_factory_parameter{ - std::forward(level)}...}; - return *this; - } + GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(mg_level, LinOpFactory); /** * Custom selector size_type (size_type level, const LinOp* fine_matrix) @@ -265,7 +256,6 @@ class Multigrid : public EnableLinOp, std::function GKO_FACTORY_PARAMETER_SCALAR(level_selector, nullptr); - using smoother_list = std::vector>; /** * Pre-smooth Factory list. * Its size must be 0, 1 or be the same as mg_level's. @@ -280,14 +270,14 @@ class Multigrid : public EnableLinOp, * If any element in the vector is a `nullptr` then the smoother * application at the corresponding level is skipped. */ - smoother_list pre_smoother{}; + GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(pre_smoother, LinOpFactory); /** * Post-smooth Factory list. * It is similar to Pre-smooth Factory list. It is ignored if * the factory parameter post_uses_pre is set to true. */ - smoother_list post_smoother{}; + GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(post_smoother, LinOpFactory); /** * Mid-smooth Factory list. If it contains available elements, multigrid @@ -296,34 +286,7 @@ class Multigrid : public EnableLinOp, * Pre-smooth Factory list. It is ignored if the factory parameter * mid_case is not mid. */ - smoother_list mid_smoother{}; - - template - parameters_type& with_pre_smoother(Args&&... smoother) - { - this->pre_smoother_generator = { - deferred_factory_parameter{ - std::forward(smoother)}...}; - return *this; - } - - template - parameters_type& with_post_smoother(Args&&... smoother) - { - this->post_smoother_generator = { - deferred_factory_parameter{ - std::forward(smoother)}...}; - return *this; - } - - template - parameters_type& with_mid_smoother(Args&&... smoother) - { - this->mid_smoother_generator = { - deferred_factory_parameter{ - std::forward(smoother)}...}; - return *this; - } + GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(mid_smoother, LinOpFactory); /** * Whether post-smoothing-related calls use corresponding @@ -363,17 +326,7 @@ class Multigrid : public EnableLinOp, * If not set, then a direct LU solver will be used as solver on the * coarsest level. */ - std::vector> coarsest_solver{ - nullptr}; - - template - parameters_type& with_coarsest_solver(Args&&... solver) - { - this->coarsest_solver_generator = { - deferred_factory_parameter{ - std::forward(solver)}...}; - return *this; - } + GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(coarsest_solver, LinOpFactory); /** * Custom coarsest_solver selector @@ -449,36 +402,36 @@ class Multigrid : public EnableLinOp, std::unique_ptr on(std::shared_ptr exec) const { auto copy = *this; - if (!copy.mg_level_generator.empty()) { + if (!copy.mg_level_generator_.empty()) { copy.mg_level.clear(); - for (auto& generator : copy.mg_level_generator) { + for (auto& generator : copy.mg_level_generator_) { copy.mg_level.push_back(generator.on(exec)); } } - if (!copy.pre_smoother_generator.empty()) { + if (!copy.pre_smoother_generator_.empty()) { copy.pre_smoother.clear(); - for (auto& generator : copy.pre_smoother_generator) { + for (auto& generator : copy.pre_smoother_generator_) { copy.pre_smoother.push_back(generator ? generator.on(exec) : nullptr); } } - if (!copy.mid_smoother_generator.empty()) { + if (!copy.mid_smoother_generator_.empty()) { copy.mid_smoother.clear(); - for (auto& generator : copy.mid_smoother_generator) { + for (auto& generator : copy.mid_smoother_generator_) { copy.mid_smoother.push_back(generator ? generator.on(exec) : nullptr); } } - if (!copy.post_smoother_generator.empty()) { + if (!copy.post_smoother_generator_.empty()) { copy.post_smoother.clear(); - for (auto& generator : copy.post_smoother_generator) { + for (auto& generator : copy.post_smoother_generator_) { copy.post_smoother.push_back(generator ? generator.on(exec) : nullptr); } } - if (!copy.coarsest_solver_generator.empty()) { + if (!copy.coarsest_solver_generator_.empty()) { copy.coarsest_solver.clear(); - for (auto& generator : copy.coarsest_solver_generator) { + for (auto& generator : copy.coarsest_solver_generator_) { copy.coarsest_solver.push_back( generator ? generator.on(exec) : nullptr); } @@ -486,18 +439,6 @@ class Multigrid : public EnableLinOp, return copy.enable_iterative_solver_factory_parameters< parameters_type, Factory>::on(exec); } - - private: - std::vector> - mg_level_generator; - std::vector> - pre_smoother_generator; - std::vector> - mid_smoother_generator; - std::vector> - post_smoother_generator; - std::vector> - coarsest_solver_generator; }; GKO_ENABLE_LIN_OP_FACTORY(Multigrid, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); From d1be65246f9612661c78e30ed2940b3fc48b1402 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 21 Sep 2023 12:36:58 +0200 Subject: [PATCH 312/583] improve abstract_factory constructors - make them explicit - pass through nullptr explicitly --- include/ginkgo/core/base/abstract_factory.hpp | 16 ++++++++++------ include/ginkgo/core/solver/multigrid.hpp | 12 ++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index e644bcdcd76..ca8ab7ed2ce 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -288,7 +288,10 @@ class deferred_factory_parameter { deferred_factory_parameter() = default; /** Creates an empty deferred factory parameter. */ - deferred_factory_parameter(std::nullptr_t) {} + explicit deferred_factory_parameter(std::nullptr_t) + { + generator_ = [](std::shared_ptr) { return nullptr; }; + } /** * Creates a deferred factory parameter from a preexisting factory with @@ -298,7 +301,8 @@ class deferred_factory_parameter { std::enable_if_t>::value>* = nullptr> - deferred_factory_parameter(std::shared_ptr factory) + explicit deferred_factory_parameter( + std::shared_ptr factory) { generator_ = [factory = std::shared_ptr(std::move(factory))]( @@ -313,7 +317,7 @@ class deferred_factory_parameter { std::enable_if_t>::value>* = nullptr> - deferred_factory_parameter( + explicit deferred_factory_parameter( std::unique_ptr factory) { generator_ = @@ -329,7 +333,7 @@ class deferred_factory_parameter { template ().on( std::shared_ptr{}))> - deferred_factory_parameter(ParametersType parameters) + explicit deferred_factory_parameter(ParametersType parameters) { generator_ = [parameters](std::shared_ptr exec) -> std::shared_ptr { @@ -341,14 +345,14 @@ class deferred_factory_parameter { std::shared_ptr on( std::shared_ptr exec) const { - if (!(*this)) { + if (this->is_empty()) { GKO_NOT_SUPPORTED(*this); } return generator_(exec); } /** Returns true iff the parameter contains a factory. */ - explicit operator bool() const { return bool(generator_); } + bool is_empty() const { return bool(generator_); } private: std::function( diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 0a0a6fdd191..1256639acb4 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -411,29 +411,25 @@ class Multigrid : public EnableLinOp, if (!copy.pre_smoother_generator_.empty()) { copy.pre_smoother.clear(); for (auto& generator : copy.pre_smoother_generator_) { - copy.pre_smoother.push_back(generator ? generator.on(exec) - : nullptr); + copy.pre_smoother.push_back(generator.on(exec)); } } if (!copy.mid_smoother_generator_.empty()) { copy.mid_smoother.clear(); for (auto& generator : copy.mid_smoother_generator_) { - copy.mid_smoother.push_back(generator ? generator.on(exec) - : nullptr); + copy.mid_smoother.push_back(generator.on(exec)); } } if (!copy.post_smoother_generator_.empty()) { copy.post_smoother.clear(); for (auto& generator : copy.post_smoother_generator_) { - copy.post_smoother.push_back(generator ? generator.on(exec) - : nullptr); + copy.post_smoother.push_back(generator.on(exec)); } } if (!copy.coarsest_solver_generator_.empty()) { copy.coarsest_solver.clear(); for (auto& generator : copy.coarsest_solver_generator_) { - copy.coarsest_solver.push_back( - generator ? generator.on(exec) : nullptr); + copy.coarsest_solver.push_back(generator.on(exec)); } } return copy.enable_iterative_solver_factory_parameters< From 2ebc888fda6c19785d16787a00be69e8631c6fe4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 21 Sep 2023 12:42:00 +0200 Subject: [PATCH 313/583] remove more instances of .on --- core/test/log/papi.cpp | 4 +-- .../distributed/preconditioner/schwarz.cpp | 6 ++-- core/test/solver/bicg.cpp | 18 ++++------- core/test/solver/bicgstab.cpp | 16 +++++----- core/test/solver/cb_gmres.cpp | 16 +++++----- core/test/solver/cg.cpp | 18 +++++------ core/test/solver/cgs.cpp | 18 +++++------ core/test/solver/fcg.cpp | 18 +++++------ core/test/solver/gcr.cpp | 22 +++++++------- core/test/solver/gmres.cpp | 22 +++++++------- core/test/solver/idr.cpp | 24 +++++++-------- core/test/solver/ir.cpp | 30 +++++++++---------- core/test/solver/multigrid.cpp | 2 +- reference/test/preconditioner/ilu.cpp | 6 ++-- reference/test/solver/ir_kernels.cpp | 5 ++-- reference/test/solver/multigrid_kernels.cpp | 8 ++--- test/solver/ir_kernels.cpp | 8 ++--- 17 files changed, 115 insertions(+), 126 deletions(-) diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp index 2ed266449f6..b4e51cdc31b 100644 --- a/core/test/log/papi.cpp +++ b/core/test/log/papi.cpp @@ -472,7 +472,7 @@ TYPED_TEST(Papi, CatchesLinOpFactoryGenerateStarted) auto factory = gko::solver::Bicgstab::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto str = this->init(gko::log::Logger::linop_factory_generate_started_mask, "linop_factory_generate_started", factory.get()); @@ -493,7 +493,7 @@ TYPED_TEST(Papi, CatchesLinOpFactoryGenerateCompleted) auto factory = gko::solver::Bicgstab::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); TypeParam dummy; auto str = diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp index e0b5749e987..5c354b11748 100644 --- a/core/test/mpi/distributed/preconditioner/schwarz.cpp +++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp @@ -123,9 +123,8 @@ TYPED_TEST(SchwarzFactory, CanBeCopied) using Jacobi = typename TestFixture::Jacobi; using Schwarz = typename TestFixture::Schwarz; using Mtx = typename TestFixture::Mtx; - auto bj = gko::share(Jacobi::build().on(this->exec)); auto copy = Schwarz::build() - .with_local_solver(bj) + .with_local_solver(Jacobi::build()) .on(this->exec) ->generate(Mtx::create(this->exec, MPI_COMM_WORLD)); @@ -141,9 +140,8 @@ TYPED_TEST(SchwarzFactory, CanBeMoved) using Schwarz = typename TestFixture::Schwarz; using Mtx = typename TestFixture::Mtx; auto tmp = clone(this->schwarz); - auto bj = gko::share(Jacobi::build().on(this->exec)); auto copy = Schwarz::build() - .with_local_solver(bj) + .with_local_solver(Jacobi::build()) .on(this->exec) ->generate(Mtx::create(this->exec, MPI_COMM_WORLD)); diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp index 37ed110bdf4..9e49b118484 100644 --- a/core/test/solver/bicg.cpp +++ b/core/test/solver/bicg.cpp @@ -194,15 +194,13 @@ TYPED_TEST(Bicg, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr bicg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto bicg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(bicg_precond) .on(this->exec); auto solver = bicg_factory->generate(this->mtx); @@ -245,15 +243,13 @@ TYPED_TEST(Bicg, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr bicg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto bicg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(bicg_precond) .on(this->exec); @@ -278,15 +274,13 @@ TYPED_TEST(Bicg, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr bicg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto bicg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = bicg_factory->generate(this->mtx); solver->set_preconditioner(bicg_precond); diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp index 937064da7c4..d5b489feff9 100644 --- a/core/test/solver/bicgstab.cpp +++ b/core/test/solver/bicgstab.cpp @@ -160,13 +160,13 @@ TYPED_TEST(Bicgstab, CanSetPreconditionerGenerator) auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_preconditioner( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = bicgstab_factory->generate(this->mtx); @@ -208,14 +208,14 @@ TYPED_TEST(Bicgstab, CanSetPreconditionerInFactory) std::shared_ptr bicgstab_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(bicgstab_precond) .on(this->exec); auto solver = bicgstab_factory->generate(this->mtx); @@ -235,14 +235,14 @@ TYPED_TEST(Bicgstab, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr bicgstab_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(bicgstab_precond) .on(this->exec); @@ -268,14 +268,14 @@ TYPED_TEST(Bicgstab, CanSetPreconditioner) std::shared_ptr bicgstab_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto bicgstab_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = bicgstab_factory->generate(this->mtx); solver->set_preconditioner(bicgstab_precond); diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp index 17dcf0c385f..5f6076f248c 100644 --- a/core/test/solver/cb_gmres.cpp +++ b/core/test/solver/cb_gmres.cpp @@ -257,7 +257,7 @@ TYPED_TEST(CbGmres, CanSetKrylovDim) Solver::build() .with_krylov_dim(new_krylov_dim) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); @@ -277,7 +277,7 @@ TYPED_TEST(CbGmres, CanUseSetKrylovDim) auto cb_gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); @@ -296,14 +296,14 @@ TYPED_TEST(CbGmres, CanSetPreconditionerInFactory) std::shared_ptr cb_gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cb_gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cb_gmres_precond) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); @@ -323,14 +323,14 @@ TYPED_TEST(CbGmres, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr cb_gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto cb_gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cb_gmres_precond) .on(this->exec); @@ -344,14 +344,14 @@ TYPED_TEST(CbGmres, CanSetPreconditioner) std::shared_ptr cb_gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cb_gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); solver->set_preconditioner(cb_gmres_precond); diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp index d0381a6e5ab..d1d7dbee344 100644 --- a/core/test/solver/cg.cpp +++ b/core/test/solver/cg.cpp @@ -164,17 +164,17 @@ TYPED_TEST(Cg, CanSetPreconditionerGenerator) auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor( gko::remove_complex(1e-6)) - .on(this->exec)) + ) .with_preconditioner( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = cg_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -194,14 +194,14 @@ TYPED_TEST(Cg, CanSetPreconditionerInFactory) std::shared_ptr cg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cg_precond) .on(this->exec); auto solver = cg_factory->generate(this->mtx); @@ -245,14 +245,14 @@ TYPED_TEST(Cg, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr cg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cg_precond) .on(this->exec); @@ -278,14 +278,14 @@ TYPED_TEST(Cg, CanSetPreconditioner) std::shared_ptr cg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = cg_factory->generate(this->mtx); solver->set_preconditioner(cg_precond); diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp index 7509c22d76e..705e9f850c8 100644 --- a/core/test/solver/cgs.cpp +++ b/core/test/solver/cgs.cpp @@ -164,17 +164,17 @@ TYPED_TEST(Cgs, CanSetPreconditionerGenerator) auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor( gko::remove_complex(1e-6)) - .on(this->exec)) + ) .with_preconditioner( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = cgs_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -218,14 +218,14 @@ TYPED_TEST(Cgs, CanSetPreconditionerInFactory) std::shared_ptr cgs_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cgs_precond) .on(this->exec); auto solver = cgs_factory->generate(this->mtx); @@ -245,14 +245,14 @@ TYPED_TEST(Cgs, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr cgs_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cgs_precond) .on(this->exec); @@ -278,14 +278,14 @@ TYPED_TEST(Cgs, CanSetPreconditioner) std::shared_ptr cgs_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cgs_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = cgs_factory->generate(this->mtx); solver->set_preconditioner(cgs_precond); diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp index 21cc686bd01..4ba3f389ecd 100644 --- a/core/test/solver/fcg.cpp +++ b/core/test/solver/fcg.cpp @@ -163,17 +163,17 @@ TYPED_TEST(Fcg, CanSetPreconditionerGenerator) auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor( gko::remove_complex(1e-6)) - .on(this->exec)) + ) .with_preconditioner( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = fcg_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -217,14 +217,14 @@ TYPED_TEST(Fcg, CanSetPreconditionerInFactory) std::shared_ptr fcg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(fcg_precond) .on(this->exec); auto solver = fcg_factory->generate(this->mtx); @@ -244,14 +244,14 @@ TYPED_TEST(Fcg, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr fcg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(fcg_precond) .on(this->exec); @@ -277,14 +277,14 @@ TYPED_TEST(Fcg, CanSetPreconditioner) std::shared_ptr fcg_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto fcg_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = fcg_factory->generate(this->mtx); solver->set_preconditioner(fcg_precond); diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index fec313582ed..554d5aa9526 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -194,16 +194,16 @@ TYPED_TEST(Gcr, CanSetPreconditionerGenerator) auto gcr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor(TestFixture::reduction_factor) - .on(this->exec)) + ) .with_preconditioner( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -249,10 +249,10 @@ TYPED_TEST(Gcr, CanSetKrylovDim) Solver::build() .with_krylov_dim(4u) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(4u), gko::stop::ResidualNorm::build() .with_reduction_factor(TestFixture::reduction_factor) - .on(this->exec)) + ) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); auto krylov_dim = solver->get_krylov_dim(); @@ -286,14 +286,14 @@ TYPED_TEST(Gcr, CanSetPreconditionerInFactory) std::shared_ptr gcr_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gcr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gcr_precond) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); @@ -313,14 +313,14 @@ TYPED_TEST(Gcr, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr gcr_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto gcr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gcr_precond) .on(this->exec); @@ -346,14 +346,14 @@ TYPED_TEST(Gcr, CanSetPreconditioner) std::shared_ptr gcr_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gcr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); solver->set_preconditioner(gcr_precond); diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index 8ce8135f8b2..c2d62b3bb45 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -180,16 +180,16 @@ TYPED_TEST(Gmres, CanSetPreconditionerGenerator) auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor(TestFixture::reduction_factor) - .on(this->exec)) + ) .with_preconditioner( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -236,10 +236,10 @@ TYPED_TEST(Gmres, CanSetKrylovDim) Solver::build() .with_krylov_dim(4u) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(4u), gko::stop::ResidualNorm::build() .with_reduction_factor(TestFixture::reduction_factor) - .on(this->exec)) + ) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); auto krylov_dim = solver->get_krylov_dim(); @@ -273,14 +273,14 @@ TYPED_TEST(Gmres, CanSetPreconditionerInFactory) std::shared_ptr gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gmres_precond) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); @@ -300,14 +300,14 @@ TYPED_TEST(Gmres, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gmres_precond) .on(this->exec); @@ -333,14 +333,14 @@ TYPED_TEST(Gmres, CanSetPreconditioner) std::shared_ptr gmres_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gmres_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); solver->set_preconditioner(gmres_precond); diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp index e2657be8581..a93978fa335 100644 --- a/core/test/solver/idr.cpp +++ b/core/test/solver/idr.cpp @@ -162,13 +162,13 @@ TYPED_TEST(Idr, CanSetPreconditionerGenerator) auto idr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_preconditioner( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = idr_factory->generate(this->mtx); @@ -209,14 +209,14 @@ TYPED_TEST(Idr, CanSetPreconditionerInFactory) std::shared_ptr idr_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto idr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(idr_precond) .on(this->exec); auto solver = idr_factory->generate(this->mtx); @@ -236,14 +236,14 @@ TYPED_TEST(Idr, ThrowsOnWrongPreconditionerInFactory) std::shared_ptr idr_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto idr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(idr_precond) .on(this->exec); @@ -257,14 +257,14 @@ TYPED_TEST(Idr, CanSetPreconditioner) std::shared_ptr idr_precond = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto idr_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); solver->set_preconditioner(idr_precond); @@ -283,7 +283,7 @@ TYPED_TEST(Idr, CanSetSubspaceDim) Solver::build() .with_subspace_dim(8u) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto subspace_dim = solver->get_subspace_dim(); @@ -320,7 +320,7 @@ TYPED_TEST(Idr, CanSetKappa) Solver::build() .with_kappa(real_type{0.05}) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto kappa = solver->get_kappa(); @@ -359,7 +359,7 @@ TYPED_TEST(Idr, CanSetDeterministic) Solver::build() .with_deterministic(true) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto deterministic = solver->get_deterministic(); @@ -396,7 +396,7 @@ TYPED_TEST(Idr, CanSetComplexSubspace) Solver::build() .with_complex_subspace(true) .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto complex_subspace = solver->get_complex_subspace(); diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp index 7419f99bfd0..93ea3e89b10 100644 --- a/core/test/solver/ir.cpp +++ b/core/test/solver/ir.cpp @@ -163,16 +163,16 @@ TYPED_TEST(Ir, CanSetInnerSolverInFactory) auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor(r::value) - .on(this->exec)) + ) .with_solver( Solver::build() .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on( this->exec)) - .on(this->exec)) + ) .on(this->exec); auto solver = ir_factory->generate(this->mtx); auto inner_solver = dynamic_cast( @@ -190,14 +190,14 @@ TYPED_TEST(Ir, CanSetGeneratedInnerSolverInFactory) std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_solver(ir_solver) .on(this->exec); auto solver = ir_factory->generate(this->mtx); @@ -241,14 +241,14 @@ TYPED_TEST(Ir, ThrowsOnWrongInnerSolverInFactory) std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_solver(ir_solver) .on(this->exec); @@ -262,14 +262,14 @@ TYPED_TEST(Ir, CanSetInnerSolver) std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = ir_factory->generate(this->mtx); solver->set_solver(ir_solver); @@ -311,14 +311,14 @@ TYPED_TEST(Ir, ThrowOnWrongInnerSolverSet) std::shared_ptr ir_solver = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto ir_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = ir_factory->generate(this->mtx); @@ -346,10 +346,10 @@ TYPED_TEST(Ir, DefaultRelaxationFactor) auto richardson = gko::solver::Richardson::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor(r::value) - .on(this->exec)) + ) .on(this->exec) ->generate(this->mtx); @@ -365,10 +365,10 @@ TYPED_TEST(Ir, UseAsRichardson) auto richardson = gko::solver::Richardson::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), + gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() .with_reduction_factor(r::value) - .on(this->exec)) + ) .with_relaxation_factor(relaxation_factor) .on(this->exec) ->generate(this->mtx); diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 8fea85a40bb..e9d6b332aac 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -287,7 +287,7 @@ TYPED_TEST(Multigrid, ApplyUsesInitialGuessReturnsFalseWhenZeroGuess) auto multigrid_factory = Solver::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(3u)) .with_max_levels(2u) .with_coarsest_solver(this->lo_factory) .with_pre_smoother(this->lo_factory) diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index 92fe8fac8cf..22c9929219e 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -615,8 +615,8 @@ TEST_F(DefaultIlu, CanBeUsedAsPreconditioner) auto solver = gko::solver::Bicgstab<>::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(this->exec)) - .with_preconditioner(default_ilu_prec_type::build().on(this->exec)) + gko::stop::Iteration::build().with_max_iters(2u)) + .with_preconditioner(default_ilu_prec_type::build()) .on(this->exec) ->generate(this->mtx); auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); @@ -636,7 +636,7 @@ TEST_F(DefaultIlu, CanBeUsedAsGeneratedPreconditioner) auto solver = gko::solver::Bicgstab<>::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(2u)) .with_generated_preconditioner(precond) .on(this->exec) ->generate(this->mtx); diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index fc0c130aa83..4fae1bfdac8 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -455,7 +455,7 @@ TYPED_TEST(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef) auto ref_solver = gko::solver::Ir::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(1u)) .on(this->exec) ->generate(this->mtx); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); @@ -464,8 +464,7 @@ TYPED_TEST(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef) auto solver = gko::solver::Ir::build() .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on( - this->exec)) + gko::stop::Iteration::build().with_max_iters(1u)) .with_default_initial_guess(guess) .on(this->exec) ->generate(this->mtx); diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 23307d20b33..86be56ce3cb 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -406,8 +406,7 @@ class Multigrid : public ::testing::Test { .with_post_uses_pre(false) .with_mid_case(mid_case) .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on( - this->exec)) + gko::stop::Iteration::build().with_max_iters(1u)) .with_cycle(cycle) .with_min_coarse_rows(1u) .on(this->exec)); @@ -428,8 +427,7 @@ class Multigrid : public ::testing::Test { .with_post_uses_pre(true) .with_mid_case(mid_case) .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on( - this->exec)) + gko::stop::Iteration::build().with_max_iters(1u)) .with_cycle(cycle) .with_min_coarse_rows(1u) .on(this->exec)); @@ -1266,7 +1264,7 @@ TYPED_TEST(Multigrid, ZeroGuessIgnoresInput) .with_max_levels(2u) .with_mg_level(this->coarse_factory) .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)) + gko::stop::Iteration::build().with_max_iters(1u)) .with_min_coarse_rows(1u); auto normal_mg = common_part .with_default_initial_guess( diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp index 81464036c69..9374b7867ce 100644 --- a/test/solver/ir_kernels.cpp +++ b/test/solver/ir_kernels.cpp @@ -133,13 +133,13 @@ TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef) auto ir_factory = gko::solver::Ir::build() .with_solver(gko::solver::Gmres::build().with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(ref))) + gko::stop::Iteration::build().with_max_iters(1u))) .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .on(ref); auto d_ir_factory = gko::solver::Ir::build() .with_solver(gko::solver::Gmres::build().with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec))) + gko::stop::Iteration::build().with_max_iters(1u))) .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .on(exec); auto solver = ir_factory->generate(std::move(mtx)); @@ -197,14 +197,14 @@ TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef) auto ir_factory = gko::solver::Ir::build() .with_solver(gko::solver::Gmres::build().with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(ref))) + gko::stop::Iteration::build().with_max_iters(1u))) .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_relaxation_factor(value_type{0.9}) .on(ref); auto d_ir_factory = gko::solver::Ir::build() .with_solver(gko::solver::Gmres::build().with_criteria( - gko::stop::Iteration::build().with_max_iters(1u).on(exec))) + gko::stop::Iteration::build().with_max_iters(1u))) .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_relaxation_factor(value_type{0.9}) .on(exec); From fb941081c6b1b9aaf5713b9434e0e7a5f370ef06 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 15:10:27 +0200 Subject: [PATCH 314/583] add .gitignore to build folders automatically --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ac16267717..4dbce4a29c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -511,3 +511,8 @@ else() FILE(READ ${PROJECT_BINARY_DIR}/minimal.log GINKGO_LOG_SUMMARY) endif() MESSAGE(STATUS "${GINKGO_LOG_SUMMARY}") + +# make sure no build files get committed accidentally +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/.gitignore) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/.gitignore "*") +endif() From e606c36a1254418ed70f723b346dad8730d4ac31 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 17:15:46 +0200 Subject: [PATCH 315/583] handle deferred factory generation by registration Co-authored-by: Marcel Koch --- core/test/solver/multigrid.cpp | 30 ++++--- include/ginkgo/core/base/abstract_factory.hpp | 86 +++++++++++++++++-- include/ginkgo/core/base/lin_op.hpp | 20 ----- include/ginkgo/core/base/std_extensions.hpp | 10 +++ .../distributed/preconditioner/schwarz.hpp | 8 -- include/ginkgo/core/preconditioner/ic.hpp | 31 +++---- include/ginkgo/core/preconditioner/ilu.hpp | 41 ++++----- include/ginkgo/core/solver/direct.hpp | 11 --- include/ginkgo/core/solver/ir.hpp | 13 --- include/ginkgo/core/solver/multigrid.hpp | 37 -------- include/ginkgo/core/solver/solver_base.hpp | 64 +++++++------- 11 files changed, 169 insertions(+), 182 deletions(-) diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index e9d6b332aac..9f7bddb633c 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -286,8 +286,7 @@ TYPED_TEST(Multigrid, ApplyUsesInitialGuessReturnsFalseWhenZeroGuess) using Solver = typename TestFixture::Solver; auto multigrid_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_max_levels(2u) .with_coarsest_solver(this->lo_factory) .with_pre_smoother(this->lo_factory) @@ -426,25 +425,28 @@ TYPED_TEST(Multigrid, ThrowWhenNullMgLevel) TYPED_TEST(Multigrid, ThrowWhenMgLevelContainsNullptr) { using Solver = typename TestFixture::Solver; - auto factory_parameters = Solver::build() - .with_max_levels(1u) - .with_min_coarse_rows(2u) - .with_criteria(this->criterion) - .with_mg_level(this->rp_factory, nullptr); + auto factory = Solver::build() + .with_max_levels(1u) + .with_min_coarse_rows(2u) + .with_criteria(this->criterion) + .with_mg_level(this->rp_factory, nullptr) + .on(this->exec); - ASSERT_THROW(factory_parameters.on(this->exec), gko::NotSupported); + ASSERT_THROW(factory->generate(this->mtx), gko::NotSupported); } TYPED_TEST(Multigrid, ThrowWhenEmptyMgLevelList) { using Solver = typename TestFixture::Solver; - auto factory = Solver::build() - .with_max_levels(1u) - .with_min_coarse_rows(2u) - .with_mg_level() - .with_criteria(this->criterion) - .on(this->exec); + auto factory = + Solver::build() + .with_max_levels(1u) + .with_min_coarse_rows(2u) + .with_mg_level( + std::vector>{}) + .with_criteria(this->criterion) + .on(this->exec); ASSERT_THROW(factory->generate(this->mtx), gko::NotSupported); } diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index ca8ab7ed2ce..341340a5db2 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_PUBLIC_CORE_BASE_ABSTRACT_FACTORY_HPP_ +#include + + #include @@ -257,7 +260,11 @@ class enable_parameters_type { */ std::unique_ptr on(std::shared_ptr exec) const { - auto factory = std::unique_ptr(new Factory(exec, *self())); + ConcreteParametersType copy = *self(); + for (const auto& item : deferred_factories) { + item.second(exec, copy); + } + auto factory = std::unique_ptr(new Factory(exec, copy)); for (auto& logger : loggers) { factory->add_logger(logger); }; @@ -271,9 +278,35 @@ class enable_parameters_type { * Loggers to be attached to the factory and generated object. */ std::vector> loggers{}; + + std::unordered_map exec, + ConcreteParametersType&)>> + deferred_factories; }; +/** + * This Macro will generate a new type containing the parameters for the factory + * `_factory_name`. For more details, see #GKO_ENABLE_LIN_OP_FACTORY(). + * It is required to use this macro **before** calling the + * macro #GKO_ENABLE_LIN_OP_FACTORY(). + * It is also required to use the same names for all parameters between both + * macros. + * + * @param _parameters_name name of the parameters member in the class + * @param _factory_name name of the generated factory type + * + * @ingroup LinOp + */ +#define GKO_CREATE_FACTORY_PARAMETERS(_parameters_name, _factory_name) \ +public: \ + class _factory_name; \ + struct _parameters_name##_type \ + : public ::gko::enable_parameters_type<_parameters_name##_type, \ + _factory_name> + + /** * Represents a factory parameter of factory type that can either initialized by * a pre-existing factory or by passing in a factory_parameters object whose @@ -288,7 +321,7 @@ class deferred_factory_parameter { deferred_factory_parameter() = default; /** Creates an empty deferred factory parameter. */ - explicit deferred_factory_parameter(std::nullptr_t) + deferred_factory_parameter(std::nullptr_t) { generator_ = [](std::shared_ptr) { return nullptr; }; } @@ -301,8 +334,7 @@ class deferred_factory_parameter { std::enable_if_t>::value>* = nullptr> - explicit deferred_factory_parameter( - std::shared_ptr factory) + deferred_factory_parameter(std::shared_ptr factory) { generator_ = [factory = std::shared_ptr(std::move(factory))]( @@ -317,7 +349,7 @@ class deferred_factory_parameter { std::enable_if_t>::value>* = nullptr> - explicit deferred_factory_parameter( + deferred_factory_parameter( std::unique_ptr factory) { generator_ = @@ -333,7 +365,7 @@ class deferred_factory_parameter { template ().on( std::shared_ptr{}))> - explicit deferred_factory_parameter(ParametersType parameters) + deferred_factory_parameter(ParametersType parameters) { generator_ = [parameters](std::shared_ptr exec) -> std::shared_ptr { @@ -351,8 +383,8 @@ class deferred_factory_parameter { return generator_(exec); } - /** Returns true iff the parameter contains a factory. */ - bool is_empty() const { return bool(generator_); } + /** Returns true iff the parameter is empty. */ + bool is_empty() const { return !bool(generator_); } private: std::function( @@ -499,6 +531,12 @@ public: \ parameters_type& with_##_name(deferred_factory_parameter<_type> factory) \ { \ this->_name##_generator_ = std::move(factory); \ + this->deferred_factories[#_name] = [](const auto& exec, \ + auto& params) { \ + if (!params._name##_generator_.is_empty()) { \ + params._name = params._name##_generator_.on(exec); \ + } \ + }; \ return *this; \ } \ \ @@ -523,11 +561,41 @@ public: \ #define GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(_name, _type) \ public: \ std::vector> _name{}; \ - template \ + template >...>::value>> \ parameters_type& with_##_name(Args&&... factories) \ { \ this->_name##_generator_ = {deferred_factory_parameter<_type>{ \ std::forward(factories)}...}; \ + this->deferred_factories[#_name] = [](const auto& exec, \ + auto& params) { \ + if (!params._name##_generator_.empty()) { \ + params._name.clear(); \ + for (auto& generator : params._name##_generator_) { \ + params._name.push_back(generator.on(exec)); \ + } \ + } \ + }; \ + return *this; \ + } \ + template \ + parameters_type& with_##_name(const std::vector& factories) \ + { \ + this->_name##_generator_.clear(); \ + for (const auto& factory : factories) { \ + this->_name##_generator_.push_back(factory); \ + } \ + this->deferred_factories[#_name] = [](const auto& exec, \ + auto& params) { \ + if (!params._name##_generator_.empty()) { \ + params._name.clear(); \ + for (auto& generator : params._name##_generator_) { \ + params._name.push_back(generator.on(exec)); \ + } \ + } \ + }; \ return *this; \ } \ \ diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index e2660baff2e..407fafda0d1 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -949,26 +949,6 @@ using EnableDefaultLinOpFactory = EnableDefaultFactory; -/** - * This Macro will generate a new type containing the parameters for the factory - * `_factory_name`. For more details, see #GKO_ENABLE_LIN_OP_FACTORY(). - * It is required to use this macro **before** calling the - * macro #GKO_ENABLE_LIN_OP_FACTORY(). - * It is also required to use the same names for all parameters between both - * macros. - * - * @param _parameters_name name of the parameters member in the class - * @param _factory_name name of the generated factory type - * - * @ingroup LinOp - */ -#define GKO_CREATE_FACTORY_PARAMETERS(_parameters_name, _factory_name) \ -public: \ - class _factory_name; \ - struct _parameters_name##_type \ - : public ::gko::enable_parameters_type<_parameters_name##_type, \ - _factory_name> - /** * This macro will generate a default implementation of a LinOpFactory for the diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp index 69629f98e06..1064ae464f0 100644 --- a/include/ginkgo/core/base/std_extensions.hpp +++ b/include/ginkgo/core/base/std_extensions.hpp @@ -128,6 +128,16 @@ constexpr bool less_equal(const T&& lhs, const T&& rhs) } +// available in with C++17 +template +struct conjunction : std::true_type {}; +template +struct conjunction : B1 {}; +template +struct conjunction + : std::conditional_t, B1> {}; + + } // namespace xstd } // namespace gko diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index fe0539570ee..f31bd96aa2e 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -95,14 +95,6 @@ class Schwarz * Local solver factory. */ GKO_DEFERRED_FACTORY_PARAMETER(local_solver, LinOpFactory); - - std::unique_ptr on(std::shared_ptr exec) const - { - auto copy = *this; - copy.local_solver = local_solver_generator_.on(exec); - return copy.enable_parameters_type::on( - exec); - } }; GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp index ed5063d403b..97e7fe37871 100644 --- a/include/ginkgo/core/preconditioner/ic.hpp +++ b/include/ginkgo/core/preconditioner/ic.hpp @@ -145,6 +145,13 @@ class Ic : public EnableLinOp>, public Transposable { deferred_factory_parameter solver) { this->l_solver_generator = std::move(solver); + this->deferred_factories["l_solver"] = [](const auto& exec, + auto& params) { + if (!params.l_solver_generator.is_empty()) { + params.l_solver_factory = + params.l_solver_generator.on(exec); + } + }; return *this; } @@ -159,26 +166,16 @@ class Ic : public EnableLinOp>, public Transposable { deferred_factory_parameter factorization) { this->factorization_generator = std::move(factorization); + this->deferred_factories["factorization"] = [](const auto& exec, + auto& params) { + if (!params.factorization_generator.is_empty()) { + params.factorization_factory = + params.factorization_generator.on(exec); + } + }; return *this; } - /** - * - */ - std::unique_ptr on(std::shared_ptr exec) const - { - auto parameters_copy = *this; - if (l_solver_generator) { - parameters_copy.l_solver_factory = l_solver_generator.on(exec); - } - if (factorization_generator) { - parameters_copy.factorization_factory = - factorization_generator.on(exec); - } - return parameters_copy - .enable_parameters_type::on(exec); - } - private: deferred_factory_parameter l_solver_generator; diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index f4f8d0abd5b..d0f32c18c8c 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -163,6 +163,13 @@ class Ilu : public EnableLinOp< deferred_factory_parameter solver) { this->l_solver_generator = std::move(solver); + this->deferred_factories["l_solver"] = [](const auto& exec, + auto& params) { + if (!params.l_solver_generator.is_empty()) { + params.l_solver_factory = + params.l_solver_generator.on(exec); + } + }; return *this; } @@ -177,6 +184,13 @@ class Ilu : public EnableLinOp< deferred_factory_parameter solver) { this->u_solver_generator = std::move(solver); + this->deferred_factories["u_solver"] = [](const auto& exec, + auto& params) { + if (!params.u_solver_generator.is_empty()) { + params.u_solver_factory = + params.u_solver_generator.on(exec); + } + }; return *this; } @@ -191,29 +205,16 @@ class Ilu : public EnableLinOp< deferred_factory_parameter factorization) { this->factorization_generator = std::move(factorization); + this->deferred_factories["factorization"] = [](const auto& exec, + auto& params) { + if (!params.factorization_generator.is_empty()) { + params.factorization_factory = + params.factorization_generator.on(exec); + } + }; return *this; } - /** - * - */ - std::unique_ptr on(std::shared_ptr exec) const - { - auto parameters_copy = *this; - if (l_solver_generator) { - parameters_copy.l_solver_factory = l_solver_generator.on(exec); - } - if (u_solver_generator) { - parameters_copy.u_solver_factory = u_solver_generator.on(exec); - } - if (factorization_generator) { - parameters_copy.factorization_factory = - factorization_generator.on(exec); - } - return parameters_copy - .enable_parameters_type::on(exec); - } - private: deferred_factory_parameter l_solver_generator; diff --git a/include/ginkgo/core/solver/direct.hpp b/include/ginkgo/core/solver/direct.hpp index dcd6fd189a6..ee6783ff96d 100644 --- a/include/ginkgo/core/solver/direct.hpp +++ b/include/ginkgo/core/solver/direct.hpp @@ -88,17 +88,6 @@ class Direct : public EnableLinOp>, /** The factorization factory to use for generating the factors. */ GKO_DEFERRED_FACTORY_PARAMETER(factorization, LinOpFactory); - - /** - * - */ - std::unique_ptr on(std::shared_ptr exec) const - { - auto parameters_copy = *this; - parameters_copy.factorization = factorization_generator_.on(exec); - return parameters_copy - .enable_parameters_type::on(exec); - } }; GKO_ENABLE_LIN_OP_FACTORY(Direct, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index 1f04c8b75d2..468e539f487 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -205,19 +205,6 @@ class Ir : public EnableLinOp>, */ initial_guess_mode GKO_FACTORY_PARAMETER_SCALAR( default_initial_guess, initial_guess_mode::provided); - - /** - * - */ - std::unique_ptr on(std::shared_ptr exec) const - { - auto parameters_copy = *this; - if (solver_generator_) { - parameters_copy.solver = solver_generator_.on(exec); - } - return parameters_copy.enable_iterative_solver_factory_parameters< - parameters_type, Factory>::on(exec); - } }; GKO_ENABLE_LIN_OP_FACTORY(Ir, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 1256639acb4..21860844d3e 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -398,43 +398,6 @@ class Multigrid : public EnableLinOp, */ initial_guess_mode GKO_FACTORY_PARAMETER_SCALAR( default_initial_guess, initial_guess_mode::zero); - - std::unique_ptr on(std::shared_ptr exec) const - { - auto copy = *this; - if (!copy.mg_level_generator_.empty()) { - copy.mg_level.clear(); - for (auto& generator : copy.mg_level_generator_) { - copy.mg_level.push_back(generator.on(exec)); - } - } - if (!copy.pre_smoother_generator_.empty()) { - copy.pre_smoother.clear(); - for (auto& generator : copy.pre_smoother_generator_) { - copy.pre_smoother.push_back(generator.on(exec)); - } - } - if (!copy.mid_smoother_generator_.empty()) { - copy.mid_smoother.clear(); - for (auto& generator : copy.mid_smoother_generator_) { - copy.mid_smoother.push_back(generator.on(exec)); - } - } - if (!copy.post_smoother_generator_.empty()) { - copy.post_smoother.clear(); - for (auto& generator : copy.post_smoother_generator_) { - copy.post_smoother.push_back(generator.on(exec)); - } - } - if (!copy.coarsest_solver_generator_.empty()) { - copy.coarsest_solver.clear(); - for (auto& generator : copy.coarsest_solver_generator_) { - copy.coarsest_solver.push_back(generator.on(exec)); - } - } - return copy.enable_iterative_solver_factory_parameters< - parameters_type, Factory>::on(exec); - } }; GKO_ENABLE_LIN_OP_FACTORY(Multigrid, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index f9132426c61..3888d7fe62d 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -861,6 +861,10 @@ class EnablePreconditionedIterativeSolver }; +/** + * The parameter type shared between all iterative solvers. + * @see GKO_CREATE_FACTORY_PARAMETERS + */ struct iterative_solver_factory_parameters { /** * Stopping criteria to be used by the solver. @@ -883,27 +887,18 @@ struct enable_iterative_solver_factory_parameters this->criterion_generators = { deferred_factory_parameter{ std::forward(value)}...}; + this->deferred_factories["criteria"] = [](const auto& exec, + auto& params) { + if (!params.criterion_generators.empty()) { + params.criteria.clear(); + for (auto& generator : params.criterion_generators) { + params.criteria.push_back(generator.on(exec)); + } + } + }; return *self(); } - /** - * @copydoc enable_solver_factory_parameters::on - * - * @note This variant instantiates stopping criteria that were provided - * without calling `.on(exec)` before generating the factory. - */ - std::unique_ptr on(std::shared_ptr exec) const - { - auto copy = *self(); - copy.criteria.clear(); - for (auto& generator : criterion_generators) { - copy.criteria.push_back(generator.on(exec)); - } - auto factory = - copy.enable_parameters_type::on(exec); - return factory; - } - private: GKO_ENABLE_SELF(Parameters); @@ -912,6 +907,11 @@ struct enable_iterative_solver_factory_parameters }; +/** + * The parameter type shared between all preconditioned iterative solvers, + * excluding the parameters available in iterative_solver_factory_parameters. + * @see GKO_CREATE_FACTORY_PARAMETERS + */ struct preconditioned_iterative_solver_factory_parameters { /** * The preconditioner to be used by the iterative solver. By default, no @@ -932,17 +932,28 @@ struct enable_preconditioned_iterative_solver_factory_parameters : enable_iterative_solver_factory_parameters, preconditioned_iterative_solver_factory_parameters { /** - * + * Provides a preconditioner factory to be used by the iterative solver in a + * fluent interface. + * @see preconditioned_iterative_solver_factory_parameters::preconditioner */ Parameters& with_preconditioner( deferred_factory_parameter preconditioner) { this->preconditioner_generator = std::move(preconditioner); + this->deferred_factories["preconditioner"] = [](const auto& exec, + auto& params) { + if (!params.preconditioner_generator.is_empty()) { + params.preconditioner = + params.preconditioner_generator.on(exec); + } + }; return *self(); } /** - * + * Provides a concrete preconditioner to be used by the iterative solver in + * a fluent interface. + * @see preconditioned_iterative_solver_factory_parameters::preconditioner */ Parameters& with_generated_preconditioner( std::shared_ptr generated_preconditioner) @@ -951,19 +962,6 @@ struct enable_preconditioned_iterative_solver_factory_parameters return *self(); } - /** - * - */ - std::unique_ptr on(std::shared_ptr exec) const - { - auto parameters_copy = *self(); - if (preconditioner_generator) { - parameters_copy.preconditioner = preconditioner_generator.on(exec); - } - return parameters_copy.enable_iterative_solver_factory_parameters< - Parameters, Factory>::on(exec); - } - private: GKO_ENABLE_SELF(Parameters); From 6fb2e4ad0564161cb64d3ded7a3143e87d27e9b3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 18:06:02 +0200 Subject: [PATCH 316/583] add missing documentation Co-authored-by: Marcel Koch --- include/ginkgo/core/base/abstract_factory.hpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index 341340a5db2..cca440afe6c 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -279,6 +279,12 @@ class enable_parameters_type { */ std::vector> loggers{}; + /** + * Deferred factory parameter initialization functions that will be called + * in on(). Their names usually correspond to the variable names in the + * parameter type. They will be provided the executor and the parameter + * object currently being initialized from the generators. + */ std::unordered_map exec, ConcreteParametersType&)>> @@ -318,9 +324,10 @@ public: \ template class deferred_factory_parameter { public: + /** Creates an empty deferred factory parameter. */ deferred_factory_parameter() = default; - /** Creates an empty deferred factory parameter. */ + /** Creates a deferred factory parameter returning a nullptr. */ deferred_factory_parameter(std::nullptr_t) { generator_ = [](std::shared_ptr) { return nullptr; }; @@ -373,7 +380,10 @@ class deferred_factory_parameter { }; } - /** Instantiates the deferred parameter into an actual factory. */ + /** + * Instantiates the deferred parameter into an actual factory. This will + * throw if the deferred factory parameter is empty. + */ std::shared_ptr on( std::shared_ptr exec) const { From d752e1b48671b6b6bcfd54ab6fd6d679cc1ca778 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 6 Oct 2023 18:07:15 +0200 Subject: [PATCH 317/583] review updates - remove additional .on(...) calls - add tests for old functionality - add assertions for dynamic type Co-authored-by: Pratik Nayak --- .../distributed/preconditioner/schwarz.cpp | 13 +++ core/test/preconditioner/ic.cpp | 41 +++++++ core/test/preconditioner/ilu.cpp | 46 ++++++++ core/test/solver/CMakeLists.txt | 1 + core/test/solver/bicg.cpp | 35 ++++-- core/test/solver/bicgstab.cpp | 46 ++++---- core/test/solver/cb_gmres.cpp | 59 +++++----- core/test/solver/cg.cpp | 53 +++++---- core/test/solver/cgs.cpp | 53 +++++---- core/test/solver/direct.cpp | 105 ++++++++++++++++++ core/test/solver/fcg.cpp | 53 +++++---- core/test/solver/gcr.cpp | 49 ++++---- core/test/solver/gmres.cpp | 49 ++++---- core/test/solver/idr.cpp | 58 +++++----- core/test/solver/ir.cpp | 77 ++++++------- core/test/solver/multigrid.cpp | 28 +++++ core/test/solver/workspace.cpp | 6 +- core/test/utils/assertions.hpp | 88 +++++++++++++++ .../performance-debugging.cpp | 3 +- reference/test/matrix/csr_kernels.cpp | 5 +- reference/test/solver/cb_gmres_kernels.cpp | 41 +++---- reference/test/solver/gcr_kernels.cpp | 21 ++-- reference/test/solver/gmres_kernels.cpp | 39 +++---- reference/test/solver/ir_kernels.cpp | 47 ++++---- test/matrix/matrix.cpp | 2 +- 25 files changed, 675 insertions(+), 343 deletions(-) create mode 100644 core/test/solver/direct.cpp diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp index 5c354b11748..16b0af91b74 100644 --- a/core/test/mpi/distributed/preconditioner/schwarz.cpp +++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp @@ -160,4 +160,17 @@ TYPED_TEST(SchwarzFactory, CanBeCleared) } +TYPED_TEST(SchwarzFactory, PassExplicitFactory) +{ + using Jacobi = typename TestFixture::Jacobi; + using Schwarz = typename TestFixture::Schwarz; + auto jacobi_factory = gko::share(Jacobi::build().on(this->exec)); + + auto factory = + Schwarz::build().with_local_solver(jacobi_factory).on(this->exec); + + ASSERT_EQ(factory->get_parameters().local_solver, jacobi_factory); +} + + } // namespace diff --git a/core/test/preconditioner/ic.cpp b/core/test/preconditioner/ic.cpp index 9e1e3f3e3c4..dfcb5e5af3f 100644 --- a/core/test/preconditioner/ic.cpp +++ b/core/test/preconditioner/ic.cpp @@ -33,6 +33,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +#endif +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 5211, 4973, 4974) +#endif + + #include @@ -44,6 +55,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { @@ -95,4 +109,31 @@ TEST_F(IcFactory, CanSetFactorizationFactory) } +TEST_F(IcFactory, DeprecatedFactoryParameter) +{ + auto ilu_factory = ic_prec_type::build() + .with_l_solver_factory(this->l_factory) + .with_factorization_factory(this->fact_factory) + .on(this->exec); + + ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory); + ASSERT_EQ(ilu_factory->get_parameters().factorization_factory, + this->fact_factory); +} + + +TEST_F(IcFactory, DeferredFactoryParameter) +{ + auto ic_factory = ic_prec_type::build() + .with_l_solver(solver_type::build()) + .with_factorization(ic_type::build()) + .on(this->exec); + + GKO_ASSERT_DYNAMIC_TYPE(ic_factory->get_parameters().l_solver_factory, + solver_type::Factory); + GKO_ASSERT_DYNAMIC_TYPE(ic_factory->get_parameters().factorization_factory, + ic_type::Factory); +} + + } // namespace diff --git a/core/test/preconditioner/ilu.cpp b/core/test/preconditioner/ilu.cpp index f25a20b47e3..dec3c8532d2 100644 --- a/core/test/preconditioner/ilu.cpp +++ b/core/test/preconditioner/ilu.cpp @@ -33,6 +33,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +#endif +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 5211, 4973, 4974) +#endif + + #include @@ -44,6 +55,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { @@ -108,4 +122,36 @@ TEST_F(IluFactory, CanSetFactorizationFactory) } +TEST_F(IluFactory, DeprecatedFactoryParameter) +{ + auto ilu_factory = ilu_prec_type::build() + .with_l_solver_factory(this->l_factory) + .with_u_solver_factory(this->u_factory) + .with_factorization_factory(this->fact_factory) + .on(this->exec); + + ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory); + ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, this->u_factory); + ASSERT_EQ(ilu_factory->get_parameters().factorization_factory, + this->fact_factory); +} + + +TEST_F(IluFactory, DeferredFactoryParameter) +{ + auto ilu_factory = ilu_prec_type::build() + .with_l_solver(l_solver_type::build()) + .with_u_solver(u_solver_type::build()) + .with_factorization(ilu_type::build()) + .on(this->exec); + + GKO_ASSERT_DYNAMIC_TYPE(ilu_factory->get_parameters().l_solver_factory, + l_solver_type::Factory); + GKO_ASSERT_DYNAMIC_TYPE(ilu_factory->get_parameters().u_solver_factory, + u_solver_type::Factory); + GKO_ASSERT_DYNAMIC_TYPE(ilu_factory->get_parameters().factorization_factory, + ilu_type::Factory); +} + + } // namespace diff --git a/core/test/solver/CMakeLists.txt b/core/test/solver/CMakeLists.txt index 4ca8763e2ee..f4e6b2e5b7b 100644 --- a/core/test/solver/CMakeLists.txt +++ b/core/test/solver/CMakeLists.txt @@ -2,6 +2,7 @@ ginkgo_create_test(bicg) ginkgo_create_test(bicgstab) ginkgo_create_test(cg) ginkgo_create_test(cgs) +ginkgo_create_test(direct) ginkgo_create_test(fcg) ginkgo_create_test(gcr) ginkgo_create_test(gmres) diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp index 9e49b118484..c13070fad1e 100644 --- a/core/test/solver/bicg.cpp +++ b/core/test/solver/bicg.cpp @@ -164,18 +164,12 @@ TYPED_TEST(Bicg, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto bicg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor( - gko::remove_complex(1e-6)) - .on(this->exec)) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor( + gko::remove_complex(1e-6))) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = bicg_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -291,4 +285,21 @@ TYPED_TEST(Bicg, CanSetPreconditioner) } +TYPED_TEST(Bicg, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp index d5b489feff9..b420ccfc49e 100644 --- a/core/test/solver/bicgstab.cpp +++ b/core/test/solver/bicgstab.cpp @@ -159,14 +159,9 @@ TYPED_TEST(Bicgstab, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto bicgstab_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = bicgstab_factory->generate(this->mtx); @@ -207,15 +202,13 @@ TYPED_TEST(Bicgstab, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr bicgstab_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto bicgstab_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(bicgstab_precond) .on(this->exec); auto solver = bicgstab_factory->generate(this->mtx); @@ -234,15 +227,13 @@ TYPED_TEST(Bicgstab, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr bicgstab_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto bicgstab_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(bicgstab_precond) .on(this->exec); @@ -267,15 +258,13 @@ TYPED_TEST(Bicgstab, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr bicgstab_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto bicgstab_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = bicgstab_factory->generate(this->mtx); solver->set_preconditioner(bicgstab_precond); @@ -286,4 +275,21 @@ TYPED_TEST(Bicgstab, CanSetPreconditioner) } +TYPED_TEST(Bicgstab, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp index 5f6076f248c..434544b3ca2 100644 --- a/core/test/solver/cb_gmres.cpp +++ b/core/test/solver/cb_gmres.cpp @@ -223,18 +223,12 @@ TYPED_TEST(CbGmres, CanSetPreconditionerGenerator) using Solver = typename TestFixture::Solver; auto cb_gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on(this->exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(nc_value_type{1e-6}) - .on(this->exec)) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::initial_resnorm) + .with_reduction_factor(nc_value_type{1e-6})) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); auto precond = @@ -256,8 +250,7 @@ TYPED_TEST(CbGmres, CanSetKrylovDim) auto cb_gmres_factory = Solver::build() .with_krylov_dim(new_krylov_dim) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); @@ -276,8 +269,7 @@ TYPED_TEST(CbGmres, CanUseSetKrylovDim) const gko::size_type new_krylov_dim{40u}; auto cb_gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); @@ -295,15 +287,13 @@ TYPED_TEST(CbGmres, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr cb_gmres_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cb_gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cb_gmres_precond) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); @@ -322,15 +312,13 @@ TYPED_TEST(CbGmres, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr cb_gmres_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto cb_gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cb_gmres_precond) .on(this->exec); @@ -343,15 +331,13 @@ TYPED_TEST(CbGmres, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr cb_gmres_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cb_gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = cb_gmres_factory->generate(this->mtx); solver->set_preconditioner(cb_gmres_precond); @@ -362,4 +348,21 @@ TYPED_TEST(CbGmres, CanSetPreconditioner) } +TYPED_TEST(CbGmres, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp index d1d7dbee344..f94694e775b 100644 --- a/core/test/solver/cg.cpp +++ b/core/test/solver/cg.cpp @@ -163,18 +163,12 @@ TYPED_TEST(Cg, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto cg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor( - gko::remove_complex(1e-6)) - ) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor( + gko::remove_complex(1e-6))) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = cg_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -193,15 +187,13 @@ TYPED_TEST(Cg, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr cg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cg_precond) .on(this->exec); auto solver = cg_factory->generate(this->mtx); @@ -244,15 +236,13 @@ TYPED_TEST(Cg, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr cg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto cg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cg_precond) .on(this->exec); @@ -277,15 +267,13 @@ TYPED_TEST(Cg, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr cg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = cg_factory->generate(this->mtx); solver->set_preconditioner(cg_precond); @@ -296,4 +284,21 @@ TYPED_TEST(Cg, CanSetPreconditioner) } +TYPED_TEST(Cg, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp index 705e9f850c8..6216899d898 100644 --- a/core/test/solver/cgs.cpp +++ b/core/test/solver/cgs.cpp @@ -163,18 +163,12 @@ TYPED_TEST(Cgs, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto cgs_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor( - gko::remove_complex(1e-6)) - ) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor( + gko::remove_complex(1e-6))) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = cgs_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -217,15 +211,13 @@ TYPED_TEST(Cgs, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr cgs_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cgs_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cgs_precond) .on(this->exec); auto solver = cgs_factory->generate(this->mtx); @@ -244,15 +236,13 @@ TYPED_TEST(Cgs, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr cgs_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto cgs_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(cgs_precond) .on(this->exec); @@ -277,15 +267,13 @@ TYPED_TEST(Cgs, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr cgs_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto cgs_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = cgs_factory->generate(this->mtx); solver->set_preconditioner(cgs_precond); @@ -296,4 +284,21 @@ TYPED_TEST(Cgs, CanSetPreconditioner) } +TYPED_TEST(Cgs, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/direct.cpp b/core/test/solver/direct.cpp new file mode 100644 index 00000000000..a4110c8c18d --- /dev/null +++ b/core/test/solver/direct.cpp @@ -0,0 +1,105 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class Direct : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Solver = gko::experimental::solver::Direct; + using Lu = gko::experimental::factorization::Lu; + + Direct() + : exec(gko::ReferenceExecutor::create()), + factory(Solver::build().with_factorization(Lu::build()).on(exec)) + {} + + std::shared_ptr exec; + std::unique_ptr factory; +}; + +TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator); + + +TYPED_TEST(Direct, FactoryKnowsItsExecutor) +{ + ASSERT_EQ(this->factory->get_executor(), this->exec); +} + + +TYPED_TEST(Direct, ThrowsOnRectangularMatrixInFactory) +{ + using Mtx = gko::matrix::Csr; + std::shared_ptr rectangular_matrix = + Mtx::create(this->exec, gko::dim<2>{1, 2}, 0); + + ASSERT_THROW(this->factory->generate(rectangular_matrix), + gko::DimensionMismatch); +} + + +TYPED_TEST(Direct, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + using Lu = typename TestFixture::Lu; + auto lu_factory = gko::share(Lu::build().on(this->exec)); + + auto factory = + Solver::build().with_factorization(lu_factory).on(this->exec); + + ASSERT_EQ(factory->get_parameters().factorization, lu_factory); +} + + +} // namespace diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp index 4ba3f389ecd..87f27c2bacd 100644 --- a/core/test/solver/fcg.cpp +++ b/core/test/solver/fcg.cpp @@ -162,18 +162,12 @@ TYPED_TEST(Fcg, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto fcg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor( - gko::remove_complex(1e-6)) - ) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor( + gko::remove_complex(1e-6))) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = fcg_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -216,15 +210,13 @@ TYPED_TEST(Fcg, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr fcg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto fcg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(fcg_precond) .on(this->exec); auto solver = fcg_factory->generate(this->mtx); @@ -243,15 +235,13 @@ TYPED_TEST(Fcg, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr fcg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto fcg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(fcg_precond) .on(this->exec); @@ -276,15 +266,13 @@ TYPED_TEST(Fcg, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr fcg_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto fcg_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = fcg_factory->generate(this->mtx); solver->set_preconditioner(fcg_precond); @@ -295,4 +283,21 @@ TYPED_TEST(Fcg, CanSetPreconditioner) } +TYPED_TEST(Fcg, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index 554d5aa9526..4c08863f09b 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -196,14 +196,9 @@ TYPED_TEST(Gcr, CanSetPreconditionerGenerator) .with_criteria( gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor) - ) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_reduction_factor(TestFixture::reduction_factor)) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -251,8 +246,7 @@ TYPED_TEST(Gcr, CanSetKrylovDim) .with_criteria( gko::stop::Iteration::build().with_max_iters(4u), gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor) - ) + .with_reduction_factor(TestFixture::reduction_factor)) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); auto krylov_dim = solver->get_krylov_dim(); @@ -285,15 +279,13 @@ TYPED_TEST(Gcr, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr gcr_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gcr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gcr_precond) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); @@ -312,15 +304,13 @@ TYPED_TEST(Gcr, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr gcr_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto gcr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gcr_precond) .on(this->exec); @@ -345,15 +335,13 @@ TYPED_TEST(Gcr, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr gcr_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gcr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = gcr_factory->generate(this->mtx); solver->set_preconditioner(gcr_precond); @@ -364,4 +352,21 @@ TYPED_TEST(Gcr, CanSetPreconditioner) } +TYPED_TEST(Gcr, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index c2d62b3bb45..2464bb7273d 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -182,14 +182,9 @@ TYPED_TEST(Gmres, CanSetPreconditionerGenerator) .with_criteria( gko::stop::Iteration::build().with_max_iters(3u), gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor) - ) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_reduction_factor(TestFixture::reduction_factor)) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); auto precond = dynamic_cast*>( @@ -238,8 +233,7 @@ TYPED_TEST(Gmres, CanSetKrylovDim) .with_criteria( gko::stop::Iteration::build().with_max_iters(4u), gko::stop::ResidualNorm::build() - .with_reduction_factor(TestFixture::reduction_factor) - ) + .with_reduction_factor(TestFixture::reduction_factor)) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); auto krylov_dim = solver->get_krylov_dim(); @@ -272,15 +266,13 @@ TYPED_TEST(Gmres, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr gmres_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gmres_precond) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); @@ -299,15 +291,13 @@ TYPED_TEST(Gmres, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr gmres_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(gmres_precond) .on(this->exec); @@ -332,15 +322,13 @@ TYPED_TEST(Gmres, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr gmres_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto gmres_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = gmres_factory->generate(this->mtx); solver->set_preconditioner(gmres_precond); @@ -351,4 +339,21 @@ TYPED_TEST(Gmres, CanSetPreconditioner) } +TYPED_TEST(Gmres, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp index a93978fa335..5552f6f1c0a 100644 --- a/core/test/solver/idr.cpp +++ b/core/test/solver/idr.cpp @@ -161,14 +161,9 @@ TYPED_TEST(Idr, CanSetPreconditionerGenerator) using value_type = typename TestFixture::value_type; auto idr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) - .with_preconditioner( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) + .with_preconditioner(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = idr_factory->generate(this->mtx); @@ -208,15 +203,13 @@ TYPED_TEST(Idr, CanSetPreconditionerInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr idr_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto idr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(idr_precond) .on(this->exec); auto solver = idr_factory->generate(this->mtx); @@ -235,15 +228,13 @@ TYPED_TEST(Idr, ThrowsOnWrongPreconditionerInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr idr_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto idr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_preconditioner(idr_precond) .on(this->exec); @@ -256,15 +247,13 @@ TYPED_TEST(Idr, CanSetPreconditioner) using Solver = typename TestFixture::Solver; std::shared_ptr idr_precond = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto idr_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); solver->set_preconditioner(idr_precond); @@ -282,8 +271,7 @@ TYPED_TEST(Idr, CanSetSubspaceDim) auto idr_factory = Solver::build() .with_subspace_dim(8u) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto subspace_dim = solver->get_subspace_dim(); @@ -319,8 +307,7 @@ TYPED_TEST(Idr, CanSetKappa) auto idr_factory = Solver::build() .with_kappa(real_type{0.05}) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto kappa = solver->get_kappa(); @@ -358,8 +345,7 @@ TYPED_TEST(Idr, CanSetDeterministic) auto idr_factory = Solver::build() .with_deterministic(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto deterministic = solver->get_deterministic(); @@ -395,8 +381,7 @@ TYPED_TEST(Idr, CanSetComplexSubspace) auto idr_factory = Solver::build() .with_complex_subspace(true) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(4u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) .on(this->exec); auto solver = idr_factory->generate(this->mtx); auto complex_subspace = solver->get_complex_subspace(); @@ -425,4 +410,21 @@ TYPED_TEST(Idr, CanSetComplexSubspaceAgain) } +TYPED_TEST(Idr, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto precond_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_preconditioner(precond_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory); +} + + } // namespace diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp index 93ea3e89b10..171c0c92b00 100644 --- a/core/test/solver/ir.cpp +++ b/core/test/solver/ir.cpp @@ -162,17 +162,11 @@ TYPED_TEST(Ir, CanSetInnerSolverInFactory) using value_type = typename TestFixture::value_type; auto ir_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - ) - .with_solver( - Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .with_solver(Solver::build().with_criteria( + gko::stop::Iteration::build().with_max_iters(3u))) .on(this->exec); auto solver = ir_factory->generate(this->mtx); auto inner_solver = dynamic_cast( @@ -189,15 +183,13 @@ TYPED_TEST(Ir, CanSetGeneratedInnerSolverInFactory) using Solver = typename TestFixture::Solver; std::shared_ptr ir_solver = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto ir_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_solver(ir_solver) .on(this->exec); auto solver = ir_factory->generate(this->mtx); @@ -240,15 +232,13 @@ TYPED_TEST(Ir, ThrowsOnWrongInnerSolverInFactory) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr ir_solver = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto ir_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_generated_solver(ir_solver) .on(this->exec); @@ -261,15 +251,13 @@ TYPED_TEST(Ir, CanSetInnerSolver) using Solver = typename TestFixture::Solver; std::shared_ptr ir_solver = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(this->mtx); auto ir_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = ir_factory->generate(this->mtx); solver->set_solver(ir_solver); @@ -289,9 +277,7 @@ TYPED_TEST(Ir, CanSetApplyWithInitialGuessMode) initial_guess_mode::zero}) { auto ir_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u).on( - this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .with_default_initial_guess(guess) .on(this->exec); auto solver = ir_factory->generate(this->mtx); @@ -310,15 +296,13 @@ TYPED_TEST(Ir, ThrowOnWrongInnerSolverSet) Mtx::create(this->exec, gko::dim<2>{2, 2}); std::shared_ptr ir_solver = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec) ->generate(wrong_sized_mtx); auto ir_factory = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto solver = ir_factory->generate(this->mtx); @@ -345,11 +329,9 @@ TYPED_TEST(Ir, DefaultRelaxationFactor) auto richardson = gko::solver::Richardson::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) .on(this->exec) ->generate(this->mtx); @@ -364,11 +346,9 @@ TYPED_TEST(Ir, UseAsRichardson) auto richardson = gko::solver::Richardson::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - ) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) .with_relaxation_factor(relaxation_factor) .on(this->exec) ->generate(this->mtx); @@ -495,4 +475,21 @@ TYPED_TEST(Ir, RunResidualNormCheckCorrectTimes) } +TYPED_TEST(Ir, PassExplicitFactory) +{ + using Solver = typename TestFixture::Solver; + auto stop_factory = gko::share( + gko::stop::Iteration::build().with_max_iters(1u).on(this->exec)); + auto inner_solver_factory = gko::share(Solver::build().on(this->exec)); + + auto factory = Solver::build() + .with_criteria(stop_factory) + .with_solver(inner_solver_factory) + .on(this->exec); + + ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory); + ASSERT_EQ(factory->get_parameters().solver, inner_solver_factory); +} + + } // namespace diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 9f7bddb633c..bab6bcaf863 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -902,4 +902,32 @@ TYPED_TEST(Multigrid, CustomCoarsestSolverSelector) } +TYPED_TEST(Multigrid, DeferredFactoryParameter) +{ + using Solver = typename TestFixture::Solver; + using DummyRPFactory = typename TestFixture::DummyRPFactory; + using DummyFactory = typename TestFixture::DummyFactory; + + auto solver = Solver::build() + .with_mg_level(DummyRPFactory::build()) + .with_pre_smoother(DummyFactory::build()) + .with_mid_smoother(DummyFactory::build()) + .with_post_smoother(DummyFactory::build()) + .with_criteria(gko::stop::Iteration::build()) + .with_coarsest_solver(DummyFactory::build()) + .on(this->exec); + + GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().mg_level[0], + typename DummyRPFactory::Factory); + GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().pre_smoother[0], + typename DummyFactory::Factory); + GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().mid_smoother[0], + typename DummyFactory::Factory); + GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().post_smoother[0], + typename DummyFactory::Factory); + GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().coarsest_solver[0], + typename DummyFactory::Factory); +} + + } // namespace diff --git a/core/test/solver/workspace.cpp b/core/test/solver/workspace.cpp index ffbab815dc6..3dc53fb6abe 100644 --- a/core/test/solver/workspace.cpp +++ b/core/test/solver/workspace.cpp @@ -256,8 +256,8 @@ TEST_F(Workspace, CanCreateOperators) ASSERT_EQ(op2->get_size(), size2); ASSERT_EQ(op1->get_stride(), stride1); ASSERT_EQ(op2->get_stride(), stride2); - ASSERT_EQ(typeid(*op1), typeid(DummyLinOp)); - ASSERT_EQ(typeid(*op2), typeid(DummyLinOp2)); + GKO_ASSERT_DYNAMIC_TYPE(op1, DummyLinOp); + GKO_ASSERT_DYNAMIC_TYPE(op2, DummyLinOp2); ASSERT_EQ(op1, ws.get_op(1)); ASSERT_EQ(op2, ws.get_op(0)); } @@ -288,7 +288,7 @@ TEST_F(Workspace, ChecksExactOperatorType) 0, [&] { return std::make_unique(exec); }, typeid(DerivedDummyLinOp), {}, 0); - ASSERT_EQ(typeid(*op1), typeid(DerivedDummyLinOp)); + GKO_ASSERT_DYNAMIC_TYPE(op1, DerivedDummyLinOp); } diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 153907cf2cf..a0f700e629e 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -53,10 +53,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/base/batch_utilities.hpp" #include "core/base/extended_float.hpp" +#include "ginkgo/core/base/name_demangling.hpp" namespace gko { @@ -1010,6 +1012,45 @@ ::testing::AssertionResult matrices_equal_sparsity( } +template +::testing::AssertionResult dynamic_type_eq(const std::string& expr1, + const std::string& expr2, + const Ptr1& ptr1, const Ptr2& ptr2) +{ + auto& ref1 = *ptr1; + auto& ref2 = *ptr2; + if (typeid(ref1) == typeid(ref2)) { + return ::testing::AssertionSuccess(); + } else { + return ::testing::AssertionFailure() + << "mismatching dynamic types\n" + << expr1 << " is\n\t" + << gko::name_demangling::get_type_name(typeid(ref1)) << "\n" + << expr2 << " is\n\t" + << gko::name_demangling::get_type_name(typeid(ref2)) << "\n"; + } +} + + +template +::testing::AssertionResult dynamic_type_is(const std::string& expr, + const std::string&, const Ptr& ptr, + const std::type_info& type) +{ + auto& ref = *ptr; + if (typeid(ref) == type) { + return ::testing::AssertionSuccess(); + } else { + return ::testing::AssertionFailure() + << "unexpected dynamic type\n" + << expr << " is\n\t" + << gko::name_demangling::get_type_name(typeid(ref)) << "\n" + << "but we expected\n\t" + << gko::name_demangling::get_type_name(type) << "\n"; + } +} + + namespace detail { @@ -1249,4 +1290,51 @@ T* plain_ptr(T* ptr) } +/** + * Checks if the dynamic types of the objects referenced by two pointers are + * equal. + * + * @param _ptr1 the first pointer + * @param _ptr2 the second pointer + */ +#define GKO_ASSERT_DYNAMIC_TYPE_EQ(_ptr1, _ptr2) \ + { \ + ASSERT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_eq, _ptr1, \ + _ptr2); \ + } + + +/** + * @copydoc GKO_ASSERT_DYNAMIC_TYPE_EQ + */ +#define GKO_EXPECT_DYNAMIC_TYPE_EQ(_ptr1, _ptr2) \ + { \ + EXPECT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_eq, _ptr1, \ + _ptr2); \ + } + + +/** + * Checks if the dynamic type of a pointer to an object matches a given type + * + * @param _ptr the pointer + * @param _type the expected type + */ +#define GKO_ASSERT_DYNAMIC_TYPE(_ptr, _type) \ + { \ + ASSERT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_is, _ptr, \ + typeid(_type)); \ + } + + +/** + * @copydoc GKO_ASSERT_DYNAMIC_TYPE + */ +#define GKO_EXPECT_DYNAMIC_TYPE(_ptr, _type) \ + { \ + EXPECT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_is, _ptr, \ + typeid(_type)); \ + } + + #endif // GKO_CORE_TEST_UTILS_ASSERTIONS_HPP_ diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp index cb06ac6cc86..c8f741114d2 100644 --- a/examples/performance-debugging/performance-debugging.cpp +++ b/examples/performance-debugging/performance-debugging.cpp @@ -417,8 +417,7 @@ int main(int argc, char* argv[]) .with_criteria( gko::stop::ResidualNorm::build() .with_reduction_factor(reduction_factor), - gko::stop::Iteration::build().with_max_iters(max_iters).on( - exec)) + gko::stop::Iteration::build().with_max_iters(max_iters)) .with_preconditioner(preconditioner::create(exec)) .on(exec); diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index d56201ade02..d0265e462f2 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -56,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/csr_kernels.hpp" #include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/assertions.hpp" namespace { @@ -810,7 +811,7 @@ TYPED_TEST(Csr, ConvertsToPrecision) GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual); auto first_strategy = this->mtx2->get_strategy(); auto second_strategy = res->get_strategy(); - ASSERT_EQ(typeid(*first_strategy), typeid(*second_strategy)); + GKO_ASSERT_DYNAMIC_TYPE_EQ(first_strategy, second_strategy); } @@ -835,7 +836,7 @@ TYPED_TEST(Csr, MovesToPrecision) GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual); auto first_strategy = this->mtx2->get_strategy(); auto second_strategy = res->get_strategy(); - ASSERT_EQ(typeid(*first_strategy), typeid(*second_strategy)); + GKO_ASSERT_DYNAMIC_TYPE_EQ(first_strategy, second_strategy); } diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp index e5b933ad82c..60d2a32b9ee 100644 --- a/reference/test/solver/cb_gmres_kernels.cpp +++ b/reference/test/solver/cb_gmres_kernels.cpp @@ -266,16 +266,12 @@ TYPED_TEST(CbGmres, SolvesStencilSystem2) auto factory = gmres_type::build() .with_storage_precision(this->storage_prec) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - this->exec), - gko::stop::Time::build() - .with_time_limit(std::chrono::seconds(6)) - .on(this->exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(this->reduction_factor()) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::Time::build().with_time_limit( + std::chrono::seconds(6)), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::initial_resnorm) + .with_reduction_factor(this->reduction_factor())) .on(this->exec); auto solver = factory->generate(this->mtx2); auto b = gko::initialize({33.0, 20.0, 20.0}, this->exec); @@ -521,13 +517,10 @@ TYPED_TEST(CbGmres, SolvesBigDenseSystem1WithRestart) gmres_type::build() .with_krylov_dim(4u) .with_storage_precision(this->storage_prec) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(200u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(this->reduction_factor()) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(200u), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::initial_resnorm) + .with_reduction_factor(this->reduction_factor())) .on(this->exec); auto solver = cb_gmres_factory_restart->generate(this->mtx_medium); auto b = gko::initialize( @@ -549,17 +542,13 @@ TYPED_TEST(CbGmres, SolvesWithPreconditioner) auto cb_gmres_factory_preconditioner = gmres_type::build() .with_storage_precision(this->storage_prec) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_baseline(gko::stop::mode::initial_resnorm) - .with_reduction_factor(this->reduction_factor()) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::initial_resnorm) + .with_reduction_factor(this->reduction_factor())) .with_preconditioner( gko::preconditioner::Jacobi::build() - .with_max_block_size(3u) - .on(this->exec)) + .with_max_block_size(3u)) .on(this->exec); auto solver = cb_gmres_factory_preconditioner->generate(this->mtx_big); auto b = gko::initialize( diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index adf5c35fd1d..8943a131d2b 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -574,12 +574,9 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1WithRestart) auto gcr_factory_restart = Solver::build() .with_krylov_dim(4u) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(200u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(200u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) .on(this->exec); auto solver = gcr_factory_restart->generate(this->mtx_medium); auto b = gko::initialize( @@ -600,16 +597,12 @@ TYPED_TEST(Gcr, SolvesWithPreconditioner) using value_type = typename TestFixture::value_type; auto gcr_factory_preconditioner = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) .with_preconditioner( gko::preconditioner::Jacobi::build() - .with_max_block_size(3u) - .on(this->exec)) + .with_max_block_size(3u)) .on(this->exec); auto solver = gcr_factory_preconditioner->generate(this->mtx_big); auto b = gko::initialize( diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index a99400e412b..c718d60343b 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -373,15 +373,15 @@ TYPED_TEST(Gmres, KernelMultiAxpy) this->small_final_iter_nums.get_data()[1] = restart; this->small_krylov_bases = gko::initialize( // restart+1 x rows x #rhs { - I{1, 10}, // 0, 0, x - I{2, 11}, // 0, 1, x - I{3, 12}, // 0, 2, x - I{4, 13}, // 1, 0, x - I{5, 14}, // 1, 1, x - I{6, 15}, // 1, 2, x - I{nan, nan}, // 2, 0, x - I{nan, nan}, // 2, 1, x - I{nan, nan}, // 2, 2, x + I{1, 10}, // 0, 0, x + I{2, 11}, // 0, 1, x + I{3, 12}, // 0, 2, x + I{4, 13}, // 1, 0, x + I{5, 14}, // 1, 1, x + I{6, 15}, // 1, 2, x + I{nan, nan}, // 2, 0, x + I{nan, nan}, // 2, 1, x + I{nan, nan}, // 2, 2, x }, this->exec); this->small_stop.get_data()[0].stop(7, false); @@ -718,12 +718,9 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) auto gmres_factory_restart = Solver::build() .with_krylov_dim(4u) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(200u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(200u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) .on(this->exec); auto solver = gmres_factory_restart->generate(this->mtx_medium); auto b = gko::initialize( @@ -744,16 +741,12 @@ TYPED_TEST(Gmres, SolvesWithPreconditioner) using value_type = typename TestFixture::value_type; auto gmres_factory_preconditioner = Solver::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) .with_preconditioner( gko::preconditioner::Jacobi::build() - .with_max_block_size(3u) - .on(this->exec)) + .with_max_block_size(3u)) .on(this->exec); auto solver = gmres_factory_preconditioner->generate(this->mtx_big); auto b = gko::initialize( diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index 4fae1bfdac8..fc6154f3366 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -184,11 +184,9 @@ TYPED_TEST(Ir, SolvesTriangularSystemWithIterativeInnerSolver) auto solver_factory = gko::solver::Ir::build() - .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on( - this->exec), + .with_criteria(gko::stop::Iteration::build().with_max_iters(30u), gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(this->exec)) + .with_reduction_factor(r::value)) .with_solver(inner_solver_factory) .on(this->exec); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); @@ -351,16 +349,15 @@ TYPED_TEST(Ir, RichardsonSolvesTriangularSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; - auto solver = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(this->exec)) - .with_relaxation_factor(value_type{0.9}) - .on(this->exec) - ->generate(this->mtx); + auto solver = + gko::solver::Ir::build() + .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value) + .on(this->exec)) + .with_relaxation_factor(value_type{0.9}) + .on(this->exec) + ->generate(this->mtx); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -383,12 +380,10 @@ TYPED_TEST(Ir, RichardsonSolvesTriangularSystemWithIterativeInnerSolver) .on(this->exec)); auto solver_factory = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(100u).on( - this->exec), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value) - .on(this->exec)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value) + .on(this->exec)) .with_relaxation_factor(value_type{0.9}) .with_solver(inner_solver_factory) .on(this->exec); @@ -407,8 +402,7 @@ TYPED_TEST(Ir, RichardsonTransposedSolvesTriangularSystem) using value_type = typename TestFixture::value_type; auto solver = gko::solver::Ir::build() - .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on( - this->exec), + .with_criteria(gko::stop::Iteration::build().with_max_iters(30u), gko::stop::ResidualNorm::build() .with_reduction_factor(r::value) .on(this->exec)) @@ -430,8 +424,7 @@ TYPED_TEST(Ir, RichardsonConjTransposedSolvesTriangularSystem) using value_type = typename TestFixture::value_type; auto solver = gko::solver::Ir::build() - .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on( - this->exec), + .with_criteria(gko::stop::Iteration::build().with_max_iters(30u), gko::stop::ResidualNorm::build() .with_reduction_factor(r::value) .on(this->exec)) @@ -454,8 +447,7 @@ TYPED_TEST(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef) using initial_guess_mode = gko::solver::initial_guess_mode; auto ref_solver = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .on(this->exec) ->generate(this->mtx); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); @@ -463,8 +455,7 @@ TYPED_TEST(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef) initial_guess_mode::zero}) { auto solver = gko::solver::Ir::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .with_default_initial_guess(guess) .on(this->exec) ->generate(this->mtx); diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 9192b2eeebe..9b78ae21d6c 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -155,7 +155,7 @@ struct CsrWithDefaultStrategy : CsrBase { CsrBase::assert_empty_state(mtx); auto first_strategy = mtx->create_default()->get_strategy(); auto second_strategy = mtx->get_strategy(); - ASSERT_EQ(typeid(*first_strategy), typeid(*second_strategy)); + GKO_ASSERT_DYNAMIC_TYPE_EQ(first_strategy, second_strategy); } }; From a2649159787e6e8814f097120cd203b61da3fbec Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Fri, 6 Oct 2023 16:10:45 +0000 Subject: [PATCH 318/583] Format files Co-authored-by: Tobias Ribizel --- core/test/log/papi.cpp | 6 ++---- core/test/utils/assertions.hpp | 4 ++-- reference/test/preconditioner/ilu.cpp | 6 ++---- reference/test/solver/gmres_kernels.cpp | 18 +++++++++--------- reference/test/solver/multigrid_kernels.cpp | 9 +++------ 5 files changed, 18 insertions(+), 25 deletions(-) diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp index b4e51cdc31b..0928f35d1ba 100644 --- a/core/test/log/papi.cpp +++ b/core/test/log/papi.cpp @@ -471,8 +471,7 @@ TYPED_TEST(Papi, CatchesLinOpFactoryGenerateStarted) { auto factory = gko::solver::Bicgstab::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); auto str = this->init(gko::log::Logger::linop_factory_generate_started_mask, "linop_factory_generate_started", factory.get()); @@ -492,8 +491,7 @@ TYPED_TEST(Papi, CatchesLinOpFactoryGenerateCompleted) { auto factory = gko::solver::Bicgstab::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(3u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(this->exec); TypeParam dummy; auto str = diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index a0f700e629e..d723d5a8964 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -52,13 +53,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include -#include #include "core/base/batch_utilities.hpp" #include "core/base/extended_float.hpp" -#include "ginkgo/core/base/name_demangling.hpp" namespace gko { diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index 22c9929219e..5150626c898 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -614,8 +614,7 @@ TEST_F(DefaultIlu, CanBeUsedAsPreconditioner) { auto solver = gko::solver::Bicgstab<>::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_preconditioner(default_ilu_prec_type::build()) .on(this->exec) ->generate(this->mtx); @@ -635,8 +634,7 @@ TEST_F(DefaultIlu, CanBeUsedAsGeneratedPreconditioner) default_ilu_prec_type::build().on(this->exec)->generate(this->mtx); auto solver = gko::solver::Bicgstab<>::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters(2u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(2u)) .with_generated_preconditioner(precond) .on(this->exec) ->generate(this->mtx); diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index c718d60343b..4c651e7917b 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -373,15 +373,15 @@ TYPED_TEST(Gmres, KernelMultiAxpy) this->small_final_iter_nums.get_data()[1] = restart; this->small_krylov_bases = gko::initialize( // restart+1 x rows x #rhs { - I{1, 10}, // 0, 0, x - I{2, 11}, // 0, 1, x - I{3, 12}, // 0, 2, x - I{4, 13}, // 1, 0, x - I{5, 14}, // 1, 1, x - I{6, 15}, // 1, 2, x - I{nan, nan}, // 2, 0, x - I{nan, nan}, // 2, 1, x - I{nan, nan}, // 2, 2, x + I{1, 10}, // 0, 0, x + I{2, 11}, // 0, 1, x + I{3, 12}, // 0, 2, x + I{4, 13}, // 1, 0, x + I{5, 14}, // 1, 1, x + I{6, 15}, // 1, 2, x + I{nan, nan}, // 2, 0, x + I{nan, nan}, // 2, 1, x + I{nan, nan}, // 2, 2, x }, this->exec); this->small_stop.get_data()[0].stop(7, false); diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 86be56ce3cb..c35db0b1427 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -405,8 +405,7 @@ class Multigrid : public ::testing::Test { gko::matrix::IdentityFactory::create(exec)) .with_post_uses_pre(false) .with_mid_case(mid_case) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .with_cycle(cycle) .with_min_coarse_rows(1u) .on(this->exec)); @@ -426,8 +425,7 @@ class Multigrid : public ::testing::Test { .with_coarsest_solver(this->lo_factory) .with_post_uses_pre(true) .with_mid_case(mid_case) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .with_cycle(cycle) .with_min_coarse_rows(1u) .on(this->exec)); @@ -1263,8 +1261,7 @@ TYPED_TEST(Multigrid, ZeroGuessIgnoresInput) .with_coarsest_solver(this->coarsest_factory) .with_max_levels(2u) .with_mg_level(this->coarse_factory) - .with_criteria( - gko::stop::Iteration::build().with_max_iters(1u)) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) .with_min_coarse_rows(1u); auto normal_mg = common_part .with_default_initial_guess( From 01c196f2528fa69dbb65be7ceea3cfbae1cdd13b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 12 Jul 2023 16:28:48 +0000 Subject: [PATCH 319/583] use advanced memory ordering instructions in CUDA --- .../{volatile.hpp.inc => memory.hpp.inc} | 0 common/cuda_hip/components/syncfree.hpp.inc | 43 +- cuda/components/memory.cuh | 789 ++++++++++++++++++ cuda/components/syncfree.cuh | 2 +- cuda/components/volatile.cuh | 58 -- cuda/solver/common_trs_kernels.cuh | 33 +- dev_tools/scripts/generate_cuda_memory_ptx.py | 192 +++++ .../{volatile.hip.hpp => memory.hip.hpp} | 70 +- hip/components/syncfree.hip.hpp | 2 +- 9 files changed, 1079 insertions(+), 110 deletions(-) rename common/cuda_hip/components/{volatile.hpp.inc => memory.hpp.inc} (100%) create mode 100644 cuda/components/memory.cuh delete mode 100644 cuda/components/volatile.cuh create mode 100755 dev_tools/scripts/generate_cuda_memory_ptx.py rename hip/components/{volatile.hip.hpp => memory.hip.hpp} (55%) diff --git a/common/cuda_hip/components/volatile.hpp.inc b/common/cuda_hip/components/memory.hpp.inc similarity index 100% rename from common/cuda_hip/components/volatile.hpp.inc rename to common/cuda_hip/components/memory.hpp.inc diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp.inc index 6b6dcc70f24..113c66d91ec 100644 --- a/common/cuda_hip/components/syncfree.hpp.inc +++ b/common/cuda_hip/components/syncfree.hpp.inc @@ -93,48 +93,31 @@ public: const auto dep_block = dependency / (block_size / subwarp_size); const auto dep_local = dependency % (block_size / subwarp_size); // assert(dependency < work_id); - if (dep_block == block_id) { - // wait for a local dependency - while (!load(local.status, dep_local)) { - __threadfence(); - } - } else { - // wait for a global dependency - while (!load(global.status, dependency)) { - __threadfence(); + if (get_lane() == 0) { + if (dep_block == block_id) { + // wait for a local dependency + while (!load_acquire_shared(local.status + dep_local)) { + } + } else { + // wait for a global dependency + while (!load_acquire(global.status + dependency)) { + } } } - __threadfence(); + group::tiled_partition(group::this_thread_block()).sync(); } - __device__ __forceinline__ bool peek(IndexType dependency) - { - const auto dep_block = dependency / (block_size / subwarp_size); - const auto dep_local = dependency % (block_size / subwarp_size); - // assert(dependency < work_id); - if (dep_block == block_id) { - // peek at a local dependency - auto finished = load(local.status, dep_local) != 0; - __threadfence(); - return finished; - } else { - // peek at a global dependency - auto finished = load(global.status, dependency); - __threadfence(); - return finished; - } - } + __device__ __forceinline__ bool peek(IndexType dependency) { return false; } __device__ __forceinline__ void mark_ready() { group::tiled_partition(group::this_thread_block()).sync(); - __threadfence(); if (get_lane() == 0) { const auto sh_id = get_work_id() % (block_size / subwarp_size); // notify local warps - store(local.status, sh_id, 1); + store_release_shared(local.status + sh_id, 1); // notify other blocks - store(global.status, get_work_id(), 1); + store_release(global.status + get_work_id(), 1); } } diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh new file mode 100644 index 00000000000..578f7c8309f --- /dev/null +++ b/cuda/components/memory.cuh @@ -0,0 +1,789 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_MEMORY_CUH_ +#define GKO_CUDA_COMPONENTS_MEMORY_CUH_ + + +#include + + +#include + + +#include "cuda/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +__device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) +{ +// see +// https://github.com/NVIDIA/cutlass/blob/ +// 6fc5008803fe4e81b81a836fcd3a88258f4e5bbf/ +// include/cutlass/arch/memory_sm75.h#L90 +// for reasoning behind this implementation +#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11) + return static_cast(__cvta_generic_to_shared(ptr)); +#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \ + __CUDACC_VER_MINOR__ >= 2) + return __nvvm_get_smem_pointer(ptr); +#else + uint32 smem_ptr; + asm("{{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 " + "%0, smem_ptr; }}" + : "=r"(smem_ptr) + : "l"(ptr)); + return smem_ptr; +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.gl;" ::: "memory"); +#else + asm volatile("fence.acq_rel.gpu;" ::: "memory"); +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel_shared() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.cta;" ::: "memory"); +#else + asm volatile("fence.acq_rel.cta;" ::: "memory"); +#endif +} + + +#include "common/cuda_hip/components/memory.hpp.inc" + + +__device__ __forceinline__ int32 load_relaxed_shared(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_relaxed_shared(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_relaxed_shared(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_relaxed_shared(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int32 load_acquire_shared(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_acquire_shared(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_acquire_shared(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.f32 %0, [%1];" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(float* ptr, float result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_acquire_shared(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.acquire.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(double* ptr, double result) +{ + membar_acq_rel_shared(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#else + asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int32 load_relaxed(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_relaxed(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_relaxed(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(float* ptr, float result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_relaxed(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#endif + + return result; +} + + +__device__ __forceinline__ void store_relaxed(double* ptr, double result) +{ +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int32 load_acquire(int32* ptr) +{ + int32 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.b32 %0, [%1];" + : "=r"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(int32* ptr, int32 result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#else + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ int64 load_acquire(int64* ptr) +{ + int64 result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.b64 %0, [%1];" + : "=l"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(int64* ptr, int64 result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#else + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ float load_acquire(float* ptr) +{ + float result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.f32 %0, [%1];" + : "=f"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(float* ptr, float result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#else + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ double load_acquire(double* ptr) +{ + double result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.acquire.gpu.f64 %0, [%1];" + : "=d"(result) + : "l"(ptr) + : "memory"); +#endif + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(double* ptr, double result) +{ + membar_acq_rel(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#else + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed_shared( + thrust::complex* ptr) +{ + float real_result; + float imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed_shared( + thrust::complex* ptr, thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(real_result), "f"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "f"(real_result), "f"(imag_result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed_shared( + thrust::complex* ptr) +{ + double real_result; + double imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#else + asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed_shared( + thrust::complex* ptr, thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(real_result), "d"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( + convert_generic_ptr_to_smem_ptr(ptr)), + "d"(real_result), "d"(imag_result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed( + thrust::complex* ptr) +{ + float real_result; + float imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" + : "=f"(real_result), "=f"(imag_result) + : "l"(ptr) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed(thrust::complex* ptr, + thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); +#endif +} + + +__device__ __forceinline__ thrust::complex load_relaxed( + thrust::complex* ptr) +{ + double real_result; + double imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "l"(ptr) + : "memory"); +#else + asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" + : "=d"(real_result), "=d"(imag_result) + : "l"(ptr) + : "memory"); +#endif + return thrust::complex{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed(thrust::complex* ptr, + thrust::complex result) +{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + "d"(real_result), "d"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + "d"(real_result), "d"(imag_result) + : "memory"); +#endif +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko + +#endif // GKO_CUDA_COMPONENTS_MEMORY_CUH_ diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh index 625f1bd8359..d00064b06b7 100644 --- a/cuda/components/syncfree.cuh +++ b/cuda/components/syncfree.cuh @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/config.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/volatile.cuh" +#include "cuda/components/memory.cuh" namespace gko { diff --git a/cuda/components/volatile.cuh b/cuda/components/volatile.cuh deleted file mode 100644 index 96cb869c57e..00000000000 --- a/cuda/components/volatile.cuh +++ /dev/null @@ -1,58 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_CUDA_COMPONENTS_VOLATILE_CUH_ -#define GKO_CUDA_COMPONENTS_VOLATILE_CUH_ - - -#include - - -#include - - -#include "cuda/base/types.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/volatile.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - -#endif // GKO_CUDA_COMPONENTS_VOLATILE_CUH_ diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 6ee2c7521ff..546b366c6a2 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -55,9 +55,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" +#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/volatile.cuh" namespace gko { @@ -426,30 +426,31 @@ __global__ void sptrsv_naive_caching_kernel( : dependency * nrhs + rhs; const bool shmem_possible = (dependency_gid / default_block_size) == self_shmem_id; + ValueType val{}; if (shmem_possible) { const auto dependency_shid = dependency_gid % default_block_size; - x_p = &x_s[dependency_shid]; - } - - ValueType x = *x_p; - while (is_nan(x)) { - x = load(x_p, 0); + while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) { + } + } else { + while ( + is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) { + } } - sum += x * vals[i]; + sum += val * vals[i]; } // The first entry past the triangular part will be the diagonal const auto diag = unit_diag ? one() : vals[i]; const auto r = (b[row * b_stride + rhs] - sum) / diag; - store(x_s, self_shid, r); - x[row * x_stride + rhs] = r; + store_relaxed_shared(x_s + self_shid, r); + store_relaxed(x + row * x_stride + rhs, r); // This check to ensure no infinite loops happen. if (is_nan(r)) { - store(x_s, self_shid, zero()); - x[row * x_stride + rhs] = zero(); + store_relaxed(x_s + self_shid, zero()); + store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; } } @@ -488,12 +489,12 @@ __global__ void sptrsv_naive_legacy_kernel( auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { - auto x_val = load(x, col * x_stride + rhs); + auto x_val = load_relaxed(x + col * x_stride + rhs); while (!is_nan(x_val)) { sum += vals[j] * x_val; j += row_step; col = colidxs[j]; - x_val = load(x, col * x_stride + rhs); + x_val = load_relaxed(x + col * x_stride + rhs); } // to avoid the kernel hanging on matrices without diagonal, // we bail out if we are past the triangle, even if it's not @@ -503,12 +504,12 @@ __global__ void sptrsv_naive_legacy_kernel( // assert(row == col); auto diag = unit_diag ? one() : vals[j]; const auto r = (b[row * b_stride + rhs] - sum) / diag; - store(x, row * x_stride + rhs, r); + store_relaxed(x + row * x_stride + rhs, r); // after we encountered the diagonal, we are done // this also skips entries outside the triangle j = row_end; if (is_nan(r)) { - store(x, row * x_stride + rhs, zero()); + store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; } } diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py new file mode 100755 index 00000000000..a03cb47f4e7 --- /dev/null +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +import os +memory_spaces = [(".shared", ".cta", "_shared", "convert_generic_ptr_to_smem_ptr(ptr)", "r"), ("", ".gpu", "", "ptr", "l")] +memory_orderings = [ + (".relaxed", "_relaxed", ".relaxed", "_relaxed", True), + (".acquire", "_acquire", ".release", "_release", False) + ] +sizes=[(".b32", "r", "int32", 4), (".b64", "l", "int64", 8), (".f32", "f", "float", 4), (".f64", "d", "double", 8)] +# header +print("""/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_COMPONENTS_MEMORY_CUH_ +#define GKO_CUDA_COMPONENTS_MEMORY_CUH_ + + +#include + + +#include + + +#include "cuda/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +__device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) +{ +// see +// https://github.com/NVIDIA/cutlass/blob/ +// 6fc5008803fe4e81b81a836fcd3a88258f4e5bbf/ +// include/cutlass/arch/memory_sm75.h#L90 +// for reasoning behind this implementation +#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11) + return static_cast(__cvta_generic_to_shared(ptr)); +#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \ + __CUDACC_VER_MINOR__ >= 2) + return __nvvm_get_smem_pointer(ptr); +#else + uint32 smem_ptr; + asm("{{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 " + "%0, smem_ptr; }}" + : "=r"(smem_ptr) + : "l"(ptr)); + return smem_ptr; +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.gl;" ::: "memory"); +#else + asm volatile("fence.acq_rel.gpu;" ::: "memory"); +#endif +} + + +__device__ __forceinline__ uint32 membar_acq_rel_shared() +{ +#if __CUDA_ARCH__ < 700 + asm volatile("membar.cta;" ::: "memory"); +#else + asm volatile("fence.acq_rel.cta;" ::: "memory"); +#endif +} + + +#include "common/cuda_hip/components/memory.hpp.inc" +""") + +# relaxed +for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: + for volta_load_ordering_suffix, load_function_ordering_suffix, volta_store_ordering_suffix, store_function_ordering_suffix, is_relaxed in memory_orderings: + for size_suffix, constraint, typename, size in sizes: + membar_expression = "" if is_relaxed else f"membar_acq_rel{function_memory_space_suffix}();" + print(f""" +__device__ __forceinline__ {typename} load{load_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr) +{{ + {typename} result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile{memory_space_suffix}{size_suffix} %0, [%1];" + : "={constraint}"(result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#else + asm volatile("ld{volta_load_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} %0, [%1];" + : "={constraint}"(result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#endif + {membar_expression} + return result; +}} + + +__device__ __forceinline__ void store{store_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr, {typename} result) +{{ + {membar_expression} +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile{memory_space_suffix}{size_suffix} [%0], %1;" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + : "memory"); +#else + asm volatile("st{volta_store_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} [%0], %1;" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + : "memory"); +#endif +}} +""") + +# vectorized relaxed loads for thrust::complex +sizes=[(".f32", "f", "float", 4), (".f64", "d", "double", 8)] +for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: + for size_suffix, constraint, typename, size in sizes: + print(f""" +__device__ __forceinline__ thrust::complex<{typename}> load_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr) +{{ + {typename} real_result; + {typename} imag_result; +#if __CUDA_ARCH__ < 700 + asm volatile("ld.volatile{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" + : "={constraint}"(real_result), "={constraint}"(imag_result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#else + asm volatile("ld.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" + : "={constraint}"(real_result), "={constraint}"(imag_result) + : "{ptr_constraint}"({ptr_name}) + : "memory"); +#endif + return thrust::complex<{typename}>{{real_result, imag_result}}; +}} + + +__device__ __forceinline__ void store_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr, thrust::complex<{typename}> result) +{{ + auto real_result = result.real(); + auto imag_result = result.imag(); +#if __CUDA_ARCH__ < 700 + asm volatile("st.volatile{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + : "memory"); +#else + asm volatile("st.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" + :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + : "memory"); +#endif +}} +""") + +print(""" +} // namespace cuda +} // namespace kernels +} // namespace gko + +#endif // GKO_CUDA_COMPONENTS_MEMORY_CUH_ +""") \ No newline at end of file diff --git a/hip/components/volatile.hip.hpp b/hip/components/memory.hip.hpp similarity index 55% rename from hip/components/volatile.hip.hpp rename to hip/components/memory.hip.hpp index de0202d8391..b424c8bbc06 100644 --- a/hip/components/volatile.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_ -#define GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_ +#ifndef GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_ +#define GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_ #include @@ -48,11 +48,73 @@ namespace kernels { namespace hip { -#include "common/cuda_hip/components/volatile.hpp.inc" +#include "common/cuda_hip/components/memory.hpp.inc" + + +template +__device__ __forceinline__ ValueType load_relaxed(ValueType* ptr) +{ + return load(ptr, 0); +} + + +template +__device__ __forceinline__ ValueType load_acquire(ValueType* ptr) +{ + auto result = load(ptr, 0); + __threadfence(); + return result; +} + +template +__device__ __forceinline__ void store_relaxed(ValueType* ptr, ValueType value) +{ + store(ptr, 0, value); +} + + +template +__device__ __forceinline__ void store_release(ValueType* ptr, ValueType value) +{ + __threadfence(); + store(ptr, 0, value); +} + + +template +__device__ __forceinline__ ValueType load_relaxed_shared(ValueType* ptr) +{ + return load(ptr, 0); +} + + +template +__device__ __forceinline__ ValueType load_acquire_shared(ValueType* ptr) +{ + auto result = load(ptr, 0); + __threadfence(); + return result; +} + +template +__device__ __forceinline__ void store_relaxed_shared(ValueType* ptr, + ValueType value) +{ + store(ptr, 0, value); +} + + +template +__device__ __forceinline__ void store_release_shared(ValueType* ptr, + ValueType value) +{ + __threadfence(); + store(ptr, 0, value); +} } // namespace hip } // namespace kernels } // namespace gko -#endif // GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_ +#endif // GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_ diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp index 232ff059585..528a9200d08 100644 --- a/hip/components/syncfree.hip.hpp +++ b/hip/components/syncfree.hip.hpp @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/config.hip.hpp" #include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/volatile.hip.hpp" +#include "hip/components/memory.hip.hpp" namespace gko { From 533ba1c3f1196d9c37f466e28038821d15b44b03 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 22 Sep 2023 22:36:29 +0200 Subject: [PATCH 320/583] review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - const-correctness - add doc to generic-to-shared ptr conversion - improve generation script readability Co-authored-by: Marcel Koch Co-authored-by: Thomas Grützmacher --- cuda/components/memory.cuh | 207 +++++++++--------- dev_tools/scripts/generate_cuda_memory_ptx.py | 127 +++++++---- hip/components/memory.hip.hpp | 8 +- 3 files changed, 196 insertions(+), 146 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 578f7c8309f..15f2541bddf 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -48,6 +48,13 @@ namespace kernels { namespace cuda { +/** + * Transforms a generic CUDA pointer pointing to shared memory to a + * shared memory pointer for use in PTX assembly. + * CUDA PTX assembly uses 32bit pointers for shared memory addressing. + * The result is undefined for a generic pointer pointing to anything but + * shared memory. + */ __device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) { // see @@ -94,18 +101,18 @@ __device__ __forceinline__ uint32 membar_acq_rel_shared() #include "common/cuda_hip/components/memory.hpp.inc" -__device__ __forceinline__ int32 load_relaxed_shared(int32* ptr) +__device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -117,30 +124,30 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_relaxed_shared(int64* ptr) +__device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -152,30 +159,30 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_relaxed_shared(float* ptr) +__device__ __forceinline__ float load_relaxed_shared(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -187,30 +194,30 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_relaxed_shared(double* ptr) +__device__ __forceinline__ double load_relaxed_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif @@ -222,30 +229,30 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #endif } -__device__ __forceinline__ int32 load_acquire_shared(int32* ptr) +__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -258,30 +265,30 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #else asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_acquire_shared(int64* ptr) +__device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -294,30 +301,30 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #else asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_acquire_shared(float* ptr) +__device__ __forceinline__ float load_acquire_shared(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -330,30 +337,30 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #else asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_acquire_shared(double* ptr) +__device__ __forceinline__ double load_acquire_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.acquire.cta.shared.f64 %0, [%1];" : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif membar_acq_rel_shared(); @@ -366,30 +373,30 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #else asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(result) : "memory"); #endif } -__device__ __forceinline__ int32 load_relaxed(int32* ptr) +__device__ __forceinline__ int32 load_relaxed(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -400,27 +407,27 @@ __device__ __forceinline__ int32 load_relaxed(int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_relaxed(int64* ptr) +__device__ __forceinline__ int64 load_relaxed(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -431,27 +438,27 @@ __device__ __forceinline__ int64 load_relaxed(int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_relaxed(float* ptr) +__device__ __forceinline__ float load_relaxed(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -462,27 +469,27 @@ __device__ __forceinline__ float load_relaxed(float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_relaxed(double* ptr) +__device__ __forceinline__ double load_relaxed(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif @@ -493,27 +500,27 @@ __device__ __forceinline__ double load_relaxed(double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #endif } -__device__ __forceinline__ int32 load_acquire(int32* ptr) +__device__ __forceinline__ int32 load_acquire(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.b32 %0, [%1];" : "=r"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -525,27 +532,27 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) : "memory"); #endif } -__device__ __forceinline__ int64 load_acquire(int64* ptr) +__device__ __forceinline__ int64 load_acquire(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.b64 %0, [%1];" : "=l"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -557,27 +564,27 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) : "memory"); #endif } -__device__ __forceinline__ float load_acquire(float* ptr) +__device__ __forceinline__ float load_acquire(const float* ptr) { float result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.f32 %0, [%1];" : "=f"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -589,27 +596,27 @@ __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) : "memory"); #endif } -__device__ __forceinline__ double load_acquire(double* ptr) +__device__ __forceinline__ double load_acquire(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.acquire.gpu.f64 %0, [%1];" : "=d"(result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif membar_acq_rel(); @@ -621,29 +628,29 @@ __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) : "memory"); #endif } __device__ __forceinline__ thrust::complex load_relaxed_shared( - thrust::complex* ptr) + const thrust::complex* ptr) { float real_result; float imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -657,12 +664,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(real_result), "f"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -670,19 +677,19 @@ __device__ __forceinline__ void store_relaxed_shared( __device__ __forceinline__ thrust::complex load_relaxed_shared( - thrust::complex* ptr) + const thrust::complex* ptr) { double real_result; double imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr(ptr)) + : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -696,12 +703,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(real_result), "d"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr(ptr)), + convert_generic_ptr_to_smem_ptr((void*)ptr)), "d"(real_result), "d"(imag_result) : "memory"); #endif @@ -709,19 +716,19 @@ __device__ __forceinline__ void store_relaxed_shared( __device__ __forceinline__ thrust::complex load_relaxed( - thrust::complex* ptr) + const thrust::complex* ptr) { float real_result; float imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -734,11 +741,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), "f"(real_result), "f"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -746,19 +753,19 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, __device__ __forceinline__ thrust::complex load_relaxed( - thrust::complex* ptr) + const thrust::complex* ptr) { double real_result; double imag_result; #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"(ptr) + : "l"((void*)ptr) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -771,11 +778,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), "d"(real_result), "d"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), "d"(real_result), "d"(imag_result) : "memory"); #endif diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index a03cb47f4e7..dd5d682a9b8 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -1,11 +1,46 @@ #!/usr/bin/env python3 -import os -memory_spaces = [(".shared", ".cta", "_shared", "convert_generic_ptr_to_smem_ptr(ptr)", "r"), ("", ".gpu", "", "ptr", "l")] +import dataclasses + + +@dataclasses.dataclass +class space: + ptx_space_suffix: str + ptx_scope_suffix: str + fn_suffix: str + ptr_expr: str + ptr_constraint: str + + +@dataclasses.dataclass +class ordering: + ptx_load_suffix: str + fn_load_suffix: str + ptx_store_suffix: str + fn_store_suffix: str + is_relaxed: bool + + +@dataclasses.dataclass +class type_desc: + ptx_type_suffix: str + val_constraint: str + name: str + + +memory_spaces = [ + space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", + ptr_expr="convert_generic_ptr_to_smem_ptr((void*)ptr)", ptr_constraint="r"), + space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="(void*)ptr", ptr_constraint="l")] memory_orderings = [ - (".relaxed", "_relaxed", ".relaxed", "_relaxed", True), - (".acquire", "_acquire", ".release", "_release", False) - ] -sizes=[(".b32", "r", "int32", 4), (".b64", "l", "int64", 8), (".f32", "f", "float", 4), (".f64", "d", "double", 8)] + ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), + ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire", + ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False) +] +types = [type_desc(ptx_type_suffix=".b32", val_constraint="r", name="int32"), + type_desc(ptx_type_suffix=".b64", val_constraint="l", name="int64"), + type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), + type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] # header print("""/************************************************************* Copyright (c) 2017-2023, the Ginkgo authors @@ -57,6 +92,13 @@ namespace cuda { +/** + * Transforms a generic CUDA pointer pointing to shared memory to a + * shared memory pointer for use in PTX assembly. + * CUDA PTX assembly uses 32bit pointers for shared memory addressing. + * The result is undefined for a generic pointer pointing to anything but + * shared memory. + */ __device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr) { // see @@ -104,23 +146,23 @@ """) # relaxed -for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: - for volta_load_ordering_suffix, load_function_ordering_suffix, volta_store_ordering_suffix, store_function_ordering_suffix, is_relaxed in memory_orderings: - for size_suffix, constraint, typename, size in sizes: - membar_expression = "" if is_relaxed else f"membar_acq_rel{function_memory_space_suffix}();" +for s in memory_spaces: + for o in memory_orderings: + for t in types: + membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" print(f""" -__device__ __forceinline__ {typename} load{load_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr) +__device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ - {typename} result; + {t.name} result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile{memory_space_suffix}{size_suffix} %0, [%1];" - : "={constraint}"(result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #else - asm volatile("ld{volta_load_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} %0, [%1];" - : "={constraint}"(result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #endif {membar_expression} @@ -128,56 +170,57 @@ }} -__device__ __forceinline__ void store{store_function_ordering_suffix}{function_memory_space_suffix}({typename}* ptr, {typename} result) +__device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result) {{ {membar_expression} #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile{memory_space_suffix}{size_suffix} [%0], %1;" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) : "memory"); #else - asm volatile("st{volta_store_ordering_suffix}{scope_suffix}{memory_space_suffix}{size_suffix} [%0], %1;" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(result) + asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) : "memory"); #endif }} """) # vectorized relaxed loads for thrust::complex -sizes=[(".f32", "f", "float", 4), (".f64", "d", "double", 8)] -for memory_space_suffix, scope_suffix, function_memory_space_suffix, ptr_name, ptr_constraint in memory_spaces: - for size_suffix, constraint, typename, size in sizes: +types = [type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), + type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] +for s in memory_spaces: + for t in types: print(f""" -__device__ __forceinline__ thrust::complex<{typename}> load_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr) +__device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) {{ - {typename} real_result; - {typename} imag_result; + {t.name} real_result; + {t.name} imag_result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" - : "={constraint}"(real_result), "={constraint}"(imag_result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" + : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #else - asm volatile("ld.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} {{%0, %1}}, [%2];" - : "={constraint}"(real_result), "={constraint}"(imag_result) - : "{ptr_constraint}"({ptr_name}) + asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" + : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) + : "{s.ptr_constraint}"({s.ptr_expr}) : "memory"); #endif - return thrust::complex<{typename}>{{real_result, imag_result}}; + return thrust::complex<{t.name}>{{real_result, imag_result}}; }} -__device__ __forceinline__ void store_relaxed{function_memory_space_suffix}(thrust::complex<{typename}>* ptr, thrust::complex<{typename}> result) +__device__ __forceinline__ void store_relaxed{s.fn_suffix}(thrust::complex<{t.name}>* ptr, thrust::complex<{t.name}> result) {{ auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #else - asm volatile("st.relaxed{scope_suffix}{memory_space_suffix}.v2{size_suffix} [%0], {{%1, %2}};" - :: "{ptr_constraint}"({ptr_name}), "{constraint}"(real_result), "{constraint}"(imag_result) + asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" + :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #endif }} @@ -189,4 +232,4 @@ } // namespace gko #endif // GKO_CUDA_COMPONENTS_MEMORY_CUH_ -""") \ No newline at end of file +""") diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index b424c8bbc06..485f67343e0 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -52,14 +52,14 @@ namespace hip { template -__device__ __forceinline__ ValueType load_relaxed(ValueType* ptr) +__device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr) { return load(ptr, 0); } template -__device__ __forceinline__ ValueType load_acquire(ValueType* ptr) +__device__ __forceinline__ ValueType load_acquire(const ValueType* ptr) { auto result = load(ptr, 0); __threadfence(); @@ -82,14 +82,14 @@ __device__ __forceinline__ void store_release(ValueType* ptr, ValueType value) template -__device__ __forceinline__ ValueType load_relaxed_shared(ValueType* ptr) +__device__ __forceinline__ ValueType load_relaxed_shared(const ValueType* ptr) { return load(ptr, 0); } template -__device__ __forceinline__ ValueType load_acquire_shared(ValueType* ptr) +__device__ __forceinline__ ValueType load_acquire_shared(const ValueType* ptr) { auto result = load(ptr, 0); __threadfence(); From 77b80ed243449dc8bd497076cb84aea6c07eabb2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 22 Sep 2023 22:40:40 +0200 Subject: [PATCH 321/583] restore peek functionality --- common/cuda_hip/components/syncfree.hpp.inc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp.inc index 113c66d91ec..a8fa767e4dd 100644 --- a/common/cuda_hip/components/syncfree.hpp.inc +++ b/common/cuda_hip/components/syncfree.hpp.inc @@ -107,7 +107,19 @@ public: group::tiled_partition(group::this_thread_block()).sync(); } - __device__ __forceinline__ bool peek(IndexType dependency) { return false; } + __device__ __forceinline__ bool peek(IndexType dependency) + { + const auto dep_block = dependency / (block_size / subwarp_size); + const auto dep_local = dependency % (block_size / subwarp_size); + // assert(dependency < work_id); + if (dep_block == block_id) { + // peek at a local dependency + return load_acquire_shared(local.status + dep_local); + } else { + // peek at a global dependency + return load_acquire(global.status + dependency); + } + } __device__ __forceinline__ void mark_ready() { From f0257851c1f58fc2afc91450712c8ceded28947c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 14:44:39 +0200 Subject: [PATCH 322/583] use const_cast for CUDA atomic load/store wrappers --- cuda/components/memory.cuh | 216 ++++++++++-------- dev_tools/scripts/generate_cuda_memory_ptx.py | 22 +- 2 files changed, 136 insertions(+), 102 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 15f2541bddf..844fca6adf4 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -107,12 +107,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif @@ -124,12 +124,12 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #endif @@ -142,12 +142,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif @@ -159,12 +159,12 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #endif @@ -177,12 +177,12 @@ __device__ __forceinline__ float load_relaxed_shared(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif @@ -194,12 +194,12 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #endif @@ -210,15 +210,17 @@ __device__ __forceinline__ double load_relaxed_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #else - asm volatile("ld.relaxed.cta.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.relaxed.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #endif return result; @@ -229,12 +231,12 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #endif @@ -247,12 +249,12 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif membar_acq_rel_shared(); @@ -265,12 +267,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #else asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "r"(result) : "memory"); #endif @@ -283,12 +285,12 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif membar_acq_rel_shared(); @@ -301,12 +303,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #else asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "l"(result) : "memory"); #endif @@ -319,12 +321,12 @@ __device__ __forceinline__ float load_acquire_shared(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else asm volatile("ld.acquire.cta.shared.f32 %0, [%1];" : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #endif membar_acq_rel_shared(); @@ -337,12 +339,12 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #else asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "f"(result) : "memory"); #endif @@ -353,15 +355,17 @@ __device__ __forceinline__ double load_acquire_shared(const double* ptr) { double result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.volatile.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #else - asm volatile("ld.acquire.cta.shared.f64 %0, [%1];" - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) - : "memory"); + asm volatile( + "ld.acquire.cta.shared.f64 %0, [%1];" + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); #endif membar_acq_rel_shared(); return result; @@ -373,12 +377,12 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #else asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr(const_cast(ptr))), "d"(result) : "memory"); #endif @@ -391,12 +395,12 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -407,10 +411,12 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #endif } @@ -422,12 +428,12 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -438,10 +444,12 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #endif } @@ -453,12 +461,12 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -469,10 +477,12 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #endif } @@ -484,12 +494,12 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif @@ -500,10 +510,12 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #endif } @@ -515,12 +527,12 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.b32 %0, [%1];" : "=r"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -532,10 +544,12 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"((void*)ptr), "r"(result) + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), + "r"(result) : "memory"); #endif } @@ -547,12 +561,12 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.b64 %0, [%1];" : "=l"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -564,10 +578,12 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"((void*)ptr), "l"(result) + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), + "l"(result) : "memory"); #endif } @@ -579,12 +595,12 @@ __device__ __forceinline__ float load_acquire(const float* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.f32 %0, [%1];" : "=f"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -596,10 +612,12 @@ __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"((void*)ptr), "f"(result) + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), + "f"(result) : "memory"); #endif } @@ -611,12 +629,12 @@ __device__ __forceinline__ double load_acquire(const double* ptr) #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #else asm volatile("ld.acquire.gpu.f64 %0, [%1];" : "=d"(result) - : "l"((void*)ptr) + : "l"(const_cast(ptr)) : "memory"); #endif membar_acq_rel(); @@ -628,10 +646,12 @@ __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"((void*)ptr), "d"(result) + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), + "d"(result) : "memory"); #endif } @@ -645,12 +665,14 @@ __device__ __forceinline__ thrust::complex load_relaxed_shared( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -664,12 +686,14 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "f"(real_result), "f"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -684,12 +708,14 @@ __device__ __forceinline__ thrust::complex load_relaxed_shared( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #else asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "r"(convert_generic_ptr_to_smem_ptr((void*)ptr)) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -703,12 +729,14 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "d"(real_result), "d"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr((void*)ptr)), + convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))), "d"(real_result), "d"(imag_result) : "memory"); #endif @@ -723,12 +751,12 @@ __device__ __forceinline__ thrust::complex load_relaxed( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];" : "=f"(real_result), "=f"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -741,11 +769,13 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -760,12 +790,12 @@ __device__ __forceinline__ thrust::complex load_relaxed( #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #else asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];" : "=d"(real_result), "=d"(imag_result) - : "l"((void*)ptr) + : "l"(const_cast*>(ptr)) : "memory"); #endif return thrust::complex{real_result, imag_result}; @@ -778,11 +808,13 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"((void*)ptr), + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"( + const_cast*>(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #endif diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index dd5d682a9b8..dae5f6c3a59 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -29,8 +29,8 @@ class type_desc: memory_spaces = [ space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", - ptr_expr="convert_generic_ptr_to_smem_ptr((void*)ptr)", ptr_constraint="r"), - space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="(void*)ptr", ptr_constraint="l")] + ptr_expr="convert_generic_ptr_to_smem_ptr(const_cast<{typename}*>(ptr))", ptr_constraint="r"), + space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="const_cast<{typename}*>(ptr)", ptr_constraint="l")] memory_orderings = [ ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), @@ -150,6 +150,7 @@ class type_desc: for o in memory_orderings: for t in types: membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" + ptr_expr = s.ptr_expr.format(typename=t.name) print(f""" __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ @@ -157,12 +158,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #else asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #endif {membar_expression} @@ -175,11 +176,11 @@ class type_desc: {membar_expression} #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) : "memory"); #else asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) : "memory"); #endif }} @@ -190,6 +191,7 @@ class type_desc: type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] for s in memory_spaces: for t in types: + ptr_expr = s.ptr_expr.format(typename=f"thrust::complex<{t.name}>") print(f""" __device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) {{ @@ -198,12 +200,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #else asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({s.ptr_expr}) + : "{s.ptr_constraint}"({ptr_expr}) : "memory"); #endif return thrust::complex<{t.name}>{{real_result, imag_result}}; @@ -216,11 +218,11 @@ class type_desc: auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #else asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({s.ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #endif }} From fc4a4e6fe90a47be37e8293b8ffb6380dd2867b3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 25 Sep 2023 15:59:26 +0200 Subject: [PATCH 323/583] remove unnecessary const casts --- cuda/components/memory.cuh | 104 +++++++----------- dev_tools/scripts/generate_cuda_memory_ptx.py | 28 +++-- 2 files changed, 56 insertions(+), 76 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 844fca6adf4..af3a0e838ea 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -124,12 +124,12 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #endif @@ -159,12 +159,12 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #endif @@ -194,12 +194,12 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #endif @@ -231,12 +231,12 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) { #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #else asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #endif @@ -267,12 +267,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #endif @@ -303,12 +303,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #endif @@ -339,12 +339,12 @@ __device__ __forceinline__ void store_release_shared(float* ptr, float result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #else asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(result) : "memory"); #endif @@ -377,12 +377,12 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #else asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"( - convert_generic_ptr_to_smem_ptr(const_cast(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(result) : "memory"); #endif @@ -411,12 +411,10 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -444,12 +442,10 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -477,12 +473,10 @@ __device__ __forceinline__ float load_relaxed(const float* ptr) __device__ __forceinline__ void store_relaxed(float* ptr, float result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #endif } @@ -510,12 +504,10 @@ __device__ __forceinline__ double load_relaxed(const double* ptr) __device__ __forceinline__ void store_relaxed(double* ptr, double result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #endif } @@ -544,12 +536,10 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(const_cast(ptr)), - "r"(result) + asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -578,12 +568,10 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(const_cast(ptr)), - "l"(result) + asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -612,12 +600,10 @@ __device__ __forceinline__ void store_release(float* ptr, float result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #else - asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(const_cast(ptr)), - "f"(result) + asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result) : "memory"); #endif } @@ -646,12 +632,10 @@ __device__ __forceinline__ void store_release(double* ptr, double result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #else - asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(const_cast(ptr)), - "d"(result) + asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result) : "memory"); #endif } @@ -686,14 +670,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -729,14 +711,12 @@ __device__ __forceinline__ void store_relaxed_shared( auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #else asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"( - convert_generic_ptr_to_smem_ptr( - const_cast*>(ptr))), + convert_generic_ptr_to_smem_ptr(ptr)), "d"(real_result), "d"(imag_result) : "memory"); #endif @@ -769,13 +749,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr), "f"(real_result), "f"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr), "f"(real_result), "f"(imag_result) : "memory"); #endif @@ -808,13 +786,11 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, auto real_result = result.real(); auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr), "d"(real_result), "d"(imag_result) : "memory"); #else - asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"( - const_cast*>(ptr)), + asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr), "d"(real_result), "d"(imag_result) : "memory"); #endif diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index dae5f6c3a59..4cbe05361c1 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -29,8 +29,8 @@ class type_desc: memory_spaces = [ space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", - ptr_expr="convert_generic_ptr_to_smem_ptr(const_cast<{typename}*>(ptr))", ptr_constraint="r"), - space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="const_cast<{typename}*>(ptr)", ptr_constraint="l")] + ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"), + space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")] memory_orderings = [ ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), @@ -150,7 +150,9 @@ class type_desc: for o in memory_orderings: for t in types: membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" - ptr_expr = s.ptr_expr.format(typename=t.name) + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast<{t.name}*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") print(f""" __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ @@ -158,12 +160,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #else asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];" : "={t.val_constraint}"(result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #endif {membar_expression} @@ -176,11 +178,11 @@ class type_desc: {membar_expression} #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); #else asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); #endif }} @@ -191,7 +193,9 @@ class type_desc: type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] for s in memory_spaces: for t in types: - ptr_expr = s.ptr_expr.format(typename=f"thrust::complex<{t.name}>") + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") print(f""" __device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) {{ @@ -200,12 +204,12 @@ class type_desc: #if __CUDA_ARCH__ < 700 asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #else asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) - : "{s.ptr_constraint}"({ptr_expr}) + : "{s.ptr_constraint}"({const_ptr_expr}) : "memory"); #endif return thrust::complex<{t.name}>{{real_result, imag_result}}; @@ -218,11 +222,11 @@ class type_desc: auto imag_result = result.imag(); #if __CUDA_ARCH__ < 700 asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #else asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};" - :: "{s.ptr_constraint}"({ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) : "memory"); #endif }} From 9ab9633165b4154a8edf30ef9e4e370eb23e39de Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 9 Oct 2023 11:29:35 +0200 Subject: [PATCH 324/583] review updates - update asm type annotations - fix incorrect store Co-authored-by: Yuhsiang M. Tsai --- cuda/components/memory.cuh | 64 +++++++++---------- cuda/solver/common_trs_kernels.cuh | 2 +- dev_tools/scripts/generate_cuda_memory_ptx.py | 4 +- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index af3a0e838ea..4d814c7f513 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -105,12 +105,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b32 %0, [%1];" + asm volatile("ld.volatile.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.relaxed.cta.shared.b32 %0, [%1];" + asm volatile("ld.relaxed.cta.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -123,12 +123,12 @@ __device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr) __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else - asm volatile("st.relaxed.cta.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.relaxed.cta.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); @@ -140,12 +140,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b64 %0, [%1];" + asm volatile("ld.volatile.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.relaxed.cta.shared.b64 %0, [%1];" + asm volatile("ld.relaxed.cta.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -158,12 +158,12 @@ __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else - asm volatile("st.relaxed.cta.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.relaxed.cta.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); @@ -247,12 +247,12 @@ __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b32 %0, [%1];" + asm volatile("ld.volatile.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.acquire.cta.shared.b32 %0, [%1];" + asm volatile("ld.acquire.cta.shared.s32 %0, [%1];" : "=r"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -266,12 +266,12 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) { membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); #else - asm volatile("st.release.cta.shared.b32 [%0], %1;" ::"r"( + asm volatile("st.release.cta.shared.s32 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) : "memory"); @@ -283,12 +283,12 @@ __device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.shared.b64 %0, [%1];" + asm volatile("ld.volatile.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); #else - asm volatile("ld.acquire.cta.shared.b64 %0, [%1];" + asm volatile("ld.acquire.cta.shared.s64 %0, [%1];" : "=l"(result) : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); @@ -302,12 +302,12 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) { membar_acq_rel_shared(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); #else - asm volatile("st.release.cta.shared.b64 [%0], %1;" ::"r"( + asm volatile("st.release.cta.shared.s64 [%0], %1;" ::"r"( convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) : "memory"); @@ -393,12 +393,12 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b32 %0, [%1];" + asm volatile("ld.volatile.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.relaxed.gpu.b32 %0, [%1];" + asm volatile("ld.relaxed.gpu.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); @@ -411,10 +411,10 @@ __device__ __forceinline__ int32 load_relaxed(const int32* ptr) __device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.relaxed.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -424,12 +424,12 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b64 %0, [%1];" + asm volatile("ld.volatile.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.relaxed.gpu.b64 %0, [%1];" + asm volatile("ld.relaxed.gpu.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); @@ -442,10 +442,10 @@ __device__ __forceinline__ int64 load_relaxed(const int64* ptr) __device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.relaxed.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.relaxed.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } @@ -517,12 +517,12 @@ __device__ __forceinline__ int32 load_acquire(const int32* ptr) { int32 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b32 %0, [%1];" + asm volatile("ld.volatile.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.acquire.gpu.b32 %0, [%1];" + asm volatile("ld.acquire.gpu.s32 %0, [%1];" : "=r"(result) : "l"(const_cast(ptr)) : "memory"); @@ -536,10 +536,10 @@ __device__ __forceinline__ void store_release(int32* ptr, int32 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #else - asm volatile("st.release.gpu.b32 [%0], %1;" ::"l"(ptr), "r"(result) + asm volatile("st.release.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result) : "memory"); #endif } @@ -549,12 +549,12 @@ __device__ __forceinline__ int64 load_acquire(const int64* ptr) { int64 result; #if __CUDA_ARCH__ < 700 - asm volatile("ld.volatile.b64 %0, [%1];" + asm volatile("ld.volatile.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); #else - asm volatile("ld.acquire.gpu.b64 %0, [%1];" + asm volatile("ld.acquire.gpu.s64 %0, [%1];" : "=l"(result) : "l"(const_cast(ptr)) : "memory"); @@ -568,10 +568,10 @@ __device__ __forceinline__ void store_release(int64* ptr, int64 result) { membar_acq_rel(); #if __CUDA_ARCH__ < 700 - asm volatile("st.volatile.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #else - asm volatile("st.release.gpu.b64 [%0], %1;" ::"l"(ptr), "l"(result) + asm volatile("st.release.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result) : "memory"); #endif } diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 546b366c6a2..6dbd65968d0 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -449,7 +449,7 @@ __global__ void sptrsv_naive_caching_kernel( // This check to ensure no infinite loops happen. if (is_nan(r)) { - store_relaxed(x_s + self_shid, zero()); + store_relaxed_shared(x_s + self_shid, zero()); store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; } diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 4cbe05361c1..d75a9f908b8 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -37,8 +37,8 @@ class type_desc: ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire", ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False) ] -types = [type_desc(ptx_type_suffix=".b32", val_constraint="r", name="int32"), - type_desc(ptx_type_suffix=".b64", val_constraint="l", name="int64"), +types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"), + type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"), type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] # header From b4c1699abe8c4e13f58ea7ad8947e75fbf6445ba Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 10 Oct 2023 10:50:50 +0200 Subject: [PATCH 325/583] add note to generated file --- cuda/components/memory.cuh | 3 +++ dev_tools/scripts/generate_cuda_memory_ptx.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 4d814c7f513..a1a53284e3f 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -43,6 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/types.hpp" +// this file is generated by dev_tools/scripts/generate_cuda_memory_ptx.py + + namespace gko { namespace kernels { namespace cuda { diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index d75a9f908b8..42bef50f9a2 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -87,6 +87,9 @@ class type_desc: #include "cuda/base/types.hpp" +// this file is generated by dev_tools/scripts/generate_cuda_memory_ptx.py + + namespace gko { namespace kernels { namespace cuda { From 03d696ce7ced9c69303f4895617285b3f5076536 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 8 Oct 2023 21:50:29 +0200 Subject: [PATCH 326/583] add reorderings to benchmarks --- benchmark/conversion/conversion.cpp | 2 + .../matrix_statistics/matrix_statistics.cpp | 1 + benchmark/preconditioner/preconditioner.cpp | 1 + benchmark/solver/solver_common.hpp | 12 +++- benchmark/sparse_blas/sparse_blas.cpp | 2 +- benchmark/spmv/spmv_common.hpp | 2 + benchmark/test/preconditioner.py | 8 +++ .../reference/preconditioner.reordered.stderr | 9 +++ .../reference/preconditioner.reordered.stdout | 33 +++++++++ .../test/reference/solver.reordered.stderr | 10 +++ .../test/reference/solver.reordered.stdout | 57 ++++++++++++++++ .../reference/sparse_blas.reordered.stderr | 9 +++ .../reference/sparse_blas.reordered.stdout | 32 +++++++++ .../test/reference/spmv.reordered.stderr | 10 +++ .../test/reference/spmv.reordered.stdout | 22 ++++++ benchmark/test/solver.py | 8 +++ benchmark/test/sparse_blas.py | 11 ++- benchmark/test/spmv.py | 8 +++ benchmark/utils/general_matrix.hpp | 68 +++++++++++++++++++ benchmark/utils/generator.hpp | 27 ++++---- 20 files changed, 315 insertions(+), 17 deletions(-) create mode 100644 benchmark/test/reference/preconditioner.reordered.stderr create mode 100644 benchmark/test/reference/preconditioner.reordered.stdout create mode 100644 benchmark/test/reference/solver.reordered.stderr create mode 100644 benchmark/test/reference/solver.reordered.stdout create mode 100644 benchmark/test/reference/sparse_blas.reordered.stderr create mode 100644 benchmark/test/reference/sparse_blas.reordered.stdout create mode 100644 benchmark/test/reference/spmv.reordered.stderr create mode 100644 benchmark/test/reference/spmv.reordered.stdout diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp index c777db1a35a..e45046329d7 100644 --- a/benchmark/conversion/conversion.cpp +++ b/benchmark/conversion/conversion.cpp @@ -118,6 +118,8 @@ struct ConversionBenchmark : Benchmark> { { gko::matrix_data data; data = Generator::generate_matrix_data(test_case); + // no reordering here, as it doesn't impact conversions beyond + // dense-sparse conversions std::clog << "Matrix is of size (" << data.size[0] << ", " << data.size[1] << "), " << data.nonzeros.size() << std::endl; test_case["rows"] = data.size[0]; diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 20feecf5ccf..576d6fa7d52 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -186,6 +186,7 @@ struct MatrixStatistics : Benchmark { json& test_case) const override { auto data = Generator::generate_matrix_data(test_case); + // no reordering here, as it doesn't change statistics std::clog << "Matrix is of size (" << data.size[0] << ", " << data.size[1] << "), " << data.nonzeros.size() << std::endl; test_case["rows"] = data.size[0]; diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 074fe202e6c..d81dfaa4d5d 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -183,6 +183,7 @@ struct PreconditionerBenchmark : Benchmark { { preconditioner_benchmark_state state; auto data = Generator::generate_matrix_data(test_case); + reorder(data, test_case); state.system_matrix = formats::matrix_factory(FLAGS_formats, exec, data); diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 784b70eca61..46b7a231e9a 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" @@ -433,10 +434,17 @@ struct SolverBenchmark : Benchmark> { {std::numeric_limits::quiet_NaN()}, exec); state.x = generator.initialize({0.0}, exec); } else { - state.system_matrix = - generator.generate_matrix_with_optimal_format(exec, test_case); + auto data = generator.generate_matrix_data(test_case); + auto permutation = + reorder(data, test_case, generator.is_distributed()); + + state.system_matrix = generator.generate_matrix_with_format( + exec, test_case["optimal"]["spmv"].get(), data); state.b = generator.generate_rhs(exec, state.system_matrix.get(), test_case); + if (permutation) { + permute(state.b, permutation.get()); + } state.x = generator.generate_initial_guess( exec, state.system_matrix.get(), state.b.get()); } diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index 5d479eb7fc0..5385de4264c 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -114,7 +114,7 @@ struct SparseBlasBenchmark : Benchmark> { json& test_case) const override { auto data = Generator::generate_matrix_data(test_case); - data.ensure_row_major_order(); + reorder(data, test_case); std::clog << "Matrix is of size (" << data.size[0] << ", " << data.size[1] << "), " << data.nonzeros.size() << std::endl; test_case["rows"] = data.size[0]; diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index c85642bb5f1..4d1ab17ccf4 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/runner.hpp" @@ -104,6 +105,7 @@ struct SpmvBenchmark : Benchmark> { { spmv_benchmark_state state; state.data = generator.generate_matrix_data(test_case); + reorder(state.data, test_case, generator.is_distributed()); auto nrhs = FLAGS_nrhs; state.b = generator.create_multi_vector_random( diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py index e05e5b780ac..7226964dd05 100755 --- a/benchmark/test/preconditioner.py +++ b/benchmark/test/preconditioner.py @@ -43,3 +43,11 @@ expected_stdout="preconditioner.profile.stdout", expected_stderr="preconditioner.profile.stderr", ) + +# stdin +test_framework.compare_output( + ["-reorder", "amd"], + expected_stdout="preconditioner.reordered.stdout", + expected_stderr="preconditioner.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) diff --git a/benchmark/test/reference/preconditioner.reordered.stderr b/benchmark/test/reference/preconditioner.reordered.stderr new file mode 100644 index 00000000000..a428671486f --- /dev/null +++ b/benchmark/test/reference/preconditioner.reordered.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running preconditioner: none diff --git a/benchmark/test/reference/preconditioner.reordered.stdout b/benchmark/test/reference/preconditioner.reordered.stdout new file mode 100644 index 00000000000..51adfb3b58b --- /dev/null +++ b/benchmark/test/reference/preconditioner.reordered.stdout @@ -0,0 +1,33 @@ +[ + { + "size": 100, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate()": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply()": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/solver.reordered.stderr b/benchmark/test/reference/solver.reordered.stderr new file mode 100644 index 00000000000..d9c04b69cf5 --- /dev/null +++ b/benchmark/test/reference/solver.reordered.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +Matrix is of size (125, 125) + Running solver: cg diff --git a/benchmark/test/reference/solver.reordered.stdout b/benchmark/test/reference/solver.reordered.stdout new file mode 100644 index 00000000000..c1b826ae3fc --- /dev/null +++ b/benchmark/test/reference/solver.reordered.stdout @@ -0,0 +1,57 @@ +[ + { + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_norm2_dispatch": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125 + } +] diff --git a/benchmark/test/reference/sparse_blas.reordered.stderr b/benchmark/test/reference/sparse_blas.reordered.stderr new file mode 100644 index 00000000000..497d5a72bbf --- /dev/null +++ b/benchmark/test/reference/sparse_blas.reordered.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are symbolic_cholesky +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running sparse_blas: symbolic_cholesky diff --git a/benchmark/test/reference/sparse_blas.reordered.stdout b/benchmark/test/reference/sparse_blas.reordered.stdout new file mode 100644 index 00000000000..b5fc8998be0 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.reordered.stdout @@ -0,0 +1,32 @@ +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "symbolic_cholesky": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "compute_elim_forest": 1.0, + "allocate": 1.0, + "free": 1.0, + "components::fill_array": 1.0, + "cholesky::symbolic_count": 1.0, + "components::prefix_sum_nonnegative": 1.0, + "copy": 1.0, + "cholesky::symbolic_factorize": 1.0, + "csr::sort_by_column_index": 1.0, + "overhead": 1.0 + }, + "factor_nonzeros": 1324, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/spmv.reordered.stderr b/benchmark/test/reference/spmv.reordered.stderr new file mode 100644 index 00000000000..97fe670aff7 --- /dev/null +++ b/benchmark/test/reference/spmv.reordered.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (develop) + running with core module 1.7.0 (develop) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running spmv: coo diff --git a/benchmark/test/reference/spmv.reordered.stdout b/benchmark/test/reference/spmv.reordered.stdout new file mode 100644 index 00000000000..5404235cdf7 --- /dev/null +++ b/benchmark/test/reference/spmv.reordered.stdout @@ -0,0 +1,22 @@ +[ + { + "size": 100, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125, + "nonzeros": 725, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py index 025ee92707c..5dd1d840a4e 100755 --- a/benchmark/test/solver.py +++ b/benchmark/test/solver.py @@ -43,3 +43,11 @@ expected_stdout="solver.profile.stdout", expected_stderr="solver.profile.stderr", ) + +# reordering +test_framework.compare_output( + ["-reorder", "amd"], + expected_stdout="solver.reordered.stdout", + expected_stderr="solver.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', +) diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py index 724cdb866f0..8e6cda3c9bd 100755 --- a/benchmark/test/sparse_blas.py +++ b/benchmark/test/sparse_blas.py @@ -4,7 +4,8 @@ # check that all input modes work: # parameter test_framework.compare_output( - ["-operations", "transpose", "-input", '[{"size": 100, "stencil": "7pt"}]'], + ["-operations", "transpose", "-input", + '[{"size": 100, "stencil": "7pt"}]'], expected_stdout="sparse_blas.simple.stdout", expected_stderr="sparse_blas.simple.stderr", ) @@ -55,3 +56,11 @@ expected_stdout="sparse_blas.profile.stdout", expected_stderr="sparse_blas.profile.stderr", ) + +# reordering +test_framework.compare_output( + ["-operations", "symbolic_cholesky", "-reorder", "amd"], + expected_stdout="sparse_blas.reordered.stdout", + expected_stderr="sparse_blas.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py index 865f74bb6d0..f6f4a4b5c39 100755 --- a/benchmark/test/spmv.py +++ b/benchmark/test/spmv.py @@ -43,3 +43,11 @@ expected_stdout="spmv.profile.stdout", expected_stderr="spmv.profile.stderr", ) + +# stdin +test_framework.compare_output( + ["-reorder", "amd"], + expected_stdout="spmv.reordered.stdout", + expected_stderr="spmv.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp index 39d8b5a8107..41b3459bc5a 100644 --- a/benchmark/utils/general_matrix.hpp +++ b/benchmark/utils/general_matrix.hpp @@ -41,12 +41,80 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/general.hpp" +#include "benchmark/utils/generator.hpp" + + +std::string reordering_algorithm_desc = + "Reordering algorithm to apply to the input matrices:\n" + " none - no reordering\n" + " amd - Approximate Minimum Degree reordering algorithm\n" +#if GKO_HAVE_METIS + " nd - Nested Dissection reordering algorithm\n" +#endif + " rcm - Reverse Cuthill-McKee reordering algorithm"; DEFINE_string(input_matrix, "", "Filename of a matrix to be used as the single input. Overwrites " "the value of the -input flag"); +DEFINE_string(reorder, "none", reordering_algorithm_desc.c_str()); + + +template +std::unique_ptr> reorder( + gko::matrix_data& data, json& test_case, + bool is_distributed = false) +{ + if (FLAGS_reorder == "none" || is_distributed) { + return nullptr; + } + using Csr = gko::matrix::Csr; + auto ref = gko::ReferenceExecutor::create(); + auto mtx = gko::share(Csr::create(ref)); + mtx->read(data); + std::unique_ptr> perm; + if (FLAGS_reorder == "amd") { + perm = gko::experimental::reorder::Amd::build() + .on(ref) + ->generate(mtx); +#if GKO_HAVE_METIS + } else if (FLAGS_reorder == "nd") { + perm = gko::experimental::reorder::NestedDissection::build() + .on(ref) + ->generate(mtx); +#endif + } else if (FLAGS_reorder == "rcm") { + perm = gko::reorder::Rcm::build() + .on(ref) + ->generate(mtx) + ->get_permutation() + ->clone(); + } else { + throw std::runtime_error{"Unknown reordering algorithm " + + FLAGS_reorder}; + } + mtx->permute(perm)->write(data); + test_case["reordered"] = FLAGS_reorder; + return perm; +} + + +template +void permute(std::unique_ptr>& vec, + const gko::matrix::Permutation* perm) +{ + vec = vec->permute(perm, gko::matrix::permute_mode::rows); +} + + +template +void permute( + std::unique_ptr>& vec, + const gko::matrix::Permutation* perm) +{} + /** * @copydoc initialize_argument_parsing diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 3f26ed3f2fc..c280cb1ac72 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -52,20 +52,25 @@ struct DefaultSystemGenerator { using value_type = ValueType; using Vec = vec; + static bool is_distributed() { return false; } + static gko::matrix_data generate_matrix_data( const json& config) { + gko::matrix_data data; if (config.contains("filename")) { std::ifstream in(config["filename"].get()); - return gko::read_generic_raw(in); + data = gko::read_generic_raw(in); } else if (config.contains("stencil")) { - return generate_stencil( + data = generate_stencil( config["stencil"].get(), config["size"].get()); } else { throw std::runtime_error( "No known way to generate matrix data found."); } + data.ensure_row_major_order(); + return data; } static std::string get_example_config() @@ -188,16 +193,19 @@ struct DistributedDefaultSystemGenerator { using Mtx = dist_mtx; using Vec = dist_vec; + static bool is_distributed() { return true; } + gko::matrix_data generate_matrix_data( const json& config) const { + gko::matrix_data data; if (config.contains("filename")) { std::ifstream in(config["filename"].get()); - return gko::read_generic_raw(in); + data = gko::read_generic_raw(in); } else if (config.contains("stencil")) { auto local_size = static_cast( config["size"].get() / comm.size()); - return generate_stencil( + data = generate_stencil( config["stencil"].get(), comm, local_size, config["comm_pattern"].get() == std::string("optimal")); @@ -205,6 +213,8 @@ struct DistributedDefaultSystemGenerator { throw std::runtime_error( "No known way to generate matrix data found."); } + data.ensure_row_major_order(); + return data; } static std::string get_example_config() @@ -240,15 +250,6 @@ struct DistributedDefaultSystemGenerator { } } - std::shared_ptr generate_matrix_with_optimal_format( - std::shared_ptr exec, json& config) const - { - auto data = generate_matrix_data(config); - return generate_matrix_with_format( - std::move(exec), config["optimal"]["spmv"].get(), - data); - } - std::shared_ptr generate_matrix_with_format( std::shared_ptr exec, const std::string& format_name, const gko::matrix_data& data, From 99f1eb870d45407ca6bc82a4dbf20c279b431368 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 10 Oct 2023 10:18:00 +0200 Subject: [PATCH 327/583] revert to old permutation interface for now --- benchmark/utils/general_matrix.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp index 41b3459bc5a..3d063e91981 100644 --- a/benchmark/utils/general_matrix.hpp +++ b/benchmark/utils/general_matrix.hpp @@ -95,7 +95,9 @@ std::unique_ptr> reorder( throw std::runtime_error{"Unknown reordering algorithm " + FLAGS_reorder}; } - mtx->permute(perm)->write(data); + auto perm_arr = + gko::array::view(ref, data.size[0], perm->get_permutation()); + gko::as(mtx->permute(&perm_arr))->write(data); test_case["reordered"] = FLAGS_reorder; return perm; } @@ -103,16 +105,18 @@ std::unique_ptr> reorder( template void permute(std::unique_ptr>& vec, - const gko::matrix::Permutation* perm) + gko::matrix::Permutation* perm) { - vec = vec->permute(perm, gko::matrix::permute_mode::rows); + auto perm_arr = gko::array::view( + perm->get_executor(), perm->get_size()[0], perm->get_permutation()); + vec = gko::as>(vec->row_permute(&perm_arr)); } template void permute( std::unique_ptr>& vec, - const gko::matrix::Permutation* perm) + gko::matrix::Permutation* perm) {} From 21948574b68f6d9b5d216b1b4bbc1e7a42a80056 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 10 Oct 2023 14:19:57 +0200 Subject: [PATCH 328/583] disable reordering flag for distributed benchmarks Co-authored-by: Yuhsiang M. Tsai --- benchmark/blas/distributed/multi_vector.cpp | 3 +++ benchmark/solver/distributed/solver.cpp | 3 +++ benchmark/solver/solver_common.hpp | 3 +-- benchmark/spmv/distributed/spmv.cpp | 3 +++ benchmark/spmv/spmv_common.hpp | 2 +- benchmark/utils/general_matrix.hpp | 17 +++++++++++++---- benchmark/utils/generator.hpp | 4 ---- 7 files changed, 24 insertions(+), 11 deletions(-) diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index d95e5fb38ac..fe5eea5a38c 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define GKO_BENCHMARK_DISTRIBUTED + + #include "benchmark/blas/blas_common.hpp" #include "benchmark/utils/general.hpp" #include "benchmark/utils/generator.hpp" diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index d691309ab6a..6577c12e52e 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define GKO_BENCHMARK_DISTRIBUTED + + #include "benchmark/solver/solver_common.hpp" #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 46b7a231e9a..b19d00cd519 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -435,8 +435,7 @@ struct SolverBenchmark : Benchmark> { state.x = generator.initialize({0.0}, exec); } else { auto data = generator.generate_matrix_data(test_case); - auto permutation = - reorder(data, test_case, generator.is_distributed()); + auto permutation = reorder(data, test_case); state.system_matrix = generator.generate_matrix_with_format( exec, test_case["optimal"]["spmv"].get(), data); diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp index 202aad15c7e..d3925dabcf2 100644 --- a/benchmark/spmv/distributed/spmv.cpp +++ b/benchmark/spmv/distributed/spmv.cpp @@ -43,6 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define GKO_BENCHMARK_DISTRIBUTED + + #include "benchmark/spmv/spmv_common.hpp" #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index 4d1ab17ccf4..1d43e3ed327 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -105,7 +105,7 @@ struct SpmvBenchmark : Benchmark> { { spmv_benchmark_state state; state.data = generator.generate_matrix_data(test_case); - reorder(state.data, test_case, generator.is_distributed()); + reorder(state.data, test_case); auto nrhs = FLAGS_nrhs; state.b = generator.create_multi_vector_random( diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp index 3d063e91981..2efbec77f99 100644 --- a/benchmark/utils/general_matrix.hpp +++ b/benchmark/utils/general_matrix.hpp @@ -51,22 +51,27 @@ std::string reordering_algorithm_desc = #if GKO_HAVE_METIS " nd - Nested Dissection reordering algorithm\n" #endif - " rcm - Reverse Cuthill-McKee reordering algorithm"; + " rcm - Reverse Cuthill-McKee reordering algorithm\n" + "This is a preprocessing step whose runtime will not be included\n" + "in the measurements."; DEFINE_string(input_matrix, "", "Filename of a matrix to be used as the single input. Overwrites " "the value of the -input flag"); + +#ifndef GKO_BENCHMARK_DISTRIBUTED DEFINE_string(reorder, "none", reordering_algorithm_desc.c_str()); +#endif template std::unique_ptr> reorder( - gko::matrix_data& data, json& test_case, - bool is_distributed = false) + gko::matrix_data& data, json& test_case) { - if (FLAGS_reorder == "none" || is_distributed) { +#ifndef GKO_BENCHMARK_DISTRIBUTED + if (FLAGS_reorder == "none") { return nullptr; } using Csr = gko::matrix::Csr; @@ -100,6 +105,10 @@ std::unique_ptr> reorder( gko::as(mtx->permute(&perm_arr))->write(data); test_case["reordered"] = FLAGS_reorder; return perm; +#else + // no reordering for distributed benchmarks + return nullptr; +#endif } diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index c280cb1ac72..3491fb0fc2c 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -52,8 +52,6 @@ struct DefaultSystemGenerator { using value_type = ValueType; using Vec = vec; - static bool is_distributed() { return false; } - static gko::matrix_data generate_matrix_data( const json& config) { @@ -193,8 +191,6 @@ struct DistributedDefaultSystemGenerator { using Mtx = dist_mtx; using Vec = dist_vec; - static bool is_distributed() { return true; } - gko::matrix_data generate_matrix_data( const json& config) const { From 11001e6a2c6c93ade942a68f34a1634c16bb3947 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 4 Aug 2023 10:57:22 +0200 Subject: [PATCH 329/583] Add a batch::BatchLinOp hierarchy and core tests Co-authored-by: Aditya Kashi --- core/test/base/CMakeLists.txt | 1 + core/test/base/batch_lin_op.cpp | 405 ++++++++++++++++ include/ginkgo/core/base/batch_lin_op.hpp | 439 ++++++++++++++++++ .../ginkgo/core/base/batch_lin_op_helpers.hpp | 202 ++++++++ .../ginkgo/core/base/exception_helpers.hpp | 179 +++++++ include/ginkgo/core/log/logger.hpp | 118 ++++- include/ginkgo/ginkgo.hpp | 3 + 7 files changed, 1339 insertions(+), 8 deletions(-) create mode 100644 core/test/base/batch_lin_op.cpp create mode 100644 include/ginkgo/core/base/batch_lin_op.hpp create mode 100644 include/ginkgo/core/base/batch_lin_op_helpers.hpp diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index 36bad656b07..4fa00e12922 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -2,6 +2,7 @@ ginkgo_create_test(abstract_factory) ginkgo_create_test(allocator) ginkgo_create_test(array) ginkgo_create_test(batch_dim) +ginkgo_create_test(batch_lin_op) ginkgo_create_test(batch_multi_vector) ginkgo_create_test(dense_cache) ginkgo_create_test(combination) diff --git a/core/test/base/batch_lin_op.cpp b/core/test/base/batch_lin_op.cpp new file mode 100644 index 00000000000..1fe1765987f --- /dev/null +++ b/core/test/base/batch_lin_op.cpp @@ -0,0 +1,405 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include + + +namespace { + + +class DummyBatchLinOp : public gko::batch::EnableBatchLinOp, + public gko::EnableCreateMethod { +public: + DummyBatchLinOp(std::shared_ptr exec, + gko::batch_dim<2> size = gko::batch_dim<2>{}) + : gko::batch::EnableBatchLinOp(exec, size) + {} + + void access() const { last_access = this->get_executor(); } + + mutable std::shared_ptr last_access; + mutable std::shared_ptr last_b_access; + mutable std::shared_ptr last_x_access; + mutable std::shared_ptr last_alpha_access; + mutable std::shared_ptr last_beta_access; + +protected: + void apply_impl(const gko::batch::BatchLinOp* b, + gko::batch::BatchLinOp* x) const override + { + this->access(); + static_cast(b)->access(); + static_cast(x)->access(); + last_b_access = b->get_executor(); + last_x_access = x->get_executor(); + } + + void apply_impl(const gko::batch::BatchLinOp* alpha, + const gko::batch::BatchLinOp* b, + const gko::batch::BatchLinOp* beta, + gko::batch::BatchLinOp* x) const override + { + this->access(); + static_cast(alpha)->access(); + static_cast(b)->access(); + static_cast(beta)->access(); + static_cast(x)->access(); + last_alpha_access = alpha->get_executor(); + last_b_access = b->get_executor(); + last_beta_access = beta->get_executor(); + last_x_access = x->get_executor(); + } +}; + + +class EnableBatchLinOp : public ::testing::Test { +protected: + EnableBatchLinOp() + : ref{gko::ReferenceExecutor::create()}, + ref2{gko::ReferenceExecutor::create()}, + op{DummyBatchLinOp::create(ref2, + gko::batch_dim<2>(1, gko::dim<2>{3, 5}))}, + op2{DummyBatchLinOp::create(ref2, + gko::batch_dim<2>(2, gko::dim<2>{3, 5}))}, + alpha{DummyBatchLinOp::create( + ref, gko::batch_dim<2>(1, gko::dim<2>{1, 1}))}, + alpha2{DummyBatchLinOp::create( + ref, gko::batch_dim<2>(2, gko::dim<2>{1, 1}))}, + beta{DummyBatchLinOp::create( + ref, gko::batch_dim<2>(1, gko::dim<2>{1, 1}))}, + beta2{DummyBatchLinOp::create( + ref, gko::batch_dim<2>(2, gko::dim<2>{1, 1}))}, + b{DummyBatchLinOp::create(ref, + gko::batch_dim<2>(1, gko::dim<2>{5, 4}))}, + b2{DummyBatchLinOp::create(ref, + gko::batch_dim<2>(2, gko::dim<2>{5, 4}))}, + x{DummyBatchLinOp::create(ref, + gko::batch_dim<2>(1, gko::dim<2>{3, 4}))}, + x2{DummyBatchLinOp::create(ref, + gko::batch_dim<2>(2, gko::dim<2>{3, 4}))} + {} + + std::shared_ptr ref; + std::shared_ptr ref2; + std::unique_ptr op; + std::unique_ptr op2; + std::unique_ptr alpha; + std::unique_ptr alpha2; + std::unique_ptr beta; + std::unique_ptr beta2; + std::unique_ptr b; + std::unique_ptr b2; + std::unique_ptr x; + std::unique_ptr x2; +}; + + +TEST_F(EnableBatchLinOp, KnowsNumBatchItems) +{ + ASSERT_EQ(op->get_num_batch_items(), 1); + ASSERT_EQ(op2->get_num_batch_items(), 2); +} + + +TEST_F(EnableBatchLinOp, KnowsItsSizes) +{ + auto op1_sizes = gko::batch_dim<2>(1, gko::dim<2>{3, 5}); + auto op2_sizes = gko::batch_dim<2>(2, gko::dim<2>{3, 5}); + ASSERT_EQ(op->get_size(), op1_sizes); + ASSERT_EQ(op2->get_size(), op2_sizes); +} + + +TEST_F(EnableBatchLinOp, CallsApplyImpl) +{ + op->apply(b, x); + + ASSERT_EQ(op->last_access, ref2); +} + + +TEST_F(EnableBatchLinOp, CallsApplyImplForBatch) +{ + op2->apply(b2, x2); + + ASSERT_EQ(op2->last_access, ref2); +} + + +TEST_F(EnableBatchLinOp, CallsExtendedApplyImpl) +{ + op->apply(alpha, b, beta, x); + + ASSERT_EQ(op->last_access, ref2); +} + + +TEST_F(EnableBatchLinOp, CallsExtendedApplyImplBatch) +{ + op2->apply(alpha2, b2, beta2, x2); + + ASSERT_EQ(op2->last_access, ref2); +} + + +TEST_F(EnableBatchLinOp, ApplyFailsOnWrongBatchSize) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 4})); + + ASSERT_THROW(op->apply(wrong, x), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ApplyFailsOnWrongNumBatchItems) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 4})); + + ASSERT_THROW(op2->apply(wrong, x2), gko::ValueMismatch); +} + + +TEST_F(EnableBatchLinOp, ApplyFailsOnWrongSolutionRows) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{5, 4})); + + ASSERT_THROW(op->apply(b, wrong), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ApplyFailsOnOneBatchItemWrongSolutionRows) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(2, gko::dim<2>{5, 4})); + + ASSERT_THROW(op2->apply(b2, wrong), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ApplyFailsOnWrongSolutionColumns) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5})); + + ASSERT_THROW(op->apply(b, wrong), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ApplyFailsOnOneBatchItemWrongSolutionColumn) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(2, gko::dim<2>{3, 5})); + + ASSERT_THROW(op2->apply(b2, wrong), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongBatchSize) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 4})); + + ASSERT_THROW(op->apply(alpha, wrong, beta, x), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongSolutionRows) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{5, 4})); + + ASSERT_THROW(op->apply(alpha, b, beta, wrong), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongSolutionColumns) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5})); + + ASSERT_THROW(op->apply(alpha, b, beta, wrong), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongAlphaDimension) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{2, 5})); + + ASSERT_THROW(op->apply(wrong, b, beta, x), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongBetaDimension) +{ + auto wrong = + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{2, 5})); + + ASSERT_THROW(op->apply(alpha, b, wrong, x), gko::DimensionMismatch); +} + + +TEST_F(EnableBatchLinOp, ApplyDoesNotCopyBetweenSameMemory) +{ + op->apply(b, x); + + ASSERT_EQ(op->last_b_access, ref); + ASSERT_EQ(op->last_x_access, ref); +} + + +TEST_F(EnableBatchLinOp, ApplyNoCopyBackBetweenSameMemory) +{ + op->apply(b, x); + + ASSERT_EQ(b->last_access, ref); + ASSERT_EQ(x->last_access, ref); +} + + +TEST_F(EnableBatchLinOp, ExtendedApplyDoesNotCopyBetweenSameMemory) +{ + op->apply(alpha, b, beta, x); + + ASSERT_EQ(op->last_alpha_access, ref); + ASSERT_EQ(op->last_b_access, ref); + ASSERT_EQ(op->last_beta_access, ref); + ASSERT_EQ(op->last_x_access, ref); +} + + +TEST_F(EnableBatchLinOp, ExtendedApplyNoCopyBackBetweenSameMemory) +{ + op->apply(alpha, b, beta, x); + + ASSERT_EQ(alpha->last_access, ref); + ASSERT_EQ(b->last_access, ref); + ASSERT_EQ(beta->last_access, ref); + ASSERT_EQ(x->last_access, ref); +} + + +template +class DummyBatchLinOpWithFactory + : public gko::batch::EnableBatchLinOp> { +public: + DummyBatchLinOpWithFactory(std::shared_ptr exec) + : gko::batch::EnableBatchLinOp(exec) + {} + + GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) + { + T GKO_FACTORY_PARAMETER_SCALAR(value, T{5}); + }; + GKO_ENABLE_BATCH_LIN_OP_FACTORY(DummyBatchLinOpWithFactory, parameters, + Factory); + GKO_ENABLE_BUILD_METHOD(Factory); + + DummyBatchLinOpWithFactory(const Factory* factory, + std::shared_ptr op) + : gko::batch::EnableBatchLinOp( + factory->get_executor()), + parameters_{factory->get_parameters()}, + op_{op} + {} + + std::shared_ptr op_; + +protected: + void apply_impl(const gko::batch::BatchLinOp* b, + gko::batch::BatchLinOp* x) const override + {} + + void apply_impl(const gko::batch::BatchLinOp* alpha, + const gko::batch::BatchLinOp* b, + const gko::batch::BatchLinOp* beta, + gko::batch::BatchLinOp* x) const override + {} +}; + + +class EnableBatchLinOpFactory : public ::testing::Test { +protected: + EnableBatchLinOpFactory() : ref{gko::ReferenceExecutor::create()} {} + + std::shared_ptr ref; +}; + + +TEST_F(EnableBatchLinOpFactory, CreatesDefaultFactory) +{ + auto factory = DummyBatchLinOpWithFactory<>::build().on(ref); + + ASSERT_EQ(factory->get_parameters().value, 5); + ASSERT_EQ(factory->get_executor(), ref); +} + + +TEST_F(EnableBatchLinOpFactory, CreatesFactoryWithParameters) +{ + auto factory = DummyBatchLinOpWithFactory<>::build().with_value(7).on(ref); + + ASSERT_EQ(factory->get_parameters().value, 7); + ASSERT_EQ(factory->get_executor(), ref); +} + + +TEST_F(EnableBatchLinOpFactory, PassesParametersToBatchLinOp) +{ + auto dummy = gko::share( + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5}))); + auto factory = DummyBatchLinOpWithFactory<>::build().with_value(6).on(ref); + + auto op = factory->generate(dummy); + + ASSERT_EQ(op->get_executor(), ref); + ASSERT_EQ(op->get_parameters().value, 6); + ASSERT_EQ(op->op_.get(), dummy.get()); +} + + +} // namespace diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp new file mode 100644 index 00000000000..68a88027904 --- /dev/null +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -0,0 +1,439 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HPP_ +#define GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HPP_ + + +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace batch { + + +/** + * @addtogroup BatchLinOp + * + * @section batch_linop_concept Batched Linear operator as a concept + * + * A batch linear operator (BatchLinOp) forms the base class for all batched + * linear algebra objects. In general, it follows the same structure as the + * LinOp class, but has some crucial differences which make it not strictly + * representable through or with the LinOp class. + * + * A batched operator is defined as a set of independent linear operators which + * have no communication/information exchange between them. Therefore, any + * collective operations between the batches is not possible and not + * implemented. This allows for each batch to be computed and operated on in an + * embarrasingly parallel fashion. + * + * Similar to the LinOp class, the BatchLinOp also implements + * BatchLinOp::apply() methods which call the internal apply_impl() methods + * which the concrete BatchLinOp's have to implement. + * + * A key difference between the LinOp and the BatchLinOp classes is the storing + * of dimensions. BatchLinOp allows for storing non-equal objects in the + * batches and hence stores a batch_dim object instead of a dim object. The + * batch_dim object is optimized to store less amount of data when storing + * uniform batches. + * + * All size validation functions again verify first that the number of batches + * are conformant and that the dimensions in the corresponding batches + * themselves are also valid/conformant. Here too, optimizations for uniform + * batches have been added. + * + * @ref BatchLinOp + */ +class BatchLinOp : public EnableAbstractPolymorphicObject { +public: + /** + * Applies a batch linear operator to a batch vector (or a sequence of batch + * of vectors). + * + * Performs the operation x = op(b), where op is this batch linear operator. + * + * @param b the input batch vector(s) on which the batch operator is + * applied + * @param x the output batch vector(s) where the result is stored + * + * @return this + */ + BatchLinOp* apply(ptr_param b, ptr_param x) + { + this->template log( + this, b.get(), x.get()); + this->validate_application_parameters(b.get(), x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, x).get()); + this->template log( + this, b.get(), x.get()); + return this; + } + + /** + * @copydoc apply(const BatchLinOp *, BatchLinOp *) + */ + const BatchLinOp* apply(ptr_param b, + ptr_param x) const + { + this->template log( + this, b.get(), x.get()); + this->validate_application_parameters(b.get(), x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, x).get()); + this->template log( + this, b.get(), x.get()); + return this; + } + + /** + * Performs the operation x = alpha * op(b) + beta * x. + * + * @param alpha scaling of the result of op(b) + * @param b vector(s) on which the operator is applied + * @param beta scaling of the input x + * @param x output vector(s) + * + * @return this + */ + BatchLinOp* apply(ptr_param alpha, + ptr_param b, + ptr_param beta, ptr_param x) + { + this->template log( + this, alpha.get(), b.get(), beta.get(), x.get()); + this->validate_application_parameters(alpha.get(), b.get(), beta.get(), + x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, beta).get(), + make_temporary_clone(exec, x).get()); + this->template log( + this, alpha.get(), b.get(), beta.get(), x.get()); + return this; + } + + /** + * @copydoc apply(const BatchLinOp *, const BatchLinOp *, const BatchLinOp + * *, BatchLinOp *) + */ + const BatchLinOp* apply(ptr_param alpha, + ptr_param b, + ptr_param beta, + ptr_param x) const + { + this->template log( + this, alpha.get(), b.get(), beta.get(), x.get()); + this->validate_application_parameters(alpha.get(), b.get(), beta.get(), + x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, beta).get(), + make_temporary_clone(exec, x).get()); + this->template log( + this, alpha.get(), b.get(), beta.get(), x.get()); + return this; + } + + /** + * Returns the number of batches in the batch operator. + * + * @return number of batches in the batch operator + */ + size_type get_num_batch_items() const noexcept + { + return size_.get_num_batch_items(); + } + + /** + * Sets the size of the batch operator. + * + * @param size to be set + */ + void set_size(const batch_dim<2>& size) { size_ = size; } + + /** + * Returns the size of the batch operator. + * + * @return size of the batch operator + */ + const batch_dim<2>& get_size() const noexcept { return size_; } + +protected: + /** + * Creates a batch operator with uniform batches. + * + * @param exec the executor where all the operations are performed + * @param num_batch_items the number of batches to be stored in the + * operator + * @param size the size of on of the operator in the batched operator + */ + explicit BatchLinOp(std::shared_ptr exec, + const size_type num_batch_items = 0, + const dim<2>& common_size = dim<2>{}) + : EnableAbstractPolymorphicObject(exec), + size_{num_batch_items > 0 ? batch_dim<2>(num_batch_items, common_size) + : batch_dim<2>{}} + {} + + /** + * Creates a batch operator. + * + * @param exec the executor where all the operations are performed + * @param batch_size the sizes of the batch operator stored as a batch_dim + */ + explicit BatchLinOp(std::shared_ptr exec, + const batch_dim<2>& batch_size) + : EnableAbstractPolymorphicObject(exec), size_{batch_size} + {} + + /** + * Implementers of BatchLinOp should override this function instead + * of apply(const BatchLinOp *, BatchLinOp *). + * + * Performs the operation x = op(b), where op is this linear operator. + * + * @param b the input batch vector(s) on which the operator is applied + * @param x the output batch vector(s) where the result is stored + */ + virtual void apply_impl(const BatchLinOp* b, BatchLinOp* x) const = 0; + + /** + * Implementers of BatchLinOp should override this function instead + * of apply(const BatchLinOp *, const BatchLinOp *, const BatchLinOp *, + * BatchLinOp *). + * + * @param alpha scaling of the result of op(b) + * @param b vector(s) on which the operator is applied + * @param beta scaling of the input x + * @param x output vector(s) + */ + virtual void apply_impl(const BatchLinOp* alpha, const BatchLinOp* b, + const BatchLinOp* beta, BatchLinOp* x) const = 0; + + /** + * Throws a DimensionMismatch exception if the parameters to `apply` are of + * the wrong size. + * + * @param b batch vector(s) on which the operator is applied + * @param x output batch vector(s) + */ + void validate_application_parameters(const BatchLinOp* b, + const BatchLinOp* x) const + { + GKO_ASSERT_BATCH_CONFORMANT(this, b); + GKO_ASSERT_BATCH_EQUAL_ROWS(this, x); + GKO_ASSERT_BATCH_EQUAL_COLS(b, x); + } + + /** + * Throws a DimensionMismatch exception if the parameters to `apply` are of + * the wrong size. + * + * @param alpha scaling of the result of op(b) + * @param b batch vector(s) on which the operator is applied + * @param beta scaling of the input x + * @param x output batch vector(s) + */ + void validate_application_parameters(const BatchLinOp* alpha, + const BatchLinOp* b, + const BatchLinOp* beta, + const BatchLinOp* x) const + { + this->validate_application_parameters(b, x); + GKO_ASSERT_BATCH_EQUAL_ROWS( + alpha, batch_dim<2>(b->get_num_batch_items(), dim<2>(1, 1))); + GKO_ASSERT_BATCH_EQUAL_ROWS( + beta, batch_dim<2>(b->get_num_batch_items(), dim<2>(1, 1))); + } + +private: + batch_dim<2> size_{}; +}; + + +/** + * A BatchLinOpFactory represents a higher order mapping which transforms one + * batch linear operator into another. + * + * In a similar fashion to LinOps, BatchLinOps are also "generated" from the + * BatchLinOpFactory. A function of this class is to provide a generate method, + * which internally cals the generate_impl(), which the concrete BatchLinOps + * have to implement. + * + * Example: using BatchCG in Ginkgo + * --------------------------- + * + * ```c++ + * // Suppose A is a batch matrix, batch_b a batch rhs vector, and batch_x an + * // initial guess + * // Create a BatchCG which runs for at most 1000 iterations, and stops after + * // reducing the residual norm by 6 orders of magnitude + * auto batch_cg_factory = solver::BatchCg<>::build() + * .with_max_iters(1000) + * .with_rel_residual_goal(1e-6) + * .on(cuda); + * // create a batch linear operator which represents the solver + * auto batch_cg = batch_cg_factory->generate(A); + * // solve the system + * batch_cg->apply(gko::lend(batch_b), gko::lend(batch_x)); + * ``` + * + * @ingroup BatchLinOp + */ +class BatchLinOpFactory + : public AbstractFactory> { +public: + using AbstractFactory>::AbstractFactory; + + std::unique_ptr generate( + std::shared_ptr input) const + { + this->template log( + this, input.get()); + const auto exec = this->get_executor(); + std::unique_ptr generated; + if (input->get_executor() == exec) { + generated = this->AbstractFactory::generate(input); + } else { + generated = + this->AbstractFactory::generate(gko::clone(exec, input)); + } + this->template log( + this, input.get(), generated.get()); + return generated; + } +}; + + +/** + * The EnableBatchLinOp mixin can be used to provide sensible default + * implementations of the majority of the BatchLinOp and PolymorphicObject + * interface. + * + * The goal of the mixin is to facilitate the development of new BatchLinOp, by + * enabling the implementers to focus on the important parts of their operator, + * while the library takes care of generating the trivial utility functions. + * The mixin will provide default implementations for the entire + * PolymorphicObject interface, including a default implementation of + * `copy_from` between objects of the new BatchLinOp type. It will also hide the + * default BatchLinOp::apply() methods with versions that preserve the static + * type of the object. + * + * Implementers of new BatchLinOps are required to specify only the following + * aspects: + * + * 1. Creation of the BatchLinOp: This can be facilitated via either + * EnableCreateMethod mixin (used mostly for matrix formats), + * or GKO_ENABLE_BATCH_LIN_OP_FACTORY macro (used for operators created from + * other operators, like preconditioners and solvers). + * 2. Application of the BatchLinOp: Implementers have to override the two + * overloads of the BatchLinOp::apply_impl() virtual methods. + * + * @tparam ConcreteBatchLinOp the concrete BatchLinOp which is being + * implemented [CRTP parameter] + * @tparam PolymorphicBase parent of ConcreteBatchLinOp in the polymorphic + * hierarchy, has to be a subclass of BatchLinOp + * + * @ingroup BatchLinOp + */ +template +class EnableBatchLinOp + : public EnablePolymorphicObject, + public EnablePolymorphicAssignment { +public: + using EnablePolymorphicObject::EnablePolymorphicObject; + + const ConcreteBatchLinOp* apply(ptr_param b, + ptr_param x) const + { + PolymorphicBase::apply(b, x); + return self(); + } + + ConcreteBatchLinOp* apply(ptr_param b, + ptr_param x) + { + PolymorphicBase::apply(b, x); + return self(); + } + + const ConcreteBatchLinOp* apply(ptr_param alpha, + ptr_param b, + ptr_param beta, + ptr_param x) const + { + PolymorphicBase::apply(alpha, b, beta, x); + return self(); + } + + ConcreteBatchLinOp* apply(ptr_param alpha, + ptr_param b, + ptr_param beta, + ptr_param x) + { + PolymorphicBase::apply(alpha, b, beta, x); + return self(); + } + +protected: + GKO_ENABLE_SELF(ConcreteBatchLinOp); +}; + + +} // namespace batch +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HPP_ diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp new file mode 100644 index 00000000000..579411e9af0 --- /dev/null +++ b/include/ginkgo/core/base/batch_lin_op_helpers.hpp @@ -0,0 +1,202 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ +#define GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ + + +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace batch { + + +/** + * This is an alias for the EnableDefaultFactory mixin, which correctly sets the + * template parameters to enable a subclass of BatchLinOpFactory. + * + * @tparam ConcreteFactory the concrete factory which is being implemented + * [CRTP parmeter] + * @tparam ConcreteLinOp the concrete BatchLinOp type which this factory + * produces, needs to have a constructor which takes a const ConcreteFactory *, + * and an std::shared_ptr as parameters. + * @tparam ParametersType a subclass of enable_parameters_type template which + * defines all of the parameters of the factory + * @tparam PolymorphicBase parent of ConcreteFactory in the polymorphic + * hierarchy, has to be a subclass of LinOpFactory + * + * @ingroup BatchLinOp + */ +template +using EnableDefaultBatchLinOpFactory = + EnableDefaultFactory; + + +/** + * This macro will generate a default implementation of a BatchLinOpFactory for + * the BatchLinOp subclass it is defined in. + * + * It is required to first call the macro #GKO_CREATE_FACTORY_PARAMETERS() + * before this one in order to instantiate the parameters type first. + * + * The list of parameters for the factory should be defined in a code block + * after the macro definition, and should contain a list of + * GKO_FACTORY_PARAMETER_* declarations. The class should provide a constructor + * with signature + * _batch_lin_op(const _factory_name *, std::shared_ptr) + * which the factory will use a callback to construct the object. + * + * A minimal example of a batch linear operator is the following: + * + * ```c++ + * struct MyBatchLinOp : public EnableBatchLinOp { + * GKO_ENABLE_BATCH_LIN_OP_FACTORY(MyBatchLinOp, my_parameters, Factory) { + * // a factory parameter named "my_value", of type int and default + * // value of 5 + * int GKO_FACTORY_PARAMETER_SCALAR(my_value, 5); + * // a factory parameter named `my_pair` of type `std::pair` + * // and default value {5, 5} + * std::pair GKO_FACTORY_PARAMETER_VECTOR(my_pair, 5, 5); + * }; + * // constructor needed by EnableBatchLinOp + * explicit MyBatchLinOp(std::shared_ptr exec) { + * : EnableBatchLinOp(exec) {} + * // constructor needed by the factory + * explicit MyBatchLinOp(const Factory *factory, + * std::shared_ptr matrix) + * : EnableBatchLinOp(factory->get_executor()), + * matrix->get_size()), + * // store factory's parameters locally + * my_parameters_{factory->get_parameters()} + * { + * int value = my_parameters_.my_value; + * // do something with value + * } + * ``` + * + * MyBatchLinOp can then be created as follows: + * + * ```c++ + * auto exec = gko::ReferenceExecutor::create(); + * // create a factory with default `my_value` parameter + * auto fact = MyBatchLinOp::build().on(exec); + * // create a operator using the factory: + * auto my_op = fact->generate(gko::matrix::BatchIdentity::create(exec, 2)); + * std::cout << my_op->get_my_parameters().my_value; // prints 5 + * + * // create a factory with custom `my_value` parameter + * auto fact = MyLinOp::build().with_my_value(0).on(exec); + * // create a operator using the factory: + * auto my_op = fact->generate(gko::matrix::BatchIdentity::create(exec, 2)); + * std::cout << my_op->get_my_parameters().my_value; // prints 0 + * ``` + * + * @note It is possible to combine both the #GKO_CREATE_FACTORY_PARAMETER_*() + * macros with this one in a unique macro for class __templates__ (not with + * regular classes). Splitting this into two distinct macros allows to use them + * in all contexts. See for more + * details. + * + * @param _lin_op concrete operator for which the factory is to be created + * [CRTP parameter] + * @param _parameters_name name of the parameters member in the class + * (its type is `<_parameters_name>_type`, the + * protected member's name is `<_parameters_name>_`, + * and the public getter's name is + * `get_<_parameters_name>()`) + * @param _factory_name name of the generated factory type + * + * @ingroup BatchLinOp + */ +#define GKO_ENABLE_BATCH_LIN_OP_FACTORY(_batch_lin_op, _parameters_name, \ + _factory_name) \ +public: \ + const _parameters_name##_type& get_##_parameters_name() const \ + { \ + return _parameters_name##_; \ + } \ + \ + class _factory_name \ + : public ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type> { \ + friend class ::gko::EnablePolymorphicObject< \ + _factory_name, ::gko::batch::BatchLinOpFactory>; \ + friend class ::gko::enable_parameters_type<_parameters_name##_type, \ + _factory_name>; \ + explicit _factory_name(std::shared_ptr exec) \ + : ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type>( \ + std::move(exec)) \ + {} \ + explicit _factory_name(std::shared_ptr exec, \ + const _parameters_name##_type& parameters) \ + : ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type>( \ + std::move(exec), parameters) \ + {} \ + }; \ + friend ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type>; \ + \ + \ +private: \ + _parameters_name##_type _parameters_name##_; \ + \ +public: \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + +} // namespace batch +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp index 4a12865f374..a45e19612b2 100644 --- a/include/ginkgo/core/base/exception_helpers.hpp +++ b/include/ginkgo/core/base/exception_helpers.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -147,6 +148,22 @@ inline dim<2> get_size(const T& op) inline dim<2> get_size(const dim<2>& size) { return size; } +template +inline batch_dim<2> get_batch_size(const T& op) +{ + return op->get_size(); +} + +inline batch_dim<2> get_batch_size(const batch_dim<2>& size) { return size; } + + +template +inline size_type get_num_batch_items(const T& obj) +{ + return obj.get_num_batch_items(); +} + + } // namespace detail @@ -298,6 +315,168 @@ inline dim<2> get_size(const dim<2>& size) { return size; } } +/** + * Asserts that _op1 can be applied to _op2. + * + * @throw DimensionMismatch if _op1 cannot be applied to _op2. + */ +#define GKO_ASSERT_BATCH_CONFORMANT(_op1, _op2) \ + { \ + auto equal_num_items = \ + ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + auto equal_inner_size = \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1] == \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0]; \ + if (!equal_num_items) { \ + throw ::gko::ValueMismatch( \ + __FILE__, __LINE__, __func__, \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + "expected equal number of batch items"); \ + } else if (!equal_inner_size) { \ + throw ::gko::DimensionMismatch( \ + __FILE__, __LINE__, __func__, #_op1, \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1], \ + #_op2, \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1], \ + "expected matching inner dimensions among all batch items"); \ + } \ + } + + +/** + * Asserts that _op1 can be applied to _op2 from the right. + * + * @throw DimensionMismatch if _op1 cannot be applied to _op2 from the right. + */ +#define GKO_ASSERT_BATCH_REVERSE_CONFORMANT(_op1, _op2) \ + { \ + auto equal_num_items = \ + ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + auto equal_outer_size = \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0] == \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1]; \ + if (!equal_num_items) { \ + throw ::gko::ValueMismatch( \ + __FILE__, __LINE__, __func__, \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + "expected equal number of batch items"); \ + } else if (!equal_outer_size) { \ + throw ::gko::DimensionMismatch( \ + __FILE__, __LINE__, __func__, #_op1, \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1], \ + #_op2, \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1], \ + "expected matching outer dimensions among all batch items"); \ + } \ + } + + +/** + * Asserts that `_op1` and `_op2` have the same number of rows. + * + * @throw DimensionMismatch if `_op1` and `_op2` differ in the number of rows + */ +#define GKO_ASSERT_BATCH_EQUAL_ROWS(_op1, _op2) \ + { \ + auto equal_num_items = \ + ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + auto equal_rows = \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0] == \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0]; \ + if (!equal_num_items) { \ + throw ::gko::ValueMismatch( \ + __FILE__, __LINE__, __func__, \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + "expected equal number of batch items"); \ + } else if (!equal_rows) { \ + throw ::gko::DimensionMismatch( \ + __FILE__, __LINE__, __func__, #_op1, \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1], \ + #_op2, \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1], \ + "expected matching number of rows among all batch items"); \ + } \ + } + + +/** + * Asserts that `_op1` and `_op2` have the same number of columns. + * + * @throw DimensionMismatch if `_op1` and `_op2` differ in the number of + * columns + */ +#define GKO_ASSERT_BATCH_EQUAL_COLS(_op1, _op2) \ + { \ + auto equal_num_items = \ + ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + auto equal_cols = \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1] == \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1]; \ + if (!equal_num_items) { \ + throw ::gko::ValueMismatch( \ + __FILE__, __LINE__, __func__, \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + "expected equal number of batch items"); \ + } else if (!equal_cols) { \ + throw ::gko::DimensionMismatch( \ + __FILE__, __LINE__, __func__, #_op1, \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1], \ + #_op2, \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1], \ + "expected matching number of cols among all batch items"); \ + } \ + } + + +/** + * Asserts that `_op1` and `_op2` have the same number of rows and columns. + * + * @throw DimensionMismatch if `_op1` and `_op2` differ in the number of + * rows or columns + */ +#define GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(_op1, _op2) \ + { \ + auto equal_num_items = \ + ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + auto equal_size = \ + ::gko::detail::get_batch_size(_op1).get_common_size() == \ + ::gko::detail::get_batch_size(_op2).get_common_size(); \ + if (!equal_num_items) { \ + throw ::gko::ValueMismatch( \ + __FILE__, __LINE__, __func__, \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + "expected equal number of batch items"); \ + } else if (!equal_size) { \ + throw ::gko::DimensionMismatch( \ + __FILE__, __LINE__, __func__, #_op1, \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1], \ + #_op2, \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1], \ + "expected matching size among all batch items"); \ + } \ + } + + /** * Instantiates a MpiError. * diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index b700e1e703a..a6fade087b0 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -58,6 +58,13 @@ class PolymorphicObject; class Operation; class stopping_status; + +namespace batch { +class BatchLinOp; +class BatchLinOpFactory; +} // namespace batch + + /** * @brief The Stopping criterion namespace. * @ref stop @@ -448,9 +455,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [[deprecated( - "Please use the version with the additional stopping " - "information.")]] virtual void + [ + [deprecated("Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x = nullptr, const LinOp* tau = nullptr) const @@ -469,9 +476,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [[deprecated( - "Please use the version with the additional stopping " - "information.")]] virtual void + [ + [deprecated("Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x, const LinOp* tau, const LinOp* implicit_tau_sq) const @@ -563,6 +570,86 @@ public: \ const PolymorphicObject* input, const PolymorphicObject* output) + /** + * BatchLinOp's apply started event. + * + * @param A the system matrix + * @param b the input vector(s) + * @param x the output vector(s) + */ + GKO_LOGGER_REGISTER_EVENT(24, batch_linop_apply_started, + const batch::BatchLinOp* A, + const batch::BatchLinOp* b, + const batch::BatchLinOp* x) + + /** + * BatchLinOp's apply completed event. + * + * @param A the system matrix + * @param b the input vector(s) + * @param x the output vector(s) + */ + GKO_LOGGER_REGISTER_EVENT(25, batch_linop_apply_completed, + const batch::BatchLinOp* A, + const batch::BatchLinOp* b, + const batch::BatchLinOp* x) + + /** + * BatchLinOp's advanced apply started event. + * + * @param A the system matrix + * @param alpha scaling of the result of op(b) + * @param b the input vector(s) + * @param beta scaling of the input x + * @param x the output vector(s) + */ + GKO_LOGGER_REGISTER_EVENT(26, batch_linop_advanced_apply_started, + const batch::BatchLinOp* A, + const batch::BatchLinOp* alpha, + const batch::BatchLinOp* b, + const batch::BatchLinOp* beta, + const batch::BatchLinOp* x) + + /** + * BatchLinOp's advanced apply completed event. + * + * @param A the system matrix + * @param alpha scaling of the result of op(b) + * @param b the input vector(s) + * @param beta scaling of the input x + * @param x the output vector(s) + */ + GKO_LOGGER_REGISTER_EVENT(27, batch_linop_advanced_apply_completed, + const batch::BatchLinOp* A, + const batch::BatchLinOp* alpha, + const batch::BatchLinOp* b, + const batch::BatchLinOp* beta, + const batch::BatchLinOp* x) + + /** + * BatchLinOp Factory's generate started event. + * + * @param factory the factory used + * @param input the BatchLinOp object used as input for the generation + * (usually a system matrix) + */ + GKO_LOGGER_REGISTER_EVENT(28, batch_linop_factory_generate_started, + const batch::BatchLinOpFactory* factory, + const batch::BatchLinOp* input) + + /** + * BatchLinOp Factory's generate completed event. + * + * @param factory the factory used + * @param input the BatchLinOp object used as input for the generation + * (usually a system matrix) + * @param output the generated BatchLinOp object + */ + GKO_LOGGER_REGISTER_EVENT(29, batch_linop_factory_generate_completed, + const batch::BatchLinOpFactory* factory, + const batch::BatchLinOp* input, + const batch::BatchLinOp* output) + #undef GKO_LOGGER_REGISTER_EVENT /** @@ -605,6 +692,21 @@ public: \ linop_factory_generate_started_mask | linop_factory_generate_completed_mask; + /** + * Bitset Mask which activates all batch linop events + */ + static constexpr mask_type batch_linop_events_mask = + batch_linop_apply_started_mask | batch_linop_apply_completed_mask | + batch_linop_advanced_apply_started_mask | + batch_linop_advanced_apply_completed_mask; + + /** + * Bitset Mask which activates all batch linop factory events + */ + static constexpr mask_type batch_linop_factory_events_mask = + batch_linop_factory_generate_started_mask | + batch_linop_factory_generate_completed_mask; + /** * Bitset Mask which activates all criterion events */ @@ -772,8 +874,8 @@ class EnableLogging : public PolymorphicBase { template struct propagate_log_helper< Event, ConcreteLoggableT, - xstd::void_t< - decltype(std::declval().get_executor())>> { + xstd::void_t().get_executor())>> { template static void propagate_log(const ConcreteLoggableT* loggable, Args&&... args) diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index bcdaa5d2d20..186a5fce061 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -40,6 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include #include @@ -54,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include From 0da356008556c8afd3f20e20dc641d67a9ba9695 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 27 Sep 2023 11:22:07 +0200 Subject: [PATCH 330/583] Review updates. Co-authored-by: Marcel Koch --- core/test/base/batch_lin_op.cpp | 72 ++---------- include/ginkgo/core/base/batch_lin_op.hpp | 14 +-- .../ginkgo/core/base/exception_helpers.hpp | 108 +++++++----------- 3 files changed, 58 insertions(+), 136 deletions(-) diff --git a/core/test/base/batch_lin_op.cpp b/core/test/base/batch_lin_op.cpp index 1fe1765987f..b656c2bf3fb 100644 --- a/core/test/base/batch_lin_op.cpp +++ b/core/test/base/batch_lin_op.cpp @@ -57,23 +57,13 @@ class DummyBatchLinOp : public gko::batch::EnableBatchLinOp, : gko::batch::EnableBatchLinOp(exec, size) {} - void access() const { last_access = this->get_executor(); } - - mutable std::shared_ptr last_access; - mutable std::shared_ptr last_b_access; - mutable std::shared_ptr last_x_access; - mutable std::shared_ptr last_alpha_access; - mutable std::shared_ptr last_beta_access; + int called = 0; protected: void apply_impl(const gko::batch::BatchLinOp* b, gko::batch::BatchLinOp* x) const override { - this->access(); - static_cast(b)->access(); - static_cast(x)->access(); - last_b_access = b->get_executor(); - last_x_access = x->get_executor(); + this->called = 1; } void apply_impl(const gko::batch::BatchLinOp* alpha, @@ -81,15 +71,7 @@ class DummyBatchLinOp : public gko::batch::EnableBatchLinOp, const gko::batch::BatchLinOp* beta, gko::batch::BatchLinOp* x) const override { - this->access(); - static_cast(alpha)->access(); - static_cast(b)->access(); - static_cast(beta)->access(); - static_cast(x)->access(); - last_alpha_access = alpha->get_executor(); - last_b_access = b->get_executor(); - last_beta_access = beta->get_executor(); - last_x_access = x->get_executor(); + this->called = 2; } }; @@ -156,7 +138,7 @@ TEST_F(EnableBatchLinOp, CallsApplyImpl) { op->apply(b, x); - ASSERT_EQ(op->last_access, ref2); + ASSERT_EQ(op->called, 1); } @@ -164,7 +146,7 @@ TEST_F(EnableBatchLinOp, CallsApplyImplForBatch) { op2->apply(b2, x2); - ASSERT_EQ(op2->last_access, ref2); + ASSERT_EQ(op2->called, 1); } @@ -172,7 +154,7 @@ TEST_F(EnableBatchLinOp, CallsExtendedApplyImpl) { op->apply(alpha, b, beta, x); - ASSERT_EQ(op->last_access, ref2); + ASSERT_EQ(op->called, 2); } @@ -180,7 +162,7 @@ TEST_F(EnableBatchLinOp, CallsExtendedApplyImplBatch) { op2->apply(alpha2, b2, beta2, x2); - ASSERT_EQ(op2->last_access, ref2); + ASSERT_EQ(op2->called, 2); } @@ -283,46 +265,6 @@ TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongBetaDimension) } -TEST_F(EnableBatchLinOp, ApplyDoesNotCopyBetweenSameMemory) -{ - op->apply(b, x); - - ASSERT_EQ(op->last_b_access, ref); - ASSERT_EQ(op->last_x_access, ref); -} - - -TEST_F(EnableBatchLinOp, ApplyNoCopyBackBetweenSameMemory) -{ - op->apply(b, x); - - ASSERT_EQ(b->last_access, ref); - ASSERT_EQ(x->last_access, ref); -} - - -TEST_F(EnableBatchLinOp, ExtendedApplyDoesNotCopyBetweenSameMemory) -{ - op->apply(alpha, b, beta, x); - - ASSERT_EQ(op->last_alpha_access, ref); - ASSERT_EQ(op->last_b_access, ref); - ASSERT_EQ(op->last_beta_access, ref); - ASSERT_EQ(op->last_x_access, ref); -} - - -TEST_F(EnableBatchLinOp, ExtendedApplyNoCopyBackBetweenSameMemory) -{ - op->apply(alpha, b, beta, x); - - ASSERT_EQ(alpha->last_access, ref); - ASSERT_EQ(b->last_access, ref); - ASSERT_EQ(beta->last_access, ref); - ASSERT_EQ(x->last_access, ref); -} - - template class DummyBatchLinOpWithFactory : public gko::batch::EnableBatchLinOp> { diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index 68a88027904..a04ae3e79ce 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -194,13 +194,6 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { return size_.get_num_batch_items(); } - /** - * Sets the size of the batch operator. - * - * @param size to be set - */ - void set_size(const batch_dim<2>& size) { size_ = size; } - /** * Returns the size of the batch operator. * @@ -209,6 +202,13 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { const batch_dim<2>& get_size() const noexcept { return size_; } protected: + /** + * Sets the size of the batch operator. + * + * @param size to be set + */ + void set_size(const batch_dim<2>& size) { size_ = size; } + /** * Creates a batch operator with uniform batches. * diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp index a45e19612b2..cb5a8b10263 100644 --- a/include/ginkgo/core/base/exception_helpers.hpp +++ b/include/ginkgo/core/base/exception_helpers.hpp @@ -315,6 +315,26 @@ inline size_type get_num_batch_items(const T& obj) } +/** + * Asserts that _op1 and _op2 have equal number of items in the batch + * + * @throw ValueMismatch if _op1 and _op2 do not have equal number of items + */ +#define GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2) \ + { \ + auto equal_num_items = \ + ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + if (!equal_num_items) { \ + throw ::gko::ValueMismatch( \ + __FILE__, __LINE__, __func__, \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ + "expected equal number of batch items"); \ + } \ + } + + /** * Asserts that _op1 can be applied to _op2. * @@ -322,19 +342,11 @@ inline size_type get_num_batch_items(const T& obj) */ #define GKO_ASSERT_BATCH_CONFORMANT(_op1, _op2) \ { \ - auto equal_num_items = \ - ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2); \ auto equal_inner_size = \ ::gko::detail::get_batch_size(_op1).get_common_size()[1] == \ ::gko::detail::get_batch_size(_op2).get_common_size()[0]; \ - if (!equal_num_items) { \ - throw ::gko::ValueMismatch( \ - __FILE__, __LINE__, __func__, \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - "expected equal number of batch items"); \ - } else if (!equal_inner_size) { \ + if (!equal_inner_size) { \ throw ::gko::DimensionMismatch( \ __FILE__, __LINE__, __func__, #_op1, \ ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ @@ -354,19 +366,11 @@ inline size_type get_num_batch_items(const T& obj) */ #define GKO_ASSERT_BATCH_REVERSE_CONFORMANT(_op1, _op2) \ { \ - auto equal_num_items = \ - ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2); \ auto equal_outer_size = \ ::gko::detail::get_batch_size(_op1).get_common_size()[0] == \ ::gko::detail::get_batch_size(_op2).get_common_size()[1]; \ - if (!equal_num_items) { \ - throw ::gko::ValueMismatch( \ - __FILE__, __LINE__, __func__, \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - "expected equal number of batch items"); \ - } else if (!equal_outer_size) { \ + if (!equal_outer_size) { \ throw ::gko::DimensionMismatch( \ __FILE__, __LINE__, __func__, #_op1, \ ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ @@ -386,19 +390,11 @@ inline size_type get_num_batch_items(const T& obj) */ #define GKO_ASSERT_BATCH_EQUAL_ROWS(_op1, _op2) \ { \ - auto equal_num_items = \ - ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2); \ auto equal_rows = \ ::gko::detail::get_batch_size(_op1).get_common_size()[0] == \ ::gko::detail::get_batch_size(_op2).get_common_size()[0]; \ - if (!equal_num_items) { \ - throw ::gko::ValueMismatch( \ - __FILE__, __LINE__, __func__, \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - "expected equal number of batch items"); \ - } else if (!equal_rows) { \ + if (!equal_rows) { \ throw ::gko::DimensionMismatch( \ __FILE__, __LINE__, __func__, #_op1, \ ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ @@ -419,19 +415,11 @@ inline size_type get_num_batch_items(const T& obj) */ #define GKO_ASSERT_BATCH_EQUAL_COLS(_op1, _op2) \ { \ - auto equal_num_items = \ - ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ + GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2); \ auto equal_cols = \ ::gko::detail::get_batch_size(_op1).get_common_size()[1] == \ ::gko::detail::get_batch_size(_op2).get_common_size()[1]; \ - if (!equal_num_items) { \ - throw ::gko::ValueMismatch( \ - __FILE__, __LINE__, __func__, \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - "expected equal number of batch items"); \ - } else if (!equal_cols) { \ + if (!equal_cols) { \ throw ::gko::DimensionMismatch( \ __FILE__, __LINE__, __func__, #_op1, \ ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ @@ -450,30 +438,22 @@ inline size_type get_num_batch_items(const T& obj) * @throw DimensionMismatch if `_op1` and `_op2` differ in the number of * rows or columns */ -#define GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(_op1, _op2) \ - { \ - auto equal_num_items = \ - ::gko::detail::get_batch_size(_op1).get_num_batch_items() == \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(); \ - auto equal_size = \ - ::gko::detail::get_batch_size(_op1).get_common_size() == \ - ::gko::detail::get_batch_size(_op2).get_common_size(); \ - if (!equal_num_items) { \ - throw ::gko::ValueMismatch( \ - __FILE__, __LINE__, __func__, \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \ - "expected equal number of batch items"); \ - } else if (!equal_size) { \ - throw ::gko::DimensionMismatch( \ - __FILE__, __LINE__, __func__, #_op1, \ - ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ - ::gko::detail::get_batch_size(_op1).get_common_size()[1], \ - #_op2, \ - ::gko::detail::get_batch_size(_op2).get_common_size()[0], \ - ::gko::detail::get_batch_size(_op2).get_common_size()[1], \ - "expected matching size among all batch items"); \ - } \ +#define GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(_op1, _op2) \ + { \ + GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2); \ + auto equal_size = \ + ::gko::detail::get_batch_size(_op1).get_common_size() == \ + ::gko::detail::get_batch_size(_op2).get_common_size(); \ + if (!equal_size) { \ + throw ::gko::DimensionMismatch( \ + __FILE__, __LINE__, __func__, #_op1, \ + ::gko::detail::get_batch_size(_op1).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op1).get_common_size()[1], \ + #_op2, \ + ::gko::detail::get_batch_size(_op2).get_common_size()[0], \ + ::gko::detail::get_batch_size(_op2).get_common_size()[1], \ + "expected matching size among all batch items"); \ + } \ } From c339832ffb4ef1170879fb80648e355e839871e1 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 1 Oct 2023 14:50:07 +0200 Subject: [PATCH 331/583] Remove apply functionality from BatchLinOp --- core/test/base/batch_lin_op.cpp | 185 +--------------------- include/ginkgo/core/base/batch_lin_op.hpp | 185 ---------------------- 2 files changed, 1 insertion(+), 369 deletions(-) diff --git a/core/test/base/batch_lin_op.cpp b/core/test/base/batch_lin_op.cpp index b656c2bf3fb..2e0bf0fae0e 100644 --- a/core/test/base/batch_lin_op.cpp +++ b/core/test/base/batch_lin_op.cpp @@ -56,23 +56,6 @@ class DummyBatchLinOp : public gko::batch::EnableBatchLinOp, gko::batch_dim<2> size = gko::batch_dim<2>{}) : gko::batch::EnableBatchLinOp(exec, size) {} - - int called = 0; - -protected: - void apply_impl(const gko::batch::BatchLinOp* b, - gko::batch::BatchLinOp* x) const override - { - this->called = 1; - } - - void apply_impl(const gko::batch::BatchLinOp* alpha, - const gko::batch::BatchLinOp* b, - const gko::batch::BatchLinOp* beta, - gko::batch::BatchLinOp* x) const override - { - this->called = 2; - } }; @@ -84,37 +67,13 @@ class EnableBatchLinOp : public ::testing::Test { op{DummyBatchLinOp::create(ref2, gko::batch_dim<2>(1, gko::dim<2>{3, 5}))}, op2{DummyBatchLinOp::create(ref2, - gko::batch_dim<2>(2, gko::dim<2>{3, 5}))}, - alpha{DummyBatchLinOp::create( - ref, gko::batch_dim<2>(1, gko::dim<2>{1, 1}))}, - alpha2{DummyBatchLinOp::create( - ref, gko::batch_dim<2>(2, gko::dim<2>{1, 1}))}, - beta{DummyBatchLinOp::create( - ref, gko::batch_dim<2>(1, gko::dim<2>{1, 1}))}, - beta2{DummyBatchLinOp::create( - ref, gko::batch_dim<2>(2, gko::dim<2>{1, 1}))}, - b{DummyBatchLinOp::create(ref, - gko::batch_dim<2>(1, gko::dim<2>{5, 4}))}, - b2{DummyBatchLinOp::create(ref, - gko::batch_dim<2>(2, gko::dim<2>{5, 4}))}, - x{DummyBatchLinOp::create(ref, - gko::batch_dim<2>(1, gko::dim<2>{3, 4}))}, - x2{DummyBatchLinOp::create(ref, - gko::batch_dim<2>(2, gko::dim<2>{3, 4}))} + gko::batch_dim<2>(2, gko::dim<2>{3, 5}))} {} std::shared_ptr ref; std::shared_ptr ref2; std::unique_ptr op; std::unique_ptr op2; - std::unique_ptr alpha; - std::unique_ptr alpha2; - std::unique_ptr beta; - std::unique_ptr beta2; - std::unique_ptr b; - std::unique_ptr b2; - std::unique_ptr x; - std::unique_ptr x2; }; @@ -134,137 +93,6 @@ TEST_F(EnableBatchLinOp, KnowsItsSizes) } -TEST_F(EnableBatchLinOp, CallsApplyImpl) -{ - op->apply(b, x); - - ASSERT_EQ(op->called, 1); -} - - -TEST_F(EnableBatchLinOp, CallsApplyImplForBatch) -{ - op2->apply(b2, x2); - - ASSERT_EQ(op2->called, 1); -} - - -TEST_F(EnableBatchLinOp, CallsExtendedApplyImpl) -{ - op->apply(alpha, b, beta, x); - - ASSERT_EQ(op->called, 2); -} - - -TEST_F(EnableBatchLinOp, CallsExtendedApplyImplBatch) -{ - op2->apply(alpha2, b2, beta2, x2); - - ASSERT_EQ(op2->called, 2); -} - - -TEST_F(EnableBatchLinOp, ApplyFailsOnWrongBatchSize) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 4})); - - ASSERT_THROW(op->apply(wrong, x), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ApplyFailsOnWrongNumBatchItems) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 4})); - - ASSERT_THROW(op2->apply(wrong, x2), gko::ValueMismatch); -} - - -TEST_F(EnableBatchLinOp, ApplyFailsOnWrongSolutionRows) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{5, 4})); - - ASSERT_THROW(op->apply(b, wrong), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ApplyFailsOnOneBatchItemWrongSolutionRows) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(2, gko::dim<2>{5, 4})); - - ASSERT_THROW(op2->apply(b2, wrong), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ApplyFailsOnWrongSolutionColumns) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5})); - - ASSERT_THROW(op->apply(b, wrong), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ApplyFailsOnOneBatchItemWrongSolutionColumn) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(2, gko::dim<2>{3, 5})); - - ASSERT_THROW(op2->apply(b2, wrong), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongBatchSize) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 4})); - - ASSERT_THROW(op->apply(alpha, wrong, beta, x), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongSolutionRows) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{5, 4})); - - ASSERT_THROW(op->apply(alpha, b, beta, wrong), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongSolutionColumns) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5})); - - ASSERT_THROW(op->apply(alpha, b, beta, wrong), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongAlphaDimension) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{2, 5})); - - ASSERT_THROW(op->apply(wrong, b, beta, x), gko::DimensionMismatch); -} - - -TEST_F(EnableBatchLinOp, ExtendedApplyFailsOnWrongBetaDimension) -{ - auto wrong = - DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{2, 5})); - - ASSERT_THROW(op->apply(alpha, b, wrong, x), gko::DimensionMismatch); -} - - template class DummyBatchLinOpWithFactory : public gko::batch::EnableBatchLinOp> { @@ -290,17 +118,6 @@ class DummyBatchLinOpWithFactory {} std::shared_ptr op_; - -protected: - void apply_impl(const gko::batch::BatchLinOp* b, - gko::batch::BatchLinOp* x) const override - {} - - void apply_impl(const gko::batch::BatchLinOp* alpha, - const gko::batch::BatchLinOp* b, - const gko::batch::BatchLinOp* beta, - gko::batch::BatchLinOp* x) const override - {} }; diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index a04ae3e79ce..ac632c715e8 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -91,99 +91,6 @@ namespace batch { */ class BatchLinOp : public EnableAbstractPolymorphicObject { public: - /** - * Applies a batch linear operator to a batch vector (or a sequence of batch - * of vectors). - * - * Performs the operation x = op(b), where op is this batch linear operator. - * - * @param b the input batch vector(s) on which the batch operator is - * applied - * @param x the output batch vector(s) where the result is stored - * - * @return this - */ - BatchLinOp* apply(ptr_param b, ptr_param x) - { - this->template log( - this, b.get(), x.get()); - this->validate_application_parameters(b.get(), x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, x).get()); - this->template log( - this, b.get(), x.get()); - return this; - } - - /** - * @copydoc apply(const BatchLinOp *, BatchLinOp *) - */ - const BatchLinOp* apply(ptr_param b, - ptr_param x) const - { - this->template log( - this, b.get(), x.get()); - this->validate_application_parameters(b.get(), x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, x).get()); - this->template log( - this, b.get(), x.get()); - return this; - } - - /** - * Performs the operation x = alpha * op(b) + beta * x. - * - * @param alpha scaling of the result of op(b) - * @param b vector(s) on which the operator is applied - * @param beta scaling of the input x - * @param x output vector(s) - * - * @return this - */ - BatchLinOp* apply(ptr_param alpha, - ptr_param b, - ptr_param beta, ptr_param x) - { - this->template log( - this, alpha.get(), b.get(), beta.get(), x.get()); - this->validate_application_parameters(alpha.get(), b.get(), beta.get(), - x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, beta).get(), - make_temporary_clone(exec, x).get()); - this->template log( - this, alpha.get(), b.get(), beta.get(), x.get()); - return this; - } - - /** - * @copydoc apply(const BatchLinOp *, const BatchLinOp *, const BatchLinOp - * *, BatchLinOp *) - */ - const BatchLinOp* apply(ptr_param alpha, - ptr_param b, - ptr_param beta, - ptr_param x) const - { - this->template log( - this, alpha.get(), b.get(), beta.get(), x.get()); - this->validate_application_parameters(alpha.get(), b.get(), beta.get(), - x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, beta).get(), - make_temporary_clone(exec, x).get()); - this->template log( - this, alpha.get(), b.get(), beta.get(), x.get()); - return this; - } - /** * Returns the number of batches in the batch operator. * @@ -236,66 +143,6 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { : EnableAbstractPolymorphicObject(exec), size_{batch_size} {} - /** - * Implementers of BatchLinOp should override this function instead - * of apply(const BatchLinOp *, BatchLinOp *). - * - * Performs the operation x = op(b), where op is this linear operator. - * - * @param b the input batch vector(s) on which the operator is applied - * @param x the output batch vector(s) where the result is stored - */ - virtual void apply_impl(const BatchLinOp* b, BatchLinOp* x) const = 0; - - /** - * Implementers of BatchLinOp should override this function instead - * of apply(const BatchLinOp *, const BatchLinOp *, const BatchLinOp *, - * BatchLinOp *). - * - * @param alpha scaling of the result of op(b) - * @param b vector(s) on which the operator is applied - * @param beta scaling of the input x - * @param x output vector(s) - */ - virtual void apply_impl(const BatchLinOp* alpha, const BatchLinOp* b, - const BatchLinOp* beta, BatchLinOp* x) const = 0; - - /** - * Throws a DimensionMismatch exception if the parameters to `apply` are of - * the wrong size. - * - * @param b batch vector(s) on which the operator is applied - * @param x output batch vector(s) - */ - void validate_application_parameters(const BatchLinOp* b, - const BatchLinOp* x) const - { - GKO_ASSERT_BATCH_CONFORMANT(this, b); - GKO_ASSERT_BATCH_EQUAL_ROWS(this, x); - GKO_ASSERT_BATCH_EQUAL_COLS(b, x); - } - - /** - * Throws a DimensionMismatch exception if the parameters to `apply` are of - * the wrong size. - * - * @param alpha scaling of the result of op(b) - * @param b batch vector(s) on which the operator is applied - * @param beta scaling of the input x - * @param x output batch vector(s) - */ - void validate_application_parameters(const BatchLinOp* alpha, - const BatchLinOp* b, - const BatchLinOp* beta, - const BatchLinOp* x) const - { - this->validate_application_parameters(b, x); - GKO_ASSERT_BATCH_EQUAL_ROWS( - alpha, batch_dim<2>(b->get_num_batch_items(), dim<2>(1, 1))); - GKO_ASSERT_BATCH_EQUAL_ROWS( - beta, batch_dim<2>(b->get_num_batch_items(), dim<2>(1, 1))); - } - private: batch_dim<2> size_{}; }; @@ -395,38 +242,6 @@ class EnableBatchLinOp using EnablePolymorphicObject::EnablePolymorphicObject; - const ConcreteBatchLinOp* apply(ptr_param b, - ptr_param x) const - { - PolymorphicBase::apply(b, x); - return self(); - } - - ConcreteBatchLinOp* apply(ptr_param b, - ptr_param x) - { - PolymorphicBase::apply(b, x); - return self(); - } - - const ConcreteBatchLinOp* apply(ptr_param alpha, - ptr_param b, - ptr_param beta, - ptr_param x) const - { - PolymorphicBase::apply(alpha, b, beta, x); - return self(); - } - - ConcreteBatchLinOp* apply(ptr_param alpha, - ptr_param b, - ptr_param beta, - ptr_param x) - { - PolymorphicBase::apply(alpha, b, beta, x); - return self(); - } - protected: GKO_ENABLE_SELF(ConcreteBatchLinOp); }; From e1be54f118120313a5597b85047792edf94b9fa3 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 6 Oct 2023 09:38:11 +0200 Subject: [PATCH 332/583] Update docs --- include/ginkgo/core/base/batch_lin_op.hpp | 45 +++++++------------ .../ginkgo/core/base/batch_lin_op_helpers.hpp | 6 +-- 2 files changed, 19 insertions(+), 32 deletions(-) diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index ac632c715e8..54a1ead1a3d 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -70,31 +70,22 @@ namespace batch { * have no communication/information exchange between them. Therefore, any * collective operations between the batches is not possible and not * implemented. This allows for each batch to be computed and operated on in an - * embarrasingly parallel fashion. + * embarrassingly parallel fashion. * - * Similar to the LinOp class, the BatchLinOp also implements - * BatchLinOp::apply() methods which call the internal apply_impl() methods - * which the concrete BatchLinOp's have to implement. - * - * A key difference between the LinOp and the BatchLinOp classes is the storing - * of dimensions. BatchLinOp allows for storing non-equal objects in the - * batches and hence stores a batch_dim object instead of a dim object. The - * batch_dim object is optimized to store less amount of data when storing - * uniform batches. - * - * All size validation functions again verify first that the number of batches - * are conformant and that the dimensions in the corresponding batches - * themselves are also valid/conformant. Here too, optimizations for uniform - * batches have been added. + * A key difference between the LinOp and the BatchLinOp class is that the apply + * between BatchLinOps is no longer supported. The user can apply a BatchLinOp + * to a batch::MultiVector but not to any general BatchLinOp. Therefore, the + * BatchLinOp serves only as a base class providing necessary core functionality + * from Polymorphic object and store the dimensions of the batched object. * * @ref BatchLinOp */ class BatchLinOp : public EnableAbstractPolymorphicObject { public: /** - * Returns the number of batches in the batch operator. + * Returns the number of items in the batch operator. * - * @return number of batches in the batch operator + * @return number of items in the batch operator */ size_type get_num_batch_items() const noexcept { @@ -104,7 +95,7 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { /** * Returns the size of the batch operator. * - * @return size of the batch operator + * @return size of the batch operator, a batch_dim object */ const batch_dim<2>& get_size() const noexcept { return size_; } @@ -117,12 +108,12 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { void set_size(const batch_dim<2>& size) { size_ = size; } /** - * Creates a batch operator with uniform batches. + * Creates a batch operator storing items of uniform sizes. * * @param exec the executor where all the operations are performed - * @param num_batch_items the number of batches to be stored in the + * @param num_batch_items the number of batch items to be stored in the * operator - * @param size the size of on of the operator in the batched operator + * @param size the common size of the items in the batched operator */ explicit BatchLinOp(std::shared_ptr exec, const size_type num_batch_items = 0, @@ -133,10 +124,10 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { {} /** - * Creates a batch operator. + * Creates a batch operator storing items of uniform sizes. * - * @param exec the executor where all the operations are performed - * @param batch_size the sizes of the batch operator stored as a batch_dim + * @param exec the executor where all the operations are performed + * @param batch_size the size the batched operator, as a batch_dim object */ explicit BatchLinOp(std::shared_ptr exec, const batch_dim<2>& batch_size) @@ -213,9 +204,7 @@ class BatchLinOpFactory * while the library takes care of generating the trivial utility functions. * The mixin will provide default implementations for the entire * PolymorphicObject interface, including a default implementation of - * `copy_from` between objects of the new BatchLinOp type. It will also hide the - * default BatchLinOp::apply() methods with versions that preserve the static - * type of the object. + * `copy_from` between objects of the new BatchLinOp type. * * Implementers of new BatchLinOps are required to specify only the following * aspects: @@ -224,8 +213,6 @@ class BatchLinOpFactory * EnableCreateMethod mixin (used mostly for matrix formats), * or GKO_ENABLE_BATCH_LIN_OP_FACTORY macro (used for operators created from * other operators, like preconditioners and solvers). - * 2. Application of the BatchLinOp: Implementers have to override the two - * overloads of the BatchLinOp::apply_impl() virtual methods. * * @tparam ConcreteBatchLinOp the concrete BatchLinOp which is being * implemented [CRTP parameter] diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp index 579411e9af0..7b479192a6b 100644 --- a/include/ginkgo/core/base/batch_lin_op_helpers.hpp +++ b/include/ginkgo/core/base/batch_lin_op_helpers.hpp @@ -61,7 +61,7 @@ namespace batch { * template parameters to enable a subclass of BatchLinOpFactory. * * @tparam ConcreteFactory the concrete factory which is being implemented - * [CRTP parmeter] + * [CRTP parameter] * @tparam ConcreteLinOp the concrete BatchLinOp type which this factory * produces, needs to have a constructor which takes a const ConcreteFactory *, * and an std::shared_ptr as parameters. @@ -128,13 +128,13 @@ using EnableDefaultBatchLinOpFactory = * // create a factory with default `my_value` parameter * auto fact = MyBatchLinOp::build().on(exec); * // create a operator using the factory: - * auto my_op = fact->generate(gko::matrix::BatchIdentity::create(exec, 2)); + * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2)); * std::cout << my_op->get_my_parameters().my_value; // prints 5 * * // create a factory with custom `my_value` parameter * auto fact = MyLinOp::build().with_my_value(0).on(exec); * // create a operator using the factory: - * auto my_op = fact->generate(gko::matrix::BatchIdentity::create(exec, 2)); + * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2)); * std::cout << my_op->get_my_parameters().my_value; // prints 0 * ``` * From 18d597ea3f969cd2cc7ef4711058a24a65620594 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 6 Oct 2023 09:43:12 +0200 Subject: [PATCH 333/583] Remove BatchLinOp apply log events --- include/ginkgo/core/log/logger.hpp | 68 +----------------------------- 1 file changed, 2 insertions(+), 66 deletions(-) diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index a6fade087b0..a1607723a75 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -570,62 +570,6 @@ public: \ const PolymorphicObject* input, const PolymorphicObject* output) - /** - * BatchLinOp's apply started event. - * - * @param A the system matrix - * @param b the input vector(s) - * @param x the output vector(s) - */ - GKO_LOGGER_REGISTER_EVENT(24, batch_linop_apply_started, - const batch::BatchLinOp* A, - const batch::BatchLinOp* b, - const batch::BatchLinOp* x) - - /** - * BatchLinOp's apply completed event. - * - * @param A the system matrix - * @param b the input vector(s) - * @param x the output vector(s) - */ - GKO_LOGGER_REGISTER_EVENT(25, batch_linop_apply_completed, - const batch::BatchLinOp* A, - const batch::BatchLinOp* b, - const batch::BatchLinOp* x) - - /** - * BatchLinOp's advanced apply started event. - * - * @param A the system matrix - * @param alpha scaling of the result of op(b) - * @param b the input vector(s) - * @param beta scaling of the input x - * @param x the output vector(s) - */ - GKO_LOGGER_REGISTER_EVENT(26, batch_linop_advanced_apply_started, - const batch::BatchLinOp* A, - const batch::BatchLinOp* alpha, - const batch::BatchLinOp* b, - const batch::BatchLinOp* beta, - const batch::BatchLinOp* x) - - /** - * BatchLinOp's advanced apply completed event. - * - * @param A the system matrix - * @param alpha scaling of the result of op(b) - * @param b the input vector(s) - * @param beta scaling of the input x - * @param x the output vector(s) - */ - GKO_LOGGER_REGISTER_EVENT(27, batch_linop_advanced_apply_completed, - const batch::BatchLinOp* A, - const batch::BatchLinOp* alpha, - const batch::BatchLinOp* b, - const batch::BatchLinOp* beta, - const batch::BatchLinOp* x) - /** * BatchLinOp Factory's generate started event. * @@ -633,7 +577,7 @@ public: \ * @param input the BatchLinOp object used as input for the generation * (usually a system matrix) */ - GKO_LOGGER_REGISTER_EVENT(28, batch_linop_factory_generate_started, + GKO_LOGGER_REGISTER_EVENT(24, batch_linop_factory_generate_started, const batch::BatchLinOpFactory* factory, const batch::BatchLinOp* input) @@ -645,7 +589,7 @@ public: \ * (usually a system matrix) * @param output the generated BatchLinOp object */ - GKO_LOGGER_REGISTER_EVENT(29, batch_linop_factory_generate_completed, + GKO_LOGGER_REGISTER_EVENT(25, batch_linop_factory_generate_completed, const batch::BatchLinOpFactory* factory, const batch::BatchLinOp* input, const batch::BatchLinOp* output) @@ -692,14 +636,6 @@ public: \ linop_factory_generate_started_mask | linop_factory_generate_completed_mask; - /** - * Bitset Mask which activates all batch linop events - */ - static constexpr mask_type batch_linop_events_mask = - batch_linop_apply_started_mask | batch_linop_apply_completed_mask | - batch_linop_advanced_apply_started_mask | - batch_linop_advanced_apply_completed_mask; - /** * Bitset Mask which activates all batch linop factory events */ From 8913fc1aad786ec736b282d834306a77c502299a Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Fri, 6 Oct 2023 07:48:31 +0000 Subject: [PATCH 334/583] Format files Co-authored-by: Pratik Nayak --- include/ginkgo/core/log/logger.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index a1607723a75..a10782c0102 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -455,9 +455,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [ - [deprecated("Please use the version with the additional stopping " - "information.")]] virtual void + [[deprecated( + "Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x = nullptr, const LinOp* tau = nullptr) const @@ -476,9 +476,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [ - [deprecated("Please use the version with the additional stopping " - "information.")]] virtual void + [[deprecated( + "Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x, const LinOp* tau, const LinOp* implicit_tau_sq) const @@ -810,8 +810,8 @@ class EnableLogging : public PolymorphicBase { template struct propagate_log_helper< Event, ConcreteLoggableT, - xstd::void_t().get_executor())>> { + xstd::void_t< + decltype(std::declval().get_executor())>> { template static void propagate_log(const ConcreteLoggableT* loggable, Args&&... args) From f5d5e2857406c7a28bc4aa2352fe82ff295fe175 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 6 Oct 2023 09:47:15 +0200 Subject: [PATCH 335/583] add common size getter --- include/ginkgo/core/base/batch_lin_op.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index 54a1ead1a3d..dd33e63bbd1 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -92,6 +92,13 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { return size_.get_num_batch_items(); } + /** + * Returns the common size of the batch items. + * + * @return the common size stored + */ + dim<2> get_common_size() const { return size_.get_common_size(); } + /** * Returns the size of the batch operator. * From 143aabebb4e2226355ab707fb7b13fef73344e6c Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 9 Oct 2023 23:46:13 +0200 Subject: [PATCH 336/583] Move lin_op_helpers back to lin_op --- include/ginkgo/core/base/batch_lin_op.hpp | 140 +++++++++++- .../ginkgo/core/base/batch_lin_op_helpers.hpp | 202 ------------------ include/ginkgo/ginkgo.hpp | 2 - 3 files changed, 139 insertions(+), 205 deletions(-) delete mode 100644 include/ginkgo/core/base/batch_lin_op_helpers.hpp diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index dd33e63bbd1..31984997b2c 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -40,7 +40,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -241,6 +240,145 @@ class EnableBatchLinOp }; +/** + * This is an alias for the EnableDefaultFactory mixin, which correctly sets the + * template parameters to enable a subclass of BatchLinOpFactory. + * + * @tparam ConcreteFactory the concrete factory which is being implemented + * [CRTP parameter] + * @tparam ConcreteLinOp the concrete BatchLinOp type which this factory + * produces, needs to have a constructor which takes a const ConcreteFactory *, + * and an std::shared_ptr as parameters. + * @tparam ParametersType a subclass of enable_parameters_type template which + * defines all of the parameters of the factory + * @tparam PolymorphicBase parent of ConcreteFactory in the polymorphic + * hierarchy, has to be a subclass of LinOpFactory + * + * @ingroup BatchLinOp + */ +template +using EnableDefaultBatchLinOpFactory = + EnableDefaultFactory; + + +/** + * This macro will generate a default implementation of a BatchLinOpFactory for + * the BatchLinOp subclass it is defined in. + * + * It is required to first call the macro #GKO_CREATE_FACTORY_PARAMETERS() + * before this one in order to instantiate the parameters type first. + * + * The list of parameters for the factory should be defined in a code block + * after the macro definition, and should contain a list of + * GKO_FACTORY_PARAMETER_* declarations. The class should provide a constructor + * with signature + * _batch_lin_op(const _factory_name *, std::shared_ptr) + * which the factory will use a callback to construct the object. + * + * A minimal example of a batch linear operator is the following: + * + * ```c++ + * struct MyBatchLinOp : public EnableBatchLinOp { + * GKO_ENABLE_BATCH_LIN_OP_FACTORY(MyBatchLinOp, my_parameters, Factory) { + * // a factory parameter named "my_value", of type int and default + * // value of 5 + * int GKO_FACTORY_PARAMETER_SCALAR(my_value, 5); + * // a factory parameter named `my_pair` of type `std::pair` + * // and default value {5, 5} + * std::pair GKO_FACTORY_PARAMETER_VECTOR(my_pair, 5, 5); + * }; + * // constructor needed by EnableBatchLinOp + * explicit MyBatchLinOp(std::shared_ptr exec) { + * : EnableBatchLinOp(exec) {} + * // constructor needed by the factory + * explicit MyBatchLinOp(const Factory *factory, + * std::shared_ptr matrix) + * : EnableBatchLinOp(factory->get_executor()), + * matrix->get_size()), + * // store factory's parameters locally + * my_parameters_{factory->get_parameters()} + * { + * int value = my_parameters_.my_value; + * // do something with value + * } + * ``` + * + * MyBatchLinOp can then be created as follows: + * + * ```c++ + * auto exec = gko::ReferenceExecutor::create(); + * // create a factory with default `my_value` parameter + * auto fact = MyBatchLinOp::build().on(exec); + * // create a operator using the factory: + * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2)); + * std::cout << my_op->get_my_parameters().my_value; // prints 5 + * + * // create a factory with custom `my_value` parameter + * auto fact = MyLinOp::build().with_my_value(0).on(exec); + * // create a operator using the factory: + * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2)); + * std::cout << my_op->get_my_parameters().my_value; // prints 0 + * ``` + * + * @note It is possible to combine both the #GKO_CREATE_FACTORY_PARAMETER_*() + * macros with this one in a unique macro for class __templates__ (not with + * regular classes). Splitting this into two distinct macros allows to use them + * in all contexts. See for more + * details. + * + * @param _lin_op concrete operator for which the factory is to be created + * [CRTP parameter] + * @param _parameters_name name of the parameters member in the class + * (its type is `<_parameters_name>_type`, the + * protected member's name is `<_parameters_name>_`, + * and the public getter's name is + * `get_<_parameters_name>()`) + * @param _factory_name name of the generated factory type + * + * @ingroup BatchLinOp + */ +#define GKO_ENABLE_BATCH_LIN_OP_FACTORY(_batch_lin_op, _parameters_name, \ + _factory_name) \ +public: \ + const _parameters_name##_type& get_##_parameters_name() const \ + { \ + return _parameters_name##_; \ + } \ + \ + class _factory_name \ + : public ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type> { \ + friend class ::gko::EnablePolymorphicObject< \ + _factory_name, ::gko::batch::BatchLinOpFactory>; \ + friend class ::gko::enable_parameters_type<_parameters_name##_type, \ + _factory_name>; \ + explicit _factory_name(std::shared_ptr exec) \ + : ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type>( \ + std::move(exec)) \ + {} \ + explicit _factory_name(std::shared_ptr exec, \ + const _parameters_name##_type& parameters) \ + : ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type>( \ + std::move(exec), parameters) \ + {} \ + }; \ + friend ::gko::batch::EnableDefaultBatchLinOpFactory< \ + _factory_name, _batch_lin_op, _parameters_name##_type>; \ + \ + \ +private: \ + _parameters_name##_type _parameters_name##_; \ + \ +public: \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + } // namespace batch } // namespace gko diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp deleted file mode 100644 index 7b479192a6b..00000000000 --- a/include/ginkgo/core/base/batch_lin_op_helpers.hpp +++ /dev/null @@ -1,202 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ -#define GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ - - -#include -#include -#include - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace gko { -namespace batch { - - -/** - * This is an alias for the EnableDefaultFactory mixin, which correctly sets the - * template parameters to enable a subclass of BatchLinOpFactory. - * - * @tparam ConcreteFactory the concrete factory which is being implemented - * [CRTP parameter] - * @tparam ConcreteLinOp the concrete BatchLinOp type which this factory - * produces, needs to have a constructor which takes a const ConcreteFactory *, - * and an std::shared_ptr as parameters. - * @tparam ParametersType a subclass of enable_parameters_type template which - * defines all of the parameters of the factory - * @tparam PolymorphicBase parent of ConcreteFactory in the polymorphic - * hierarchy, has to be a subclass of LinOpFactory - * - * @ingroup BatchLinOp - */ -template -using EnableDefaultBatchLinOpFactory = - EnableDefaultFactory; - - -/** - * This macro will generate a default implementation of a BatchLinOpFactory for - * the BatchLinOp subclass it is defined in. - * - * It is required to first call the macro #GKO_CREATE_FACTORY_PARAMETERS() - * before this one in order to instantiate the parameters type first. - * - * The list of parameters for the factory should be defined in a code block - * after the macro definition, and should contain a list of - * GKO_FACTORY_PARAMETER_* declarations. The class should provide a constructor - * with signature - * _batch_lin_op(const _factory_name *, std::shared_ptr) - * which the factory will use a callback to construct the object. - * - * A minimal example of a batch linear operator is the following: - * - * ```c++ - * struct MyBatchLinOp : public EnableBatchLinOp { - * GKO_ENABLE_BATCH_LIN_OP_FACTORY(MyBatchLinOp, my_parameters, Factory) { - * // a factory parameter named "my_value", of type int and default - * // value of 5 - * int GKO_FACTORY_PARAMETER_SCALAR(my_value, 5); - * // a factory parameter named `my_pair` of type `std::pair` - * // and default value {5, 5} - * std::pair GKO_FACTORY_PARAMETER_VECTOR(my_pair, 5, 5); - * }; - * // constructor needed by EnableBatchLinOp - * explicit MyBatchLinOp(std::shared_ptr exec) { - * : EnableBatchLinOp(exec) {} - * // constructor needed by the factory - * explicit MyBatchLinOp(const Factory *factory, - * std::shared_ptr matrix) - * : EnableBatchLinOp(factory->get_executor()), - * matrix->get_size()), - * // store factory's parameters locally - * my_parameters_{factory->get_parameters()} - * { - * int value = my_parameters_.my_value; - * // do something with value - * } - * ``` - * - * MyBatchLinOp can then be created as follows: - * - * ```c++ - * auto exec = gko::ReferenceExecutor::create(); - * // create a factory with default `my_value` parameter - * auto fact = MyBatchLinOp::build().on(exec); - * // create a operator using the factory: - * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2)); - * std::cout << my_op->get_my_parameters().my_value; // prints 5 - * - * // create a factory with custom `my_value` parameter - * auto fact = MyLinOp::build().with_my_value(0).on(exec); - * // create a operator using the factory: - * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2)); - * std::cout << my_op->get_my_parameters().my_value; // prints 0 - * ``` - * - * @note It is possible to combine both the #GKO_CREATE_FACTORY_PARAMETER_*() - * macros with this one in a unique macro for class __templates__ (not with - * regular classes). Splitting this into two distinct macros allows to use them - * in all contexts. See for more - * details. - * - * @param _lin_op concrete operator for which the factory is to be created - * [CRTP parameter] - * @param _parameters_name name of the parameters member in the class - * (its type is `<_parameters_name>_type`, the - * protected member's name is `<_parameters_name>_`, - * and the public getter's name is - * `get_<_parameters_name>()`) - * @param _factory_name name of the generated factory type - * - * @ingroup BatchLinOp - */ -#define GKO_ENABLE_BATCH_LIN_OP_FACTORY(_batch_lin_op, _parameters_name, \ - _factory_name) \ -public: \ - const _parameters_name##_type& get_##_parameters_name() const \ - { \ - return _parameters_name##_; \ - } \ - \ - class _factory_name \ - : public ::gko::batch::EnableDefaultBatchLinOpFactory< \ - _factory_name, _batch_lin_op, _parameters_name##_type> { \ - friend class ::gko::EnablePolymorphicObject< \ - _factory_name, ::gko::batch::BatchLinOpFactory>; \ - friend class ::gko::enable_parameters_type<_parameters_name##_type, \ - _factory_name>; \ - explicit _factory_name(std::shared_ptr exec) \ - : ::gko::batch::EnableDefaultBatchLinOpFactory< \ - _factory_name, _batch_lin_op, _parameters_name##_type>( \ - std::move(exec)) \ - {} \ - explicit _factory_name(std::shared_ptr exec, \ - const _parameters_name##_type& parameters) \ - : ::gko::batch::EnableDefaultBatchLinOpFactory< \ - _factory_name, _batch_lin_op, _parameters_name##_type>( \ - std::move(exec), parameters) \ - {} \ - }; \ - friend ::gko::batch::EnableDefaultBatchLinOpFactory< \ - _factory_name, _batch_lin_op, _parameters_name##_type>; \ - \ - \ -private: \ - _parameters_name##_type _parameters_name##_; \ - \ -public: \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - - -} // namespace batch -} // namespace gko - - -#endif // GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HELPERS_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 186a5fce061..aed3b5f3572 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -41,7 +41,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include @@ -56,7 +55,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include From d332a0e98715a8e7381a7a80c284720ec1a7757e Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 10:22:21 +0200 Subject: [PATCH 337/583] Review updates Co-authored-by: Tobias Ribizel Co-authored-by: Yu-Hsiang Tasi --- core/test/base/batch_lin_op.cpp | 91 ++++++++++++++++++++--- include/ginkgo/core/base/batch_lin_op.hpp | 48 ++++++------ include/ginkgo/core/log/logger.hpp | 20 +++-- 3 files changed, 117 insertions(+), 42 deletions(-) diff --git a/core/test/base/batch_lin_op.cpp b/core/test/base/batch_lin_op.cpp index 2e0bf0fae0e..61dcf89f109 100644 --- a/core/test/base/batch_lin_op.cpp +++ b/core/test/base/batch_lin_op.cpp @@ -44,11 +44,36 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace { +struct DummyLogger : gko::log::Logger { + DummyLogger() + : gko::log::Logger(gko::log::Logger::batch_linop_factory_events_mask) + {} + + void on_batch_linop_factory_generate_started( + const gko::batch::BatchLinOpFactory*, + const gko::batch::BatchLinOp*) const override + { + batch_linop_factory_generate_started++; + } + + void on_batch_linop_factory_generate_completed( + const gko::batch::BatchLinOpFactory*, const gko::batch::BatchLinOp*, + const gko::batch::BatchLinOp*) const override + { + batch_linop_factory_generate_completed++; + } + + int mutable batch_linop_factory_generate_started = 0; + int mutable batch_linop_factory_generate_completed = 0; +}; + + class DummyBatchLinOp : public gko::batch::EnableBatchLinOp, public gko::EnableCreateMethod { public: @@ -63,33 +88,25 @@ class EnableBatchLinOp : public ::testing::Test { protected: EnableBatchLinOp() : ref{gko::ReferenceExecutor::create()}, - ref2{gko::ReferenceExecutor::create()}, - op{DummyBatchLinOp::create(ref2, - gko::batch_dim<2>(1, gko::dim<2>{3, 5}))}, - op2{DummyBatchLinOp::create(ref2, - gko::batch_dim<2>(2, gko::dim<2>{3, 5}))} + op{DummyBatchLinOp::create(ref, + gko::batch_dim<2>(1, gko::dim<2>{3, 5}))} {} std::shared_ptr ref; - std::shared_ptr ref2; std::unique_ptr op; - std::unique_ptr op2; }; TEST_F(EnableBatchLinOp, KnowsNumBatchItems) { ASSERT_EQ(op->get_num_batch_items(), 1); - ASSERT_EQ(op2->get_num_batch_items(), 2); } TEST_F(EnableBatchLinOp, KnowsItsSizes) { auto op1_sizes = gko::batch_dim<2>(1, gko::dim<2>{3, 5}); - auto op2_sizes = gko::batch_dim<2>(2, gko::dim<2>{3, 5}); ASSERT_EQ(op->get_size(), op1_sizes); - ASSERT_EQ(op2->get_size(), op2_sizes); } @@ -123,9 +140,14 @@ class DummyBatchLinOpWithFactory class EnableBatchLinOpFactory : public ::testing::Test { protected: - EnableBatchLinOpFactory() : ref{gko::ReferenceExecutor::create()} {} + EnableBatchLinOpFactory() + : ref{gko::ReferenceExecutor::create()}, + logger{std::make_shared()} + + {} std::shared_ptr ref; + std::shared_ptr logger; }; @@ -161,4 +183,51 @@ TEST_F(EnableBatchLinOpFactory, PassesParametersToBatchLinOp) } +TEST_F(EnableBatchLinOpFactory, FactoryGenerateIsLogged) +{ + auto before_logger = *logger; + auto factory = DummyBatchLinOpWithFactory<>::build().on(ref); + factory->add_logger(logger); + factory->generate( + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5}))); + + ASSERT_EQ(logger->batch_linop_factory_generate_started, + before_logger.batch_linop_factory_generate_started + 1); + ASSERT_EQ(logger->batch_linop_factory_generate_completed, + before_logger.batch_linop_factory_generate_completed + 1); +} + + +TEST_F(EnableBatchLinOpFactory, WithLoggersWorksAndPropagates) +{ + auto before_logger = *logger; + auto factory = + DummyBatchLinOpWithFactory<>::build().with_loggers(logger).on(ref); + auto op = factory->generate( + DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5}))); + + ASSERT_EQ(logger->batch_linop_factory_generate_started, + before_logger.batch_linop_factory_generate_started + 1); + ASSERT_EQ(logger->batch_linop_factory_generate_completed, + before_logger.batch_linop_factory_generate_completed + 1); +} + + +TEST_F(EnableBatchLinOpFactory, CopiesLinOpToOtherExecutor) +{ + auto ref2 = gko::ReferenceExecutor::create(); + auto dummy = gko::share( + DummyBatchLinOp::create(ref2, gko::batch_dim<2>(1, gko::dim<2>{3, 5}))); + auto factory = DummyBatchLinOpWithFactory<>::build().with_value(6).on(ref); + + auto op = factory->generate(dummy); + + ASSERT_EQ(op->get_executor(), ref); + ASSERT_EQ(op->get_parameters().value, 6); + ASSERT_EQ(op->op_->get_executor(), ref); + ASSERT_NE(op->op_.get(), dummy.get()); + ASSERT_TRUE(dynamic_cast(op->op_.get())); +} + + } // namespace diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index 31984997b2c..2e507d99a45 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -73,9 +73,13 @@ namespace batch { * * A key difference between the LinOp and the BatchLinOp class is that the apply * between BatchLinOps is no longer supported. The user can apply a BatchLinOp - * to a batch::MultiVector but not to any general BatchLinOp. Therefore, the - * BatchLinOp serves only as a base class providing necessary core functionality - * from Polymorphic object and store the dimensions of the batched object. + * to a batch::MultiVector but not to any general BatchLinOp. This apply to a + * batch::MultiVector is handled by the concrete LinOp and may be moved to the + * base BatchLinOp class in the future. + * + * Therefore, the BatchLinOp serves only as a base class providing necessary + * core functionality from Polymorphic object and store the dimensions of the + * batched object. * * @ref BatchLinOp */ @@ -84,24 +88,24 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { /** * Returns the number of items in the batch operator. * - * @return number of items in the batch operator + * @return number of items in the batch operator */ size_type get_num_batch_items() const noexcept { - return size_.get_num_batch_items(); + return get_size().get_num_batch_items(); } /** * Returns the common size of the batch items. * - * @return the common size stored + * @return the common size stored */ - dim<2> get_common_size() const { return size_.get_common_size(); } + dim<2> get_common_size() const { return get_size().get_common_size(); } /** * Returns the size of the batch operator. * - * @return size of the batch operator, a batch_dim object + * @return size of the batch operator, a batch_dim object */ const batch_dim<2>& get_size() const noexcept { return size_; } @@ -117,27 +121,28 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { * Creates a batch operator storing items of uniform sizes. * * @param exec the executor where all the operations are performed - * @param num_batch_items the number of batch items to be stored in the - * operator - * @param size the common size of the items in the batched operator + * @param batch_size the size the batched operator, as a batch_dim object */ explicit BatchLinOp(std::shared_ptr exec, - const size_type num_batch_items = 0, - const dim<2>& common_size = dim<2>{}) - : EnableAbstractPolymorphicObject(exec), - size_{num_batch_items > 0 ? batch_dim<2>(num_batch_items, common_size) - : batch_dim<2>{}} + const batch_dim<2>& batch_size) + : EnableAbstractPolymorphicObject(exec), size_{batch_size} {} /** * Creates a batch operator storing items of uniform sizes. * * @param exec the executor where all the operations are performed - * @param batch_size the size the batched operator, as a batch_dim object + * @param num_batch_items the number of batch items to be stored in the + * operator + * @param size the common size of the items in the batched operator */ explicit BatchLinOp(std::shared_ptr exec, - const batch_dim<2>& batch_size) - : EnableAbstractPolymorphicObject(exec), size_{batch_size} + const size_type num_batch_items = 0, + const dim<2>& common_size = dim<2>{}) + : BatchLinOp{std::move(exec), + num_batch_items > 0 + ? batch_dim<2>(num_batch_items, common_size) + : batch_dim<2>{}} {} private: @@ -158,7 +163,7 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { * --------------------------- * * ```c++ - * // Suppose A is a batch matrix, batch_b a batch rhs vector, and batch_x an + * // Suppose A is a batch matrix, batch_b, a batch rhs vector, and batch_x, an * // initial guess * // Create a BatchCG which runs for at most 1000 iterations, and stops after * // reducing the residual norm by 6 orders of magnitude @@ -234,9 +239,6 @@ class EnableBatchLinOp public: using EnablePolymorphicObject::EnablePolymorphicObject; - -protected: - GKO_ENABLE_SELF(ConcreteBatchLinOp); }; diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index a10782c0102..bef0a44c227 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -60,8 +60,12 @@ class stopping_status; namespace batch { + + class BatchLinOp; class BatchLinOpFactory; + + } // namespace batch @@ -455,9 +459,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [[deprecated( - "Please use the version with the additional stopping " - "information.")]] virtual void + [ + [deprecated("Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x = nullptr, const LinOp* tau = nullptr) const @@ -476,9 +480,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [[deprecated( - "Please use the version with the additional stopping " - "information.")]] virtual void + [ + [deprecated("Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x, const LinOp* tau, const LinOp* implicit_tau_sq) const @@ -810,8 +814,8 @@ class EnableLogging : public PolymorphicBase { template struct propagate_log_helper< Event, ConcreteLoggableT, - xstd::void_t< - decltype(std::declval().get_executor())>> { + xstd::void_t().get_executor())>> { template static void propagate_log(const ConcreteLoggableT* loggable, Args&&... args) From f11e389b3e2cc453628b9824af06faed7f63eed6 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Tue, 10 Oct 2023 08:30:45 +0000 Subject: [PATCH 338/583] Format files Co-authored-by: Pratik Nayak --- include/ginkgo/core/log/logger.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index bef0a44c227..47c03b3c572 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -459,9 +459,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [ - [deprecated("Please use the version with the additional stopping " - "information.")]] virtual void + [[deprecated( + "Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x = nullptr, const LinOp* tau = nullptr) const @@ -480,9 +480,9 @@ public: \ * @warning This on_iteration_complete function that this macro declares is * deprecated. Please use the version with the stopping information. */ - [ - [deprecated("Please use the version with the additional stopping " - "information.")]] virtual void + [[deprecated( + "Please use the version with the additional stopping " + "information.")]] virtual void on_iteration_complete(const LinOp* solver, const size_type& it, const LinOp* r, const LinOp* x, const LinOp* tau, const LinOp* implicit_tau_sq) const @@ -814,8 +814,8 @@ class EnableLogging : public PolymorphicBase { template struct propagate_log_helper< Event, ConcreteLoggableT, - xstd::void_t().get_executor())>> { + xstd::void_t< + decltype(std::declval().get_executor())>> { template static void propagate_log(const ConcreteLoggableT* loggable, Args&&... args) From fb75ac39d55a0fe1818621e2390625756a0924c7 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 11:08:56 +0200 Subject: [PATCH 339/583] Add batch_linop_fac mask to logger. --- core/log/logger.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/log/logger.cpp b/core/log/logger.cpp index 81f75842474..4b21bfe9b74 100644 --- a/core/log/logger.cpp +++ b/core/log/logger.cpp @@ -43,6 +43,7 @@ constexpr Logger::mask_type Logger::operation_events_mask; constexpr Logger::mask_type Logger::polymorphic_object_events_mask; constexpr Logger::mask_type Logger::linop_events_mask; constexpr Logger::mask_type Logger::linop_factory_events_mask; +constexpr Logger::mask_type Logger::batch_linop_factory_events_mask; constexpr Logger::mask_type Logger::criterion_events_mask; constexpr Logger::mask_type Logger::allocation_started_mask; From 0bd2b24406a7b3c8882c9f0dc50bb73f2b787903 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 11:50:39 +0200 Subject: [PATCH 340/583] Review updates Co-authored-by: Terry Cojean --- include/ginkgo/core/base/batch_lin_op.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index 2e507d99a45..320c935a54f 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -248,13 +248,13 @@ class EnableBatchLinOp * * @tparam ConcreteFactory the concrete factory which is being implemented * [CRTP parameter] - * @tparam ConcreteLinOp the concrete BatchLinOp type which this factory + * @tparam ConcreteBatchLinOp the concrete BatchLinOp type which this factory * produces, needs to have a constructor which takes a const ConcreteFactory *, * and an std::shared_ptr as parameters. * @tparam ParametersType a subclass of enable_parameters_type template which * defines all of the parameters of the factory * @tparam PolymorphicBase parent of ConcreteFactory in the polymorphic - * hierarchy, has to be a subclass of LinOpFactory + * hierarchy, has to be a subclass of BatchLinOpFactory * * @ingroup BatchLinOp */ @@ -330,8 +330,8 @@ using EnableDefaultBatchLinOpFactory = * in all contexts. See for more * details. * - * @param _lin_op concrete operator for which the factory is to be created - * [CRTP parameter] + * @param _batch_lin_op concrete operator for which the factory is to be + * created [CRTP parameter] * @param _parameters_name name of the parameters member in the class * (its type is `<_parameters_name>_type`, the * protected member's name is `<_parameters_name>_`, From b13e0df89fb8d740a894950bca63b0a898f024f7 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 16:48:19 +0200 Subject: [PATCH 341/583] Doc clarifications --- include/ginkgo/core/base/batch_lin_op.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index 320c935a54f..78ce4f4a942 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -73,14 +73,15 @@ namespace batch { * * A key difference between the LinOp and the BatchLinOp class is that the apply * between BatchLinOps is no longer supported. The user can apply a BatchLinOp - * to a batch::MultiVector but not to any general BatchLinOp. This apply to a - * batch::MultiVector is handled by the concrete LinOp and may be moved to the - * base BatchLinOp class in the future. + * to a batch::MultiVector but not to any general BatchLinOp. * * Therefore, the BatchLinOp serves only as a base class providing necessary * core functionality from Polymorphic object and store the dimensions of the * batched object. * + * @note Apply to batch::MultiVector objects are handled by the concrete LinOp + * and may be moved to the base BatchLinOp class in the future. + * * @ref BatchLinOp */ class BatchLinOp : public EnableAbstractPolymorphicObject { From ade5bb372bc2725d8c7fef1aefa5d8de9ec8c323 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 1 Oct 2023 14:19:44 +0200 Subject: [PATCH 342/583] Add batch dense base class, core and kernels Co-authored-by: Aditya Kashi --- core/CMakeLists.txt | 1 + core/base/batch_struct.hpp | 76 +++ core/device_hooks/common_kernels.inc.cpp | 10 + core/matrix/batch_dense.cpp | 203 ++++++++ core/matrix/batch_dense_kernels.hpp | 81 +++ core/test/matrix/batch_dense.cpp | 520 +++++++++++++++++++ cuda/CMakeLists.txt | 1 + cuda/matrix/batch_dense_kernels.cu | 90 ++++ dpcpp/CMakeLists.txt | 1 + dpcpp/matrix/batch_dense_kernels.dp.cpp | 83 +++ hip/CMakeLists.txt | 1 + hip/matrix/batch_dense_kernels.hip.cpp | 94 ++++ include/ginkgo/core/matrix/batch_dense.hpp | 341 ++++++++++++ omp/CMakeLists.txt | 1 + omp/matrix/batch_dense_kernels.cpp | 129 +++++ reference/CMakeLists.txt | 1 + reference/base/batch_struct.hpp | 28 + reference/matrix/batch_dense_kernels.cpp | 128 +++++ reference/matrix/batch_dense_kernels.hpp.inc | 88 ++++ 19 files changed, 1877 insertions(+) create mode 100644 core/matrix/batch_dense.cpp create mode 100644 core/matrix/batch_dense_kernels.hpp create mode 100644 core/test/matrix/batch_dense.cpp create mode 100644 cuda/matrix/batch_dense_kernels.cu create mode 100644 dpcpp/matrix/batch_dense_kernels.dp.cpp create mode 100644 hip/matrix/batch_dense_kernels.hip.cpp create mode 100644 include/ginkgo/core/matrix/batch_dense.hpp create mode 100644 omp/matrix/batch_dense_kernels.cpp create mode 100644 reference/matrix/batch_dense_kernels.cpp create mode 100644 reference/matrix/batch_dense_kernels.hpp.inc diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 7932976d6c9..46ea67abc65 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -39,6 +39,7 @@ target_sources(ginkgo log/vtune.cpp log/record.cpp log/stream.cpp + matrix/batch_dense.cpp matrix/coo.cpp matrix/csr.cpp matrix/dense.cpp diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index caca4577cf7..21bd5b0e8ea 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -81,6 +81,46 @@ struct uniform_batch { } // namespace multi_vector +namespace batch_dense { + + +/** + * Encapsulates one matrix from a batch of multi-vectors. + */ +template +struct batch_item { + using value_type = ValueType; + ValueType* values; + int stride; + int num_rows; + int num_rhs; +}; + + +/** + * A 'simple' structure to store a global uniform batch of multi-vectors. + */ +template +struct uniform_batch { + using value_type = ValueType; + using entry_type = batch_item; + + ValueType* values; + size_type num_batch_items; + int stride; + int num_rows; + int num_rhs; + + size_type get_entry_storage() const + { + return num_rows * stride * sizeof(value_type); + } +}; + + +} // namespace batch_dense + + template GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item to_const( const multi_vector::batch_item& b) @@ -97,6 +137,22 @@ GKO_ATTRIBUTES GKO_INLINE multi_vector::uniform_batch to_const( } +template +GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::batch_item +to_const(const matrix::batch_dense::batch_item& b) +{ + return {b.values, b.stride, b.num_rows, b.num_rhs}; +} + + +template +GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::uniform_batch +to_const(const matrix::batch_dense::uniform_batch& ub) +{ + return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs}; +} + + /** * Extract one object (matrix, vector etc.) from a batch of objects * @@ -126,6 +182,26 @@ extract_batch_item(ValueType* const batch_values, const int stride, } +template +GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::batch_item +extract_batch_item(const matrix::batch_dense::uniform_batch& batch, + const size_type batch_idx) +{ + return {batch.values + batch_idx * batch.stride * batch.num_rows, + batch.stride, batch.num_rows, batch.num_rhs}; +} + +template +GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::batch_item +extract_batch_item(ValueType* const batch_values, const int stride, + const int num_rows, const int num_rhs, + const size_type batch_idx) +{ + return {batch_values + batch_idx * stride * num_rows, stride, num_rows, + num_rhs}; +} + + } // namespace batch } // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index c8bbd2e0a31..c22f5cd968d 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -299,6 +299,16 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); } // namespace batch_multi_vector +namespace batch_dense { + + +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_dense + + namespace dense { diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp new file mode 100644 index 00000000000..e6dedcf11fd --- /dev/null +++ b/core/matrix/batch_dense.cpp @@ -0,0 +1,203 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/matrix/batch_dense_kernels.hpp" + + +namespace gko { +namespace batch { +namespace matrix { +namespace dense { + + +GKO_REGISTER_OPERATION(simple_apply, batch_dense::simple_apply); +GKO_REGISTER_OPERATION(advanced_apply, batch_dense::advanced_apply); + + +} // namespace dense + + +namespace detail { + + +template +batch_dim<2> compute_batch_size( + const std::vector*>& matrices) +{ + auto common_size = matrices[0]->get_size(); + for (size_type i = 1; i < matrices.size(); ++i) { + GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); + } + return batch_dim<2>{matrices.size(), common_size}; +} + + +} // namespace detail + + +template +std::unique_ptr> +BatchDense::create_view_for_item(size_type item_id) +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create( + exec, this->get_common_size(), + make_array_view(exec, num_rows * stride, + this->get_values_for_item(item_id)), + stride); + return mat; +} + + +template +std::unique_ptr> +BatchDense::create_const_view_for_item(size_type item_id) const +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create_const( + exec, this->get_common_size(), + make_const_array_view(exec, num_rows * stride, + this->get_const_values_for_item(item_id)), + stride); + return mat; +} + + +template +std::unique_ptr> +BatchDense::create_with_config_of(ptr_param other) +{ + // De-referencing `other` before calling the functions (instead of + // using operator `->`) is currently required to be compatible with + // CUDA 10.1. + // Otherwise, it results in a compile error. + return (*other).create_with_same_config(); +} + + +template +void BatchDense::set_size(const batch_dim<2>& value) noexcept +{ + batch_size_ = value; +} + + +template +std::unique_ptr> +BatchDense::create_with_same_config() const +{ + return BatchDense::create(this->get_executor(), + this->get_size()); +} + + +inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) +{ + return batch_dim<2>(sizes.get_num_batch_items(), + dim<2>(1, sizes.get_common_size()[1])); +} + + +template +void BatchDense::apply_impl(const MultiVector* b, + MultiVector* x) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(b->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); + GKO_ASSERT_CONFORMANT(this->get_common_size(), x->get_common_size()); + this->get_executor()->run(batch_dense::make_simple_apply(this, b, x)); +} + + +template +void BatchDense::apply_impl(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(b->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); + GKO_ASSERT_CONFORMANT(this->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_COLS(alpha->get_common_size(), gko::dim<2>(1, 1)); + GKO_ASSERT_EQUAL_COLS(beta->get_common_size(), gko::dim<2>(1, 1)); + this->get_executor()->run( + batch_dense::make_advanced_apply(alpha, this, b, beta, x)); +} + + +template +void BatchDense::convert_to( + BatchDense>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void BatchDense::move_to( + BatchDense>* result) +{ + this->convert_to(result); +} + + +#define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class BatchDense<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX); + + +} // namespace matrix +} // namespace batch +} // namespace gko diff --git a/core/matrix/batch_dense_kernels.hpp b/core/matrix/batch_dense_kernels.hpp new file mode 100644 index 00000000000..e801d7aa152 --- /dev/null +++ b/core/matrix/batch_dense_kernels.hpp @@ -0,0 +1,81 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ +#define GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ + + +#include + + +#include +#include +#include + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(_type) \ + void simple_apply(std::shared_ptr exec, \ + const batch::matrix::BatchDense<_type>* a, \ + const batch::MultiVector<_type>* b, \ + MultiVector<_type>* c) + +#define GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL(_type) \ + void advanced_apply(std::shared_ptr exec, \ + const batch::MultiVector<_type>* alpha, \ + const batch::matrix::BatchDense<_type>* a, \ + const batch::MultiVector<_type>* b, \ + const batch::MultiVector<_type>* beta, \ + batch::MultiVector<_type>* c) + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL(ValueType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_dense, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp new file mode 100644 index 00000000000..7db7469baf6 --- /dev/null +++ b/core/test/matrix/batch_dense.cpp @@ -0,0 +1,520 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class BatchDense : public ::testing::Test { +protected: + using value_type = T; + using DenseMtx = gko::matrix::Dense; + using size_type = gko::size_type; + BatchDense() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::batch_initialize>( + std::vector{4, 3}, + {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + exec)) + {} + + + static void assert_equal_to_original_mtx( + gko::matrix::BatchDense* m) + { + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_stride().at(0), 4); + ASSERT_EQ(m->get_stride().at(1), 3); + ASSERT_EQ(m->get_num_stored_elements(), (2 * 4) + (2 * 3)); + ASSERT_EQ(m->get_num_stored_elements(0), 2 * 4); + ASSERT_EQ(m->get_num_stored_elements(1), 2 * 3); + EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5}); + EXPECT_EQ(m->at(0, 1, 1), value_type{2.5}); + ASSERT_EQ(m->at(0, 1, 2), value_type{3.5}); + EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{2.5}); + EXPECT_EQ(m->at(1, 0, 2), value_type{3.0}); + EXPECT_EQ(m->at(1, 1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{2.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); + } + + static void assert_empty(gko::matrix::BatchDense* m) + { + ASSERT_EQ(m->get_num_batch_entries(), 0); + ASSERT_EQ(m->get_num_stored_elements(), 0); + } + + std::shared_ptr exec; + std::unique_ptr> mtx; +}; + +TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); + + +TYPED_TEST(BatchDense, CanBeEmpty) +{ + auto empty = gko::matrix::BatchDense::create(this->exec); + this->assert_empty(empty.get()); +} + + +TYPED_TEST(BatchDense, ReturnsNullValuesArrayWhenEmpty) +{ + auto empty = gko::matrix::BatchDense::create(this->exec); + ASSERT_EQ(empty->get_const_values(), nullptr); +} + + +TYPED_TEST(BatchDense, CanBeConstructedWithSize) +{ + using size_type = gko::size_type; + auto m = gko::matrix::BatchDense::create( + this->exec, + std::vector>{gko::dim<2>{2, 4}, gko::dim<2>{2, 3}}); + + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 4)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 3)); + EXPECT_EQ(m->get_stride().at(0), 4); + EXPECT_EQ(m->get_stride().at(1), 3); + ASSERT_EQ(m->get_num_stored_elements(), 14); + ASSERT_EQ(m->get_num_stored_elements(0), 8); + ASSERT_EQ(m->get_num_stored_elements(1), 6); +} + + +TYPED_TEST(BatchDense, CanBeConstructedWithSizeAndStride) +{ + using size_type = gko::size_type; + auto m = gko::matrix::BatchDense::create( + this->exec, std::vector>{gko::dim<2>{2, 3}}, + std::vector{4}); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + EXPECT_EQ(m->get_stride().at(0), 4); + ASSERT_EQ(m->get_num_stored_elements(), 8); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) +{ + using value_type = typename TestFixture::value_type; + using size_type = gko::size_type; + // clang-format off + value_type data[] = { + 1.0, 2.0, -1.0, + 3.0, 4.0, -1.0, + 3.0, 5.0, 1.0, + 5.0, 6.0, -3.0}; + // clang-format on + + auto m = gko::matrix::BatchDense::create( + this->exec, + std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, + gko::array::view(this->exec, 12, data), + std::vector{3, 3}); + + ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) +{ + using value_type = typename TestFixture::value_type; + using size_type = gko::size_type; + // clang-format off + const value_type data[] = { + 1.0, 2.0, -1.0, + 3.0, 4.0, -1.0, + 3.0, 5.0, 1.0, + 5.0, 6.0, -3.0}; + // clang-format on + + auto m = gko::matrix::BatchDense::create_const( + this->exec, + std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, + gko::array::const_view(this->exec, 12, data), + std::vector{3, 3}); + + ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromBatchDenseMatrices) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 3, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto m = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat2.get()}); + auto m_ref = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), + mat2.get(), mat1.get(), mat2.get()}); + auto m2 = + gko::matrix::BatchDense::create(this->exec, 3, m.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatricesByDuplication) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto bat_m = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); + auto m = + gko::matrix::BatchDense::create(this->exec, 3, mat1.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); +} + + +TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto m = gko::matrix::BatchDense::create( + this->exec, std::vector{mat1.get(), mat2.get()}); + + this->assert_equal_to_original_mtx(m.get()); +} + + +TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) +{ + using value_type = typename TestFixture::value_type; + using DenseMtx = typename TestFixture::DenseMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize( + 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + this->exec); + + auto dense_mats = this->mtx->unbatch(); + + + GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); + GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.); +} + + +TYPED_TEST(BatchDense, KnowsItsSizeAndValues) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} + + +TYPED_TEST(BatchDense, CanBeListConstructed) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( + {{1.0, 2.0}, {1.0, 3.0}}, this->exec); + + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 4); + EXPECT_EQ(m->at(0, 0), value_type{1}); + EXPECT_EQ(m->at(0, 1), value_type{2}); + EXPECT_EQ(m->at(1, 0), value_type{1}); + EXPECT_EQ(m->at(1, 1), value_type{3}); +} + + +TYPED_TEST(BatchDense, CanBeListConstructedWithstride) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( + std::vector{2}, {{1.0, 2.0}}, this->exec); + ASSERT_EQ(m->get_num_batch_entries(), 1); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 4); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{2.0}); +} + + +TYPED_TEST(BatchDense, CanBeListConstructedByCopies) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::batch_initialize>( + 2, I({1.0, 2.0}), this->exec); + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 4); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{2.0}); +} + + +TYPED_TEST(BatchDense, CanBeDoubleListConstructed) +{ + using value_type = typename TestFixture::value_type; + using T = value_type; + auto m = gko::batch_initialize>( + {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, + {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, + this->exec); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); + ASSERT_EQ(m->get_stride().at(0), 3); + ASSERT_EQ(m->get_stride().at(1), 2); + EXPECT_EQ(m->get_num_stored_elements(), 15); + ASSERT_EQ(m->get_num_stored_elements(0), 9); + ASSERT_EQ(m->get_num_stored_elements(1), 6); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{1.0}); + EXPECT_EQ(m->at(0, 2), value_type{0.0}); + ASSERT_EQ(m->at(0, 3), value_type{2.0}); + EXPECT_EQ(m->at(0, 4), value_type{4.0}); + EXPECT_EQ(m->at(1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 2), value_type{3.0}); + ASSERT_EQ(m->at(1, 3), value_type{4.0}); + EXPECT_EQ(m->at(1, 4), value_type{5.0}); +} + + +TYPED_TEST(BatchDense, CanBeDoubleListConstructedWithstride) +{ + using value_type = typename TestFixture::value_type; + using T = value_type; + auto m = gko::batch_initialize>( + {4, 3}, + {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, + {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, + this->exec); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); + ASSERT_EQ(m->get_stride().at(0), 4); + ASSERT_EQ(m->get_stride().at(1), 3); + EXPECT_EQ(m->get_num_stored_elements(), 21); + ASSERT_EQ(m->get_num_stored_elements(0), 12); + ASSERT_EQ(m->get_num_stored_elements(1), 9); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{1.0}); + EXPECT_EQ(m->at(0, 2), value_type{0.0}); + ASSERT_EQ(m->at(0, 3), value_type{2.0}); + EXPECT_EQ(m->at(0, 4), value_type{4.0}); + EXPECT_EQ(m->at(1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 2), value_type{3.0}); + ASSERT_EQ(m->at(1, 3), value_type{4.0}); + EXPECT_EQ(m->at(1, 4), value_type{5.0}); +} + + +TYPED_TEST(BatchDense, CanBeCopied) +{ + auto mtx_copy = gko::matrix::BatchDense::create(this->exec); + mtx_copy->copy_from(this->mtx.get()); + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->at(0, 0, 0) = 7; + this->mtx->at(0, 1) = 7; + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchDense, CanBeMoved) +{ + auto mtx_copy = gko::matrix::BatchDense::create(this->exec); + mtx_copy->copy_from(std::move(this->mtx)); + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchDense, CanBeCloned) +{ + auto mtx_clone = this->mtx->clone(); + this->assert_equal_to_original_mtx( + dynamic_castmtx.get())>(mtx_clone.get())); +} + + +TYPED_TEST(BatchDense, CanBeCleared) +{ + this->mtx->clear(); + this->assert_empty(this->mtx.get()); +} + + +TYPED_TEST(BatchDense, CanBeReadFromMatrixData) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::matrix::BatchDense::create(this->exec); + // clang-format off + m->read({gko::matrix_data{{2, 3}, + {{0, 0, 1.0}, + {0, 1, 3.0}, + {0, 2, 2.0}, + {1, 0, 0.0}, + {1, 1, 5.0}, + {1, 2, 0.0}}}, + gko::matrix_data{{2, 2}, + {{0, 0, -1.0}, + {0, 1, 0.5}, + {1, 0, 0.0}, + {1, 1, 9.0}}}}); + // clang-format on + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 2)); + ASSERT_EQ(m->get_num_stored_elements(), 10); + ASSERT_EQ(m->get_num_stored_elements(0), 6); + ASSERT_EQ(m->get_num_stored_elements(1), 4); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); + EXPECT_EQ(m->at(0, 1, 2), value_type{0.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); + EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); +} + + +TYPED_TEST(BatchDense, GeneratesCorrectMatrixData) +{ + using value_type = typename TestFixture::value_type; + using tpl = typename gko::matrix_data::nonzero_type; + std::vector> data; + + this->mtx->write(data); + + ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); + ASSERT_EQ(data[0].nonzeros.size(), 6); + EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0})); + EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0})); + EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0})); + EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5})); + EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5})); + EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5})); + ASSERT_EQ(data[1].size, gko::dim<2>(2, 3)); + ASSERT_EQ(data[1].nonzeros.size(), 6); + EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5})); + EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0})); + EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0})); + EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0})); + EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0})); +} + + +TYPED_TEST(BatchDense, CanBeReadFromMatrixAssemblyData) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::matrix::BatchDense::create(this->exec); + gko::matrix_assembly_data data1(gko::dim<2>{2, 3}); + data1.set_value(0, 0, 1.0); + data1.set_value(0, 1, 3.0); + data1.set_value(0, 2, 2.0); + data1.set_value(1, 0, 0.0); + data1.set_value(1, 1, 5.0); + data1.set_value(1, 2, 0.0); + gko::matrix_assembly_data data2(gko::dim<2>{2, 1}); + data2.set_value(0, 0, 2.0); + data2.set_value(1, 0, 5.0); + auto data = std::vector>{data1, data2}; + + m->read(data); + + ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); + ASSERT_EQ(m->get_num_stored_elements(), 8); + ASSERT_EQ(m->get_num_stored_elements(0), 6); + ASSERT_EQ(m->get_num_stored_elements(1), 2); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 2), value_type{0.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); + EXPECT_EQ(m->at(1, 1, 0), value_type{5.0}); +} + + +} // namespace diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 4c972d2a584..dfa1b2177ee 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -38,6 +38,7 @@ target_sources(ginkgo_cuda factorization/par_ilut_select_kernel.cu factorization/par_ilut_spgeam_kernel.cu factorization/par_ilut_sweep_kernel.cu + matrix/batch_dense_kernels.cu matrix/coo_kernels.cu ${CSR_INSTANTIATE} matrix/dense_kernels.cu diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu new file mode 100644 index 00000000000..5e53a410bf0 --- /dev/null +++ b/cuda/matrix/batch_dense_kernels.cu @@ -0,0 +1,90 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/cublas_bindings.hpp" +#include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The BatchDense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_multiplier = 4; + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::BatchDense* mat, + const batch::MultiVector* b, + MultiVector* x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::BatchDense* a, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* c) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_dense +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index dd0d7c4cdfb..4099bb603a3 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -35,6 +35,7 @@ target_sources(ginkgo_dpcpp factorization/par_ilut_select_kernel.dp.cpp factorization/par_ilut_spgeam_kernel.dp.cpp factorization/par_ilut_sweep_kernel.dp.cpp + matrix/batch_dense_kernels.dp.cpp matrix/coo_kernels.dp.cpp matrix/csr_kernels.dp.cpp matrix/fbcsr_kernels.dp.cpp diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp new file mode 100644 index 00000000000..100dbf7e670 --- /dev/null +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -0,0 +1,83 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The BatchDense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::BatchDense* a, + const batch::MultiVector* b, + MultiVector* c) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::BatchDense* a, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* c) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_dense +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 779db13d36a..21b573b6cd0 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -35,6 +35,7 @@ set(GINKGO_HIP_SOURCES factorization/par_ilut_select_kernel.hip.cpp factorization/par_ilut_spgeam_kernel.hip.cpp factorization/par_ilut_sweep_kernel.hip.cpp + matrix/batch_dense_kernels.hip.cpp matrix/coo_kernels.hip.cpp ${CSR_INSTANTIATE} matrix/dense_kernels.hip.cpp diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp new file mode 100644 index 00000000000..640f9c67b6a --- /dev/null +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -0,0 +1,94 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include + + +#include "core/matrix/batch_struct.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The BatchDense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_multiplier = 4; + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::BatchDense* mat, + const batch::MultiVector* b, + MultiVector* x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::BatchDense* a, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* c) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_dense +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp new file mode 100644 index 00000000000..60023727c8a --- /dev/null +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -0,0 +1,341 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ +#define GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace batch { +namespace matrix { + + +/** + * BatchDense is a batch matrix format which explicitly stores all values of the + * matrix in each of the batches. + * + * The values in each of the batches are stored in row-major format (values + * belonging to the same row appear consecutive in the memory). Optionally, rows + * can be padded for better memory access. + * + * @tparam ValueType precision of matrix elements + * + * @note While this format is not very useful for storing sparse matrices, it + * is often suitable to store vectors, and sets of vectors. + * @ingroup batch_dense + * @ingroup mat_formats + * @ingroup BatchLinOp + */ +template +class BatchDense : public EnableBatchLinOp>, + public EnableCreateMethod>, + public ConvertibleTo>> { + friend class EnableCreateMethod; + friend class EnablePolymorphicObject; + friend class BatchDense>; + friend class BatchDense>; + +public: + using EnableBatchLinOp::convert_to; + using EnableBatchLinOp::move_to; + + using value_type = ValueType; + using index_type = int32; + using transposed_type = BatchDense; + using unbatch_type = matrix::Dense; + using absolute_type = remove_complex; + using complex_type = to_complex; + + /** + * Creates a BatchDense matrix with the configuration of another BatchDense + * matrix. + * + * @param other The other matrix whose configuration needs to copied. + */ + static std::unique_ptr create_with_config_of( + const BatchDense* other) + { + // De-referencing `other` before calling the functions (instead of + // using operator `->`) is currently required to be compatible with + // CUDA 10.1. + // Otherwise, it results in a compile error. + return (*other).create_with_same_config(); + } + + void convert_to( + BatchDense>* result) const override; + + void move_to(BatchDense>* result) override; + + + /** + * Creates a mutable view (of matrix::Dense type) of one item of the Batch + * MultiVector object. Does not perform any deep copies, but + * only returns a view of the data. + * + * @param item_id The index of the batch item + * + * @return a matrix::Dense object with the data from the batch item at the + * given index. + */ + std::unique_ptr create_view_for_item(size_type item_id); + + /** + * @copydoc create_view_for_item(size_type) + */ + std::unique_ptr create_const_view_for_item( + size_type item_id) const; + + /** + * Returns the batch size. + * + * @return the batch size + */ + batch_dim<2> get_size() const { return batch_size_; } + + /** + * Returns the number of batch items. + * + * @return the number of batch items + */ + size_type get_num_batch_items() const + { + return batch_size_.get_num_batch_items(); + } + + /** + * Returns the common size of the batch items. + * + * @return the common size stored + */ + dim<2> get_common_size() const { return batch_size_.get_common_size(); } + + /** + * Returns a pointer to the array of values of the multi-vector + * + * @return the pointer to the array of values + */ + value_type* get_values() noexcept { return values_.get_data(); } + + /** + * @copydoc get_values() + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_values() const noexcept + { + return values_.get_const_data(); + } + + /** + * Returns a pointer to the array of values of the multi-vector for a + * specific batch item. + * + * @param batch_id the id of the batch item. + * + * @return the pointer to the array of values + */ + value_type* get_values_for_item(size_type batch_id) noexcept + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_data() + + this->get_size().get_cumulative_offset(batch_id); + } + + /** + * @copydoc get_values_for_item(size_type) + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_const_data() + + this->get_size().get_cumulative_offset(batch_id); + } + + /** + * Returns the number of elements explicitly stored in the batch matrix, + * cumulative across all the batch items. + * + * @return the number of elements explicitly stored in the vector, + * cumulative across all the batch items + */ + size_type get_num_stored_elements() const noexcept + { + return values_.get_num_elems(); + } + + + /** + * Creates a constant (immutable) batch dense matrix from a constant + * array. + * + * @param exec the executor to create the vector on + * @param size the dimensions of the vector + * @param values the value array of the vector + * + * @return A smart pointer to the constant multi-vector wrapping the input + * array (if it resides on the same executor as the vector) or a copy of the + * array on the correct executor. + */ + static std::unique_ptr> + create_const(std::shared_ptr exec, + const batch_dim<2>& sizes, + gko::detail::const_array_view&& values); + +private: + inline size_type compute_num_elems(const batch_dim<2>& size) + { + return size.get_cumulative_offset(size.get_num_batch_items()); + } + + + void apply(const MultiVector* b, + MultiVector* x) const + { + this->apply_impl(b, x); + } + + void apply(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const + { + this->apply_impl(alpha, b, beta, x); + } + +protected: + /** + * Sets the size of the MultiVector. + * + * @param value the new size of the operator + */ + void set_size(const batch_dim<2>& value) noexcept; + + /** + * Creates an uninitialized BatchDense matrix of the specified size. + * + * @param exec Executor associated to the matrix + * @param size size of the matrix + */ + BatchDense(std::shared_ptr exec, + const batch_dim<2>& size = batch_dim<2>{}); + + /** + * Creates a BatchDense matrix from an already allocated (and initialized) + * array. + * + * @tparam ValuesArray type of array of values + * + * @param exec Executor associated to the matrix + * @param size sizes of the batch matrices in a batch_dim object + * @param values array of matrix values + * @param strides stride of the rows (i.e. offset between the first + * elements of two consecutive rows, expressed as the + * number of matrix elements) + * + * @note If `values` is not an rvalue, not an array of ValueType, or is on + * the wrong executor, an internal copy will be created, and the + * original array data will not be used in the matrix. + */ + template + BatchDense(std::shared_ptr exec, const batch_dim<2>& size, + ValuesArray&& values) + : EnableBatchLinOp(exec, size), + values_{exec, std::forward(values)} + { + // Ensure that the values array has the correct size + auto num_elems = compute_num_elems(size); + GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1); + } + + /** + * Creates a BatchDense matrix with the same configuration as the callers + * matrix. + * + * @returns a BatchDense matrix with the same configuration as the caller. + */ + std::unique_ptr create_with_same_config() const; + + virtual void apply_impl(const MultiVector* b, + MultiVector* x) const; + + virtual void apply_impl(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const; + + size_type linearize_index(size_type batch, size_type row, + size_type col) const noexcept + { + return batch_size_.get_cumulative_offset(batch) + + row * batch_size_.get_common_size()[1] + col; + } + + size_type linearize_index(size_type batch, size_type idx) const noexcept + { + return linearize_index(batch, idx / this->get_common_size()[1], + idx % this->get_common_size()[1]); + } + +private: + batch_dim<2> batch_size_; + array values_; +}; + + +} // namespace matrix +} // namespace batch +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_ diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 47259feeac0..d87399492f5 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -23,6 +23,7 @@ target_sources(ginkgo_omp factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp + matrix/batch_dense_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp new file mode 100644 index 00000000000..ea2e84a8e83 --- /dev/null +++ b/omp/matrix/batch_dense_kernels.cpp @@ -0,0 +1,129 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include + + +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The BatchDense matrix format namespace. + * @ref BatchDense + * @ingroup batch_dense + */ +namespace batch_dense { + + +#include "reference/matrix/batch_dense_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::BatchDense* mat, + const batch::MultiVector* b, + MultiVector* x) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + simple_apply_kernel(mat_item, b_item, x_item); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::BatchDense* a, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* c) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); + if (alpha->get_num_batch_items() > 1) { + GKO_ASSERT(alpha->get_num_batch_items() == x->get_num_batch_items()); + GKO_ASSERT(beta->get_num_batch_items() == x->get_num_batch_items()); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); + const auto beta_item = batch::extract_batch_item(beta_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); + } + } else { + const auto alpha_item = batch::extract_batch_item(alpha_ub, 0); + const auto beta_item = batch::extract_batch_item(beta_ub, 0); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_dense +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index dd54e3fb52f..37498588ca7 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -25,6 +25,7 @@ target_sources(ginkgo_reference factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp + matrix/batch_dense_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index ce7c7af5605..b30fa971ed7 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -87,6 +87,34 @@ inline batch::multi_vector::uniform_batch get_batch_struct( } +/** + * Generates an immutable uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch +get_batch_struct(const batch::matrix::BatchDense* const op) +{ + return {op->get_const_values(), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +/** + * Generates a uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch get_batch_struct( + batch::matrix::BatchDense* const op) +{ + return {op->get_values(), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + } // namespace host } // namespace kernels } // namespace gko diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp new file mode 100644 index 00000000000..aa285a6b01b --- /dev/null +++ b/reference/matrix/batch_dense_kernels.cpp @@ -0,0 +1,128 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "core/matrix/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The BatchDense matrix format namespace. + * @ref BatchDense + * @ingroup batch_dense + */ +namespace batch_dense { + + +#include "reference/matrix/batch_dense_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::BatchDense* mat, + const batch::MultiVector* b, + MultiVector* x) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + simple_apply_kernel(mat_item, b_item, x_item); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::BatchDense* a, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* c) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); + if (alpha->get_num_batch_items() > 1) { + GKO_ASSERT(alpha->get_num_batch_items() == x->get_num_batch_items()); + GKO_ASSERT(beta->get_num_batch_items() == x->get_num_batch_items()); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); + const auto beta_item = batch::extract_batch_item(beta_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); + } + } else { + const auto alpha_item = batch::extract_batch_item(alpha_ub, 0); + const auto beta_item = batch::extract_batch_item(beta_ub, 0); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_dense +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp.inc new file mode 100644 index 00000000000..ae342982de5 --- /dev/null +++ b/reference/matrix/batch_dense_kernels.hpp.inc @@ -0,0 +1,88 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +inline void simple_apply_kernel( + const gko::batch::batch_dense::batch_item& a, + const gko::batch::batch_multi_vector::batch_item& b, + const gko::batch::batch_multi_vector::batch_item& c) +{ + for (int row = 0; row < c.num_rows; ++row) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] = gko::zero(); + } + } + + for (int row = 0; row < c.num_rows; ++row) { + for (int inner = 0; inner < a.num_rhs; ++inner) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] += + a.values[row * a.stride + inner] * + b.values[inner * b.stride + col]; + } + } + } +} + + +template +inline void advanced_apply_kernel( + const ValueType alpha, + const gko::batch::batch_dense::batch_item& a, + const gko::batch::batch_multi_vector::batch_item& b, + const ValueType beta, + const gko::batch::batch_multi_vector::batch_item& c) +{ + if (beta != gko::zero()) { + for (int row = 0; row < c.num_rows; ++row) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] *= beta; + } + } + } else { + for (int row = 0; row < c.num_rows; ++row) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] *= gko::zero(); + } + } + } + + for (int row = 0; row < c.num_rows; ++row) { + for (int inner = 0; inner < a.num_rhs; ++inner) { + for (int col = 0; col < c.num_rhs; ++col) { + c.values[row * c.stride + col] += + alpha * a.values[row * a.stride + inner] * + b.values[inner * b.stride + col]; + } + } + } +} From 4f8c875efa68d4f12641d10f397f4734b0216794 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 2 Oct 2023 19:18:05 +0200 Subject: [PATCH 343/583] add reference kernels WIP --- core/base/batch_multi_vector_kernels.hpp | 1 - core/base/batch_struct.hpp | 76 ----------- core/device_hooks/common_kernels.inc.cpp | 1 + core/matrix/batch_dense.cpp | 10 +- core/matrix/batch_dense_kernels.hpp | 5 +- core/matrix/batch_struct.hpp | 125 +++++++++++++++++++ cuda/matrix/batch_dense_kernels.cu | 3 +- dpcpp/matrix/batch_dense_kernels.dp.cpp | 2 +- hip/matrix/batch_dense_kernels.hip.cpp | 2 +- include/ginkgo/core/matrix/batch_dense.hpp | 10 +- omp/matrix/batch_dense_kernels.cpp | 4 +- reference/base/batch_struct.hpp | 28 ----- reference/matrix/batch_dense_kernels.cpp | 16 ++- reference/matrix/batch_dense_kernels.hpp.inc | 12 +- reference/matrix/batch_struct.hpp | 95 ++++++++++++++ 15 files changed, 259 insertions(+), 131 deletions(-) create mode 100644 core/matrix/batch_struct.hpp create mode 100644 reference/matrix/batch_struct.hpp diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp index 8603a2b9055..5a39567f470 100644 --- a/core/base/batch_multi_vector_kernels.hpp +++ b/core/base/batch_multi_vector_kernels.hpp @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include #include "core/base/kernel_declaration.hpp" diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index 21bd5b0e8ea..caca4577cf7 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -81,46 +81,6 @@ struct uniform_batch { } // namespace multi_vector -namespace batch_dense { - - -/** - * Encapsulates one matrix from a batch of multi-vectors. - */ -template -struct batch_item { - using value_type = ValueType; - ValueType* values; - int stride; - int num_rows; - int num_rhs; -}; - - -/** - * A 'simple' structure to store a global uniform batch of multi-vectors. - */ -template -struct uniform_batch { - using value_type = ValueType; - using entry_type = batch_item; - - ValueType* values; - size_type num_batch_items; - int stride; - int num_rows; - int num_rhs; - - size_type get_entry_storage() const - { - return num_rows * stride * sizeof(value_type); - } -}; - - -} // namespace batch_dense - - template GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item to_const( const multi_vector::batch_item& b) @@ -137,22 +97,6 @@ GKO_ATTRIBUTES GKO_INLINE multi_vector::uniform_batch to_const( } -template -GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::batch_item -to_const(const matrix::batch_dense::batch_item& b) -{ - return {b.values, b.stride, b.num_rows, b.num_rhs}; -} - - -template -GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::uniform_batch -to_const(const matrix::batch_dense::uniform_batch& ub) -{ - return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs}; -} - - /** * Extract one object (matrix, vector etc.) from a batch of objects * @@ -182,26 +126,6 @@ extract_batch_item(ValueType* const batch_values, const int stride, } -template -GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::batch_item -extract_batch_item(const matrix::batch_dense::uniform_batch& batch, - const size_type batch_idx) -{ - return {batch.values + batch_idx * batch.stride * batch.num_rows, - batch.stride, batch.num_rows, batch.num_rhs}; -} - -template -GKO_ATTRIBUTES GKO_INLINE matrix::batch_dense::batch_item -extract_batch_item(ValueType* const batch_values, const int stride, - const int num_rows, const int num_rhs, - const size_type batch_idx) -{ - return {batch_values + batch_idx * stride * num_rows, stride, num_rows, - num_rhs}; -} - - } // namespace batch } // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index c22f5cd968d..87cab3dcf0b 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -57,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/factorization/par_ict_kernels.hpp" #include "core/factorization/par_ilu_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" +#include "core/matrix/batch_dense_kernels.hpp" #include "core/matrix/coo_kernels.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/dense_kernels.hpp" diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index e6dedcf11fd..803f7a51c50 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/matrix/batch_dense_kernels.hpp" @@ -66,7 +67,7 @@ namespace detail { template batch_dim<2> compute_batch_size( - const std::vector*>& matrices) + const std::vector*>& matrices) { auto common_size = matrices[0]->get_size(); for (size_type i = 1; i < matrices.size(); ++i) { @@ -80,7 +81,7 @@ batch_dim<2> compute_batch_size( template -std::unique_ptr> +std::unique_ptr> BatchDense::create_view_for_item(size_type item_id) { auto exec = this->get_executor(); @@ -96,7 +97,7 @@ BatchDense::create_view_for_item(size_type item_id) template -std::unique_ptr> +std::unique_ptr> BatchDense::create_const_view_for_item(size_type item_id) const { auto exec = this->get_executor(); @@ -113,7 +114,8 @@ BatchDense::create_const_view_for_item(size_type item_id) const template std::unique_ptr> -BatchDense::create_with_config_of(ptr_param other) +BatchDense::create_with_config_of( + ptr_param> other) { // De-referencing `other` before calling the functions (instead of // using operator `->`) is currently required to be compatible with diff --git a/core/matrix/batch_dense_kernels.hpp b/core/matrix/batch_dense_kernels.hpp index e801d7aa152..7f814e08b50 100644 --- a/core/matrix/batch_dense_kernels.hpp +++ b/core/matrix/batch_dense_kernels.hpp @@ -42,6 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/kernel_declaration.hpp" + + namespace gko { namespace kernels { @@ -50,7 +53,7 @@ namespace kernels { void simple_apply(std::shared_ptr exec, \ const batch::matrix::BatchDense<_type>* a, \ const batch::MultiVector<_type>* b, \ - MultiVector<_type>* c) + batch::MultiVector<_type>* c) #define GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL(_type) \ void advanced_apply(std::shared_ptr exec, \ diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp new file mode 100644 index 00000000000..b6926b0894d --- /dev/null +++ b/core/matrix/batch_struct.hpp @@ -0,0 +1,125 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ + + +#include +#include +#include + + +namespace gko { +namespace batch { +namespace matrix { +namespace batch_dense { + + +/** + * Encapsulates one matrix from a batch of multi-vectors. + */ +template +struct batch_item { + using value_type = ValueType; + ValueType* values; + int stride; + int num_rows; + int num_rhs; +}; + + +/** + * A 'simple' structure to store a global uniform batch of multi-vectors. + */ +template +struct uniform_batch { + using value_type = ValueType; + using entry_type = batch_item; + + ValueType* values; + size_type num_batch_items; + int stride; + int num_rows; + int num_rhs; + + size_type get_entry_storage() const + { + return num_rows * stride * sizeof(value_type); + } +}; + + +} // namespace batch_dense + + +template +GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item to_const( + const batch_dense::batch_item& b) +{ + return {b.values, b.stride, b.num_rows, b.num_rhs}; +} + + +template +GKO_ATTRIBUTES GKO_INLINE batch_dense::uniform_batch to_const( + const batch_dense::uniform_batch& ub) +{ + return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs}; +} + + +template +GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item extract_batch_item( + const batch_dense::uniform_batch& batch, + const size_type batch_idx) +{ + return {batch.values + batch_idx * batch.stride * batch.num_rows, + batch.stride, batch.num_rows, batch.num_rhs}; +} + +template +GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item extract_batch_item( + ValueType* const batch_values, const int stride, const int num_rows, + const int num_rhs, const size_type batch_idx) +{ + return {batch_values + batch_idx * stride * num_rows, stride, num_rows, + num_rhs}; +} + + +} // namespace matrix +} // namespace batch +} // namespace gko + + +#endif // GKO_CORE_MATRIX_BATCH_STRUCT_HPP_ diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 5e53a410bf0..4615af581f5 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" @@ -66,7 +67,7 @@ template void simple_apply(std::shared_ptr exec, const batch::matrix::BatchDense* mat, const batch::MultiVector* b, - MultiVector* x) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 100dbf7e670..964bf094077 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -59,7 +59,7 @@ template void simple_apply(std::shared_ptr exec, const batch::matrix::BatchDense* a, const batch::MultiVector* b, - MultiVector* c) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 640f9c67b6a..93570388d50 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -70,7 +70,7 @@ template void simple_apply(std::shared_ptr exec, const batch::matrix::BatchDense* mat, const batch::MultiVector* b, - MultiVector* x) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 60023727c8a..47aff35b7e7 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -85,7 +86,7 @@ class BatchDense : public EnableBatchLinOp>, using value_type = ValueType; using index_type = int32; using transposed_type = BatchDense; - using unbatch_type = matrix::Dense; + using unbatch_type = gko::matrix::Dense; using absolute_type = remove_complex; using complex_type = to_complex; @@ -227,10 +228,9 @@ class BatchDense : public EnableBatchLinOp>, * array (if it resides on the same executor as the vector) or a copy of the * array on the correct executor. */ - static std::unique_ptr> - create_const(std::shared_ptr exec, - const batch_dim<2>& sizes, - gko::detail::const_array_view&& values); + static std::unique_ptr> create_const( + std::shared_ptr exec, const batch_dim<2>& sizes, + gko::detail::const_array_view&& values); private: inline size_type compute_num_elems(const batch_dim<2>& size) diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp index ea2e84a8e83..fe742bee402 100644 --- a/omp/matrix/batch_dense_kernels.cpp +++ b/omp/matrix/batch_dense_kernels.cpp @@ -40,6 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" #include "reference/matrix/batch_struct.hpp" @@ -61,7 +63,7 @@ template void simple_apply(std::shared_ptr exec, const batch::matrix::BatchDense* mat, const batch::MultiVector* b, - MultiVector* x) + batch::MultiVector* x) { const auto b_ub = host::get_batch_struct(b); const auto x_ub = host::get_batch_struct(x); diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index b30fa971ed7..ce7c7af5605 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -87,34 +87,6 @@ inline batch::multi_vector::uniform_batch get_batch_struct( } -/** - * Generates an immutable uniform batch struct from a batch of multi-vectors. - */ -template -inline batch::matrix::batch_dense::uniform_batch -get_batch_struct(const batch::matrix::BatchDense* const op) -{ - return {op->get_const_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - -/** - * Generates a uniform batch struct from a batch of multi-vectors. - */ -template -inline batch::matrix::batch_dense::uniform_batch get_batch_struct( - batch::matrix::BatchDense* const op) -{ - return {op->get_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - } // namespace host } // namespace kernels } // namespace gko diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index aa285a6b01b..bb5f3e18df7 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -41,7 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" #include "reference/matrix/batch_struct.hpp" @@ -63,13 +65,13 @@ template void simple_apply(std::shared_ptr exec, const batch::matrix::BatchDense* mat, const batch::MultiVector* b, - MultiVector* x) + batch::MultiVector* x) { const auto b_ub = host::get_batch_struct(b); const auto x_ub = host::get_batch_struct(x); const auto mat_ub = host::get_batch_struct(mat); for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); simple_apply_kernel(mat_item, b_item, x_item); @@ -83,10 +85,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void advanced_apply(std::shared_ptr exec, const batch::MultiVector* alpha, - const batch::matrix::BatchDense* a, + const batch::matrix::BatchDense* mat, const batch::MultiVector* b, const batch::MultiVector* beta, - batch::MultiVector* c) + batch::MultiVector* x) { const auto b_ub = host::get_batch_struct(b); const auto x_ub = host::get_batch_struct(x); @@ -97,7 +99,8 @@ void advanced_apply(std::shared_ptr exec, GKO_ASSERT(alpha->get_num_batch_items() == x->get_num_batch_items()); GKO_ASSERT(beta->get_num_batch_items() == x->get_num_batch_items()); for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto mat_item = + batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); @@ -109,7 +112,8 @@ void advanced_apply(std::shared_ptr exec, const auto alpha_item = batch::extract_batch_item(alpha_ub, 0); const auto beta_item = batch::extract_batch_item(beta_ub, 0); for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto mat_item = + batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp.inc index ae342982de5..d45183b2faa 100644 --- a/reference/matrix/batch_dense_kernels.hpp.inc +++ b/reference/matrix/batch_dense_kernels.hpp.inc @@ -32,9 +32,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void simple_apply_kernel( - const gko::batch::batch_dense::batch_item& a, - const gko::batch::batch_multi_vector::batch_item& b, - const gko::batch::batch_multi_vector::batch_item& c) + const gko::batch::matrix::batch_dense::batch_item& a, + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& c) { for (int row = 0; row < c.num_rows; ++row) { for (int col = 0; col < c.num_rhs; ++col) { @@ -57,10 +57,10 @@ inline void simple_apply_kernel( template inline void advanced_apply_kernel( const ValueType alpha, - const gko::batch::batch_dense::batch_item& a, - const gko::batch::batch_multi_vector::batch_item& b, + const gko::batch::matrix::batch_dense::batch_item& a, + const gko::batch::multi_vector::batch_item& b, const ValueType beta, - const gko::batch::batch_multi_vector::batch_item& c) + const gko::batch::multi_vector::batch_item& c) { if (beta != gko::zero()) { for (int row = 0; row < c.num_rows; ++row) { diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp new file mode 100644 index 00000000000..1bed5a4e5c9 --- /dev/null +++ b/reference/matrix/batch_struct.hpp @@ -0,0 +1,95 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ + + +#include +#include + + +#include "core/base/batch_struct.hpp" + + +namespace gko { +namespace kernels { +/** + * @brief A namespace for shared functionality between omp and reference + * executors. + */ +namespace host { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch +get_batch_struct(const batch::matrix::BatchDense* const op) +{ + return {op->get_const_values(), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +/** + * Generates a uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch get_batch_struct( + batch::matrix::BatchDense* const op) +{ + return {op->get_values(), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +} // namespace host +} // namespace kernels +} // namespace gko + + +#endif // GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ From 5bdedb16c86932fdc53760668e503ee97a1161d8 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 3 Oct 2023 11:31:51 +0200 Subject: [PATCH 344/583] Generalize batch utilities --- core/base/batch_multi_vector.cpp | 28 ++++++++-- core/base/batch_utilities.hpp | 47 ++++++++-------- core/test/base/batch_multi_vector.cpp | 54 +++++++++++-------- core/test/utils/assertions.hpp | 8 ++- .../test/base/batch_multi_vector_kernels.cpp | 43 ++++++++------- 5 files changed, 109 insertions(+), 71 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 23591cd1ffe..f6884ef523b 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/base/batch_multi_vector_kernels.hpp" @@ -72,7 +73,7 @@ namespace detail { template batch_dim<2> compute_batch_size( - const std::vector*>& matrices) + const std::vector*>& matrices) { auto common_size = matrices[0]->get_size(); for (size_type i = 1; i < matrices.size(); ++i) { @@ -86,7 +87,7 @@ batch_dim<2> compute_batch_size( template -std::unique_ptr> +std::unique_ptr> MultiVector::create_view_for_item(size_type item_id) { auto exec = this->get_executor(); @@ -102,7 +103,7 @@ MultiVector::create_view_for_item(size_type item_id) template -std::unique_ptr> +std::unique_ptr> MultiVector::create_const_view_for_item(size_type item_id) const { auto exec = this->get_executor(); @@ -290,6 +291,27 @@ void MultiVector::move_to( } +template +void MultiVector::convert_to( + matrix::BatchDense* result) const +{ + auto exec = result->get_executor() != nullptr ? result->get_executor() + : this->get_executor(); + auto tmp = gko::batch::matrix::BatchDense::create_const( + exec, this->get_size(), + make_const_array_view(exec, this->get_num_stored_elements(), + this->get_const_values())); + result->copy_from(tmp); +} + + +template +void MultiVector::move_to(matrix::BatchDense* result) +{ + this->convert_to(result); +} + + #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index e5dc22faeda..d5c5bdb4aa2 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -51,16 +51,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace batch { -namespace multivector { -template -std::unique_ptr> duplicate( - std::shared_ptr exec, size_type num_duplications, - const batch::MultiVector* input) +template +std::unique_ptr duplicate(std::shared_ptr exec, + size_type num_duplications, + const OutputType* input) { auto num_batch_items = input->get_num_batch_items(); - auto tmp = batch::MultiVector::create( + auto tmp = OutputType::create( exec, batch_dim<2>(num_batch_items * num_duplications, input->get_common_size())); @@ -75,13 +74,13 @@ std::unique_ptr> duplicate( } -template -std::unique_ptr> create_from_dense( +template +std::unique_ptr create_from_item( std::shared_ptr exec, const size_type num_duplications, - const matrix::Dense* input) + const typename OutputType::unbatch_type* input) { auto num_batch_items = num_duplications; - auto tmp = batch::MultiVector::create( + auto tmp = OutputType::create( exec, batch_dim<2>(num_batch_items, input->get_size())); for (size_type b = 0; b < num_batch_items; ++b) { @@ -92,13 +91,13 @@ std::unique_ptr> create_from_dense( } -template -std::unique_ptr> create_from_dense( +template +std::unique_ptr create_from_item( std::shared_ptr exec, - const std::vector*>& input) + const std::vector& input) { auto num_batch_items = input.size(); - auto tmp = batch::MultiVector::create( + auto tmp = OutputType::create( exec, batch_dim<2>(num_batch_items, input[0]->get_size())); for (size_type b = 0; b < num_batch_items; ++b) { @@ -109,13 +108,12 @@ std::unique_ptr> create_from_dense( } -template -std::vector>> unbatch( - const batch::MultiVector* batch_multivec) +template +auto unbatch(const InputType* batch_multivec) { auto exec = batch_multivec->get_executor(); auto unbatched_mats = - std::vector>>{}; + std::vector>{}; for (size_type b = 0; b < batch_multivec->get_num_batch_items(); ++b) { unbatched_mats.emplace_back( batch_multivec->create_const_view_for_item(b)->clone()); @@ -124,14 +122,14 @@ std::vector>> unbatch( } -template -std::unique_ptr> read( +template +std::unique_ptr read( std::shared_ptr exec, const std::vector>& data) { auto num_batch_items = data.size(); - auto tmp = MultiVector::create( - exec, batch_dim<2>(num_batch_items, data[0].size)); + auto tmp = + OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size)); for (size_type b = 0; b < num_batch_items; ++b) { tmp->create_view_for_item(b)->read(data[b]); @@ -141,9 +139,9 @@ std::unique_ptr> read( } -template +template std::vector> write( - const MultiVector* mvec) + const OutputType* mvec) { auto data = std::vector>( mvec->get_num_batch_items()); @@ -157,7 +155,6 @@ std::vector> write( } -} // namespace multivector } // namespace batch } // namespace gko diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 85168a406cc..7bdaec30b27 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -188,11 +188,11 @@ TYPED_TEST(MultiVector, CanBeConstructedFromExistingData) using size_type = gko::size_type; // clang-format off value_type data[] = { - 1.0, 2.0, - -1.0,3.0, + 1.0, 2.0, + -1.0, 3.0, 4.0, -1.0, - 3.0, 5.0, - 1.0, 5.0, + 3.0, 5.0, + 1.0, 5.0, 6.0, -3.0}; // clang-format on @@ -218,11 +218,11 @@ TYPED_TEST(MultiVector, CanBeConstructedFromExistingConstData) using size_type = gko::size_type; // clang-format off value_type data[] = { - 1.0, 2.0, - -1.0,3.0, + 1.0, 2.0, + -1.0, 3.0, 4.0, -1.0, - 3.0, 5.0, - 1.0, 5.0, + 3.0, 5.0, + 1.0, 5.0, 6.0, -3.0}; // clang-format on @@ -252,7 +252,7 @@ TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::multivector::create_from_dense( + auto m = gko::batch::create_from_item>( this->exec, std::vector{mat1.get(), mat2.get()}); this->assert_equal_to_original_mtx(m.get()); @@ -269,10 +269,12 @@ TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatricesByDuplication) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::batch::multivector::create_from_dense( - this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); - auto m = - gko::batch::multivector::create_from_dense(this->exec, 3, mat1.get()); + auto bat_m = + gko::batch::create_from_item>( + this->exec, + std::vector{mat1.get(), mat1.get(), mat1.get()}); + auto m = gko::batch::create_from_item>( + this->exec, 3, mat1.get()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } @@ -287,14 +289,16 @@ TYPED_TEST(MultiVector, CanBeConstructedByDuplicatingMultiVectors) this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::multivector::create_from_dense( + auto m = gko::batch::create_from_item>( this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::batch::multivector::create_from_dense( - this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), - mat2.get(), mat1.get(), mat2.get()}); + auto m_ref = + gko::batch::create_from_item>( + this->exec, + std::vector{mat1.get(), mat2.get(), mat1.get(), + mat2.get(), mat1.get(), mat2.get()}); - auto m2 = - gko::batch::multivector::duplicate(this->exec, 3, m.get()); + auto m2 = gko::batch::duplicate>( + this->exec, 3, m.get()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } @@ -385,7 +389,8 @@ TYPED_TEST(MultiVector, CanBeUnbatchedIntoDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto dense_mats = gko::batch::multivector::unbatch(this->mtx.get()); + auto dense_mats = gko::batch::unbatch>( + this->mtx.get()); ASSERT_EQ(dense_mats.size(), 2); GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); @@ -404,7 +409,8 @@ TYPED_TEST(MultiVector, CanBeReadFromMatrixData) vec_data.emplace_back(gko::matrix_data( {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}})); - auto m = gko::batch::multivector::read(this->exec, + auto m = gko::batch::read>(this->exec, vec_data); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); @@ -429,7 +435,8 @@ TYPED_TEST(MultiVector, CanBeReadFromSparseMatrixData) vec_data.emplace_back(gko::matrix_data( {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}})); - auto m = gko::batch::multivector::read(this->exec, + auto m = gko::batch::read>(this->exec, vec_data); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); @@ -451,7 +458,8 @@ TYPED_TEST(MultiVector, GeneratesCorrectMatrixData) using tpl = typename gko::matrix_data::nonzero_type; auto data = - gko::batch::multivector::write(this->mtx.get()); + gko::batch::write>(this->mtx.get()); ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); ASSERT_EQ(data[0].nonzeros.size(), 6); diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index d723d5a8964..63ed1e5423a 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -720,8 +720,12 @@ ::testing::AssertionResult batch_matrices_near( using value_type1 = typename Mat1::value_type; using value_type2 = typename Mat2::value_type; - auto first_data = gko::batch::multivector::write(first); - auto second_data = gko::batch::multivector::write(second); + auto first_data = + gko::batch::write>(first); + auto second_data = + gko::batch::write>(second); if (first_data.size() != second_data.size()) { return ::testing::AssertionFailure() diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 4f922c37703..e0c7643c8d7 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -137,13 +137,14 @@ TYPED_TEST(MultiVector, ScalesData) using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize( {{{2.0, -2.0, 1.5}}, {{3.0, -1.0, 0.25}}}, this->exec); - auto ualpha = gko::batch::multivector::unbatch(alpha.get()); + auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_0->scale(alpha.get()); this->mtx_00->scale(ualpha[0].get()); this->mtx_01->scale(ualpha[1].get()); - auto res = gko::batch::multivector::unbatch(this->mtx_0.get()); + auto res = + gko::batch::unbatch>(this->mtx_0.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_00.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_01.get(), 0.); } @@ -154,13 +155,14 @@ TYPED_TEST(MultiVector, ScalesDataWithScalar) using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize({{2.0}, {-2.0}}, this->exec); - auto ualpha = gko::batch::multivector::unbatch(alpha.get()); + auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_1->scale(alpha.get()); this->mtx_10->scale(ualpha[0].get()); this->mtx_11->scale(ualpha[1].get()); - auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); + auto res = + gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -172,13 +174,14 @@ TYPED_TEST(MultiVector, ScalesDataWithMultipleScalars) using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize( {{{2.0, -2.0, -1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto ualpha = gko::batch::multivector::unbatch(alpha.get()); + auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_1->scale(alpha.get()); this->mtx_10->scale(ualpha[0].get()); this->mtx_11->scale(ualpha[1].get()); - auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); + auto res = + gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -190,13 +193,14 @@ TYPED_TEST(MultiVector, AddsScaled) using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize( {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec); - auto ualpha = gko::batch::multivector::unbatch(alpha.get()); + auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); + auto res = + gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -207,13 +211,14 @@ TYPED_TEST(MultiVector, AddsScaledWithScalar) using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; auto alpha = gko::batch::initialize({{2.0}, {-2.0}}, this->exec); - auto ualpha = gko::batch::multivector::unbatch(alpha.get()); + auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - auto res = gko::batch::multivector::unbatch(this->mtx_1.get()); + auto res = + gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.); } @@ -236,13 +241,13 @@ TYPED_TEST(MultiVector, ComputesDot) using T = typename TestFixture::value_type; auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); - auto ures = gko::batch::multivector::unbatch(result.get()); + auto ures = gko::batch::unbatch>(result.get()); this->mtx_0->compute_dot(this->mtx_1.get(), result.get()); this->mtx_00->compute_dot(this->mtx_10.get(), ures[0].get()); this->mtx_01->compute_dot(this->mtx_11.get(), ures[1].get()); - auto res = gko::batch::multivector::unbatch(result.get()); + auto res = gko::batch::unbatch>(result.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); } @@ -277,13 +282,13 @@ TYPED_TEST(MultiVector, ComputesConjDot) using T = typename TestFixture::value_type; auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); - auto ures = gko::batch::multivector::unbatch(result.get()); + auto ures = gko::batch::unbatch>(result.get()); this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()); this->mtx_00->compute_conj_dot(this->mtx_10.get(), ures[0].get()); this->mtx_01->compute_conj_dot(this->mtx_11.get(), ures[1].get()); - auto res = gko::batch::multivector::unbatch(result.get()); + auto res = gko::batch::unbatch>(result.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); } @@ -359,8 +364,9 @@ TYPED_TEST(MultiVector, ConvertsToPrecision) this->mtx_1->convert_to(tmp.get()); tmp->convert_to(res.get()); - auto ures = gko::batch::multivector::unbatch(res.get()); - auto umtx = gko::batch::multivector::unbatch(this->mtx_1.get()); + auto ures = gko::batch::unbatch>(res.get()); + auto umtx = + gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual); GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual); } @@ -382,8 +388,9 @@ TYPED_TEST(MultiVector, MovesToPrecision) this->mtx_1->move_to(tmp.get()); tmp->move_to(res.get()); - auto ures = gko::batch::multivector::unbatch(res.get()); - auto umtx = gko::batch::multivector::unbatch(this->mtx_1.get()); + auto ures = gko::batch::unbatch>(res.get()); + auto umtx = + gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual); GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual); } From 669cc19753275db1827b34bdd5c9744901331c37 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 3 Oct 2023 11:32:30 +0200 Subject: [PATCH 345/583] MultiVector to BatchDense conversion --- core/matrix/batch_dense.cpp | 27 ++- core/test/matrix/CMakeLists.txt | 2 + core/test/matrix/batch_dense.cpp | 222 ++++++++---------- .../ginkgo/core/base/batch_multi_vector.hpp | 25 +- include/ginkgo/core/matrix/batch_dense.hpp | 11 +- 5 files changed, 154 insertions(+), 133 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index 803f7a51c50..9f72a26c488 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -53,12 +53,14 @@ namespace gko { namespace batch { namespace matrix { namespace dense { +namespace { GKO_REGISTER_OPERATION(simple_apply, batch_dense::simple_apply); GKO_REGISTER_OPERATION(advanced_apply, batch_dense::advanced_apply); +} // namespace } // namespace dense @@ -141,6 +143,19 @@ BatchDense::create_with_same_config() const } +template +std::unique_ptr> +BatchDense::create_const( + std::shared_ptr exec, const batch_dim<2>& sizes, + gko::detail::const_array_view&& values) +{ + // cast const-ness away, but return a const object afterwards, + // so we can ensure that no modifications take place. + return std::unique_ptr(new BatchDense{ + exec, sizes, gko::detail::array_const_cast(std::move(values))}); +} + + inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) { return batch_dim<2>(sizes.get_num_batch_items(), @@ -148,6 +163,14 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) } +template +BatchDense::BatchDense(std::shared_ptr exec, + const batch_dim<2>& size) + : EnableBatchLinOp>(exec, size), + values_(exec, compute_num_elems(size)) +{} + + template void BatchDense::apply_impl(const MultiVector* b, MultiVector* x) const @@ -157,7 +180,7 @@ void BatchDense::apply_impl(const MultiVector* b, GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); GKO_ASSERT_CONFORMANT(this->get_common_size(), x->get_common_size()); - this->get_executor()->run(batch_dense::make_simple_apply(this, b, x)); + this->get_executor()->run(dense::make_simple_apply(this, b, x)); } @@ -175,7 +198,7 @@ void BatchDense::apply_impl(const MultiVector* alpha, GKO_ASSERT_EQUAL_COLS(alpha->get_common_size(), gko::dim<2>(1, 1)); GKO_ASSERT_EQUAL_COLS(beta->get_common_size(), gko::dim<2>(1, 1)); this->get_executor()->run( - batch_dense::make_advanced_apply(alpha, this, b, beta, x)); + dense::make_advanced_apply(alpha, this, b, beta, x)); } diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt index 433361a054f..57c2c97e355 100644 --- a/core/test/matrix/CMakeLists.txt +++ b/core/test/matrix/CMakeLists.txt @@ -1,3 +1,5 @@ +# ginkgo_create_test(batch_dense) +# ginkgo_create_test(coo) ginkgo_create_test(coo_builder) ginkgo_create_test(csr) diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index 7db7469baf6..a1ebdb1061c 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -44,9 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -namespace { - - template class BatchDense : public ::testing::Test { protected: @@ -55,11 +52,13 @@ class BatchDense : public ::testing::Test { using size_type = gko::size_type; BatchDense() : exec(gko::ReferenceExecutor::create()), - mtx(gko::batch_initialize>( - std::vector{4, 3}, + mtx(gko::batch::initialize< + gko::batch::matrix::BatchDense>( {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, - exec)) + exec)), + dense_mtx(gko::initialize>( + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)) {} @@ -67,13 +66,8 @@ class BatchDense : public ::testing::Test { gko::matrix::BatchDense* m) { ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_stride().at(0), 4); - ASSERT_EQ(m->get_stride().at(1), 3); - ASSERT_EQ(m->get_num_stored_elements(), (2 * 4) + (2 * 3)); - ASSERT_EQ(m->get_num_stored_elements(0), 2 * 4); - ASSERT_EQ(m->get_num_stored_elements(1), 2 * 3); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 4)); EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); EXPECT_EQ(m->at(0, 0, 2), value_type{3.0}); @@ -95,7 +89,7 @@ class BatchDense : public ::testing::Test { } std::shared_ptr exec; - std::unique_ptr> mtx; + std::unique_ptr> mtx; }; TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); @@ -103,46 +97,85 @@ TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); TYPED_TEST(BatchDense, CanBeEmpty) { - auto empty = gko::matrix::BatchDense::create(this->exec); + auto empty = gko::batch::matrix::BatchDense::create(this->exec); this->assert_empty(empty.get()); } TYPED_TEST(BatchDense, ReturnsNullValuesArrayWhenEmpty) { - auto empty = gko::matrix::BatchDense::create(this->exec); + auto empty = gko::batch::matrix::BatchDense::create(this->exec); ASSERT_EQ(empty->get_const_values(), nullptr); } -TYPED_TEST(BatchDense, CanBeConstructedWithSize) +TYPED_TEST(BatchDense, CanGetValuesForEntry) { - using size_type = gko::size_type; - auto m = gko::matrix::BatchDense::create( - this->exec, - std::vector>{gko::dim<2>{2, 4}, gko::dim<2>{2, 3}}); + using value_type = typename TestFixture::value_type; - ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 4)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 3)); - EXPECT_EQ(m->get_stride().at(0), 4); - EXPECT_EQ(m->get_stride().at(1), 3); - ASSERT_EQ(m->get_num_stored_elements(), 14); - ASSERT_EQ(m->get_num_stored_elements(0), 8); - ASSERT_EQ(m->get_num_stored_elements(1), 6); + ASSERT_EQ(this->mtx->get_values_for_item(1)[0], value_type{1.0}); +} + + +TYPED_TEST(BatchDense, CanCreateDenseItemView) +{ + GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->dense_mtx, + 0.0); +} + + +TYPED_TEST(BatchDense, CanBeCopied) +{ + auto mtx_copy = + gko::batch::matrix::BatchDense::create(this->exec); + + mtx_copy->copy_from(this->mtx.get()); + + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->at(0, 0, 0) = 7; + this->mtx->at(0, 1) = 7; + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchDense, CanBeMoved) +{ + auto mtx_copy = + gko::batch::matrix::BatchDense::create(this->exec); + + mtx_copy->copy_from(std::move(this->mtx)); + + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(BatchDense, CanBeCloned) +{ + auto mtx_clone = this->mtx->clone(); + + this->assert_equal_to_original_mtx( + dynamic_castmtx.get())>(mtx_clone.get())); } -TYPED_TEST(BatchDense, CanBeConstructedWithSizeAndStride) +TYPED_TEST(BatchDense, CanBeCleared) +{ + this->mtx->clear(); + + this->assert_empty(this->mtx.get()); +} + + +TYPED_TEST(BatchDense, CanBeConstructedWithSize) { using size_type = gko::size_type; - auto m = gko::matrix::BatchDense::create( - this->exec, std::vector>{gko::dim<2>{2, 3}}, - std::vector{4}); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - EXPECT_EQ(m->get_stride().at(0), 4); - ASSERT_EQ(m->get_num_stored_elements(), 8); + auto m = gko::batch::matrix::BatchDense::create( + this->exec, gko::batch_dim<2>> {2, gko::dim<2>{5, 3}}); + + ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3)); + ASSERT_EQ(m->get_num_stored_elements(), 30); } @@ -152,23 +185,27 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) using size_type = gko::size_type; // clang-format off value_type data[] = { - 1.0, 2.0, -1.0, - 3.0, 4.0, -1.0, - 3.0, 5.0, 1.0, - 5.0, 6.0, -3.0}; + 1.0, 2.0, + -1.0, 3.0, + 4.0, -1.0, + 3.0, 5.0, + 1.0, 5.0, + 6.0, -3.0}; // clang-format on - auto m = gko::matrix::BatchDense::create( - this->exec, - std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, - gko::array::view(this->exec, 12, data), - std::vector{3, 3}); + auto m = gko::batch::matrix::BatchDense::create( + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), + gko::array::view(this->exec, 8, data)); ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); - ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); - ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); + ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); + ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); + ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); + ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); } @@ -178,23 +215,27 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) using size_type = gko::size_type; // clang-format off const value_type data[] = { - 1.0, 2.0, -1.0, - 3.0, 4.0, -1.0, - 3.0, 5.0, 1.0, - 5.0, 6.0, -3.0}; + 1.0, 2.0, + -1.0, 3.0, + 4.0, -1.0, + 3.0, 5.0, + 1.0, 5.0, + 6.0, -3.0}; // clang-format on auto m = gko::matrix::BatchDense::create_const( - this->exec, - std::vector>{gko::dim<2>{2, 2}, gko::dim<2>{2, 2}}, - gko::array::const_view(this->exec, 12, data), - std::vector{3, 3}); + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), + gko::array::const_view(this->exec, 8, data)); ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{-1.0}); - ASSERT_EQ(m->at(1, 0, 1), value_type{5.0}); - ASSERT_EQ(m->at(1, 1, 2), value_type{-3.0}); + ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); + ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); + ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); + ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); } @@ -203,20 +244,15 @@ TYPED_TEST(BatchDense, CanBeConstructedFromBatchDenseMatrices) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize( - 3, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::matrix::BatchDense::create( + auto m = gko::batch::multivector::create_from_dense( this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::matrix::BatchDense::create( - this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), - mat2.get(), mat1.get(), mat2.get()}); - auto m2 = - gko::matrix::BatchDense::create(this->exec, 3, m.get()); - GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); + this->assert_equal_to_original_mtx(m.get()); } @@ -297,19 +333,6 @@ TYPED_TEST(BatchDense, CanBeListConstructed) } -TYPED_TEST(BatchDense, CanBeListConstructedWithstride) -{ - using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( - std::vector{2}, {{1.0, 2.0}}, this->exec); - ASSERT_EQ(m->get_num_batch_entries(), 1); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 4); - EXPECT_EQ(m->at(0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1), value_type{2.0}); -} - - TYPED_TEST(BatchDense, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; @@ -385,40 +408,6 @@ TYPED_TEST(BatchDense, CanBeDoubleListConstructedWithstride) } -TYPED_TEST(BatchDense, CanBeCopied) -{ - auto mtx_copy = gko::matrix::BatchDense::create(this->exec); - mtx_copy->copy_from(this->mtx.get()); - this->assert_equal_to_original_mtx(this->mtx.get()); - this->mtx->at(0, 0, 0) = 7; - this->mtx->at(0, 1) = 7; - this->assert_equal_to_original_mtx(mtx_copy.get()); -} - - -TYPED_TEST(BatchDense, CanBeMoved) -{ - auto mtx_copy = gko::matrix::BatchDense::create(this->exec); - mtx_copy->copy_from(std::move(this->mtx)); - this->assert_equal_to_original_mtx(mtx_copy.get()); -} - - -TYPED_TEST(BatchDense, CanBeCloned) -{ - auto mtx_clone = this->mtx->clone(); - this->assert_equal_to_original_mtx( - dynamic_castmtx.get())>(mtx_clone.get())); -} - - -TYPED_TEST(BatchDense, CanBeCleared) -{ - this->mtx->clear(); - this->assert_empty(this->mtx.get()); -} - - TYPED_TEST(BatchDense, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; @@ -515,6 +504,3 @@ TYPED_TEST(BatchDense, CanBeReadFromMatrixAssemblyData) EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); EXPECT_EQ(m->at(1, 1, 0), value_type{5.0}); } - - -} // namespace diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index d91274526d3..43f35e55f62 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -52,6 +52,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace batch { +namespace matrix { + + +template +class BatchDense; + + +} + /** * MultiVector stores multiple vectors in a batched fashion and is useful @@ -81,21 +90,25 @@ class MultiVector : public EnablePolymorphicObject>, public EnablePolymorphicAssignment>, public EnableCreateMethod>, - public ConvertibleTo>> { + public ConvertibleTo>>, + public ConvertibleTo> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class MultiVector>; friend class MultiVector>; + friend class matrix::BatchDense; public: using EnablePolymorphicAssignment::convert_to; using EnablePolymorphicAssignment::move_to; using ConvertibleTo>>::convert_to; using ConvertibleTo>>::move_to; + using ConvertibleTo>::convert_to; + using ConvertibleTo>::move_to; using value_type = ValueType; using index_type = int32; - using unbatch_type = matrix::Dense; + using unbatch_type = gko::matrix::Dense; using absolute_type = remove_complex>; using complex_type = to_complex>; @@ -113,6 +126,10 @@ class MultiVector void move_to(MultiVector>* result) override; + void convert_to(matrix::BatchDense* result) const override; + + void move_to(matrix::BatchDense* result) override; + /** * Creates a mutable view (of matrix::Dense type) of one item of the Batch * MultiVector object. Does not perform any deep copies, but only returns a @@ -196,8 +213,8 @@ class MultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 47aff35b7e7..1b36cd64869 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -97,14 +97,7 @@ class BatchDense : public EnableBatchLinOp>, * @param other The other matrix whose configuration needs to copied. */ static std::unique_ptr create_with_config_of( - const BatchDense* other) - { - // De-referencing `other` before calling the functions (instead of - // using operator `->`) is currently required to be compatible with - // CUDA 10.1. - // Otherwise, it results in a compile error. - return (*other).create_with_same_config(); - } + ptr_param other); void convert_to( BatchDense>* result) const override; @@ -228,7 +221,7 @@ class BatchDense : public EnableBatchLinOp>, * array (if it resides on the same executor as the vector) or a copy of the * array on the correct executor. */ - static std::unique_ptr> create_const( + static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values); From 989b17cd0ed518a75b5770d6f7a05385bcfce0b5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 3 Oct 2023 14:09:59 +0200 Subject: [PATCH 346/583] Add tests for BatchDense core --- core/matrix/batch_dense.cpp | 7 - core/test/base/batch_multi_vector.cpp | 2 +- core/test/matrix/CMakeLists.txt | 3 +- core/test/matrix/batch_dense.cpp | 231 +++++++++------------ core/test/utils/assertions.hpp | 8 +- include/ginkgo/core/matrix/batch_dense.hpp | 93 +++++---- 6 files changed, 159 insertions(+), 185 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index 9f72a26c488..f5d255d901c 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -127,13 +127,6 @@ BatchDense::create_with_config_of( } -template -void BatchDense::set_size(const batch_dim<2>& value) noexcept -{ - batch_size_ = value; -} - - template std::unique_ptr> BatchDense::create_with_same_config() const diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 7bdaec30b27..8390a6c4327 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -412,9 +412,9 @@ TYPED_TEST(MultiVector, CanBeReadFromMatrixData) auto m = gko::batch::read>(this->exec, vec_data); - EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt index 57c2c97e355..cca4b8da1c0 100644 --- a/core/test/matrix/CMakeLists.txt +++ b/core/test/matrix/CMakeLists.txt @@ -1,5 +1,4 @@ -# ginkgo_create_test(batch_dense) -# +ginkgo_create_test(batch_dense) ginkgo_create_test(coo) ginkgo_create_test(coo_builder) ginkgo_create_test(csr) diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index a1ebdb1061c..f9210550bea 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -36,12 +36,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include +#include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/batch_helpers.hpp" template @@ -63,11 +66,11 @@ class BatchDense : public ::testing::Test { static void assert_equal_to_original_mtx( - gko::matrix::BatchDense* m) + gko::batch::matrix::BatchDense* m) { - ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 4)); + ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3)); EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); EXPECT_EQ(m->at(0, 0, 2), value_type{3.0}); @@ -82,19 +85,26 @@ class BatchDense : public ::testing::Test { ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); } - static void assert_empty(gko::matrix::BatchDense* m) + static void assert_empty(gko::batch::matrix::BatchDense* m) { - ASSERT_EQ(m->get_num_batch_entries(), 0); + ASSERT_EQ(m->get_num_batch_items(), 0); ASSERT_EQ(m->get_num_stored_elements(), 0); } std::shared_ptr exec; std::unique_ptr> mtx; + std::unique_ptr> dense_mtx; }; TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); +TYPED_TEST(BatchDense, KnowsItsSizeAndValues) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} + + TYPED_TEST(BatchDense, CanBeEmpty) { auto empty = gko::batch::matrix::BatchDense::create(this->exec); @@ -171,9 +181,9 @@ TYPED_TEST(BatchDense, CanBeConstructedWithSize) using size_type = gko::size_type; auto m = gko::batch::matrix::BatchDense::create( - this->exec, gko::batch_dim<2>> {2, gko::dim<2>{5, 3}}); + this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3})); - ASSERT_EQ(m->get_num_batch_entries(), 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3)); ASSERT_EQ(m->get_num_stored_elements(), 30); } @@ -223,7 +233,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) 6.0, -3.0}; // clang-format on - auto m = gko::matrix::BatchDense::create_const( + auto m = gko::batch::matrix::BatchDense::create_const( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), gko::array::const_view(this->exec, 8, data)); @@ -239,17 +249,19 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) } -TYPED_TEST(BatchDense, CanBeConstructedFromBatchDenseMatrices) +TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::multivector::create_from_dense( + auto m = gko::batch::create_from_item< + gko::batch::matrix::BatchDense>( this->exec, std::vector{mat1.get(), mat2.get()}); this->assert_equal_to_original_mtx(m.get()); @@ -261,34 +273,45 @@ TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatricesByDuplication) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; + auto mat1 = gko::initialize( 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::matrix::BatchDense::create( + auto bat_m = gko::batch::create_from_item< + gko::batch::matrix::BatchDense>( this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); - auto m = - gko::matrix::BatchDense::create(this->exec, 3, mat1.get()); + auto m = gko::batch::create_from_item< + gko::batch::matrix::BatchDense>(this->exec, 3, mat1.get()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } -TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) +TYPED_TEST(BatchDense, CanBeConstructedByDuplicatingBatchDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize( - 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); + + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::matrix::BatchDense::create( + auto m = gko::batch::create_from_item< + gko::batch::matrix::BatchDense>( this->exec, std::vector{mat1.get(), mat2.get()}); + auto m_ref = gko::batch::create_from_item< + gko::batch::matrix::BatchDense>( + this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), + mat2.get(), mat1.get(), mat2.get()}); - this->assert_equal_to_original_mtx(m.get()); + auto m2 = gko::batch::duplicate>( + this->exec, 3, m.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } @@ -302,30 +325,23 @@ TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto dense_mats = this->mtx->unbatch(); - + auto dense_mats = + gko::batch::unbatch>( + this->mtx.get()); GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.); } -TYPED_TEST(BatchDense, KnowsItsSizeAndValues) -{ - this->assert_equal_to_original_mtx(this->mtx.get()); -} - - TYPED_TEST(BatchDense, CanBeListConstructed) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + auto m = gko::batch::initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); - ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 4); + ASSERT_EQ(m->get_num_batch_items(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); EXPECT_EQ(m->at(0, 0), value_type{1}); EXPECT_EQ(m->at(0, 1), value_type{2}); EXPECT_EQ(m->at(1, 0), value_type{1}); @@ -336,12 +352,12 @@ TYPED_TEST(BatchDense, CanBeListConstructed) TYPED_TEST(BatchDense, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; - auto m = gko::batch_initialize>( + + auto m = gko::batch::initialize>( 2, I({1.0, 2.0}), this->exec); - ASSERT_EQ(m->get_num_batch_entries(), 2); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 4); + + ASSERT_EQ(m->get_num_batch_items(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); @@ -353,18 +369,13 @@ TYPED_TEST(BatchDense, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; using T = value_type; - auto m = gko::batch_initialize>( + + auto m = gko::batch::initialize>( {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, - {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, + {I{1.0, 2.0, -1.0}, I{3.0, 4.0, -2.0}, I{5.0, 6.0, -3.0}}}, this->exec); - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); - ASSERT_EQ(m->get_stride().at(0), 3); - ASSERT_EQ(m->get_stride().at(1), 2); - EXPECT_EQ(m->get_num_stored_elements(), 15); - ASSERT_EQ(m->get_num_stored_elements(0), 9); - ASSERT_EQ(m->get_num_stored_elements(1), 6); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3)); EXPECT_EQ(m->at(0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 1), value_type{1.0}); EXPECT_EQ(m->at(0, 2), value_type{0.0}); @@ -372,72 +383,58 @@ TYPED_TEST(BatchDense, CanBeDoubleListConstructed) EXPECT_EQ(m->at(0, 4), value_type{4.0}); EXPECT_EQ(m->at(1, 0), value_type{1.0}); EXPECT_EQ(m->at(1, 1), value_type{2.0}); - EXPECT_EQ(m->at(1, 2), value_type{3.0}); - ASSERT_EQ(m->at(1, 3), value_type{4.0}); - EXPECT_EQ(m->at(1, 4), value_type{5.0}); + EXPECT_EQ(m->at(1, 2), value_type{-1.0}); + ASSERT_EQ(m->at(1, 3), value_type{3.0}); + EXPECT_EQ(m->at(1, 4), value_type{4.0}); } -TYPED_TEST(BatchDense, CanBeDoubleListConstructedWithstride) +TYPED_TEST(BatchDense, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; - using T = value_type; - auto m = gko::batch_initialize>( - {4, 3}, - {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, - {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}}, - this->exec); + using index_type = int; - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(3, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(3, 2)); - ASSERT_EQ(m->get_stride().at(0), 4); - ASSERT_EQ(m->get_stride().at(1), 3); - EXPECT_EQ(m->get_num_stored_elements(), 21); - ASSERT_EQ(m->get_num_stored_elements(0), 12); - ASSERT_EQ(m->get_num_stored_elements(1), 9); - EXPECT_EQ(m->at(0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1), value_type{1.0}); - EXPECT_EQ(m->at(0, 2), value_type{0.0}); - ASSERT_EQ(m->at(0, 3), value_type{2.0}); - EXPECT_EQ(m->at(0, 4), value_type{4.0}); - EXPECT_EQ(m->at(1, 0), value_type{1.0}); - EXPECT_EQ(m->at(1, 1), value_type{2.0}); - EXPECT_EQ(m->at(1, 2), value_type{3.0}); - ASSERT_EQ(m->at(1, 3), value_type{4.0}); - EXPECT_EQ(m->at(1, 4), value_type{5.0}); + auto vec_data = std::vector>{}; + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}})); + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}})); + + auto m = gko::batch::read>( + this->exec, vec_data); + + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); + EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); } -TYPED_TEST(BatchDense, CanBeReadFromMatrixData) +TYPED_TEST(BatchDense, CanBeReadFromSparseMatrixData) { using value_type = typename TestFixture::value_type; - auto m = gko::matrix::BatchDense::create(this->exec); - // clang-format off - m->read({gko::matrix_data{{2, 3}, - {{0, 0, 1.0}, - {0, 1, 3.0}, - {0, 2, 2.0}, - {1, 0, 0.0}, - {1, 1, 5.0}, - {1, 2, 0.0}}}, - gko::matrix_data{{2, 2}, - {{0, 0, -1.0}, - {0, 1, 0.5}, - {1, 0, 0.0}, - {1, 1, 9.0}}}}); - // clang-format on - - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 2)); - ASSERT_EQ(m->get_num_stored_elements(), 10); - ASSERT_EQ(m->get_num_stored_elements(0), 6); - ASSERT_EQ(m->get_num_stored_elements(1), 4); + using index_type = int; + auto vec_data = std::vector>{}; + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}})); + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}})); + + auto m = gko::batch::read>( + this->exec, vec_data); + + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); - EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); - EXPECT_EQ(m->at(0, 1, 2), value_type{0.0}); EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); @@ -448,10 +445,12 @@ TYPED_TEST(BatchDense, CanBeReadFromMatrixData) TYPED_TEST(BatchDense, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; + using index_type = int; using tpl = typename gko::matrix_data::nonzero_type; - std::vector> data; - this->mtx->write(data); + auto data = gko::batch::write>( + this->mtx.get()); ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); ASSERT_EQ(data[0].nonzeros.size(), 6); @@ -470,37 +469,3 @@ TYPED_TEST(BatchDense, GeneratesCorrectMatrixData) EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0})); EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0})); } - - -TYPED_TEST(BatchDense, CanBeReadFromMatrixAssemblyData) -{ - using value_type = typename TestFixture::value_type; - auto m = gko::matrix::BatchDense::create(this->exec); - gko::matrix_assembly_data data1(gko::dim<2>{2, 3}); - data1.set_value(0, 0, 1.0); - data1.set_value(0, 1, 3.0); - data1.set_value(0, 2, 2.0); - data1.set_value(1, 0, 0.0); - data1.set_value(1, 1, 5.0); - data1.set_value(1, 2, 0.0); - gko::matrix_assembly_data data2(gko::dim<2>{2, 1}); - data2.set_value(0, 0, 2.0); - data2.set_value(1, 0, 5.0); - auto data = std::vector>{data1, data2}; - - m->read(data); - - ASSERT_EQ(m->get_size().at(0), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size().at(1), gko::dim<2>(2, 1)); - ASSERT_EQ(m->get_num_stored_elements(), 8); - ASSERT_EQ(m->get_num_stored_elements(0), 6); - ASSERT_EQ(m->get_num_stored_elements(1), 2); - EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); - EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); - EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); - EXPECT_EQ(m->at(0, 0, 2), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 2), value_type{0.0}); - EXPECT_EQ(m->at(1, 0, 0), value_type{2.0}); - EXPECT_EQ(m->at(1, 1, 0), value_type{5.0}); -} diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 63ed1e5423a..40034883078 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -720,12 +720,8 @@ ::testing::AssertionResult batch_matrices_near( using value_type1 = typename Mat1::value_type; using value_type2 = typename Mat2::value_type; - auto first_data = - gko::batch::write>(first); - auto second_data = - gko::batch::write>(second); + auto first_data = gko::batch::write(first); + auto second_data = gko::batch::write(second); if (first_data.size() != second_data.size()) { return ::testing::AssertionFailure() diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 1b36cd64869..55a1791a2a5 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -124,46 +124,75 @@ class BatchDense : public EnableBatchLinOp>, size_type item_id) const; /** - * Returns the batch size. + * Returns a pointer to the array of values of the multi-vector * - * @return the batch size + * @return the pointer to the array of values */ - batch_dim<2> get_size() const { return batch_size_; } + value_type* get_values() noexcept { return values_.get_data(); } /** - * Returns the number of batch items. + * @copydoc get_values() * - * @return the number of batch items + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. */ - size_type get_num_batch_items() const + const value_type* get_const_values() const noexcept { - return batch_size_.get_num_batch_items(); + return values_.get_const_data(); } /** - * Returns the common size of the batch items. + * Returns a single element for a particular batch item. + * + * @param batch_id the batch item index to be queried + * @param row the row of the requested element + * @param col the column of the requested element * - * @return the common size stored + * @note the method has to be called on the same Executor the vector is + * stored at (e.g. trying to call this method on a GPU multi-vector + * from the OMP results in a runtime error) */ - dim<2> get_common_size() const { return batch_size_.get_common_size(); } + value_type& at(size_type batch_id, size_type row, size_type col) + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_data()[linearize_index(batch_id, row, col)]; + } /** - * Returns a pointer to the array of values of the multi-vector - * - * @return the pointer to the array of values + * @copydoc MultiVector::at(size_type, size_type, size_type) */ - value_type* get_values() noexcept { return values_.get_data(); } + value_type at(size_type batch_id, size_type row, size_type col) const + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_const_data()[linearize_index(batch_id, row, col)]; + } /** - * @copydoc get_values() + * Returns a single element for a particular batch item. * - * @note This is the constant version of the function, which can be - * significantly more memory efficient than the non-constant version, - * so always prefer this version. + * Useful for iterating across all elements of the vector. + * However, it is less efficient than the two-parameter variant of this + * method. + * + * @param batch_id the batch item index to be queried + * @param idx a linear index of the requested element + * + * @note the method has to be called on the same Executor the vector is + * stored at (e.g. trying to call this method on a GPU multi-vector + * from the OMP results in a runtime error) */ - const value_type* get_const_values() const noexcept + ValueType& at(size_type batch_id, size_type idx) noexcept { - return values_.get_const_data(); + return values_.get_data()[linearize_index(batch_id, idx)]; + } + + /** + * @copydoc MultiVector::at(size_type, size_type, size_type) + */ + ValueType at(size_type batch_id, size_type idx) const noexcept + { + return values_.get_const_data()[linearize_index(batch_id, idx)]; } /** @@ -225,12 +254,6 @@ class BatchDense : public EnableBatchLinOp>, std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values); -private: - inline size_type compute_num_elems(const batch_dim<2>& size) - { - return size.get_cumulative_offset(size.get_num_batch_items()); - } - void apply(const MultiVector* b, MultiVector* x) const @@ -246,14 +269,13 @@ class BatchDense : public EnableBatchLinOp>, this->apply_impl(alpha, b, beta, x); } -protected: - /** - * Sets the size of the MultiVector. - * - * @param value the new size of the operator - */ - void set_size(const batch_dim<2>& value) noexcept; +private: + inline size_type compute_num_elems(const batch_dim<2>& size) + { + return size.get_cumulative_offset(size.get_num_batch_items()); + } +protected: /** * Creates an uninitialized BatchDense matrix of the specified size. * @@ -310,8 +332,8 @@ class BatchDense : public EnableBatchLinOp>, size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept { - return batch_size_.get_cumulative_offset(batch) + - row * batch_size_.get_common_size()[1] + col; + return this->get_size().get_cumulative_offset(batch) + + row * this->get_size().get_common_size()[1] + col; } size_type linearize_index(size_type batch, size_type idx) const noexcept @@ -321,7 +343,6 @@ class BatchDense : public EnableBatchLinOp>, } private: - batch_dim<2> batch_size_; array values_; }; From 12c652056dfcf0f49f0a66e78456afe3dfca6f00 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 3 Oct 2023 16:18:47 +0200 Subject: [PATCH 347/583] Add reference kernel tests --- core/matrix/batch_dense.cpp | 18 +- reference/test/matrix/CMakeLists.txt | 1 + reference/test/matrix/batch_dense_kernels.cpp | 219 ++++++++++++++++++ 3 files changed, 230 insertions(+), 8 deletions(-) create mode 100644 reference/test/matrix/batch_dense_kernels.cpp diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index f5d255d901c..c9da010c228 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -168,11 +168,12 @@ template void BatchDense::apply_impl(const MultiVector* b, MultiVector* x) const { - GKO_ASSERT_EQUAL_DIMENSIONS(b->get_common_size(), x->get_common_size()); GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); - GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); - GKO_ASSERT_CONFORMANT(this->get_common_size(), x->get_common_size()); + + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); this->get_executor()->run(dense::make_simple_apply(this, b, x)); } @@ -183,13 +184,14 @@ void BatchDense::apply_impl(const MultiVector* alpha, const MultiVector* beta, MultiVector* x) const { - GKO_ASSERT_EQUAL_DIMENSIONS(b->get_common_size(), x->get_common_size()); GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); - GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); - GKO_ASSERT_CONFORMANT(this->get_common_size(), x->get_common_size()); - GKO_ASSERT_EQUAL_COLS(alpha->get_common_size(), gko::dim<2>(1, 1)); - GKO_ASSERT_EQUAL_COLS(beta->get_common_size(), gko::dim<2>(1, 1)); + + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(), gko::dim<2>(1, 1)); + GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1)); this->get_executor()->run( dense::make_advanced_apply(alpha, this, b, beta, x)); } diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt index 9670a5df80c..18634de662d 100644 --- a/reference/test/matrix/CMakeLists.txt +++ b/reference/test/matrix/CMakeLists.txt @@ -1,3 +1,4 @@ +ginkgo_create_test(batch_dense_kernels) ginkgo_create_test(coo_kernels) ginkgo_create_test(csr_kernels) ginkgo_create_test(dense_kernels) diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp new file mode 100644 index 00000000000..7bf11ba70f9 --- /dev/null +++ b/reference/test/matrix/batch_dense_kernels.cpp @@ -0,0 +1,219 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include + + +#include "core/matrix/batch_dense_kernels.hpp" +#include "core/test/utils.hpp" + + +template +class BatchDense : public ::testing::Test { +protected: + using value_type = T; + using size_type = gko::size_type; + using Mtx = gko::batch::matrix::BatchDense; + using MVec = gko::batch::MultiVector; + using DenseMtx = gko::matrix::Dense; + using ComplexMtx = gko::to_complex; + using RealMtx = gko::remove_complex; + BatchDense() + : exec(gko::ReferenceExecutor::create()), + mtx_0(gko::batch::initialize( + {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, + {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}}, + exec)), + mtx_00(gko::initialize( + {I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, exec)), + mtx_01(gko::initialize( + {I({1.0, -2.0, -0.5}), I({1.0, -2.5, 4.0})}, exec)), + b_0(gko::batch::initialize( + {{I({1.0, 0.0, 1.0}), I({2.0, 0.0, 1.0}), + I({1.0, 0.0, 2.0})}, + {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), + I({1.0, 0.0, 2.0})}}, + exec)), + b_00(gko::initialize( + {I({1.0, 0.0, 1.0}), I({2.0, 0.0, 1.0}), + I({1.0, 0.0, 2.0})}, + exec)), + b_01(gko::initialize( + {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), + I({1.0, 0.0, 2.0})}, + exec)), + x_0(gko::batch::initialize( + {{I({2.0, 0.0, 1.0}), I({2.0, 0.0, 2.0})}, + {I({-2.0, 1.0, 1.0}), I({1.0, -1.0, -1.0})}}, + exec)), + x_00(gko::initialize( + {I({2.0, 0.0, 1.0}), I({2.0, 0.0, 2.0})}, exec)), + x_01(gko::initialize( + {I({-2.0, 1.0, 1.0}), I({1.0, -1.0, -1.0})}, exec)) + {} + + std::shared_ptr exec; + std::unique_ptr mtx_0; + std::unique_ptr mtx_00; + std::unique_ptr mtx_01; + std::unique_ptr b_0; + std::unique_ptr b_00; + std::unique_ptr b_01; + std::unique_ptr x_0; + std::unique_ptr x_00; + std::unique_ptr x_01; + + std::ranlux48 rand_engine; +}; + + +TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); + + +TYPED_TEST(BatchDense, AppliesToBatchMultiVector) +{ + using T = typename TestFixture::value_type; + + this->mtx_0->apply(this->b_0.get(), this->x_0.get()); + this->mtx_00->apply(this->b_00.get(), this->x_00.get()); + this->mtx_01->apply(this->b_01.get(), this->x_01.get()); + + auto res = gko::batch::unbatch>(this->x_0.get()); + + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); +} + + +TYPED_TEST(BatchDense, AppliesLinearCombinationToBatchMultiVector) +{ + using Mtx = typename TestFixture::Mtx; + using MVec = typename TestFixture::MVec; + using DenseMtx = typename TestFixture::DenseMtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch::initialize({{1.5}, {-1.0}}, this->exec); + auto beta = gko::batch::initialize({{2.5}, {-4.0}}, this->exec); + auto alpha0 = gko::initialize({1.5}, this->exec); + auto alpha1 = gko::initialize({-1.0}, this->exec); + auto beta0 = gko::initialize({2.5}, this->exec); + auto beta1 = gko::initialize({-4.0}, this->exec); + + this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), + this->x_0.get()); + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), + this->x_00.get()); + this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), + this->x_01.get()); + + auto res = gko::batch::unbatch>(this->x_0.get()); + + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); +} + + +TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfResultCols) +{ + using MVec = typename TestFixture::MVec; + auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}}); + + ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfResultRows) +{ + using MVec = typename TestFixture::MVec; + auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}}); + + ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, ApplyFailsOnWrongInnerDimension) +{ + using MVec = typename TestFixture::MVec; + auto res = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + + ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, AdvancedApplyFailsOnWrongInnerDimension) +{ + using MVec = typename TestFixture::MVec; + auto res = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + auto alpha = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + auto beta = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + + ASSERT_THROW( + this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(BatchDense, AdvancedApplyFailsOnWrongAlphaDimension) +{ + using MVec = typename TestFixture::MVec; + auto res = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}}); + auto alpha = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}}); + auto beta = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + + ASSERT_THROW( + this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), + gko::DimensionMismatch); +} From 1b14a262935a267e30a8d24017786e8376a9e5da Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 3 Oct 2023 17:56:16 +0200 Subject: [PATCH 348/583] Add OMP tests and fix kernel --- core/matrix/batch_struct.hpp | 14 +- cuda/matrix/batch_dense_kernels.cu | 2 +- hip/matrix/batch_dense_kernels.hip.cpp | 2 +- omp/matrix/batch_dense_kernels.cpp | 38 ++---- reference/matrix/batch_dense_kernels.cpp | 32 ++--- reference/matrix/batch_dense_kernels.hpp.inc | 4 +- reference/test/matrix/batch_dense_kernels.cpp | 27 ++++ test/matrix/CMakeLists.txt | 1 + test/matrix/batch_dense_kernels.cpp | 129 ++++++++++++++++++ 9 files changed, 188 insertions(+), 61 deletions(-) create mode 100644 test/matrix/batch_dense_kernels.cpp diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index b6926b0894d..37c297bb6b5 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -54,7 +54,7 @@ struct batch_item { ValueType* values; int stride; int num_rows; - int num_rhs; + int num_cols; }; @@ -70,7 +70,7 @@ struct uniform_batch { size_type num_batch_items; int stride; int num_rows; - int num_rhs; + int num_cols; size_type get_entry_storage() const { @@ -86,7 +86,7 @@ template GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item to_const( const batch_dense::batch_item& b) { - return {b.values, b.stride, b.num_rows, b.num_rhs}; + return {b.values, b.stride, b.num_rows, b.num_cols}; } @@ -94,7 +94,7 @@ template GKO_ATTRIBUTES GKO_INLINE batch_dense::uniform_batch to_const( const batch_dense::uniform_batch& ub) { - return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs}; + return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_cols}; } @@ -104,16 +104,16 @@ GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item extract_batch_item( const size_type batch_idx) { return {batch.values + batch_idx * batch.stride * batch.num_rows, - batch.stride, batch.num_rows, batch.num_rhs}; + batch.stride, batch.num_rows, batch.num_cols}; } template GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item extract_batch_item( ValueType* const batch_values, const int stride, const int num_rows, - const int num_rhs, const size_type batch_idx) + const int num_cols, const size_type batch_idx) { return {batch_values + batch_idx * stride * num_rows, stride, num_rows, - num_rhs}; + num_cols}; } diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 4615af581f5..c0a172fd026 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" +// #include "cuda/matrix/batch_struct.hip.hpp" namespace gko { diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 93570388d50..06f0caf81ec 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -48,7 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" +// #include "hip/matrix/batch_struct.hip.hpp" namespace gko { diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp index fe742bee402..a767215c844 100644 --- a/omp/matrix/batch_dense_kernels.cpp +++ b/omp/matrix/batch_dense_kernels.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" #include "reference/matrix/batch_struct.hpp" @@ -70,7 +71,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_ub = host::get_batch_struct(mat); #pragma omp parallel for for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = batch::extract_batch_item(mat_ub, batch); + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); simple_apply_kernel(mat_item, b_item, x_item); @@ -84,40 +85,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void advanced_apply(std::shared_ptr exec, const batch::MultiVector* alpha, - const batch::matrix::BatchDense* a, + const batch::matrix::BatchDense* mat, const batch::MultiVector* b, const batch::MultiVector* beta, - batch::MultiVector* c) + batch::MultiVector* x) { const auto b_ub = host::get_batch_struct(b); const auto x_ub = host::get_batch_struct(x); const auto mat_ub = host::get_batch_struct(mat); const auto alpha_ub = host::get_batch_struct(alpha); const auto beta_ub = host::get_batch_struct(beta); - if (alpha->get_num_batch_items() > 1) { - GKO_ASSERT(alpha->get_num_batch_items() == x->get_num_batch_items()); - GKO_ASSERT(beta->get_num_batch_items() == x->get_num_batch_items()); #pragma omp parallel for - for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = batch::extract_batch_item(mat_ub, batch); - const auto b_item = batch::extract_batch_item(b_ub, batch); - const auto x_item = batch::extract_batch_item(x_ub, batch); - const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); - const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); - } - } else { - const auto alpha_item = batch::extract_batch_item(alpha_ub, 0); - const auto beta_item = batch::extract_batch_item(beta_ub, 0); -#pragma omp parallel for - for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = batch::extract_batch_item(mat_ub, batch); - const auto b_item = batch::extract_batch_item(b_ub, batch); - const auto x_item = batch::extract_batch_item(x_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); - } + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); + const auto beta_item = batch::extract_batch_item(beta_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); } } diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index bb5f3e18df7..f42d9a81d1f 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -95,30 +95,14 @@ void advanced_apply(std::shared_ptr exec, const auto mat_ub = host::get_batch_struct(mat); const auto alpha_ub = host::get_batch_struct(alpha); const auto beta_ub = host::get_batch_struct(beta); - if (alpha->get_num_batch_items() > 1) { - GKO_ASSERT(alpha->get_num_batch_items() == x->get_num_batch_items()); - GKO_ASSERT(beta->get_num_batch_items() == x->get_num_batch_items()); - for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = - batch::matrix::extract_batch_item(mat_ub, batch); - const auto b_item = batch::extract_batch_item(b_ub, batch); - const auto x_item = batch::extract_batch_item(x_ub, batch); - const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); - const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); - } - } else { - const auto alpha_item = batch::extract_batch_item(alpha_ub, 0); - const auto beta_item = batch::extract_batch_item(beta_ub, 0); - for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { - const auto mat_item = - batch::matrix::extract_batch_item(mat_ub, batch); - const auto b_item = batch::extract_batch_item(b_ub, batch); - const auto x_item = batch::extract_batch_item(x_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); - } + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); + const auto beta_item = batch::extract_batch_item(beta_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); } } diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp.inc index d45183b2faa..bff9ad137cf 100644 --- a/reference/matrix/batch_dense_kernels.hpp.inc +++ b/reference/matrix/batch_dense_kernels.hpp.inc @@ -43,7 +43,7 @@ inline void simple_apply_kernel( } for (int row = 0; row < c.num_rows; ++row) { - for (int inner = 0; inner < a.num_rhs; ++inner) { + for (int inner = 0; inner < a.num_cols; ++inner) { for (int col = 0; col < c.num_rhs; ++col) { c.values[row * c.stride + col] += a.values[row * a.stride + inner] * @@ -77,7 +77,7 @@ inline void advanced_apply_kernel( } for (int row = 0; row < c.num_rows; ++row) { - for (int inner = 0; inner < a.num_rhs; ++inner) { + for (int inner = 0; inner < a.num_cols; ++inner) { for (int col = 0; col < c.num_rhs; ++col) { c.values[row * c.stride + col] += alpha * a.values[row * a.stride + inner] * diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp index 7bf11ba70f9..8e2e522e5f4 100644 --- a/reference/test/matrix/batch_dense_kernels.cpp +++ b/reference/test/matrix/batch_dense_kernels.cpp @@ -129,6 +129,33 @@ TYPED_TEST(BatchDense, AppliesToBatchMultiVector) } +TYPED_TEST(BatchDense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) +{ + using Mtx = typename TestFixture::Mtx; + using MVec = typename TestFixture::MVec; + using DenseMtx = typename TestFixture::DenseMtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch::initialize(2, {1.5}, this->exec); + auto beta = gko::batch::initialize(2, {-4.0}, this->exec); + auto alpha0 = gko::initialize({1.5}, this->exec); + auto alpha1 = gko::initialize({1.5}, this->exec); + auto beta0 = gko::initialize({-4.0}, this->exec); + auto beta1 = gko::initialize({-4.0}, this->exec); + + this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), + this->x_0.get()); + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), + this->x_00.get()); + this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), + this->x_01.get()); + + auto res = gko::batch::unbatch>(this->x_0.get()); + + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); +} + + TYPED_TEST(BatchDense, AppliesLinearCombinationToBatchMultiVector) { using Mtx = typename TestFixture::Mtx; diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt index a9cf267a3c8..91987f3717f 100644 --- a/test/matrix/CMakeLists.txt +++ b/test/matrix/CMakeLists.txt @@ -1,3 +1,4 @@ +ginkgo_create_common_test(batch_dense_kernels DISABLE_EXECUTORS dpcpp hip cuda) ginkgo_create_common_device_test(csr_kernels) ginkgo_create_common_test(csr_kernels2) ginkgo_create_common_test(coo_kernels) diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp new file mode 100644 index 00000000000..60ef4d61a95 --- /dev/null +++ b/test/matrix/batch_dense_kernels.cpp @@ -0,0 +1,129 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include + + +#include "core/base/batch_utilities.hpp" +#include "core/matrix/batch_dense_kernels.hpp" +#include "core/test/utils.hpp" +#include "core/test/utils/assertions.hpp" +#include "core/test/utils/batch_helpers.hpp" +#include "test/utils/executor.hpp" + + +class BatchDense : public CommonTestFixture { +protected: + using vtype = double; + using Mtx = gko::batch::matrix::BatchDense; + using MVec = gko::batch::MultiVector; + + BatchDense() : rand_engine(15) {} + + template + std::unique_ptr gen_mtx(const gko::size_type num_batch_items, + gko::size_type num_rows, + gko::size_type num_cols) + { + return gko::test::generate_random_batch_matrix( + num_batch_items, num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_data(gko::size_type num_vecs = 1) + { + const int num_rows = 252; + const int num_cols = 32; + x = gen_mtx(batch_size, num_rows, num_cols); + y = gen_mtx(batch_size, num_cols, num_vecs); + alpha = gen_mtx(batch_size, 1, 1); + beta = gen_mtx(batch_size, 1, 1); + dx = gko::clone(exec, x); + dy = gko::clone(exec, y); + dalpha = gko::clone(exec, alpha); + dbeta = gko::clone(exec, beta); + expected = MVec::create( + ref, + gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs})); + expected->fill(gko::one()); + dresult = gko::clone(exec, expected); + } + + std::ranlux48 rand_engine; + + const size_t batch_size = 11; + std::unique_ptr x; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr dresult; + std::unique_ptr dx; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; +}; + + +TEST_F(BatchDense, SingleVectorApplyIsEquivalentToRef) +{ + set_up_apply_data(1); + + x->apply(y.get(), expected.get()); + dx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(BatchDense, SingleVectorAdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(1); + + x->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); +} From 3273d69a756dde47447e59fb20040f85c9efe581 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Wed, 4 Oct 2023 09:41:28 +0000 Subject: [PATCH 349/583] Format files Co-authored-by: Pratik Nayak --- include/ginkgo/core/base/batch_multi_vector.hpp | 4 ++-- include/ginkgo/core/matrix/batch_dense.hpp | 4 ++-- include/ginkgo/ginkgo.hpp | 1 + test/matrix/batch_dense_kernels.cpp | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 43f35e55f62..6b3b207c76c 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -213,8 +213,8 @@ class MultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 55a1791a2a5..0457f444c5a 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -217,8 +217,8 @@ class BatchDense : public EnableBatchLinOp>, * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index aed3b5f3572..8bb29242e88 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -108,6 +108,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index 60ef4d61a95..7d44f29899c 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include "core/matrix/batch_dense_kernels.hpp" #include @@ -43,10 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/base/batch_utilities.hpp" -#include "core/matrix/batch_dense_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" From 579b3e702c26e0f58d6f0dddfab7d42ffd171a3f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 4 Oct 2023 11:49:01 +0200 Subject: [PATCH 350/583] circ dep and typo fixes --- core/test/matrix/batch_dense.cpp | 2 +- reference/matrix/batch_struct.hpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index f9210550bea..02788e14b7d 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -153,7 +153,7 @@ TYPED_TEST(BatchDense, CanBeMoved) auto mtx_copy = gko::batch::matrix::BatchDense::create(this->exec); - mtx_copy->copy_from(std::move(this->mtx)); + this->mtx->move_to(mtx_copy); this->assert_equal_to_original_mtx(mtx_copy.get()); } diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index 1bed5a4e5c9..dee7c71948a 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -36,9 +36,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" namespace gko { From e97d62824cc9eb85be7aa65751d7118e38d810c3 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 4 Oct 2023 15:04:36 +0200 Subject: [PATCH 351/583] Add CUDA, HIP kernels and tests Co-authored-by: Aditya Kashi Co-authored-by: Isha Aggarwal --- .../batch_dense_kernel_launcher.hpp.inc | 78 ++++++++ .../matrix/batch_dense_kernels.hpp.inc | 170 ++++++++++++++++++ cuda/base/batch_multi_vector_kernels.cu | 1 + cuda/matrix/batch_dense_kernels.cu | 26 +-- cuda/matrix/batch_struct.hpp | 96 ++++++++++ hip/matrix/batch_dense_kernels.hip.cpp | 28 +-- hip/matrix/batch_struct.hip.hpp | 96 ++++++++++ test/matrix/CMakeLists.txt | 2 +- 8 files changed, 459 insertions(+), 38 deletions(-) create mode 100644 common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc create mode 100644 common/cuda_hip/matrix/batch_dense_kernels.hpp.inc create mode 100644 cuda/matrix/batch_struct.hpp create mode 100644 hip/matrix/batch_struct.hip.hpp diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc new file mode 100644 index 00000000000..668b0278680 --- /dev/null +++ b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::BatchDense* mat, + const batch::MultiVector* b, + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::BatchDense* mat, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, + beta_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc new file mode 100644 index 00000000000..43046166abc --- /dev/null +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc @@ -0,0 +1,170 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +__device__ __forceinline__ void simple_apply( + const gko::batch::matrix::batch_dense::batch_item& mat, + const ValueType* const __restrict__ b, ValueType* const __restrict__ x) +{ + constexpr auto tile_size = config::warp_size; + + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); + const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + + for (int row = subwarp_grp_id; row < mat.num_rows; + row += num_subwarp_grps_per_block) { + ValueType temp = zero(); + for (int j = subwarp_grp.thread_rank(); j < mat.num_cols; + j += subwarp_grp.size()) { + const ValueType val = mat.values[row * mat.stride + j]; + temp += val * b[j]; + } + +#pragma unroll + for (int i = static_cast(tile_size) / 2; i > 0; i /= 2) { + temp += subwarp_grp.shfl_down(temp, i); + } + + if (subwarp_grp.thread_rank() == 0) { + x[row] = temp; + } + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: + batch_dense:: + uniform_batch< + const ValueType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + simple_apply(mat_b, b_b.values, x_b.values); + } +} + + +template +__device__ __forceinline__ void advanced_apply( + const ValueType alpha, + const gko::batch::matrix::batch_dense::batch_item& mat, + const ValueType* const __restrict__ b, const ValueType beta, + ValueType* const __restrict__ x) +{ + constexpr auto tile_size = config::warp_size; + + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); + const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + + for (int row = subwarp_grp_id; row < mat.num_rows; + row += num_subwarp_grps_per_block) { + ValueType temp = zero(); + for (int j = subwarp_grp.thread_rank(); j < mat.num_cols; + j += subwarp_grp.size()) { + const ValueType val = mat.values[row * mat.stride + j]; + temp += alpha * val * b[j]; + } + +#pragma unroll + for (int i = static_cast(tile_size) / 2; i > 0; i /= 2) { + temp += subwarp_grp.shfl_down(temp, i); + } + + if (subwarp_grp.thread_rank() == 0) { + x[row] = temp + beta * x[row]; + } + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void advanced_apply_kernel(const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + alpha, + const gko::batch::matrix:: + batch_dense:: + uniform_batch< + const ValueType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + beta, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto beta_b = gko::batch::extract_batch_item(beta, batch_id); + advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values); + } +} diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 7729d006b75..5c4d1f5bdc5 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -78,6 +78,7 @@ constexpr int sm_oversubscription = 4; // clang-format on + } // namespace batch_multi_vector } // namespace cuda } // namespace kernels diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index c0a172fd026..9d9cfcf6c8e 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/pointer_mode_guard.hpp" @@ -45,7 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" -// #include "cuda/matrix/batch_struct.hip.hpp" +#include "cuda/matrix/batch_struct.hpp" namespace gko { @@ -60,29 +61,18 @@ namespace batch_dense { constexpr auto default_block_size = 256; -constexpr int sm_multiplier = 4; +constexpr int sm_oversubscription = 4; +// clang-format off -template -void simple_apply(std::shared_ptr exec, - const batch::matrix::BatchDense* mat, - const batch::MultiVector* b, - batch::MultiVector* x) GKO_NOT_IMPLEMENTED; +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -template -void advanced_apply(std::shared_ptr exec, - const batch::MultiVector* alpha, - const batch::matrix::BatchDense* a, - const batch::MultiVector* b, - const batch::MultiVector* beta, - batch::MultiVector* c) GKO_NOT_IMPLEMENTED; +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); +// clang-format on } // namespace batch_dense diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp new file mode 100644 index 00000000000..202eb91a366 --- /dev/null +++ b/cuda/matrix/batch_struct.hpp @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ + + +#include +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp, + * while also shallow-casting to the required CUDA scalar type. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch> +get_batch_struct(const batch::matrix::BatchDense* const op) +{ + return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +/** + * Generates a uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch> +get_batch_struct(batch::matrix::BatchDense* const op) +{ + return {as_cuda_type(op->get_values()), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 06f0caf81ec..51f2237826b 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -40,7 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/pointer_mode_guard.hip.hpp" @@ -48,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" -// #include "hip/matrix/batch_struct.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" namespace gko { @@ -63,30 +65,18 @@ namespace batch_dense { constexpr auto default_block_size = 256; -constexpr int sm_multiplier = 4; +constexpr int sm_oversubscription = 4; +// clang-format off -template -void simple_apply(std::shared_ptr exec, - const batch::matrix::BatchDense* mat, - const batch::MultiVector* b, - batch::MultiVector* x) GKO_NOT_IMPLEMENTED; +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -template -void advanced_apply(std::shared_ptr exec, - const batch::MultiVector* alpha, - const batch::matrix::BatchDense* a, - const batch::MultiVector* b, - const batch::MultiVector* beta, - batch::MultiVector* c) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" +// clang-format on } // namespace batch_dense } // namespace hip diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp new file mode 100644 index 00000000000..0d5dfb46a1b --- /dev/null +++ b/hip/matrix/batch_struct.hip.hpp @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_HIP_MATRIX_BATCH_STRUCT_HPP_ + + +#include +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "hip/base/config.hpp" +#include "hip/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp, + * while also shallow-casting to the required HIP scalar type. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch> +get_batch_struct(const batch::matrix::BatchDense* const op) +{ + return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +/** + * Generates a uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch> +get_batch_struct(batch::matrix::BatchDense* const op) +{ + return {as_hip_type(op->get_values()), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_MATRIX_BATCH_STRUCT_HPP_ diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt index 91987f3717f..a89abfb4ae4 100644 --- a/test/matrix/CMakeLists.txt +++ b/test/matrix/CMakeLists.txt @@ -1,4 +1,4 @@ -ginkgo_create_common_test(batch_dense_kernels DISABLE_EXECUTORS dpcpp hip cuda) +ginkgo_create_common_test(batch_dense_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_device_test(csr_kernels) ginkgo_create_common_test(csr_kernels2) ginkgo_create_common_test(coo_kernels) From 0d0b1191039424f0a3b92419e855f9e2a162b28f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 4 Oct 2023 15:36:19 +0200 Subject: [PATCH 352/583] Add SYCL kernels and tests WIP Co-authored-by: Phuong Nguyen --- .../matrix/batch_dense_kernels.hpp.inc | 12 +-- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 55 +++++------ dpcpp/matrix/batch_dense_kernels.dp.cpp | 99 ++++++++++++++++++- dpcpp/matrix/batch_dense_kernels.hpp.inc | 91 +++++++++++++++++ dpcpp/matrix/batch_struct.hpp | 94 ++++++++++++++++++ test/matrix/CMakeLists.txt | 2 +- 6 files changed, 311 insertions(+), 42 deletions(-) create mode 100644 dpcpp/matrix/batch_dense_kernels.hpp.inc create mode 100644 dpcpp/matrix/batch_struct.hpp diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc index 43046166abc..6cae08eadb5 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc @@ -52,10 +52,8 @@ __device__ __forceinline__ void simple_apply( temp += val * b[j]; } -#pragma unroll - for (int i = static_cast(tile_size) / 2; i > 0; i /= 2) { - temp += subwarp_grp.shfl_down(temp, i); - } + // subgroup level reduction + temp = reduce(subgroup, temp, thrust::plus{}); if (subwarp_grp.thread_rank() == 0) { x[row] = temp; @@ -116,10 +114,8 @@ __device__ __forceinline__ void advanced_apply( temp += alpha * val * b[j]; } -#pragma unroll - for (int i = static_cast(tile_size) / 2; i > 0; i /= 2) { - temp += subwarp_grp.shfl_down(temp, i); - } + // subgroup level reduction + temp = reduce(subgroup, temp, thrust::plus{}); if (subwarp_grp.thread_rank() == 0) { x[row] = temp + beta * x[row]; diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 10e47ba080e..12648b81e00 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -37,11 +37,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include -#include +#include "core/base/batch_struct.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" @@ -193,9 +194,9 @@ void compute_dot(std::shared_ptr exec, // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto x_b = batch::extract_batch_item(x_ub, group_id); @@ -231,19 +232,18 @@ void compute_conj_dot(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto y_b = batch::extract_batch_item(y_ub, group_id); - const auto res_b = - batch::extract_batch_item(res_ub, group_id); - compute_gen_dot_product_kernel( - x_b, y_b, res_b, item_ct1, - [](auto val) { return conj(val); }); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto y_b = batch::extract_batch_item(y_ub, group_id); + const auto res_b = batch::extract_batch_item(res_ub, group_id); + compute_gen_dot_product_kernel( + x_b, y_b, res_b, item_ct1, + [](auto val) { return conj(val); }); + }); }); } @@ -268,17 +268,16 @@ void compute_norm2(std::shared_ptr exec, const dim3 grid(num_batches); exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = - batch::extract_batch_item(x_ub, group_id); - const auto res_b = batch::extract_batch_item( - res_ub, group_id); - compute_norm2_kernel(x_b, res_b, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto res_b = batch::extract_batch_item(res_ub, group_id); + compute_norm2_kernel(x_b, res_b, item_ct1); + }); }); } diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 964bf094077..118d46d81a5 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -40,8 +40,24 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include -#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" namespace gko { @@ -55,11 +71,46 @@ namespace dpcpp { namespace batch_dense { +#include "dpcpp/matrix/batch_dense_kernels.hpp.inc" + + template void simple_apply(std::shared_ptr exec, - const batch::matrix::BatchDense* a, + const batch::matrix::BatchDense* mat, const batch::MultiVector* b, - batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) +{ + const size_type num_rows = x->get_common_size()[0]; + const size_type num_cols = x->get_common_size()[1]; + + const auto num_batch_items = x->get_num_batch_items(); + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batch_items); + const auto x_ub = get_batch_struct(x); + const auto b_ub = get_batch_struct(b); + const auto mat_ub = get_batch_struct(mat); + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + + // Launch a kernel that has nbatches blocks, each block has max group size + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b.values, x_b.values, item_ct1); + }); + }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); @@ -68,10 +119,48 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void advanced_apply(std::shared_ptr exec, const batch::MultiVector* alpha, - const batch::matrix::BatchDense* a, + const batch::matrix::BatchDense* mat, const batch::MultiVector* b, const batch::MultiVector* beta, - batch::MultiVector* c) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) +{ + const auto mat_ub = get_batch_struct(mat); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + + const auto num_batch_items = mat_ub.num_batch_items; + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batch_items); + + // Launch a kernel that has nbatches blocks, each block has max group size + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, + beta_b.values[0], x_b.values, item_ct1); + }); + }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp.inc new file mode 100644 index 00000000000..ba528ac31a4 --- /dev/null +++ b/dpcpp/matrix/batch_dense_kernels.hpp.inc @@ -0,0 +1,91 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__dpct_inline__ void simple_apply_kernel( + const gko::batch::matrix::batch_dense::batch_item& mat, + const ValueType* const __restrict__ b, ValueType* const __restrict__ x, + sycl::nd_item<3>& item_ct1) +{ + constexpr auto tile_size = config::warp_size; + auto subg = + group::tiled_partition(group::this_thread_block(item_ct1)); + const auto subgroup = static_cast(subg); + const int subgroup_id = subgroup.get_group_id(); + const int subgroup_size = subgroup.get_local_range().size(); + const int num_subgroup = subgroup.get_group_range().size(); + + for (int row = subgroup_id; row < mat.num_rows; row += num_subgroup) { + ValueType temp = zero(); + for (int j = subgroup.get_local_id(); j < mat.num_cols; + j += subgroup_size) { + const ValueType val = mat.values[row * mat.stride + j]; + temp += val * b[j]; + } + temp = ::gko::kernels::dpcpp::reduce( + subg, temp, [](ValueType v1, ValueType v2) { return v1 + v2; }); + if (subgroup.get_local_id() == 0) { + x[row] = temp; + } + } +} + + +template +__dpct_inline__ void advanced_apply_kernel( + const ValueType alpha, + const gko::batch::matrix::batch_dense::batch_item& mat, + const ValueType* const __restrict__ b, const ValueType beta, + ValueType* const __restrict__ x, sycl::nd_item<3>& item_ct1) +{ + constexpr auto tile_size = config::warp_size; + auto subg = + group::tiled_partition(group::this_thread_block(item_ct1)); + const auto subgroup = static_cast(subg); + const int subgroup_id = subgroup.get_group_id(); + const int subgroup_size = subgroup.get_local_range().size(); + const int num_subgroup = subgroup.get_group_range().size(); + + for (int row = subgroup_id; row < mat.num_rows; row += num_subgroup) { + ValueType temp = zero(); + for (int j = subgroup.get_local_id(); j < mat.num_cols; + j += subgroup_size) { + const ValueType val = mat.values[row * mat.stride + j]; + temp += alpha * val * b[j]; + } + temp = ::gko::kernels::dpcpp::reduce( + subg, temp, [](ValueType v1, ValueType v2) { return v1 + v2; }); + if (subgroup.get_local_id() == 0) { + x[row] = temp + beta * x[row]; + } + } +} diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp new file mode 100644 index 00000000000..dd8c1bbbab6 --- /dev/null +++ b/dpcpp/matrix/batch_struct.hpp @@ -0,0 +1,94 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_ + + +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/config.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +/** @file batch_struct.hpp + * + * Helper functions to generate a batch struct from a batch LinOp, + * while also shallow-casting to the required DPCPP scalar type. + * + * A specialization is needed for every format of every kind of linear algebra + * object. These are intended to be called on the host. + */ + + +/** + * Generates an immutable uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch +get_batch_struct(const batch::matrix::BatchDense* const op) +{ + return {op->get_const_values(), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +/** + * Generates a uniform batch struct from a batch of multi-vectors. + */ +template +inline batch::matrix::batch_dense::uniform_batch get_batch_struct( + batch::matrix::BatchDense* const op) +{ + return {op->get_values(), op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_ diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt index a89abfb4ae4..9f3b17cd858 100644 --- a/test/matrix/CMakeLists.txt +++ b/test/matrix/CMakeLists.txt @@ -1,4 +1,4 @@ -ginkgo_create_common_test(batch_dense_kernels DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_test(batch_dense_kernels) ginkgo_create_common_device_test(csr_kernels) ginkgo_create_common_test(csr_kernels2) ginkgo_create_common_test(coo_kernels) From f4b9b869063229544690c6fc99e46ae1bfe75e8a Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 5 Oct 2023 13:38:18 +0200 Subject: [PATCH 353/583] HIP and CUDA thrust fixes --- .../matrix/batch_dense_kernels.hpp.inc | 32 +++++++++---------- cuda/matrix/batch_dense_kernels.cu | 5 +++ hip/matrix/batch_dense_kernels.hip.cpp | 5 ++- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc index 6cae08eadb5..2f876332ae7 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc @@ -39,15 +39,15 @@ __device__ __forceinline__ void simple_apply( constexpr auto tile_size = config::warp_size; auto thread_block = group::this_thread_block(); - auto subwarp_grp = group::tiled_partition(thread_block); - const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); - const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); - for (int row = subwarp_grp_id; row < mat.num_rows; - row += num_subwarp_grps_per_block) { + for (int row = subgroup_id; row < mat.num_rows; + row += num_subgroups_per_block) { ValueType temp = zero(); - for (int j = subwarp_grp.thread_rank(); j < mat.num_cols; - j += subwarp_grp.size()) { + for (int j = subgroup.thread_rank(); j < mat.num_cols; + j += subgroup.size()) { const ValueType val = mat.values[row * mat.stride + j]; temp += val * b[j]; } @@ -55,7 +55,7 @@ __device__ __forceinline__ void simple_apply( // subgroup level reduction temp = reduce(subgroup, temp, thrust::plus{}); - if (subwarp_grp.thread_rank() == 0) { + if (subgroup.thread_rank() == 0) { x[row] = temp; } } @@ -101,15 +101,15 @@ __device__ __forceinline__ void advanced_apply( constexpr auto tile_size = config::warp_size; auto thread_block = group::this_thread_block(); - auto subwarp_grp = group::tiled_partition(thread_block); - const auto subwarp_grp_id = static_cast(threadIdx.x / tile_size); - const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); - for (int row = subwarp_grp_id; row < mat.num_rows; - row += num_subwarp_grps_per_block) { + for (int row = subgroup_id; row < mat.num_rows; + row += num_subgroups_per_block) { ValueType temp = zero(); - for (int j = subwarp_grp.thread_rank(); j < mat.num_cols; - j += subwarp_grp.size()) { + for (int j = subgroup.thread_rank(); j < mat.num_cols; + j += subgroup.size()) { const ValueType val = mat.values[row * mat.stride + j]; temp += alpha * val * b[j]; } @@ -117,7 +117,7 @@ __device__ __forceinline__ void advanced_apply( // subgroup level reduction temp = reduce(subgroup, temp, thrust::plus{}); - if (subwarp_grp.thread_rank() == 0) { + if (subgroup.thread_rank() == 0) { x[row] = temp + beta * x[row]; } } diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 9d9cfcf6c8e..28d61f70731 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -33,6 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_dense_kernels.hpp" +#include +#include + + #include @@ -42,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/base/thrust.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 51f2237826b..20c46736026 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -34,6 +34,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include #include @@ -42,10 +44,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/thrust.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" From ac61ccdf665447239a1e1273afbfb87673220496 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 5 Oct 2023 15:49:46 +0200 Subject: [PATCH 354/583] SYCL kernel fixes --- dpcpp/matrix/batch_dense_kernels.dp.cpp | 20 +++++++++------ dpcpp/matrix/batch_dense_kernels.hpp.inc | 31 +++++++++++++++--------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 118d46d81a5..7f3dca70a32 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -80,10 +80,10 @@ void simple_apply(std::shared_ptr exec, const batch::MultiVector* b, batch::MultiVector* x) { - const size_type num_rows = x->get_common_size()[0]; - const size_type num_cols = x->get_common_size()[1]; + const size_type num_rows = mat->get_common_size()[0]; + const size_type num_cols = mat->get_common_size()[1]; - const auto num_batch_items = x->get_num_batch_items(); + const auto num_batch_items = mat->get_num_batch_items(); auto device = exec->get_queue()->get_device(); auto group_size = device.get_info(); @@ -100,14 +100,16 @@ void simple_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b.values, x_b.values, item_ct1); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); }); }); } @@ -145,7 +147,9 @@ void advanced_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto mat_b = @@ -156,8 +160,8 @@ void advanced_apply(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto beta_b = batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, - beta_b.values[0], x_b.values, item_ct1); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); }); }); } diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp.inc index ba528ac31a4..dacd31feade 100644 --- a/dpcpp/matrix/batch_dense_kernels.hpp.inc +++ b/dpcpp/matrix/batch_dense_kernels.hpp.inc @@ -33,7 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __dpct_inline__ void simple_apply_kernel( const gko::batch::matrix::batch_dense::batch_item& mat, - const ValueType* const __restrict__ b, ValueType* const __restrict__ x, + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& x, sycl::nd_item<3>& item_ct1) { constexpr auto tile_size = config::warp_size; @@ -42,19 +43,21 @@ __dpct_inline__ void simple_apply_kernel( const auto subgroup = static_cast(subg); const int subgroup_id = subgroup.get_group_id(); const int subgroup_size = subgroup.get_local_range().size(); - const int num_subgroup = subgroup.get_group_range().size(); + const int num_subgroups = subgroup.get_group_range().size(); - for (int row = subgroup_id; row < mat.num_rows; row += num_subgroup) { + for (int row = subgroup_id; row < mat.num_rows; row += num_subgroups) { ValueType temp = zero(); for (int j = subgroup.get_local_id(); j < mat.num_cols; j += subgroup_size) { const ValueType val = mat.values[row * mat.stride + j]; - temp += val * b[j]; + temp += val * b.values[j]; } + temp = ::gko::kernels::dpcpp::reduce( - subg, temp, [](ValueType v1, ValueType v2) { return v1 + v2; }); + subg, temp, [](ValueType a, ValueType b) { return a + b; }); + if (subgroup.get_local_id() == 0) { - x[row] = temp; + x.values[row] = temp; } } } @@ -62,10 +65,12 @@ __dpct_inline__ void simple_apply_kernel( template __dpct_inline__ void advanced_apply_kernel( - const ValueType alpha, + const gko::batch::multi_vector::batch_item& alpha, const gko::batch::matrix::batch_dense::batch_item& mat, - const ValueType* const __restrict__ b, const ValueType beta, - ValueType* const __restrict__ x, sycl::nd_item<3>& item_ct1) + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& beta, + const gko::batch::multi_vector::batch_item& x, + sycl::nd_item<3>& item_ct1) { constexpr auto tile_size = config::warp_size; auto subg = @@ -80,12 +85,14 @@ __dpct_inline__ void advanced_apply_kernel( for (int j = subgroup.get_local_id(); j < mat.num_cols; j += subgroup_size) { const ValueType val = mat.values[row * mat.stride + j]; - temp += alpha * val * b[j]; + temp += alpha.values[0] * val * b.values[j]; } + temp = ::gko::kernels::dpcpp::reduce( - subg, temp, [](ValueType v1, ValueType v2) { return v1 + v2; }); + subg, temp, [](ValueType a, ValueType b) { return a + b; }); + if (subgroup.get_local_id() == 0) { - x[row] = temp + beta * x[row]; + x.values[row] = temp + beta.values[0] * x.values[row]; } } } From f73196036f405cc0c13f862cc58843e88cf89cb2 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 5 Oct 2023 16:00:30 +0200 Subject: [PATCH 355/583] BatchDense -> batch::Dense --- .../batch_dense_kernel_launcher.hpp.inc | 4 +- core/base/batch_multi_vector.cpp | 7 +- core/matrix/batch_dense.cpp | 48 +++---- core/matrix/batch_dense_kernels.hpp | 4 +- core/test/matrix/batch_dense.cpp | 128 +++++++++--------- cuda/matrix/batch_dense_kernels.cu | 2 +- cuda/matrix/batch_struct.hpp | 4 +- dpcpp/matrix/batch_dense_kernels.dp.cpp | 6 +- dpcpp/matrix/batch_struct.hpp | 4 +- hip/matrix/batch_dense_kernels.hip.cpp | 2 +- hip/matrix/batch_struct.hip.hpp | 8 +- .../ginkgo/core/base/batch_multi_vector.hpp | 14 +- include/ginkgo/core/matrix/batch_dense.hpp | 59 ++++---- omp/matrix/batch_dense_kernels.cpp | 8 +- reference/matrix/batch_dense_kernels.cpp | 8 +- reference/matrix/batch_struct.hpp | 4 +- reference/test/matrix/batch_dense_kernels.cpp | 24 ++-- test/matrix/batch_dense_kernels.cpp | 10 +- 18 files changed, 169 insertions(+), 175 deletions(-) diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc index 668b0278680..23ae8ebd5f0 100644 --- a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template void simple_apply(std::shared_ptr exec, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, batch::MultiVector* x) { @@ -55,7 +55,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void advanced_apply(std::shared_ptr exec, const batch::MultiVector* alpha, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, const batch::MultiVector* beta, batch::MultiVector* x) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index f6884ef523b..294fe45972a 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -292,12 +292,11 @@ void MultiVector::move_to( template -void MultiVector::convert_to( - matrix::BatchDense* result) const +void MultiVector::convert_to(matrix::Dense* result) const { auto exec = result->get_executor() != nullptr ? result->get_executor() : this->get_executor(); - auto tmp = gko::batch::matrix::BatchDense::create_const( + auto tmp = gko::batch::matrix::Dense::create_const( exec, this->get_size(), make_const_array_view(exec, this->get_num_stored_elements(), this->get_const_values())); @@ -306,7 +305,7 @@ void MultiVector::convert_to( template -void MultiVector::move_to(matrix::BatchDense* result) +void MultiVector::move_to(matrix::Dense* result) { this->convert_to(result); } diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index c9da010c228..75f29bc6b4c 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -84,7 +84,7 @@ batch_dim<2> compute_batch_size( template std::unique_ptr> -BatchDense::create_view_for_item(size_type item_id) +Dense::create_view_for_item(size_type item_id) { auto exec = this->get_executor(); auto num_rows = this->get_common_size()[0]; @@ -100,7 +100,7 @@ BatchDense::create_view_for_item(size_type item_id) template std::unique_ptr> -BatchDense::create_const_view_for_item(size_type item_id) const +Dense::create_const_view_for_item(size_type item_id) const { auto exec = this->get_executor(); auto num_rows = this->get_common_size()[0]; @@ -115,9 +115,8 @@ BatchDense::create_const_view_for_item(size_type item_id) const template -std::unique_ptr> -BatchDense::create_with_config_of( - ptr_param> other) +std::unique_ptr> Dense::create_with_config_of( + ptr_param> other) { // De-referencing `other` before calling the functions (instead of // using operator `->`) is currently required to be compatible with @@ -128,23 +127,21 @@ BatchDense::create_with_config_of( template -std::unique_ptr> -BatchDense::create_with_same_config() const +std::unique_ptr> Dense::create_with_same_config() + const { - return BatchDense::create(this->get_executor(), - this->get_size()); + return Dense::create(this->get_executor(), this->get_size()); } template -std::unique_ptr> -BatchDense::create_const( +std::unique_ptr> Dense::create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values) { // cast const-ness away, but return a const object afterwards, // so we can ensure that no modifications take place. - return std::unique_ptr(new BatchDense{ + return std::unique_ptr(new Dense{ exec, sizes, gko::detail::array_const_cast(std::move(values))}); } @@ -157,16 +154,16 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) template -BatchDense::BatchDense(std::shared_ptr exec, - const batch_dim<2>& size) - : EnableBatchLinOp>(exec, size), +Dense::Dense(std::shared_ptr exec, + const batch_dim<2>& size) + : EnableBatchLinOp>(exec, size), values_(exec, compute_num_elems(size)) {} template -void BatchDense::apply_impl(const MultiVector* b, - MultiVector* x) const +void Dense::apply_impl(const MultiVector* b, + MultiVector* x) const { GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); @@ -179,10 +176,10 @@ void BatchDense::apply_impl(const MultiVector* b, template -void BatchDense::apply_impl(const MultiVector* alpha, - const MultiVector* b, - const MultiVector* beta, - MultiVector* x) const +void Dense::apply_impl(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const { GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); @@ -198,8 +195,8 @@ void BatchDense::apply_impl(const MultiVector* alpha, template -void BatchDense::convert_to( - BatchDense>* result) const +void Dense::convert_to( + Dense>* result) const { result->values_ = this->values_; result->set_size(this->get_size()); @@ -207,14 +204,13 @@ void BatchDense::convert_to( template -void BatchDense::move_to( - BatchDense>* result) +void Dense::move_to(Dense>* result) { this->convert_to(result); } -#define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class BatchDense<_type> +#define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class Dense<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX); diff --git a/core/matrix/batch_dense_kernels.hpp b/core/matrix/batch_dense_kernels.hpp index 7f814e08b50..cb46b7291b8 100644 --- a/core/matrix/batch_dense_kernels.hpp +++ b/core/matrix/batch_dense_kernels.hpp @@ -51,14 +51,14 @@ namespace kernels { #define GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(_type) \ void simple_apply(std::shared_ptr exec, \ - const batch::matrix::BatchDense<_type>* a, \ + const batch::matrix::Dense<_type>* a, \ const batch::MultiVector<_type>* b, \ batch::MultiVector<_type>* c) #define GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL(_type) \ void advanced_apply(std::shared_ptr exec, \ const batch::MultiVector<_type>* alpha, \ - const batch::matrix::BatchDense<_type>* a, \ + const batch::matrix::Dense<_type>* a, \ const batch::MultiVector<_type>* b, \ const batch::MultiVector<_type>* beta, \ batch::MultiVector<_type>* c) diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index 02788e14b7d..6afe13a50af 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -48,15 +48,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template -class BatchDense : public ::testing::Test { +class Dense : public ::testing::Test { protected: using value_type = T; using DenseMtx = gko::matrix::Dense; using size_type = gko::size_type; - BatchDense() + Dense() : exec(gko::ReferenceExecutor::create()), - mtx(gko::batch::initialize< - gko::batch::matrix::BatchDense>( + mtx(gko::batch::initialize>( {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, exec)), @@ -66,7 +65,7 @@ class BatchDense : public ::testing::Test { static void assert_equal_to_original_mtx( - gko::batch::matrix::BatchDense* m) + gko::batch::matrix::Dense* m) { ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); @@ -85,41 +84,41 @@ class BatchDense : public ::testing::Test { ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); } - static void assert_empty(gko::batch::matrix::BatchDense* m) + static void assert_empty(gko::batch::matrix::Dense* m) { ASSERT_EQ(m->get_num_batch_items(), 0); ASSERT_EQ(m->get_num_stored_elements(), 0); } std::shared_ptr exec; - std::unique_ptr> mtx; + std::unique_ptr> mtx; std::unique_ptr> dense_mtx; }; -TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); +TYPED_TEST_SUITE(Dense, gko::test::ValueTypes); -TYPED_TEST(BatchDense, KnowsItsSizeAndValues) +TYPED_TEST(Dense, KnowsItsSizeAndValues) { this->assert_equal_to_original_mtx(this->mtx.get()); } -TYPED_TEST(BatchDense, CanBeEmpty) +TYPED_TEST(Dense, CanBeEmpty) { - auto empty = gko::batch::matrix::BatchDense::create(this->exec); + auto empty = gko::batch::matrix::Dense::create(this->exec); this->assert_empty(empty.get()); } -TYPED_TEST(BatchDense, ReturnsNullValuesArrayWhenEmpty) +TYPED_TEST(Dense, ReturnsNullValuesArrayWhenEmpty) { - auto empty = gko::batch::matrix::BatchDense::create(this->exec); + auto empty = gko::batch::matrix::Dense::create(this->exec); ASSERT_EQ(empty->get_const_values(), nullptr); } -TYPED_TEST(BatchDense, CanGetValuesForEntry) +TYPED_TEST(Dense, CanGetValuesForEntry) { using value_type = typename TestFixture::value_type; @@ -127,17 +126,16 @@ TYPED_TEST(BatchDense, CanGetValuesForEntry) } -TYPED_TEST(BatchDense, CanCreateDenseItemView) +TYPED_TEST(Dense, CanCreateDenseItemView) { GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->dense_mtx, 0.0); } -TYPED_TEST(BatchDense, CanBeCopied) +TYPED_TEST(Dense, CanBeCopied) { - auto mtx_copy = - gko::batch::matrix::BatchDense::create(this->exec); + auto mtx_copy = gko::batch::matrix::Dense::create(this->exec); mtx_copy->copy_from(this->mtx.get()); @@ -148,10 +146,9 @@ TYPED_TEST(BatchDense, CanBeCopied) } -TYPED_TEST(BatchDense, CanBeMoved) +TYPED_TEST(Dense, CanBeMoved) { - auto mtx_copy = - gko::batch::matrix::BatchDense::create(this->exec); + auto mtx_copy = gko::batch::matrix::Dense::create(this->exec); this->mtx->move_to(mtx_copy); @@ -159,7 +156,7 @@ TYPED_TEST(BatchDense, CanBeMoved) } -TYPED_TEST(BatchDense, CanBeCloned) +TYPED_TEST(Dense, CanBeCloned) { auto mtx_clone = this->mtx->clone(); @@ -168,7 +165,7 @@ TYPED_TEST(BatchDense, CanBeCloned) } -TYPED_TEST(BatchDense, CanBeCleared) +TYPED_TEST(Dense, CanBeCleared) { this->mtx->clear(); @@ -176,11 +173,11 @@ TYPED_TEST(BatchDense, CanBeCleared) } -TYPED_TEST(BatchDense, CanBeConstructedWithSize) +TYPED_TEST(Dense, CanBeConstructedWithSize) { using size_type = gko::size_type; - auto m = gko::batch::matrix::BatchDense::create( + auto m = gko::batch::matrix::Dense::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3})); ASSERT_EQ(m->get_num_batch_items(), 2); @@ -189,7 +186,7 @@ TYPED_TEST(BatchDense, CanBeConstructedWithSize) } -TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) +TYPED_TEST(Dense, CanBeConstructedFromExistingData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -203,7 +200,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) 6.0, -3.0}; // clang-format on - auto m = gko::batch::matrix::BatchDense::create( + auto m = gko::batch::matrix::Dense::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), gko::array::view(this->exec, 8, data)); @@ -219,7 +216,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingData) } -TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) +TYPED_TEST(Dense, CanBeConstructedFromExistingConstData) { using value_type = typename TestFixture::value_type; using size_type = gko::size_type; @@ -233,7 +230,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) 6.0, -3.0}; // clang-format on - auto m = gko::batch::matrix::BatchDense::create_const( + auto m = gko::batch::matrix::Dense::create_const( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), gko::array::const_view(this->exec, 8, data)); @@ -249,7 +246,7 @@ TYPED_TEST(BatchDense, CanBeConstructedFromExistingConstData) } -TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) +TYPED_TEST(Dense, CanBeConstructedFromDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -260,15 +257,15 @@ TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::create_from_item< - gko::batch::matrix::BatchDense>( - this->exec, std::vector{mat1.get(), mat2.get()}); + auto m = + gko::batch::create_from_item>( + this->exec, std::vector{mat1.get(), mat2.get()}); this->assert_equal_to_original_mtx(m.get()); } -TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatricesByDuplication) +TYPED_TEST(Dense, CanBeConstructedFromDenseMatricesByDuplication) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -279,17 +276,19 @@ TYPED_TEST(BatchDense, CanBeConstructedFromDenseMatricesByDuplication) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::batch::create_from_item< - gko::batch::matrix::BatchDense>( - this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); - auto m = gko::batch::create_from_item< - gko::batch::matrix::BatchDense>(this->exec, 3, mat1.get()); + auto bat_m = + gko::batch::create_from_item>( + this->exec, + std::vector{mat1.get(), mat1.get(), mat1.get()}); + auto m = + gko::batch::create_from_item>( + this->exec, 3, mat1.get()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } -TYPED_TEST(BatchDense, CanBeConstructedByDuplicatingBatchDenseMatrices) +TYPED_TEST(Dense, CanBeConstructedByDuplicatingDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -300,22 +299,23 @@ TYPED_TEST(BatchDense, CanBeConstructedByDuplicatingBatchDenseMatrices) auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::create_from_item< - gko::batch::matrix::BatchDense>( - this->exec, std::vector{mat1.get(), mat2.get()}); - auto m_ref = gko::batch::create_from_item< - gko::batch::matrix::BatchDense>( - this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), - mat2.get(), mat1.get(), mat2.get()}); + auto m = + gko::batch::create_from_item>( + this->exec, std::vector{mat1.get(), mat2.get()}); + auto m_ref = + gko::batch::create_from_item>( + this->exec, + std::vector{mat1.get(), mat2.get(), mat1.get(), + mat2.get(), mat1.get(), mat2.get()}); - auto m2 = gko::batch::duplicate>( + auto m2 = gko::batch::duplicate>( this->exec, 3, m.get()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } -TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) +TYPED_TEST(Dense, CanBeUnbatchedIntoDenseMatrices) { using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; @@ -326,7 +326,7 @@ TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) this->exec); auto dense_mats = - gko::batch::unbatch>( + gko::batch::unbatch>( this->mtx.get()); GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.); @@ -334,10 +334,10 @@ TYPED_TEST(BatchDense, CanBeUnbatchedIntoDenseMatrices) } -TYPED_TEST(BatchDense, CanBeListConstructed) +TYPED_TEST(Dense, CanBeListConstructed) { using value_type = typename TestFixture::value_type; - auto m = gko::batch::initialize>( + auto m = gko::batch::initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); ASSERT_EQ(m->get_num_batch_items(), 2); @@ -349,11 +349,11 @@ TYPED_TEST(BatchDense, CanBeListConstructed) } -TYPED_TEST(BatchDense, CanBeListConstructedByCopies) +TYPED_TEST(Dense, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; - auto m = gko::batch::initialize>( + auto m = gko::batch::initialize>( 2, I({1.0, 2.0}), this->exec); ASSERT_EQ(m->get_num_batch_items(), 2); @@ -365,12 +365,12 @@ TYPED_TEST(BatchDense, CanBeListConstructedByCopies) } -TYPED_TEST(BatchDense, CanBeDoubleListConstructed) +TYPED_TEST(Dense, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; using T = value_type; - auto m = gko::batch::initialize>( + auto m = gko::batch::initialize>( {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, {I{1.0, 2.0, -1.0}, I{3.0, 4.0, -2.0}, I{5.0, 6.0, -3.0}}}, this->exec); @@ -389,7 +389,7 @@ TYPED_TEST(BatchDense, CanBeDoubleListConstructed) } -TYPED_TEST(BatchDense, CanBeReadFromMatrixData) +TYPED_TEST(Dense, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; using index_type = int; @@ -401,8 +401,8 @@ TYPED_TEST(BatchDense, CanBeReadFromMatrixData) {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}})); auto m = gko::batch::read>( - this->exec, vec_data); + gko::batch::matrix::Dense>(this->exec, + vec_data); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); @@ -416,7 +416,7 @@ TYPED_TEST(BatchDense, CanBeReadFromMatrixData) } -TYPED_TEST(BatchDense, CanBeReadFromSparseMatrixData) +TYPED_TEST(Dense, CanBeReadFromSparseMatrixData) { using value_type = typename TestFixture::value_type; using index_type = int; @@ -427,8 +427,8 @@ TYPED_TEST(BatchDense, CanBeReadFromSparseMatrixData) {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}})); auto m = gko::batch::read>( - this->exec, vec_data); + gko::batch::matrix::Dense>(this->exec, + vec_data); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); @@ -442,14 +442,14 @@ TYPED_TEST(BatchDense, CanBeReadFromSparseMatrixData) } -TYPED_TEST(BatchDense, GeneratesCorrectMatrixData) +TYPED_TEST(Dense, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; using index_type = int; using tpl = typename gko::matrix_data::nonzero_type; auto data = gko::batch::write>( + gko::batch::matrix::Dense>( this->mtx.get()); ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 28d61f70731..4f1dbc8f4d4 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -58,7 +58,7 @@ namespace gko { namespace kernels { namespace cuda { /** - * @brief The BatchDense matrix format namespace. + * @brief The Dense matrix format namespace. * * @ingroup batch_dense */ diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 202eb91a366..56af3c5ba7e 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -65,7 +65,7 @@ namespace cuda { */ template inline batch::matrix::batch_dense::uniform_batch> -get_batch_struct(const batch::matrix::BatchDense* const op) +get_batch_struct(const batch::matrix::Dense* const op) { return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -79,7 +79,7 @@ get_batch_struct(const batch::matrix::BatchDense* const op) */ template inline batch::matrix::batch_dense::uniform_batch> -get_batch_struct(batch::matrix::BatchDense* const op) +get_batch_struct(batch::matrix::Dense* const op) { return {as_cuda_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 7f3dca70a32..4552f918c60 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -64,7 +64,7 @@ namespace gko { namespace kernels { namespace dpcpp { /** - * @brief The BatchDense matrix format namespace. + * @brief The Dense matrix format namespace. * * @ingroup batch_dense */ @@ -76,7 +76,7 @@ namespace batch_dense { template void simple_apply(std::shared_ptr exec, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, batch::MultiVector* x) { @@ -121,7 +121,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void advanced_apply(std::shared_ptr exec, const batch::MultiVector* alpha, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, const batch::MultiVector* beta, batch::MultiVector* x) diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index dd8c1bbbab6..e44bc394667 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -63,7 +63,7 @@ namespace dpcpp { */ template inline batch::matrix::batch_dense::uniform_batch -get_batch_struct(const batch::matrix::BatchDense* const op) +get_batch_struct(const batch::matrix::Dense* const op) { return {op->get_const_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -77,7 +77,7 @@ get_batch_struct(const batch::matrix::BatchDense* const op) */ template inline batch::matrix::batch_dense::uniform_batch get_batch_struct( - batch::matrix::BatchDense* const op) + batch::matrix::Dense* const op) { return {op->get_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 20c46736026..aa6d717438e 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -60,7 +60,7 @@ namespace gko { namespace kernels { namespace hip { /** - * @brief The BatchDense matrix format namespace. + * @brief The Dense matrix format namespace. * * @ingroup batch_dense */ diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index 0d5dfb46a1b..c75a6c7f0a3 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -41,8 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" -#include "hip/base/config.hpp" -#include "hip/base/types.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" namespace gko { @@ -65,7 +65,7 @@ namespace hip { */ template inline batch::matrix::batch_dense::uniform_batch> -get_batch_struct(const batch::matrix::BatchDense* const op) +get_batch_struct(const batch::matrix::Dense* const op) { return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -79,7 +79,7 @@ get_batch_struct(const batch::matrix::BatchDense* const op) */ template inline batch::matrix::batch_dense::uniform_batch> -get_batch_struct(batch::matrix::BatchDense* const op) +get_batch_struct(batch::matrix::Dense* const op) { return {as_hip_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 6b3b207c76c..7830a4c6efb 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -56,7 +56,7 @@ namespace matrix { template -class BatchDense; +class Dense; } @@ -91,20 +91,20 @@ class MultiVector public EnablePolymorphicAssignment>, public EnableCreateMethod>, public ConvertibleTo>>, - public ConvertibleTo> { + public ConvertibleTo> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class MultiVector>; friend class MultiVector>; - friend class matrix::BatchDense; + friend class matrix::Dense; public: using EnablePolymorphicAssignment::convert_to; using EnablePolymorphicAssignment::move_to; using ConvertibleTo>>::convert_to; using ConvertibleTo>>::move_to; - using ConvertibleTo>::convert_to; - using ConvertibleTo>::move_to; + using ConvertibleTo>::convert_to; + using ConvertibleTo>::move_to; using value_type = ValueType; using index_type = int32; @@ -126,9 +126,9 @@ class MultiVector void move_to(MultiVector>* result) override; - void convert_to(matrix::BatchDense* result) const override; + void convert_to(matrix::Dense* result) const override; - void move_to(matrix::BatchDense* result) override; + void move_to(matrix::Dense* result) override; /** * Creates a mutable view (of matrix::Dense type) of one item of the Batch diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 0457f444c5a..86cd78eadc8 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -55,7 +55,7 @@ namespace matrix { /** - * BatchDense is a batch matrix format which explicitly stores all values of the + * Dense is a batch matrix format which explicitly stores all values of the * matrix in each of the batches. * * The values in each of the batches are stored in row-major format (values @@ -71,38 +71,37 @@ namespace matrix { * @ingroup BatchLinOp */ template -class BatchDense : public EnableBatchLinOp>, - public EnableCreateMethod>, - public ConvertibleTo>> { - friend class EnableCreateMethod; - friend class EnablePolymorphicObject; - friend class BatchDense>; - friend class BatchDense>; +class Dense : public EnableBatchLinOp>, + public EnableCreateMethod>, + public ConvertibleTo>> { + friend class EnableCreateMethod; + friend class EnablePolymorphicObject; + friend class Dense>; + friend class Dense>; public: - using EnableBatchLinOp::convert_to; - using EnableBatchLinOp::move_to; + using EnableBatchLinOp::convert_to; + using EnableBatchLinOp::move_to; using value_type = ValueType; using index_type = int32; - using transposed_type = BatchDense; + using transposed_type = Dense; using unbatch_type = gko::matrix::Dense; - using absolute_type = remove_complex; - using complex_type = to_complex; + using absolute_type = remove_complex; + using complex_type = to_complex; /** - * Creates a BatchDense matrix with the configuration of another BatchDense + * Creates a Dense matrix with the configuration of another Dense * matrix. * * @param other The other matrix whose configuration needs to copied. */ - static std::unique_ptr create_with_config_of( - ptr_param other); + static std::unique_ptr create_with_config_of( + ptr_param other); - void convert_to( - BatchDense>* result) const override; + void convert_to(Dense>* result) const override; - void move_to(BatchDense>* result) override; + void move_to(Dense>* result) override; /** @@ -250,7 +249,7 @@ class BatchDense : public EnableBatchLinOp>, * array (if it resides on the same executor as the vector) or a copy of the * array on the correct executor. */ - static std::unique_ptr> create_const( + static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values); @@ -277,16 +276,16 @@ class BatchDense : public EnableBatchLinOp>, protected: /** - * Creates an uninitialized BatchDense matrix of the specified size. + * Creates an uninitialized Dense matrix of the specified size. * * @param exec Executor associated to the matrix * @param size size of the matrix */ - BatchDense(std::shared_ptr exec, - const batch_dim<2>& size = batch_dim<2>{}); + Dense(std::shared_ptr exec, + const batch_dim<2>& size = batch_dim<2>{}); /** - * Creates a BatchDense matrix from an already allocated (and initialized) + * Creates a Dense matrix from an already allocated (and initialized) * array. * * @tparam ValuesArray type of array of values @@ -303,9 +302,9 @@ class BatchDense : public EnableBatchLinOp>, * original array data will not be used in the matrix. */ template - BatchDense(std::shared_ptr exec, const batch_dim<2>& size, - ValuesArray&& values) - : EnableBatchLinOp(exec, size), + Dense(std::shared_ptr exec, const batch_dim<2>& size, + ValuesArray&& values) + : EnableBatchLinOp(exec, size), values_{exec, std::forward(values)} { // Ensure that the values array has the correct size @@ -314,12 +313,12 @@ class BatchDense : public EnableBatchLinOp>, } /** - * Creates a BatchDense matrix with the same configuration as the callers + * Creates a Dense matrix with the same configuration as the callers * matrix. * - * @returns a BatchDense matrix with the same configuration as the caller. + * @returns a Dense matrix with the same configuration as the caller. */ - std::unique_ptr create_with_same_config() const; + std::unique_ptr create_with_same_config() const; virtual void apply_impl(const MultiVector* b, MultiVector* x) const; diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp index a767215c844..2d0b7ed4d40 100644 --- a/omp/matrix/batch_dense_kernels.cpp +++ b/omp/matrix/batch_dense_kernels.cpp @@ -50,8 +50,8 @@ namespace gko { namespace kernels { namespace omp { /** - * @brief The BatchDense matrix format namespace. - * @ref BatchDense + * @brief The Dense matrix format namespace. + * @ref Dense * @ingroup batch_dense */ namespace batch_dense { @@ -62,7 +62,7 @@ namespace batch_dense { template void simple_apply(std::shared_ptr exec, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, batch::MultiVector* x) { @@ -85,7 +85,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void advanced_apply(std::shared_ptr exec, const batch::MultiVector* alpha, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, const batch::MultiVector* beta, batch::MultiVector* x) diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index f42d9a81d1f..3d7ef03a3bd 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -51,8 +51,8 @@ namespace gko { namespace kernels { namespace reference { /** - * @brief The BatchDense matrix format namespace. - * @ref BatchDense + * @brief The Dense matrix format namespace. + * @ref Dense * @ingroup batch_dense */ namespace batch_dense { @@ -63,7 +63,7 @@ namespace batch_dense { template void simple_apply(std::shared_ptr exec, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, batch::MultiVector* x) { @@ -85,7 +85,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void advanced_apply(std::shared_ptr exec, const batch::MultiVector* alpha, - const batch::matrix::BatchDense* mat, + const batch::matrix::Dense* mat, const batch::MultiVector* b, const batch::MultiVector* beta, batch::MultiVector* x) diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index dee7c71948a..40e2cfc2078 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -66,7 +66,7 @@ namespace host { */ template inline batch::matrix::batch_dense::uniform_batch -get_batch_struct(const batch::matrix::BatchDense* const op) +get_batch_struct(const batch::matrix::Dense* const op) { return {op->get_const_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), @@ -80,7 +80,7 @@ get_batch_struct(const batch::matrix::BatchDense* const op) */ template inline batch::matrix::batch_dense::uniform_batch get_batch_struct( - batch::matrix::BatchDense* const op) + batch::matrix::Dense* const op) { return {op->get_values(), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp index 8e2e522e5f4..e1689352cde 100644 --- a/reference/test/matrix/batch_dense_kernels.cpp +++ b/reference/test/matrix/batch_dense_kernels.cpp @@ -53,16 +53,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template -class BatchDense : public ::testing::Test { +class Dense : public ::testing::Test { protected: using value_type = T; using size_type = gko::size_type; - using Mtx = gko::batch::matrix::BatchDense; + using Mtx = gko::batch::matrix::Dense; using MVec = gko::batch::MultiVector; using DenseMtx = gko::matrix::Dense; using ComplexMtx = gko::to_complex; using RealMtx = gko::remove_complex; - BatchDense() + Dense() : exec(gko::ReferenceExecutor::create()), mtx_0(gko::batch::initialize( {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, @@ -111,10 +111,10 @@ class BatchDense : public ::testing::Test { }; -TYPED_TEST_SUITE(BatchDense, gko::test::ValueTypes); +TYPED_TEST_SUITE(Dense, gko::test::ValueTypes); -TYPED_TEST(BatchDense, AppliesToBatchMultiVector) +TYPED_TEST(Dense, AppliesToBatchMultiVector) { using T = typename TestFixture::value_type; @@ -129,7 +129,7 @@ TYPED_TEST(BatchDense, AppliesToBatchMultiVector) } -TYPED_TEST(BatchDense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) +TYPED_TEST(Dense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) { using Mtx = typename TestFixture::Mtx; using MVec = typename TestFixture::MVec; @@ -156,7 +156,7 @@ TYPED_TEST(BatchDense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) } -TYPED_TEST(BatchDense, AppliesLinearCombinationToBatchMultiVector) +TYPED_TEST(Dense, AppliesLinearCombinationToBatchMultiVector) { using Mtx = typename TestFixture::Mtx; using MVec = typename TestFixture::MVec; @@ -183,7 +183,7 @@ TYPED_TEST(BatchDense, AppliesLinearCombinationToBatchMultiVector) } -TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfResultCols) +TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultCols) { using MVec = typename TestFixture::MVec; auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}}); @@ -193,7 +193,7 @@ TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfResultCols) } -TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfResultRows) +TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultRows) { using MVec = typename TestFixture::MVec; auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}}); @@ -203,7 +203,7 @@ TYPED_TEST(BatchDense, ApplyFailsOnWrongNumberOfResultRows) } -TYPED_TEST(BatchDense, ApplyFailsOnWrongInnerDimension) +TYPED_TEST(Dense, ApplyFailsOnWrongInnerDimension) { using MVec = typename TestFixture::MVec; auto res = @@ -214,7 +214,7 @@ TYPED_TEST(BatchDense, ApplyFailsOnWrongInnerDimension) } -TYPED_TEST(BatchDense, AdvancedApplyFailsOnWrongInnerDimension) +TYPED_TEST(Dense, AdvancedApplyFailsOnWrongInnerDimension) { using MVec = typename TestFixture::MVec; auto res = @@ -230,7 +230,7 @@ TYPED_TEST(BatchDense, AdvancedApplyFailsOnWrongInnerDimension) } -TYPED_TEST(BatchDense, AdvancedApplyFailsOnWrongAlphaDimension) +TYPED_TEST(Dense, AdvancedApplyFailsOnWrongAlphaDimension) { using MVec = typename TestFixture::MVec; auto res = diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index 7d44f29899c..b32f1063377 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -53,13 +53,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "test/utils/executor.hpp" -class BatchDense : public CommonTestFixture { +class Dense : public CommonTestFixture { protected: using vtype = double; - using Mtx = gko::batch::matrix::BatchDense; + using Mtx = gko::batch::matrix::Dense; using MVec = gko::batch::MultiVector; - BatchDense() : rand_engine(15) {} + Dense() : rand_engine(15) {} template std::unique_ptr gen_mtx(const gko::size_type num_batch_items, @@ -107,7 +107,7 @@ class BatchDense : public CommonTestFixture { }; -TEST_F(BatchDense, SingleVectorApplyIsEquivalentToRef) +TEST_F(Dense, SingleVectorApplyIsEquivalentToRef) { set_up_apply_data(1); @@ -118,7 +118,7 @@ TEST_F(BatchDense, SingleVectorApplyIsEquivalentToRef) } -TEST_F(BatchDense, SingleVectorAdvancedApplyIsEquivalentToRef) +TEST_F(Dense, SingleVectorAdvancedApplyIsEquivalentToRef) { set_up_apply_data(1); From 37ce4f6095b0462cae1cbe49919f44c63bea5dd7 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 5 Oct 2023 16:54:40 +0200 Subject: [PATCH 356/583] Doc updates and multivector view --- core/matrix/batch_dense.cpp | 32 +++++++ core/matrix/batch_struct.hpp | 4 +- core/test/matrix/batch_dense.cpp | 12 +++ cuda/matrix/batch_struct.hpp | 4 +- dpcpp/matrix/batch_struct.hpp | 4 +- hip/matrix/batch_struct.hip.hpp | 4 +- include/ginkgo/core/matrix/batch_dense.hpp | 102 +++++++++++++-------- reference/matrix/batch_struct.hpp | 4 +- 8 files changed, 120 insertions(+), 46 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index 75f29bc6b4c..a864b4114c2 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -82,6 +82,38 @@ batch_dim<2> compute_batch_size( } // namespace detail +template +std::unique_ptr> +Dense::create_multi_vector_view() +{ + auto exec = this->get_executor(); + auto num_batch_items = this->get_num_batch_items(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mvec = MultiVector::create( + exec, this->get_size(), + make_array_view(exec, num_batch_items * num_rows * stride, + this->get_values())); + return mvec; +} + + +template +std::unique_ptr> +Dense::create_const_multi_vector_view() const +{ + auto exec = this->get_executor(); + auto num_batch_items = this->get_num_batch_items(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mvec = MultiVector::create_const( + exec, this->get_size(), + make_const_array_view(exec, num_batch_items * num_rows * stride, + this->get_const_values())); + return mvec; +} + + template std::unique_ptr> Dense::create_view_for_item(size_type item_id) diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index 37c297bb6b5..93b2b027ceb 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -46,7 +46,7 @@ namespace batch_dense { /** - * Encapsulates one matrix from a batch of multi-vectors. + * Encapsulates one matrix from a batch of dense matrices. */ template struct batch_item { @@ -59,7 +59,7 @@ struct batch_item { /** - * A 'simple' structure to store a global uniform batch of multi-vectors. + * A 'simple' structure to store a global uniform batch of dense matrices. */ template struct uniform_batch { diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index 6afe13a50af..36fc3f2ee4a 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -59,6 +59,10 @@ class Dense : public ::testing::Test { {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, exec)), + mvec(gko::batch::initialize>( + {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + exec)), dense_mtx(gko::initialize>( {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)) {} @@ -92,6 +96,7 @@ class Dense : public ::testing::Test { std::shared_ptr exec; std::unique_ptr> mtx; + std::unique_ptr> mvec; std::unique_ptr> dense_mtx; }; @@ -133,6 +138,13 @@ TYPED_TEST(Dense, CanCreateDenseItemView) } +TYPED_TEST(Dense, CanCreateMultiVectorView) +{ + GKO_ASSERT_BATCH_MTX_NEAR(this->mtx->create_multi_vector_view(), this->mvec, + 0.0); +} + + TYPED_TEST(Dense, CanBeCopied) { auto mtx_copy = gko::batch::matrix::Dense::create(this->exec); diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 56af3c5ba7e..19b006d26cd 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -61,7 +61,7 @@ namespace cuda { /** - * Generates an immutable uniform batch struct from a batch of multi-vectors. + * Generates an immutable uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch> @@ -75,7 +75,7 @@ get_batch_struct(const batch::matrix::Dense* const op) /** - * Generates a uniform batch struct from a batch of multi-vectors. + * Generates a uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch> diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index e44bc394667..cd5298a4409 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -59,7 +59,7 @@ namespace dpcpp { /** - * Generates an immutable uniform batch struct from a batch of multi-vectors. + * Generates an immutable uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch @@ -73,7 +73,7 @@ get_batch_struct(const batch::matrix::Dense* const op) /** - * Generates a uniform batch struct from a batch of multi-vectors. + * Generates a uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch get_batch_struct( diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index c75a6c7f0a3..25c73d45abc 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -61,7 +61,7 @@ namespace hip { /** - * Generates an immutable uniform batch struct from a batch of multi-vectors. + * Generates an immutable uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch> @@ -75,7 +75,7 @@ get_batch_struct(const batch::matrix::Dense* const op) /** - * Generates a uniform batch struct from a batch of multi-vectors. + * Generates a uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch> diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 86cd78eadc8..d713760947e 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -59,21 +59,24 @@ namespace matrix { * matrix in each of the batches. * * The values in each of the batches are stored in row-major format (values - * belonging to the same row appear consecutive in the memory). Optionally, rows - * can be padded for better memory access. + * belonging to the same row appear consecutive in the memory and the values of + * each batch item are also stored consecutively in memory). + * + * @note Though the storage layout is similar to the multi-vector object, the + * class semantics and the operations it aims to provide is different. Hence it + * is recommended to create multi-vector objects if the user means to view the + * data as a set of vectors. * * @tparam ValueType precision of matrix elements * - * @note While this format is not very useful for storing sparse matrices, it - * is often suitable to store vectors, and sets of vectors. * @ingroup batch_dense * @ingroup mat_formats * @ingroup BatchLinOp */ template -class Dense : public EnableBatchLinOp>, - public EnableCreateMethod>, - public ConvertibleTo>> { +class Dense final : public EnableBatchLinOp>, + public EnableCreateMethod>, + public ConvertibleTo>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Dense>; @@ -103,16 +106,31 @@ class Dense : public EnableBatchLinOp>, void move_to(Dense>* result) override; + /** + * Creates a mutable view (of MultiVector type) of the data owned by the + * matrix::Dense object. Does not perform any deep copies, but only + * returns a view of the underlying data. + * + * @return a MultiVector object with a view of the data from the batch + * dense matrix. + */ + std::unique_ptr> create_multi_vector_view(); + + /** + * @copydoc create_const_multi_vector_view() + */ + std::unique_ptr> + create_const_multi_vector_view() const; /** - * Creates a mutable view (of matrix::Dense type) of one item of the Batch - * MultiVector object. Does not perform any deep copies, but - * only returns a view of the data. + * Creates a mutable view (of matrix::Dense type) of one item of the + * batch::matrix::Dense object. Does not perform any deep + * copies, but only returns a view of the data. * * @param item_id The index of the batch item * - * @return a matrix::Dense object with the data from the batch item at the - * given index. + * @return a batch::matrix::Dense object with the data from the batch item + * at the given index. */ std::unique_ptr create_view_for_item(size_type item_id); @@ -148,8 +166,8 @@ class Dense : public EnableBatchLinOp>, * @param row the row of the requested element * @param col the column of the requested element * - * @note the method has to be called on the same Executor the vector is - * stored at (e.g. trying to call this method on a GPU multi-vector + * @note the method has to be called on the same Executor the matrix is + * stored at (e.g. trying to call this method on a GPU Dense object * from the OMP results in a runtime error) */ value_type& at(size_type batch_id, size_type row, size_type col) @@ -159,7 +177,7 @@ class Dense : public EnableBatchLinOp>, } /** - * @copydoc MultiVector::at(size_type, size_type, size_type) + * @copydoc Dense::at(size_type, size_type, size_type) */ value_type at(size_type batch_id, size_type row, size_type col) const { @@ -170,15 +188,15 @@ class Dense : public EnableBatchLinOp>, /** * Returns a single element for a particular batch item. * - * Useful for iterating across all elements of the vector. + * Useful for iterating across all elements of the matrix. * However, it is less efficient than the two-parameter variant of this * method. * * @param batch_id the batch item index to be queried * @param idx a linear index of the requested element * - * @note the method has to be called on the same Executor the vector is - * stored at (e.g. trying to call this method on a GPU multi-vector + * @note the method has to be called on the same Executor the matrix is + * stored at (e.g. trying to call this method on a GPU Dense object * from the OMP results in a runtime error) */ ValueType& at(size_type batch_id, size_type idx) noexcept @@ -187,7 +205,7 @@ class Dense : public EnableBatchLinOp>, } /** - * @copydoc MultiVector::at(size_type, size_type, size_type) + * @copydoc Dense::at(size_type, size_type, size_type) */ ValueType at(size_type batch_id, size_type idx) const noexcept { @@ -195,7 +213,7 @@ class Dense : public EnableBatchLinOp>, } /** - * Returns a pointer to the array of values of the multi-vector for a + * Returns a pointer to the array of values of the matrix for a * specific batch item. * * @param batch_id the id of the batch item. @@ -236,30 +254,45 @@ class Dense : public EnableBatchLinOp>, return values_.get_num_elems(); } - /** * Creates a constant (immutable) batch dense matrix from a constant * array. * - * @param exec the executor to create the vector on - * @param size the dimensions of the vector - * @param values the value array of the vector + * @param exec the executor to create the matrix on + * @param size the dimensions of the matrix + * @param values the value array of the matrix * - * @return A smart pointer to the constant multi-vector wrapping the input - * array (if it resides on the same executor as the vector) or a copy of the + * @return A smart pointer to the constant matrix wrapping the input + * array (if it resides on the same executor as the matrix) or a copy of the * array on the correct executor. */ static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, gko::detail::const_array_view&& values); - + /** + * Apply the matrix to a multi-vector. Represents the matrix vector + * multiplication, x = A * b, where x and b are both multi-vectors. + * + * @param b the multi-vector to be applied to + * @param x the output multi-vector + */ void apply(const MultiVector* b, MultiVector* x) const { this->apply_impl(b, x); } + /** + * Apply the matrix to a multi-vector with a linear combination of the given + * input vector. Represents the matrix vector multiplication, x = alpha* A * + * b + beta * x, where x and b are both multi-vectors. + * + * @param alpha the scalar to scale the matrix-vector product with + * @param b the multi-vector to be applied to + * @param beta the scalar to scale the x vector with + * @param x the output multi-vector + */ void apply(const MultiVector* alpha, const MultiVector* b, const MultiVector* beta, @@ -293,9 +326,6 @@ class Dense : public EnableBatchLinOp>, * @param exec Executor associated to the matrix * @param size sizes of the batch matrices in a batch_dim object * @param values array of matrix values - * @param strides stride of the rows (i.e. offset between the first - * elements of two consecutive rows, expressed as the - * number of matrix elements) * * @note If `values` is not an rvalue, not an array of ValueType, or is on * the wrong executor, an internal copy will be created, and the @@ -320,13 +350,13 @@ class Dense : public EnableBatchLinOp>, */ std::unique_ptr create_with_same_config() const; - virtual void apply_impl(const MultiVector* b, - MultiVector* x) const; + void apply_impl(const MultiVector* b, + MultiVector* x) const; - virtual void apply_impl(const MultiVector* alpha, - const MultiVector* b, - const MultiVector* beta, - MultiVector* x) const; + void apply_impl(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const; size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index 40e2cfc2078..1a759cec2a9 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -62,7 +62,7 @@ namespace host { /** - * Generates an immutable uniform batch struct from a batch of multi-vectors. + * Generates an immutable uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch @@ -76,7 +76,7 @@ get_batch_struct(const batch::matrix::Dense* const op) /** - * Generates a uniform batch struct from a batch of multi-vectors. + * Generates a uniform batch struct from a batch of dense matrices. */ template inline batch::matrix::batch_dense::uniform_batch get_batch_struct( From 33c71c13955e63c94d817bdc2d3f4fe1d83ac192 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Fri, 6 Oct 2023 07:12:03 +0000 Subject: [PATCH 357/583] Format files Co-authored-by: Pratik Nayak --- cuda/matrix/batch_struct.hpp | 4 +- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 52 ++++++++++--------- dpcpp/matrix/batch_dense_kernels.dp.cpp | 54 ++++++++++---------- dpcpp/matrix/batch_struct.hpp | 4 +- hip/matrix/batch_struct.hip.hpp | 10 ++-- reference/matrix/batch_struct.hpp | 4 +- 6 files changed, 69 insertions(+), 59 deletions(-) diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 19b006d26cd..f191953f7b9 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -34,13 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#include "core/matrix/batch_struct.hpp" + + #include #include #include #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "cuda/base/config.hpp" #include "cuda/base/types.hpp" diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 12648b81e00..e0bc15fdc61 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -194,9 +194,9 @@ void compute_dot(std::shared_ptr exec, // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto x_b = batch::extract_batch_item(x_ub, group_id); @@ -232,18 +232,19 @@ void compute_conj_dot(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto y_b = batch::extract_batch_item(y_ub, group_id); - const auto res_b = batch::extract_batch_item(res_ub, group_id); - compute_gen_dot_product_kernel( - x_b, y_b, res_b, item_ct1, - [](auto val) { return conj(val); }); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto y_b = batch::extract_batch_item(y_ub, group_id); + const auto res_b = + batch::extract_batch_item(res_ub, group_id); + compute_gen_dot_product_kernel( + x_b, y_b, res_b, item_ct1, + [](auto val) { return conj(val); }); + }); }); } @@ -268,16 +269,17 @@ void compute_norm2(std::shared_ptr exec, const dim3 grid(num_batches); exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto res_b = batch::extract_batch_item(res_ub, group_id); - compute_norm2_kernel(x_b, res_b, item_ct1); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = + batch::extract_batch_item(x_ub, group_id); + const auto res_b = batch::extract_batch_item( + res_ub, group_id); + compute_norm2_kernel(x_b, res_b, item_ct1); + }); }); } diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 4552f918c60..6aec3e57fc5 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -100,17 +100,17 @@ void simple_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b, x_b, item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); + }); }); } @@ -147,22 +147,22 @@ void advanced_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto alpha_b = - batch::extract_batch_item(alpha_ub, group_id); - const auto beta_b = - batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, - item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); + }); }); } diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index cd5298a4409..f561bf004c7 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -34,12 +34,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_ +#include "core/matrix/batch_struct.hpp" + + #include #include #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "dpcpp/base/config.hpp" diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index 25c73d45abc..c0659420661 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HPP_ -#define GKO_HIP_MATRIX_BATCH_STRUCT_HPP_ +#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ +#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ + + +#include "core/matrix/batch_struct.hpp" #include @@ -40,7 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/types.hip.hpp" @@ -93,4 +95,4 @@ get_batch_struct(batch::matrix::Dense* const op) } // namespace gko -#endif // GKO_HIP_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index 1a759cec2a9..47d48f1e927 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -34,13 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ +#include "core/matrix/batch_struct.hpp" + + #include #include #include #include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" namespace gko { From fd9a228435a7387f3537633531a7d6a846d04cc3 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 6 Oct 2023 09:11:18 +0200 Subject: [PATCH 358/583] Use CommonTestFixture value_type --- test/matrix/batch_dense_kernels.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index b32f1063377..d6bf85a42c4 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -55,9 +55,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. class Dense : public CommonTestFixture { protected: - using vtype = double; - using Mtx = gko::batch::matrix::Dense; - using MVec = gko::batch::MultiVector; + using Mtx = gko::batch::matrix::Dense; + using MVec = gko::batch::MultiVector; Dense() : rand_engine(15) {} @@ -87,7 +86,7 @@ class Dense : public CommonTestFixture { expected = MVec::create( ref, gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs})); - expected->fill(gko::one()); + expected->fill(gko::one()); dresult = gko::clone(exec, expected); } @@ -114,7 +113,7 @@ TEST_F(Dense, SingleVectorApplyIsEquivalentToRef) x->apply(y.get(), expected.get()); dx->apply(dy.get(), dresult.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); } @@ -125,5 +124,5 @@ TEST_F(Dense, SingleVectorAdvancedApplyIsEquivalentToRef) x->apply(alpha.get(), y.get(), beta.get(), expected.get()); dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); - GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); } From a71ce20655df85cedb0753e6161344ce0ff9675e Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 9 Oct 2023 12:48:24 +0200 Subject: [PATCH 359/583] Review updates Co-authored-by: Marcel Koch --- core/base/batch_multi_vector.cpp | 16 +++++++++--- core/base/batch_utilities.hpp | 7 +++--- core/matrix/batch_dense.cpp | 25 ------------------- .../test/preconditioner/jacobi_kernels.dp.cpp | 2 +- include/ginkgo/core/matrix/batch_dense.hpp | 2 +- reference/matrix/batch_dense_kernels.hpp.inc | 2 +- reference/test/matrix/batch_dense_kernels.cpp | 2 +- test/matrix/batch_dense_kernels.cpp | 2 +- 8 files changed, 20 insertions(+), 38 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 294fe45972a..bd2079907a3 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -294,11 +294,12 @@ void MultiVector::move_to( template void MultiVector::convert_to(matrix::Dense* result) const { - auto exec = result->get_executor() != nullptr ? result->get_executor() - : this->get_executor(); + auto exec = result->get_executor() == nullptr ? this->get_executor() + : result->get_executor(); auto tmp = gko::batch::matrix::Dense::create_const( exec, this->get_size(), - make_const_array_view(exec, this->get_num_stored_elements(), + make_const_array_view(this->get_executor(), + this->get_num_stored_elements(), this->get_const_values())); result->copy_from(tmp); } @@ -307,7 +308,14 @@ void MultiVector::convert_to(matrix::Dense* result) const template void MultiVector::move_to(matrix::Dense* result) { - this->convert_to(result); + auto exec = result->get_executor() == nullptr ? this->get_executor() + : result->get_executor(); + auto tmp = gko::batch::matrix::Dense::create_const( + exec, this->get_size(), + make_const_array_view(this->get_executor(), + this->get_num_stored_elements(), + this->get_const_values())); + tmp->move_to(result); } diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index d5c5bdb4aa2..834e89c8358 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -109,14 +109,13 @@ std::unique_ptr create_from_item( template -auto unbatch(const InputType* batch_multivec) +auto unbatch(const InputType* batch_object) { - auto exec = batch_multivec->get_executor(); auto unbatched_mats = std::vector>{}; - for (size_type b = 0; b < batch_multivec->get_num_batch_items(); ++b) { + for (size_type b = 0; b < batch_object->get_num_batch_items(); ++b) { unbatched_mats.emplace_back( - batch_multivec->create_const_view_for_item(b)->clone()); + batch_object->create_const_view_for_item(b)->clone()); } return unbatched_mats; } diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index a864b4114c2..b948a2c3afc 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -64,24 +64,6 @@ GKO_REGISTER_OPERATION(advanced_apply, batch_dense::advanced_apply); } // namespace dense -namespace detail { - - -template -batch_dim<2> compute_batch_size( - const std::vector*>& matrices) -{ - auto common_size = matrices[0]->get_size(); - for (size_type i = 1; i < matrices.size(); ++i) { - GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); - } - return batch_dim<2>{matrices.size(), common_size}; -} - - -} // namespace detail - - template std::unique_ptr> Dense::create_multi_vector_view() @@ -178,13 +160,6 @@ std::unique_ptr> Dense::create_const( } -inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) -{ - return batch_dim<2>(sizes.get_num_batch_items(), - dim<2>(1, sizes.get_common_size()[1])); -} - - template Dense::Dense(std::shared_ptr exec, const batch_dim<2>& size) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index aae15245357..b8082a2db32 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -90,7 +90,7 @@ class Jacobi : public ::testing::Test { gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1, value_type accuracy = 0.1, bool skip_sorting = true) { - std::ranlux48 engine(42); + std::default_random_engine engine(42); const auto dim = *(end(block_pointers) - 1); if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index d713760947e..d081e5d440e 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -62,7 +62,7 @@ namespace matrix { * belonging to the same row appear consecutive in the memory and the values of * each batch item are also stored consecutively in memory). * - * @note Though the storage layout is similar to the multi-vector object, the + * @note Though the storage layout is the same as the multi-vector object, the * class semantics and the operations it aims to provide is different. Hence it * is recommended to create multi-vector objects if the user means to view the * data as a set of vectors. diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp.inc index bff9ad137cf..20e395af5b7 100644 --- a/reference/matrix/batch_dense_kernels.hpp.inc +++ b/reference/matrix/batch_dense_kernels.hpp.inc @@ -71,7 +71,7 @@ inline void advanced_apply_kernel( } else { for (int row = 0; row < c.num_rows; ++row) { for (int col = 0; col < c.num_rhs; ++col) { - c.values[row * c.stride + col] *= gko::zero(); + c.values[row * c.stride + col] = gko::zero(); } } } diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp index e1689352cde..97dbe3e77cb 100644 --- a/reference/test/matrix/batch_dense_kernels.cpp +++ b/reference/test/matrix/batch_dense_kernels.cpp @@ -107,7 +107,7 @@ class Dense : public ::testing::Test { std::unique_ptr x_00; std::unique_ptr x_01; - std::ranlux48 rand_engine; + std::default_random_engine rand_engine; }; diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index d6bf85a42c4..a73efcd8753 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -90,7 +90,7 @@ class Dense : public CommonTestFixture { dresult = gko::clone(exec, expected); } - std::ranlux48 rand_engine; + std::default_random_engine rand_engine; const size_t batch_size = 11; std::unique_ptr x; From 2e8e600a047671bd339d6dbe1b43082cbbe470be Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 9 Oct 2023 16:01:46 +0200 Subject: [PATCH 360/583] Review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher --- .../matrix/batch_dense_kernels.hpp.inc | 18 ++--- core/base/batch_multi_vector.cpp | 9 +-- core/base/batch_struct.hpp | 16 ++-- core/matrix/batch_struct.hpp | 37 +++++----- cuda/base/batch_struct.hpp | 14 ++-- cuda/matrix/batch_struct.hpp | 18 ++--- dpcpp/base/batch_struct.hpp | 14 ++-- dpcpp/matrix/batch_dense_kernels.hpp.inc | 4 +- dpcpp/matrix/batch_struct.hpp | 22 +++--- hip/base/batch_struct.hip.hpp | 14 ++-- hip/matrix/batch_struct.hip.hpp | 18 ++--- include/ginkgo/core/matrix/batch_dense.hpp | 14 ++-- reference/base/batch_struct.hpp | 12 +-- reference/matrix/batch_dense_kernels.hpp.inc | 4 +- reference/matrix/batch_struct.hpp | 19 ++--- .../test/base/batch_multi_vector_kernels.cpp | 14 ++-- reference/test/matrix/batch_dense_kernels.cpp | 74 +++++++++---------- test/matrix/batch_dense_kernels.cpp | 44 +++++------ 18 files changed, 178 insertions(+), 187 deletions(-) diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc index 2f876332ae7..7a38cfea215 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __device__ __forceinline__ void simple_apply( - const gko::batch::matrix::batch_dense::batch_item& mat, + const gko::batch::matrix::dense::batch_item& mat, const ValueType* const __restrict__ b, ValueType* const __restrict__ x) { constexpr auto tile_size = config::warp_size; @@ -65,10 +65,9 @@ template __global__ __launch_bounds__( default_block_size, sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - batch_dense:: - uniform_batch< - const ValueType> - mat, + dense::uniform_batch< + const ValueType> + mat, const gko::batch:: multi_vector:: uniform_batch< @@ -94,7 +93,7 @@ __global__ __launch_bounds__( template __device__ __forceinline__ void advanced_apply( const ValueType alpha, - const gko::batch::matrix::batch_dense::batch_item& mat, + const gko::batch::matrix::dense::batch_item& mat, const ValueType* const __restrict__ b, const ValueType beta, ValueType* const __restrict__ x) { @@ -132,10 +131,9 @@ __global__ __launch_bounds__( const ValueType> alpha, const gko::batch::matrix:: - batch_dense:: - uniform_batch< - const ValueType> - mat, + dense::uniform_batch< + const ValueType> + mat, const gko::batch:: multi_vector:: uniform_batch< diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index bd2079907a3..6a14919bf2f 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -308,14 +308,7 @@ void MultiVector::convert_to(matrix::Dense* result) const template void MultiVector::move_to(matrix::Dense* result) { - auto exec = result->get_executor() == nullptr ? this->get_executor() - : result->get_executor(); - auto tmp = gko::batch::matrix::Dense::create_const( - exec, this->get_size(), - make_const_array_view(this->get_executor(), - this->get_num_stored_elements(), - this->get_const_values())); - tmp->move_to(result); + this->convert_to(result); } diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp index caca4577cf7..71445550b87 100644 --- a/core/base/batch_struct.hpp +++ b/core/base/batch_struct.hpp @@ -51,9 +51,9 @@ template struct batch_item { using value_type = ValueType; ValueType* values; - int stride; - int num_rows; - int num_rhs; + int32 stride; + int32 num_rows; + int32 num_rhs; }; @@ -67,9 +67,9 @@ struct uniform_batch { ValueType* values; size_type num_batch_items; - int stride; - int num_rows; - int num_rhs; + int32 stride; + int32 num_rows; + int32 num_rhs; size_type get_entry_storage() const { @@ -117,8 +117,8 @@ extract_batch_item(const multi_vector::uniform_batch& batch, template GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item -extract_batch_item(ValueType* const batch_values, const int stride, - const int num_rows, const int num_rhs, +extract_batch_item(ValueType* const batch_values, const int32 stride, + const int32 num_rows, const int32 num_rhs, const size_type batch_idx) { return {batch_values + batch_idx * stride * num_rows, stride, num_rows, diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index 93b2b027ceb..0bbfde40cc9 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace batch { namespace matrix { -namespace batch_dense { +namespace dense { /** @@ -51,10 +51,10 @@ namespace batch_dense { template struct batch_item { using value_type = ValueType; - ValueType* values; - int stride; - int num_rows; - int num_cols; + value_type* values; + int32 stride; + int32 num_rows; + int32 num_cols; }; @@ -68,9 +68,9 @@ struct uniform_batch { ValueType* values; size_type num_batch_items; - int stride; - int num_rows; - int num_cols; + int32 stride; + int32 num_rows; + int32 num_cols; size_type get_entry_storage() const { @@ -79,38 +79,37 @@ struct uniform_batch { }; -} // namespace batch_dense +} // namespace dense template -GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item to_const( - const batch_dense::batch_item& b) +GKO_ATTRIBUTES GKO_INLINE dense::batch_item to_const( + const dense::batch_item& b) { return {b.values, b.stride, b.num_rows, b.num_cols}; } template -GKO_ATTRIBUTES GKO_INLINE batch_dense::uniform_batch to_const( - const batch_dense::uniform_batch& ub) +GKO_ATTRIBUTES GKO_INLINE dense::uniform_batch to_const( + const dense::uniform_batch& ub) { return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_cols}; } template -GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item extract_batch_item( - const batch_dense::uniform_batch& batch, - const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE dense::batch_item extract_batch_item( + const dense::uniform_batch& batch, const size_type batch_idx) { return {batch.values + batch_idx * batch.stride * batch.num_rows, batch.stride, batch.num_rows, batch.num_cols}; } template -GKO_ATTRIBUTES GKO_INLINE batch_dense::batch_item extract_batch_item( - ValueType* const batch_values, const int stride, const int num_rows, - const int num_cols, const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE dense::batch_item extract_batch_item( + ValueType* const batch_values, const int32 stride, const int32 num_rows, + const int32 num_cols, const size_type batch_idx) { return {batch_values + batch_idx * stride * num_rows, stride, num_rows, num_cols}; diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 715332418fb..12f34509275 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -54,7 +54,7 @@ namespace cuda { * while also shallow-casting to the required CUDA scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. + * object. */ @@ -66,9 +66,9 @@ inline batch::multi_vector::uniform_batch> get_batch_struct(const batch::MultiVector* const op) { return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } /** @@ -79,9 +79,9 @@ inline batch::multi_vector::uniform_batch> get_batch_struct(batch::MultiVector* const op) { return {as_cuda_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index f191953f7b9..8daf06f416c 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -58,7 +58,7 @@ namespace cuda { * while also shallow-casting to the required CUDA scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. + * object. */ @@ -66,13 +66,13 @@ namespace cuda { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(const batch::matrix::Dense* const op) { return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } @@ -80,13 +80,13 @@ get_batch_struct(const batch::matrix::Dense* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(batch::matrix::Dense* const op) { return {as_cuda_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index 9c752a94b4f..2a0c03f552e 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -53,7 +53,7 @@ namespace dpcpp { * while also shallow-casting to the required DPCPP scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. + * object. */ @@ -65,9 +65,9 @@ inline batch::multi_vector::uniform_batch get_batch_struct( const batch::MultiVector* const op) { return {op->get_const_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } @@ -79,9 +79,9 @@ inline batch::multi_vector::uniform_batch get_batch_struct( batch::MultiVector* const op) { return {op->get_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp.inc index dacd31feade..88ef5f54764 100644 --- a/dpcpp/matrix/batch_dense_kernels.hpp.inc +++ b/dpcpp/matrix/batch_dense_kernels.hpp.inc @@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __dpct_inline__ void simple_apply_kernel( - const gko::batch::matrix::batch_dense::batch_item& mat, + const gko::batch::matrix::dense::batch_item& mat, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& x, sycl::nd_item<3>& item_ct1) @@ -66,7 +66,7 @@ __dpct_inline__ void simple_apply_kernel( template __dpct_inline__ void advanced_apply_kernel( const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::matrix::batch_dense::batch_item& mat, + const gko::batch::matrix::dense::batch_item& mat, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& beta, const gko::batch::multi_vector::batch_item& x, diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index f561bf004c7..1955399d0d8 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -37,8 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_struct.hpp" -#include #include +#include #include "core/base/batch_struct.hpp" @@ -56,7 +56,7 @@ namespace dpcpp { * while also shallow-casting to the required DPCPP scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. + * object. */ @@ -64,13 +64,13 @@ namespace dpcpp { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch -get_batch_struct(const batch::matrix::Dense* const op) +inline batch::matrix::dense::uniform_batch get_batch_struct( + const batch::matrix::Dense* const op) { return {op->get_const_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } @@ -78,13 +78,13 @@ get_batch_struct(const batch::matrix::Dense* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch get_batch_struct( +inline batch::matrix::dense::uniform_batch get_batch_struct( batch::matrix::Dense* const op) { return {op->get_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index 442260e50e6..732c40662aa 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -54,7 +54,7 @@ namespace hip { * while also shallow-casting to the required Hip scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. + * object. */ @@ -66,9 +66,9 @@ inline batch::multi_vector::uniform_batch> get_batch_struct(const batch::MultiVector* const op) { return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } /** @@ -79,9 +79,9 @@ inline batch::multi_vector::uniform_batch> get_batch_struct( batch::MultiVector* const op) { return {as_hip_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index c0659420661..a22797a03d4 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -58,7 +58,7 @@ namespace hip { * while also shallow-casting to the required HIP scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. + * object. */ @@ -66,13 +66,13 @@ namespace hip { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(const batch::matrix::Dense* const op) { return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } @@ -80,13 +80,13 @@ get_batch_struct(const batch::matrix::Dense* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(batch::matrix::Dense* const op) { return {as_hip_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index d081e5d440e..932c52edfc5 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -63,7 +63,7 @@ namespace matrix { * each batch item are also stored consecutively in memory). * * @note Though the storage layout is the same as the multi-vector object, the - * class semantics and the operations it aims to provide is different. Hence it + * class semantics and the operations it aims to provide are different. Hence it * is recommended to create multi-vector objects if the user means to view the * data as a set of vectors. * @@ -123,13 +123,13 @@ class Dense final : public EnableBatchLinOp>, create_const_multi_vector_view() const; /** - * Creates a mutable view (of matrix::Dense type) of one item of the + * Creates a mutable view (of gko::matrix::Dense type) of one item of the * batch::matrix::Dense object. Does not perform any deep * copies, but only returns a view of the data. * * @param item_id The index of the batch item * - * @return a batch::matrix::Dense object with the data from the batch item + * @return a gko::matrix::Dense object with the data from the batch item * at the given index. */ std::unique_ptr create_view_for_item(size_type item_id); @@ -168,7 +168,7 @@ class Dense final : public EnableBatchLinOp>, * * @note the method has to be called on the same Executor the matrix is * stored at (e.g. trying to call this method on a GPU Dense object - * from the OMP results in a runtime error) + * from the OMP may result in incorrect behaviour) */ value_type& at(size_type batch_id, size_type row, size_type col) { @@ -197,7 +197,7 @@ class Dense final : public EnableBatchLinOp>, * * @note the method has to be called on the same Executor the matrix is * stored at (e.g. trying to call this method on a GPU Dense object - * from the OMP results in a runtime error) + * from the OMP may result in incorrect behaviour) */ ValueType& at(size_type batch_id, size_type idx) noexcept { @@ -268,7 +268,7 @@ class Dense final : public EnableBatchLinOp>, */ static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - gko::detail::const_array_view&& values); + detail::const_array_view&& values); /** * Apply the matrix to a multi-vector. Represents the matrix vector @@ -343,7 +343,7 @@ class Dense final : public EnableBatchLinOp>, } /** - * Creates a Dense matrix with the same configuration as the callers + * Creates a Dense matrix with the same configuration as the caller's * matrix. * * @returns a Dense matrix with the same configuration as the caller. diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index ce7c7af5605..0a3dbf37493 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -67,9 +67,9 @@ inline batch::multi_vector::uniform_batch get_batch_struct( const batch::MultiVector* const op) { return {op->get_const_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } @@ -81,9 +81,9 @@ inline batch::multi_vector::uniform_batch get_batch_struct( batch::MultiVector* const op) { return {op->get_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp.inc index 20e395af5b7..17144267af1 100644 --- a/reference/matrix/batch_dense_kernels.hpp.inc +++ b/reference/matrix/batch_dense_kernels.hpp.inc @@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void simple_apply_kernel( - const gko::batch::matrix::batch_dense::batch_item& a, + const gko::batch::matrix::dense::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) { @@ -57,7 +57,7 @@ inline void simple_apply_kernel( template inline void advanced_apply_kernel( const ValueType alpha, - const gko::batch::matrix::batch_dense::batch_item& a, + const gko::batch::matrix::dense::batch_item& a, const gko::batch::multi_vector::batch_item& b, const ValueType beta, const gko::batch::multi_vector::batch_item& c) diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index 47d48f1e927..dcd4ce3e71e 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include @@ -67,13 +68,13 @@ namespace host { * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch -get_batch_struct(const batch::matrix::Dense* const op) +inline batch::matrix::dense::uniform_batch get_batch_struct( + const batch::matrix::Dense* const op) { return {op->get_const_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } @@ -81,13 +82,13 @@ get_batch_struct(const batch::matrix::Dense* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::batch_dense::uniform_batch get_batch_struct( +inline batch::matrix::dense::uniform_batch get_batch_struct( batch::matrix::Dense* const op) { return {op->get_values(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1])}; } diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index e0c7643c8d7..a49168dc24e 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -140,9 +140,9 @@ TYPED_TEST(MultiVector, ScalesData) auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_0->scale(alpha.get()); + this->mtx_00->scale(ualpha[0].get()); this->mtx_01->scale(ualpha[1].get()); - auto res = gko::batch::unbatch>(this->mtx_0.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_00.get(), 0.); @@ -158,9 +158,9 @@ TYPED_TEST(MultiVector, ScalesDataWithScalar) auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_1->scale(alpha.get()); + this->mtx_10->scale(ualpha[0].get()); this->mtx_11->scale(ualpha[1].get()); - auto res = gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); @@ -196,9 +196,9 @@ TYPED_TEST(MultiVector, AddsScaled) auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); + this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - auto res = gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); @@ -214,9 +214,9 @@ TYPED_TEST(MultiVector, AddsScaledWithScalar) auto ualpha = gko::batch::unbatch>(alpha.get()); this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get()); + this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get()); this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get()); - auto res = gko::batch::unbatch>(this->mtx_1.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.); @@ -244,9 +244,9 @@ TYPED_TEST(MultiVector, ComputesDot) auto ures = gko::batch::unbatch>(result.get()); this->mtx_0->compute_dot(this->mtx_1.get(), result.get()); + this->mtx_00->compute_dot(this->mtx_10.get(), ures[0].get()); this->mtx_01->compute_dot(this->mtx_11.get(), ures[1].get()); - auto res = gko::batch::unbatch>(result.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); @@ -256,6 +256,7 @@ TYPED_TEST(MultiVector, ComputesDot) TYPED_TEST(MultiVector, ComputeDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; + auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); @@ -285,9 +286,9 @@ TYPED_TEST(MultiVector, ComputesConjDot) auto ures = gko::batch::unbatch>(result.get()); this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()); + this->mtx_00->compute_conj_dot(this->mtx_10.get(), ures[0].get()); this->mtx_01->compute_conj_dot(this->mtx_11.get(), ures[1].get()); - auto res = gko::batch::unbatch>(result.get()); GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.); @@ -297,6 +298,7 @@ TYPED_TEST(MultiVector, ComputesConjDot) TYPED_TEST(MultiVector, ComputeConjDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; + auto result = Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3})); diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp index 97dbe3e77cb..a85453edee8 100644 --- a/reference/test/matrix/batch_dense_kernels.cpp +++ b/reference/test/matrix/batch_dense_kernels.cpp @@ -57,14 +57,12 @@ class Dense : public ::testing::Test { protected: using value_type = T; using size_type = gko::size_type; - using Mtx = gko::batch::matrix::Dense; - using MVec = gko::batch::MultiVector; + using BMtx = gko::batch::matrix::Dense; + using BMVec = gko::batch::MultiVector; using DenseMtx = gko::matrix::Dense; - using ComplexMtx = gko::to_complex; - using RealMtx = gko::remove_complex; Dense() : exec(gko::ReferenceExecutor::create()), - mtx_0(gko::batch::initialize( + mtx_0(gko::batch::initialize( {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}}, exec)), @@ -72,7 +70,7 @@ class Dense : public ::testing::Test { {I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, exec)), mtx_01(gko::initialize( {I({1.0, -2.0, -0.5}), I({1.0, -2.5, 4.0})}, exec)), - b_0(gko::batch::initialize( + b_0(gko::batch::initialize( {{I({1.0, 0.0, 1.0}), I({2.0, 0.0, 1.0}), I({1.0, 0.0, 2.0})}, {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), @@ -86,7 +84,7 @@ class Dense : public ::testing::Test { {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), I({1.0, 0.0, 2.0})}, exec)), - x_0(gko::batch::initialize( + x_0(gko::batch::initialize( {{I({2.0, 0.0, 1.0}), I({2.0, 0.0, 2.0})}, {I({-2.0, 1.0, 1.0}), I({1.0, -1.0, -1.0})}}, exec)), @@ -97,13 +95,13 @@ class Dense : public ::testing::Test { {} std::shared_ptr exec; - std::unique_ptr mtx_0; + std::unique_ptr mtx_0; std::unique_ptr mtx_00; std::unique_ptr mtx_01; - std::unique_ptr b_0; + std::unique_ptr b_0; std::unique_ptr b_00; std::unique_ptr b_01; - std::unique_ptr x_0; + std::unique_ptr x_0; std::unique_ptr x_00; std::unique_ptr x_01; @@ -119,11 +117,10 @@ TYPED_TEST(Dense, AppliesToBatchMultiVector) using T = typename TestFixture::value_type; this->mtx_0->apply(this->b_0.get(), this->x_0.get()); + this->mtx_00->apply(this->b_00.get(), this->x_00.get()); this->mtx_01->apply(this->b_01.get(), this->x_01.get()); - auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); } @@ -131,12 +128,12 @@ TYPED_TEST(Dense, AppliesToBatchMultiVector) TYPED_TEST(Dense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) { - using Mtx = typename TestFixture::Mtx; - using MVec = typename TestFixture::MVec; + using BMtx = typename TestFixture::BMtx; + using BMVec = typename TestFixture::BMVec; using DenseMtx = typename TestFixture::DenseMtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch::initialize(2, {1.5}, this->exec); - auto beta = gko::batch::initialize(2, {-4.0}, this->exec); + auto alpha = gko::batch::initialize(2, {1.5}, this->exec); + auto beta = gko::batch::initialize(2, {-4.0}, this->exec); auto alpha0 = gko::initialize({1.5}, this->exec); auto alpha1 = gko::initialize({1.5}, this->exec); auto beta0 = gko::initialize({-4.0}, this->exec); @@ -144,13 +141,12 @@ TYPED_TEST(Dense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), this->x_0.get()); + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), this->x_00.get()); this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), this->x_01.get()); - auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); } @@ -158,12 +154,12 @@ TYPED_TEST(Dense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) TYPED_TEST(Dense, AppliesLinearCombinationToBatchMultiVector) { - using Mtx = typename TestFixture::Mtx; - using MVec = typename TestFixture::MVec; + using BMtx = typename TestFixture::BMtx; + using BMVec = typename TestFixture::BMVec; using DenseMtx = typename TestFixture::DenseMtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch::initialize({{1.5}, {-1.0}}, this->exec); - auto beta = gko::batch::initialize({{2.5}, {-4.0}}, this->exec); + auto alpha = gko::batch::initialize({{1.5}, {-1.0}}, this->exec); + auto beta = gko::batch::initialize({{2.5}, {-4.0}}, this->exec); auto alpha0 = gko::initialize({1.5}, this->exec); auto alpha1 = gko::initialize({-1.0}, this->exec); auto beta0 = gko::initialize({2.5}, this->exec); @@ -171,13 +167,12 @@ TYPED_TEST(Dense, AppliesLinearCombinationToBatchMultiVector) this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), this->x_0.get()); + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), this->x_00.get()); this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), this->x_01.get()); - auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); } @@ -185,8 +180,9 @@ TYPED_TEST(Dense, AppliesLinearCombinationToBatchMultiVector) TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultCols) { - using MVec = typename TestFixture::MVec; - auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}}); + using BMVec = typename TestFixture::BMVec; + + auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}}); ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), gko::DimensionMismatch); @@ -195,8 +191,9 @@ TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultCols) TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultRows) { - using MVec = typename TestFixture::MVec; - auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}}); + using BMVec = typename TestFixture::BMVec; + + auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}}); ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), gko::DimensionMismatch); @@ -205,9 +202,10 @@ TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultRows) TYPED_TEST(Dense, ApplyFailsOnWrongInnerDimension) { - using MVec = typename TestFixture::MVec; + using BMVec = typename TestFixture::BMVec; + auto res = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()), gko::DimensionMismatch); @@ -216,13 +214,13 @@ TYPED_TEST(Dense, ApplyFailsOnWrongInnerDimension) TYPED_TEST(Dense, AdvancedApplyFailsOnWrongInnerDimension) { - using MVec = typename TestFixture::MVec; + using BMVec = typename TestFixture::BMVec; auto res = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); auto alpha = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); auto beta = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); ASSERT_THROW( this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), @@ -232,13 +230,13 @@ TYPED_TEST(Dense, AdvancedApplyFailsOnWrongInnerDimension) TYPED_TEST(Dense, AdvancedApplyFailsOnWrongAlphaDimension) { - using MVec = typename TestFixture::MVec; + using BMVec = typename TestFixture::BMVec; auto res = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}}); auto alpha = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}}); auto beta = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); ASSERT_THROW( this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index a73efcd8753..119a868be09 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -55,17 +55,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. class Dense : public CommonTestFixture { protected: - using Mtx = gko::batch::matrix::Dense; - using MVec = gko::batch::MultiVector; + using BMtx = gko::batch::matrix::Dense; + using BMVec = gko::batch::MultiVector; Dense() : rand_engine(15) {} - template - std::unique_ptr gen_mtx(const gko::size_type num_batch_items, - gko::size_type num_rows, - gko::size_type num_cols) + template + std::unique_ptr gen_mtx(const gko::size_type num_batch_items, + gko::size_type num_rows, + gko::size_type num_cols) { - return gko::test::generate_random_batch_matrix( + return gko::test::generate_random_batch_matrix( num_batch_items, num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); @@ -75,15 +75,15 @@ class Dense : public CommonTestFixture { { const int num_rows = 252; const int num_cols = 32; - x = gen_mtx(batch_size, num_rows, num_cols); - y = gen_mtx(batch_size, num_cols, num_vecs); - alpha = gen_mtx(batch_size, 1, 1); - beta = gen_mtx(batch_size, 1, 1); + x = gen_mtx(batch_size, num_rows, num_cols); + y = gen_mtx(batch_size, num_cols, num_vecs); + alpha = gen_mtx(batch_size, 1, 1); + beta = gen_mtx(batch_size, 1, 1); dx = gko::clone(exec, x); dy = gko::clone(exec, y); dalpha = gko::clone(exec, alpha); dbeta = gko::clone(exec, beta); - expected = MVec::create( + expected = BMVec::create( ref, gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs})); expected->fill(gko::one()); @@ -93,16 +93,16 @@ class Dense : public CommonTestFixture { std::default_random_engine rand_engine; const size_t batch_size = 11; - std::unique_ptr x; - std::unique_ptr y; - std::unique_ptr alpha; - std::unique_ptr beta; - std::unique_ptr expected; - std::unique_ptr dresult; - std::unique_ptr dx; - std::unique_ptr dy; - std::unique_ptr dalpha; - std::unique_ptr dbeta; + std::unique_ptr x; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr dresult; + std::unique_ptr dx; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; }; From 5ac0ad8a4a0fc20b7565d7438871c82acc214874 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 9 Oct 2023 22:56:15 +0200 Subject: [PATCH 361/583] dpcpp Jacobi needs ranlux --- dpcpp/test/preconditioner/jacobi_kernels.dp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index b8082a2db32..aae15245357 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -90,7 +90,7 @@ class Jacobi : public ::testing::Test { gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1, value_type accuracy = 0.1, bool skip_sorting = true) { - std::default_random_engine engine(42); + std::ranlux48 engine(42); const auto dim = *(end(block_pointers) - 1); if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( From 8ab42cb37f0a46f3400639787c377740266168be Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 9 Oct 2023 23:12:59 +0200 Subject: [PATCH 362/583] Remove create_multivector_view --- core/matrix/batch_dense.cpp | 32 ---------------------- core/test/matrix/batch_dense.cpp | 7 ----- include/ginkgo/core/matrix/batch_dense.hpp | 20 ++------------ 3 files changed, 2 insertions(+), 57 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index b948a2c3afc..da092a20229 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -64,38 +64,6 @@ GKO_REGISTER_OPERATION(advanced_apply, batch_dense::advanced_apply); } // namespace dense -template -std::unique_ptr> -Dense::create_multi_vector_view() -{ - auto exec = this->get_executor(); - auto num_batch_items = this->get_num_batch_items(); - auto num_rows = this->get_common_size()[0]; - auto stride = this->get_common_size()[1]; - auto mvec = MultiVector::create( - exec, this->get_size(), - make_array_view(exec, num_batch_items * num_rows * stride, - this->get_values())); - return mvec; -} - - -template -std::unique_ptr> -Dense::create_const_multi_vector_view() const -{ - auto exec = this->get_executor(); - auto num_batch_items = this->get_num_batch_items(); - auto num_rows = this->get_common_size()[0]; - auto stride = this->get_common_size()[1]; - auto mvec = MultiVector::create_const( - exec, this->get_size(), - make_const_array_view(exec, num_batch_items * num_rows * stride, - this->get_const_values())); - return mvec; -} - - template std::unique_ptr> Dense::create_view_for_item(size_type item_id) diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index 36fc3f2ee4a..316312bd68f 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -138,13 +138,6 @@ TYPED_TEST(Dense, CanCreateDenseItemView) } -TYPED_TEST(Dense, CanCreateMultiVectorView) -{ - GKO_ASSERT_BATCH_MTX_NEAR(this->mtx->create_multi_vector_view(), this->mvec, - 0.0); -} - - TYPED_TEST(Dense, CanBeCopied) { auto mtx_copy = gko::batch::matrix::Dense::create(this->exec); diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 932c52edfc5..50f8fe39727 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -106,22 +106,6 @@ class Dense final : public EnableBatchLinOp>, void move_to(Dense>* result) override; - /** - * Creates a mutable view (of MultiVector type) of the data owned by the - * matrix::Dense object. Does not perform any deep copies, but only - * returns a view of the underlying data. - * - * @return a MultiVector object with a view of the data from the batch - * dense matrix. - */ - std::unique_ptr> create_multi_vector_view(); - - /** - * @copydoc create_const_multi_vector_view() - */ - std::unique_ptr> - create_const_multi_vector_view() const; - /** * Creates a mutable view (of gko::matrix::Dense type) of one item of the * batch::matrix::Dense object. Does not perform any deep @@ -234,8 +218,8 @@ class Dense final : public EnableBatchLinOp>, * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + From 00eba9845969f5cb363d6e22409bd7a73a1aa16f Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 9 Oct 2023 21:50:28 +0000 Subject: [PATCH 363/583] Format files Co-authored-by: Pratik Nayak --- include/ginkgo/core/matrix/batch_dense.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 50f8fe39727..2a33a0a8df3 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -218,8 +218,8 @@ class Dense final : public EnableBatchLinOp>, * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + From ceb97e01f24a6e74aa732aa8bba34ec3ec71301f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 00:07:15 +0200 Subject: [PATCH 364/583] const_array_view needs to be in gko:: MSVC compiler fails lookip in gko::detail if there exists a gko::x::detail namespace --- include/ginkgo/core/matrix/batch_dense.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 2a33a0a8df3..89f12d69f62 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -252,7 +252,7 @@ class Dense final : public EnableBatchLinOp>, */ static std::unique_ptr> create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - detail::const_array_view&& values); + gko::detail::const_array_view&& values); /** * Apply the matrix to a multi-vector. Represents the matrix vector From f6d4c4e3fee1ac4af0932a99cc4c01339571a84f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 16:15:32 +0200 Subject: [PATCH 365/583] Review updates Co-authored-by: Yu-Hsiang Tsai Co-authored-by: Marcel Koch --- core/matrix/batch_dense.cpp | 14 +---- core/test/base/batch_dim.cpp | 10 ---- core/test/matrix/batch_dense.cpp | 16 +++-- cuda/base/batch_struct.hpp | 2 +- cuda/matrix/batch_dense_kernels.cu | 1 + cuda/matrix/batch_struct.hpp | 2 +- dpcpp/base/batch_struct.hpp | 2 +- dpcpp/matrix/batch_dense_kernels.dp.cpp | 58 +++++++++---------- dpcpp/matrix/batch_struct.hpp | 2 +- hip/base/batch_struct.hip.hpp | 2 +- hip/matrix/batch_dense_kernels.hip.cpp | 2 + hip/matrix/batch_struct.hip.hpp | 2 +- include/ginkgo/core/base/batch_dim.hpp | 12 ---- .../ginkgo/core/base/batch_multi_vector.hpp | 24 ++++++-- include/ginkgo/core/matrix/batch_dense.hpp | 36 +++++++----- reference/test/matrix/batch_dense_kernels.cpp | 26 --------- 16 files changed, 89 insertions(+), 122 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index da092a20229..7675fcdde9c 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -100,19 +100,7 @@ template std::unique_ptr> Dense::create_with_config_of( ptr_param> other) { - // De-referencing `other` before calling the functions (instead of - // using operator `->`) is currently required to be compatible with - // CUDA 10.1. - // Otherwise, it results in a compile error. - return (*other).create_with_same_config(); -} - - -template -std::unique_ptr> Dense::create_with_same_config() - const -{ - return Dense::create(this->get_executor(), this->get_size()); + return Dense::create(other->get_executor(), other->get_size()); } diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp index 7914eb4d15e..e8722530fba 100644 --- a/core/test/base/batch_dim.cpp +++ b/core/test/base/batch_dim.cpp @@ -85,16 +85,6 @@ TEST(BatchDim, NotEqualWorks) } -TEST(BatchDim, CanGetCumulativeOffsets) -{ - auto d = gko::batch_dim<2>(3, gko::dim<2>(4, 2)); - - ASSERT_EQ(d.get_cumulative_offset(0), 0); - ASSERT_EQ(d.get_cumulative_offset(1), 8); - ASSERT_EQ(d.get_cumulative_offset(2), 16); -} - - TEST(BatchDim, TransposesBatchDimensions) { ASSERT_EQ(gko::transpose(gko::batch_dim<2>(2, gko::dim<2>{4, 2})), diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index 316312bd68f..7bde0c708dc 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -289,7 +289,7 @@ TYPED_TEST(Dense, CanBeConstructedFromDenseMatricesByDuplication) gko::batch::create_from_item>( this->exec, 3, mat1.get()); - GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 0); } @@ -316,7 +316,7 @@ TYPED_TEST(Dense, CanBeConstructedByDuplicatingDenseMatrices) auto m2 = gko::batch::duplicate>( this->exec, 3, m.get()); - GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); + GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 0); } @@ -384,13 +384,21 @@ TYPED_TEST(Dense, CanBeDoubleListConstructed) EXPECT_EQ(m->at(0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 1), value_type{1.0}); EXPECT_EQ(m->at(0, 2), value_type{0.0}); - ASSERT_EQ(m->at(0, 3), value_type{2.0}); + EXPECT_EQ(m->at(0, 3), value_type{2.0}); EXPECT_EQ(m->at(0, 4), value_type{4.0}); + EXPECT_EQ(m->at(0, 5), value_type{3.0}); + EXPECT_EQ(m->at(0, 6), value_type{3.0}); + EXPECT_EQ(m->at(0, 7), value_type{6.0}); + EXPECT_EQ(m->at(0, 8), value_type{1.0}); EXPECT_EQ(m->at(1, 0), value_type{1.0}); EXPECT_EQ(m->at(1, 1), value_type{2.0}); EXPECT_EQ(m->at(1, 2), value_type{-1.0}); - ASSERT_EQ(m->at(1, 3), value_type{3.0}); + EXPECT_EQ(m->at(1, 3), value_type{3.0}); EXPECT_EQ(m->at(1, 4), value_type{4.0}); + EXPECT_EQ(m->at(1, 5), value_type{-2.0}); + EXPECT_EQ(m->at(1, 6), value_type{5.0}); + EXPECT_EQ(m->at(1, 7), value_type{6.0}); + EXPECT_EQ(m->at(1, 8), value_type{-3.0}); } diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 12f34509275..14b300c9204 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -54,7 +54,7 @@ namespace cuda { * while also shallow-casting to the required CUDA scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. + * object. These are intended to be called on the host. */ diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 4f1dbc8f4d4..47c478864cf 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -77,6 +77,7 @@ constexpr int sm_oversubscription = 4; #include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + // clang-format on diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 8daf06f416c..2ae453b6e61 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -58,7 +58,7 @@ namespace cuda { * while also shallow-casting to the required CUDA scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. + * object. These are intended to be called on the host. */ diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index 2a0c03f552e..dc8301ecb2e 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -53,7 +53,7 @@ namespace dpcpp { * while also shallow-casting to the required DPCPP scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. + * object. These are intended to be called on the host. */ diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 6aec3e57fc5..8fca47c27b8 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -98,19 +98,19 @@ void simple_apply(std::shared_ptr exec, } // Launch a kernel that has nbatches blocks, each block has max group size - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b, x_b, item_ct1); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); + }); }); } @@ -145,24 +145,24 @@ void advanced_apply(std::shared_ptr exec, const dim3 grid(num_batch_items); // Launch a kernel that has nbatches blocks, each block has max group size - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto alpha_b = - batch::extract_batch_item(alpha_ub, group_id); - const auto beta_b = - batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, - item_ct1); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); + }); }); } diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index 1955399d0d8..d452f78644f 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -56,7 +56,7 @@ namespace dpcpp { * while also shallow-casting to the required DPCPP scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. + * object. These are intended to be called on the host. */ diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index 732c40662aa..5747e202fb7 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -54,7 +54,7 @@ namespace hip { * while also shallow-casting to the required Hip scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. + * object. These are intended to be called on the host. */ diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index aa6d717438e..a0fdea446be 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -79,8 +79,10 @@ constexpr int sm_oversubscription = 4; #include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + // clang-format on + } // namespace batch_dense } // namespace hip } // namespace kernels diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index a22797a03d4..c1bd6441367 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -58,7 +58,7 @@ namespace hip { * while also shallow-casting to the required HIP scalar type. * * A specialization is needed for every format of every kind of linear algebra - * object. + * object. These are intended to be called on the host. */ diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index 3bda352fb9d..e0ade2c872f 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -74,18 +74,6 @@ struct batch_dim { return common_size_; } - /** - * Get the cumulative storage size offset - * - * @param batch_id the batch id - * - * @return the cumulative offset - */ - size_type get_cumulative_offset(size_type batch_id) const - { - return batch_id * common_size_[0] * common_size_[1]; - } - /** * Checks if two batch_dim objects are equal. * diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 7830a4c6efb..61dffba3193 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -202,8 +202,7 @@ class MultiVector value_type* get_values_for_item(size_type batch_id) noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); - return values_.get_data() + - this->get_size().get_cumulative_offset(batch_id); + return values_.get_data() + this->get_cumulative_offset(batch_id); } /** @@ -217,8 +216,7 @@ class MultiVector size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); - return values_.get_const_data() + - this->get_size().get_cumulative_offset(batch_id); + return values_.get_const_data() + this->get_cumulative_offset(batch_id); } /** @@ -233,6 +231,19 @@ class MultiVector return values_.get_num_elems(); } + /** + * Get the cumulative storage size offset + * + * @param batch_id the batch id + * + * @return the cumulative offset + */ + size_type get_cumulative_offset(size_type batch_id) const + { + return batch_id * this->get_common_size()[0] * + this->get_common_size()[1]; + } + /** * Returns a single element for a particular batch item. * @@ -375,7 +386,8 @@ class MultiVector private: inline size_type compute_num_elems(const batch_dim<2>& size) { - return size.get_cumulative_offset(size.get_num_batch_items()); + return size.get_num_batch_items() * size.get_common_size()[0] * + size.get_common_size()[1]; } protected: @@ -434,7 +446,7 @@ class MultiVector size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept { - return batch_size_.get_cumulative_offset(batch) + + return this->get_cumulative_offset(batch) + row * batch_size_.get_common_size()[1] + col; } diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 89f12d69f62..59ab92cd146 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -124,6 +124,19 @@ class Dense final : public EnableBatchLinOp>, std::unique_ptr create_const_view_for_item( size_type item_id) const; + /** + * Get the cumulative storage size offset + * + * @param batch_id the batch id + * + * @return the cumulative offset + */ + size_type get_cumulative_offset(size_type batch_id) const + { + return batch_id * this->get_common_size()[0] * + this->get_common_size()[1]; + } + /** * Returns a pointer to the array of values of the multi-vector * @@ -207,8 +220,7 @@ class Dense final : public EnableBatchLinOp>, value_type* get_values_for_item(size_type batch_id) noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); - return values_.get_data() + - this->get_size().get_cumulative_offset(batch_id); + return values_.get_data() + this->get_cumulative_offset(batch_id); } /** @@ -222,8 +234,7 @@ class Dense final : public EnableBatchLinOp>, size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); - return values_.get_const_data() + - this->get_size().get_cumulative_offset(batch_id); + return values_.get_const_data() + this->get_cumulative_offset(batch_id); } /** @@ -269,8 +280,8 @@ class Dense final : public EnableBatchLinOp>, /** * Apply the matrix to a multi-vector with a linear combination of the given - * input vector. Represents the matrix vector multiplication, x = alpha* A * - * b + beta * x, where x and b are both multi-vectors. + * input vector. Represents the matrix vector multiplication, x = alpha * A + * * b + beta * x, where x and b are both multi-vectors. * * @param alpha the scalar to scale the matrix-vector product with * @param b the multi-vector to be applied to @@ -288,7 +299,8 @@ class Dense final : public EnableBatchLinOp>, private: inline size_type compute_num_elems(const batch_dim<2>& size) { - return size.get_cumulative_offset(size.get_num_batch_items()); + return size.get_num_batch_items() * size.get_common_size()[0] * + size.get_common_size()[1]; } protected: @@ -326,14 +338,6 @@ class Dense final : public EnableBatchLinOp>, GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1); } - /** - * Creates a Dense matrix with the same configuration as the caller's - * matrix. - * - * @returns a Dense matrix with the same configuration as the caller. - */ - std::unique_ptr create_with_same_config() const; - void apply_impl(const MultiVector* b, MultiVector* x) const; @@ -345,7 +349,7 @@ class Dense final : public EnableBatchLinOp>, size_type linearize_index(size_type batch, size_type row, size_type col) const noexcept { - return this->get_size().get_cumulative_offset(batch) + + return this->get_cumulative_offset(batch) + row * this->get_size().get_common_size()[1] + col; } diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp index a85453edee8..6a23374f7cb 100644 --- a/reference/test/matrix/batch_dense_kernels.cpp +++ b/reference/test/matrix/batch_dense_kernels.cpp @@ -126,32 +126,6 @@ TYPED_TEST(Dense, AppliesToBatchMultiVector) } -TYPED_TEST(Dense, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) -{ - using BMtx = typename TestFixture::BMtx; - using BMVec = typename TestFixture::BMVec; - using DenseMtx = typename TestFixture::DenseMtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch::initialize(2, {1.5}, this->exec); - auto beta = gko::batch::initialize(2, {-4.0}, this->exec); - auto alpha0 = gko::initialize({1.5}, this->exec); - auto alpha1 = gko::initialize({1.5}, this->exec); - auto beta0 = gko::initialize({-4.0}, this->exec); - auto beta1 = gko::initialize({-4.0}, this->exec); - - this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), - this->x_0.get()); - - this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), - this->x_00.get()); - this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), - this->x_01.get()); - auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); -} - - TYPED_TEST(Dense, AppliesLinearCombinationToBatchMultiVector) { using BMtx = typename TestFixture::BMtx; From a34315fe22a25709e2cd437b34c5958db9ea2d0d Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 16:35:46 +0200 Subject: [PATCH 366/583] Move apply validation to BatchLinOp --- core/matrix/batch_dense.cpp | 16 ++------- include/ginkgo/core/base/batch_lin_op.hpp | 40 +++++++++++++++++++++++ 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index 7675fcdde9c..758635cea7f 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -128,12 +128,7 @@ template void Dense::apply_impl(const MultiVector* b, MultiVector* x) const { - GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); - GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); - - GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); - GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); - GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); + this->validate_application_parameters(b, x); this->get_executor()->run(dense::make_simple_apply(this, b, x)); } @@ -144,14 +139,7 @@ void Dense::apply_impl(const MultiVector* alpha, const MultiVector* beta, MultiVector* x) const { - GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); - GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); - - GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); - GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); - GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); - GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(), gko::dim<2>(1, 1)); - GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1)); + this->validate_application_parameters(alpha, b, beta, x); this->get_executor()->run( dense::make_advanced_apply(alpha, this, b, beta, x)); } diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index 78ce4f4a942..a0efb2ea324 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -110,6 +111,45 @@ class BatchLinOp : public EnableAbstractPolymorphicObject { */ const batch_dim<2>& get_size() const noexcept { return size_; } + /** + * Validates the sizes for the apply(b,x) operation in the + * concrete BatchLinOp. + * + */ + template + void validate_application_parameters(const MultiVector* b, + MultiVector* x) const + { + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); + + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); + } + + /** + * Validates the sizes for the apply(alpha, b , beta, x) operation in the + * concrete BatchLinOp. + * + */ + template + void validate_application_parameters(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const + { + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); + + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(), + gko::dim<2>(1, 1)); + GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1)); + } + protected: /** * Sets the size of the batch operator. From 2b65f1433eed447c944ec2e89d0db1eca5c3753a Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 16:38:25 +0200 Subject: [PATCH 367/583] Add to test_install --- test/test_install/test_install.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index d442647a985..325773f0b75 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -219,6 +219,13 @@ int main() auto test = batch_multi_vector_type::create(exec); } + // core/base/batch_dense.hpp + { + using type1 = float; + using batch_dense_type = gko::batch::Dense; + auto test = batch_dense_type::create(exec); + } + // core/base/combination.hpp { using type1 = int; From 5928b9ff4041490539ce182ae96738902a12f53e Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Tue, 10 Oct 2023 14:46:12 +0000 Subject: [PATCH 368/583] Format files Co-authored-by: Pratik Nayak --- dpcpp/matrix/batch_dense_kernels.dp.cpp | 54 ++++++++++++------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 8fca47c27b8..a6fba2df8e3 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -100,17 +100,17 @@ void simple_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b, x_b, item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); + }); }); } @@ -147,22 +147,22 @@ void advanced_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto alpha_b = - batch::extract_batch_item(alpha_ub, group_id); - const auto beta_b = - batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, - item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); + }); }); } From ed59e2fe570e5a0b44245d80ec6f6cbe1b62ae00 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 22:53:30 +0200 Subject: [PATCH 369/583] Review updates Co-authored-by: Terry Cojean --- core/matrix/batch_dense_kernels.hpp | 1 - core/test/matrix/batch_dense.cpp | 8 ++------ cuda/matrix/batch_dense_kernels.cu | 3 --- cuda/matrix/batch_struct.hpp | 3 --- dpcpp/matrix/batch_struct.hpp | 2 -- hip/matrix/batch_dense_kernels.hip.cpp | 3 --- hip/matrix/batch_struct.hip.hpp | 3 --- include/ginkgo/core/matrix/batch_dense.hpp | 3 +++ reference/matrix/batch_struct.hpp | 2 -- test/matrix/batch_dense_kernels.cpp | 16 ++++++++-------- test/test_install/test_install.cpp | 2 +- 11 files changed, 14 insertions(+), 32 deletions(-) diff --git a/core/matrix/batch_dense_kernels.hpp b/core/matrix/batch_dense_kernels.hpp index cb46b7291b8..ef59ff3e9cc 100644 --- a/core/matrix/batch_dense_kernels.hpp +++ b/core/matrix/batch_dense_kernels.hpp @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index 7bde0c708dc..8e64c913a6a 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -256,7 +256,6 @@ TYPED_TEST(Dense, CanBeConstructedFromDenseMatrices) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, @@ -275,16 +274,15 @@ TYPED_TEST(Dense, CanBeConstructedFromDenseMatricesByDuplication) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize( 4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto bat_m = gko::batch::create_from_item>( this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}); + auto m = gko::batch::create_from_item>( this->exec, 3, mat1.get()); @@ -298,12 +296,10 @@ TYPED_TEST(Dense, CanBeConstructedByDuplicatingDenseMatrices) using value_type = typename TestFixture::value_type; using DenseMtx = typename TestFixture::DenseMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec); auto mat2 = gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::create_from_item>( this->exec, std::vector{mat1.get(), mat2.get()}); @@ -342,6 +338,7 @@ TYPED_TEST(Dense, CanBeUnbatchedIntoDenseMatrices) TYPED_TEST(Dense, CanBeListConstructed) { using value_type = typename TestFixture::value_type; + auto m = gko::batch::initialize>( {{1.0, 2.0}, {1.0, 3.0}}, this->exec); @@ -406,7 +403,6 @@ TYPED_TEST(Dense, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; using index_type = int; - auto vec_data = std::vector>{}; vec_data.emplace_back(gko::matrix_data( {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}})); diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 47c478864cf..dd82e15b8cc 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -34,7 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include @@ -44,8 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 2ae453b6e61..73712a7b81b 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -37,13 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_struct.hpp" -#include -#include #include #include "core/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/types.hpp" diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index d452f78644f..b0393daf55d 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -37,12 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_struct.hpp" -#include #include #include "core/base/batch_struct.hpp" -#include "dpcpp/base/config.hpp" namespace gko { diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index a0fdea446be..eb3da83760a 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include #include @@ -46,8 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index c1bd6441367..4670cf0988b 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -37,13 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_struct.hpp" -#include -#include #include #include "core/base/batch_struct.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/types.hip.hpp" diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 59ab92cd146..7f3ce5890e4 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -133,6 +133,7 @@ class Dense final : public EnableBatchLinOp>, */ size_type get_cumulative_offset(size_type batch_id) const { + GKO_ASSERT(batch_id < this->get_num_batch_items()); return batch_id * this->get_common_size()[0] * this->get_common_size()[1]; } @@ -198,6 +199,7 @@ class Dense final : public EnableBatchLinOp>, */ ValueType& at(size_type batch_id, size_type idx) noexcept { + GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_data()[linearize_index(batch_id, idx)]; } @@ -206,6 +208,7 @@ class Dense final : public EnableBatchLinOp>, */ ValueType at(size_type batch_id, size_type idx) const noexcept { + GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data()[linearize_index(batch_id, idx)]; } diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index dcd4ce3e71e..483d7717718 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -37,8 +37,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_struct.hpp" -#include -#include #include #include diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index 119a868be09..a243d51f3c1 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -75,11 +75,11 @@ class Dense : public CommonTestFixture { { const int num_rows = 252; const int num_cols = 32; - x = gen_mtx(batch_size, num_rows, num_cols); + mat = gen_mtx(batch_size, num_rows, num_cols); y = gen_mtx(batch_size, num_cols, num_vecs); alpha = gen_mtx(batch_size, 1, 1); beta = gen_mtx(batch_size, 1, 1); - dx = gko::clone(exec, x); + dmat = gko::clone(exec, mat); dy = gko::clone(exec, y); dalpha = gko::clone(exec, alpha); dbeta = gko::clone(exec, beta); @@ -93,13 +93,13 @@ class Dense : public CommonTestFixture { std::default_random_engine rand_engine; const size_t batch_size = 11; - std::unique_ptr x; + std::unique_ptr mat; std::unique_ptr y; std::unique_ptr alpha; std::unique_ptr beta; std::unique_ptr expected; std::unique_ptr dresult; - std::unique_ptr dx; + std::unique_ptr dmat; std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; @@ -110,8 +110,8 @@ TEST_F(Dense, SingleVectorApplyIsEquivalentToRef) { set_up_apply_data(1); - x->apply(y.get(), expected.get()); - dx->apply(dy.get(), dresult.get()); + mat->apply(y.get(), expected.get()); + dmat->apply(dy.get(), dresult.get()); GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); } @@ -121,8 +121,8 @@ TEST_F(Dense, SingleVectorAdvancedApplyIsEquivalentToRef) { set_up_apply_data(1); - x->apply(alpha.get(), y.get(), beta.get(), expected.get()); - dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + mat->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmat->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); } diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index 325773f0b75..7e53ea8f165 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -222,7 +222,7 @@ int main() // core/base/batch_dense.hpp { using type1 = float; - using batch_dense_type = gko::batch::Dense; + using batch_dense_type = gko::batch::matrix::Dense; auto test = batch_dense_type::create(exec); } From 17d54f3771b45c7991f0c3d57d4ebd34c17e6b72 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 14 Sep 2023 15:08:47 +0200 Subject: [PATCH 370/583] destroy rand_generator --- cuda/base/curand_bindings.hpp | 6 ++++++ cuda/solver/idr_kernels.cu | 1 + hip/base/hiprand_bindings.hip.hpp | 5 +++++ hip/solver/idr_kernels.hip.cpp | 1 + 4 files changed, 13 insertions(+) diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index 429481ec9b6..4bf12dd9064 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -83,6 +83,12 @@ inline curandGenerator_t rand_generator(int64 seed, } +inline void destroy(curandGenerator_t gen) +{ + GKO_ASSERT_NO_CURAND_ERRORS(curandDestroyGenerator(gen)); +} + + #define GKO_BIND_CURAND_RANDOM_VECTOR(ValueType, CurandName) \ inline void rand_vector( \ curandGenerator_t& gen, int n, remove_complex mean, \ diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index 10e8a7b2fc3..7bfe56987f4 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -104,6 +104,7 @@ void initialize_subspace_vectors(std::shared_ptr exec, gen, subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), 0.0, 1.0, subspace_vectors->get_values()); + curand::destroy(gen); } } diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 14e144f6d84..dfef3bb84b4 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -87,6 +87,11 @@ inline hiprandGenerator_t rand_generator(int64 seed, return gen; } +inline void destroy(hiprandGenerator_t gen) +{ + GKO_ASSERT_NO_HIPRAND_ERRORS(hiprandDestroyGenerator(gen)); +} + #define GKO_BIND_HIPRAND_RANDOM_VECTOR(ValueType, HiprandName) \ inline void rand_vector( \ diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 9e6f353abe4..1a3d2931897 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -106,6 +106,7 @@ void initialize_subspace_vectors(std::shared_ptr exec, gen, subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), 0.0, 1.0, subspace_vectors->get_values()); + hiprand::destroy(gen); } } From c2649ded0206f072cfd9fa7688d7259d4a38241e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" <19565938+yhmtsai@users.noreply.github.com> Date: Fri, 13 Oct 2023 00:42:58 +0200 Subject: [PATCH 371/583] Fix PAPI segmentation fault (#1419) This PR fix the PAPI segmentation fault Related PR: https://github.com/ginkgo-project/ginkgo/pull/1419 --- include/ginkgo/core/log/papi.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp index bf22f7c876f..2b2a3326cce 100644 --- a/include/ginkgo/core/log/papi.hpp +++ b/include/ginkgo/core/log/papi.hpp @@ -208,10 +208,7 @@ class Papi : public Logger { create(std::shared_ptr, const Logger::mask_type& enabled_events = Logger::all_events_mask) { - return std::shared_ptr(new Papi(enabled_events), [](auto logger) { - papi_sde_shutdown(logger->get_handle()); - delete logger; - }); + return Papi::create(enabled_events); } /** @@ -223,8 +220,9 @@ class Papi : public Logger { const Logger::mask_type& enabled_events = Logger::all_events_mask) { return std::shared_ptr(new Papi(enabled_events), [](auto logger) { - papi_sde_shutdown(logger->get_handle()); + auto handle = logger->get_handle(); delete logger; + papi_sde_shutdown(handle); }); } From 74997fc2f2883c8ab88b425f67aa404611cc10ac Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Thu, 12 Oct 2023 11:09:25 -0700 Subject: [PATCH 372/583] Update tau.cpp Fixing order of includes. The ifdef check has to happen after config.h has been included, or else the perfstubs header file won't get included. Also, adding an argument name for the `end_tau()` function. --- core/log/tau.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/core/log/tau.cpp b/core/log/tau.cpp index 62b68732de1..5db95375da9 100644 --- a/core/log/tau.cpp +++ b/core/log/tau.cpp @@ -30,16 +30,14 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include +#include + #if GKO_HAVE_TAU #define PERFSTUBS_USE_TIMERS #include #endif - -#include -#include - - namespace gko { namespace log { @@ -56,7 +54,7 @@ void begin_tau(const char* name, profile_event_category) } -void end_tau(const char*, profile_event_category) +void end_tau(const char* name, profile_event_category) { PERFSTUBS_STOP_STRING(name); } From 1ca0debdaaec04ee635c86e5f198d40cf1ed1f12 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 13 Oct 2023 11:25:07 +0200 Subject: [PATCH 373/583] review updates Co-authored-by: Yuhsiang M. Tsai --- core/log/tau.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/log/tau.cpp b/core/log/tau.cpp index 5db95375da9..e1b29c9c953 100644 --- a/core/log/tau.cpp +++ b/core/log/tau.cpp @@ -30,6 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include #include #include From fe60c741b1aa818cca1894a72b48cbc5d5a96af6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 14 Jun 2023 16:19:54 +0200 Subject: [PATCH 374/583] add icpx support --- .github/workflows/intel.yml | 6 ++++-- CMakeLists.txt | 4 +++- README.md | 2 +- benchmark/CMakeLists.txt | 2 ++ cmake/autodetect_executors.cmake | 4 ++++ cmake/build_helpers.cmake | 7 ++++++- cmake/create_test.cmake | 2 ++ dpcpp/CMakeLists.txt | 2 ++ test/solver/CMakeLists.txt | 3 +++ third_party/gtest/CMakeLists.txt | 2 +- 10 files changed, 28 insertions(+), 6 deletions(-) diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 9fd85708737..4652b3996e1 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -6,6 +6,7 @@ on: - 'master' - 'develop' - 'release/**' + - 'icpx_compilation' tags: - '**' pull_request: @@ -21,7 +22,8 @@ jobs: fail-fast: false matrix: config: - - {build_type: "Release", name: "intel/release/shared", "mixed": "ON"} + - {compiler: "dpcpp", build_type: "Release", name: "intel/dpcpp/release/shared", mixed: "ON"} + - {compiler: "icpx", build_type: "Release", name: "intel/icpx/release/shared", mixed: "OFF"} name: ${{ matrix.config.name }} runs-on: [gpu_intel] @@ -35,7 +37,7 @@ jobs: spack find --loaded mkdir build cd build - cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DGINKGO_COMPILER_FLAGS="-ffp-model=precise" -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON + cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DGINKGO_COMPILER_FLAGS="-ffp-model=precise" -DCMAKE_CXX_COMPILER=${{ matrix.config.compiler }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON make -j8 ONEAPI_DEVICE_SELECTOR=level_zero:gpu ctest -j10 --output-on-failure diff --git a/CMakeLists.txt b/CMakeLists.txt index 4dbce4a29c6..5dff9bcbaac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,7 +50,7 @@ set(GINKGO_VERBOSE_LEVEL "1" CACHE STRING if(MSVC) set(GINKGO_COMPILER_FLAGS "" CACHE STRING "Set the required CXX compiler flags, mainly used for warnings. Current default is ``") -elseif(GINKGO_BUILD_DPCPP OR CMAKE_CXX_COMPILER MATCHES "dpcpp") +elseif(GINKGO_BUILD_DPCPP OR CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx") # For now always use `-ffp-model=precise` with DPC++. This can be removed when # the floating point issues are fixed. set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise" CACHE STRING @@ -298,6 +298,8 @@ endif() if(GINKGO_BUILD_DPCPP) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION) + get_filename_component(GINKGO_SYCL_DIR ${CMAKE_CXX_COMPILER} DIRECTORY) + set(SYCL_INCLUDE_PATH "${GINKGO_SYCL_DIR}/../include;${GINKGO_SYCL_DIR}/../include/sycl") else() set(GINKGO_DPCPP_MAJOR_VERSION "0") endif() diff --git a/README.md b/README.md index 44428386b83..102005e4a18 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ The Ginkgo HIP module has the following __additional__ requirements: The Ginkgo DPC++ module has the following __additional__ requirements: * _OneAPI 2021.3+_ -* Set `dpcpp` as the `CMAKE_CXX_COMPILER` +* Set `dpcpp` or `icpx` as the `CMAKE_CXX_COMPILER` * `c++17` is used to compile Ginkgo * The following oneAPI packages should be available: * oneMKL diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index fd04620f595..7a4f5b1ca43 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -140,6 +140,8 @@ if (GINKGO_BUILD_DPCPP) ginkgo_benchmark_onemkl_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_onemkl_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(dpcpp_timer utils/dpcpp_timer.dp.cpp) + target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS}) + target_link_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_libraries(dpcpp_timer ginkgo) endif() diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake index 315e0eb3e38..86949cecda9 100644 --- a/cmake/autodetect_executors.cmake +++ b/cmake/autodetect_executors.cmake @@ -40,6 +40,10 @@ endif() if (NOT DEFINED GINKGO_BUILD_DPCPP) try_compile(GKO_CAN_COMPILE_DPCPP ${PROJECT_BINARY_DIR}/dpcpp SOURCES ${PROJECT_SOURCE_DIR}/dpcpp/test_dpcpp.dp.cpp + # try_compile will pass the project CMAKE_CXX_FLAGS so passing -DCMAKE_CXX_FLAGS does not affect it. + # They append COMPILE_DEFINITIONS into CMAKE_CXX_FLAGS. + # Note. it is different from try_compile COMPILE_DEFINITIONS affect + CMAKE_FLAGS -DCOMPILE_DEFINITIONS=-fsycl CXX_STANDARD 17) if (GKO_CAN_COMPILE_DPCPP) message(STATUS "Enabling DPCPP executor") diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index a7b8c48acf3..25add05c60f 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -9,6 +9,10 @@ function(ginkgo_default_includes name) $ $ ) + if(DEFINED SYCL_INCLUDE_PATH) + # avoid -fsycl in all place + target_include_directories("${name}" PUBLIC ${SYCL_INCLUDE_PATH}) + endif() if(GINKGO_HAVE_HWLOC) target_include_directories("${name}" PUBLIC @@ -139,7 +143,8 @@ function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION MACRO_ "int main() {std::cout << ${MACRO_VAR} << '\\n'\;" "return 0\;}") file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp" ${DPCPP_VERSION_PROG}) - execute_process(COMMAND ${DPCPP_COMPILER} ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp + # we always add -fsycl + execute_process(COMMAND ${DPCPP_COMPILER} -fsycl ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver ERROR_VARIABLE DPCPP_EXTRACT_VER_ERROR) execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index cec47fced74..55b70bbaeaa 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -124,6 +124,7 @@ function(ginkgo_create_dpcpp_test test_name) add_executable(${test_target_name} ${test_name}.dp.cpp) target_compile_features(${test_target_name} PUBLIC cxx_std_17) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) + target_link_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_target_name} "_dpcpp" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE sycl) @@ -298,6 +299,7 @@ function(ginkgo_create_common_device_test test_name) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) + target_link_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel) endif() if(GINKGO_BUILD_OMP) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 4099bb603a3..9d0952480be 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -88,8 +88,10 @@ configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common. ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) +set(GINKGO_DPCPP_FLAGS "-fsycl") set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") +target_link_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}") # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating # find_package(MKL) everywhere when linking ginkgo (see the MKL example diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt index 4cec6b05d22..3231956beb7 100644 --- a/test/solver/CMakeLists.txt +++ b/test/solver/CMakeLists.txt @@ -13,3 +13,6 @@ ginkgo_create_common_test(lower_trs_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(multigrid_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(solver DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(upper_trs_kernels DISABLE_EXECUTORS dpcpp) +if(GINKGO_BUILD_DPCPP) + target_link_options(test_solver_idr_kernels_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) +endif() diff --git a/third_party/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt index 45b564dbfbf..378a7cdc705 100644 --- a/third_party/gtest/CMakeLists.txt +++ b/third_party/gtest/CMakeLists.txt @@ -22,7 +22,7 @@ set_target_properties(gtest gtest_main PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${GINKGO_LIBRARY_PATH}") # If CXX compiler is dpcpp, use -ffp-model=precise # Otherwise, it will throw src/gtest.cc:1583:8: error: comparison with NaN always evaluates to false in fast floating point modes -if(CMAKE_CXX_COMPILER MATCHES "dpcpp") +if(CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx") target_compile_options(gtest PRIVATE "-ffp-model=precise") target_compile_options(gtest_main PRIVATE "-ffp-model=precise") endif() From 27d7512c9f2aca5e67ae5c5771c5bd0b99578f02 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 21 Aug 2023 09:41:42 +0200 Subject: [PATCH 375/583] add gko_add_sycl_to_target --- .github/workflows/intel.yml | 1 - .gitlab-ci.yml | 17 ++++++++++++++++- CMakeLists.txt | 5 +++-- benchmark/CMakeLists.txt | 2 +- cmake/build_helpers.cmake | 4 ---- cmake/create_test.cmake | 6 ++++-- cmake/sycl.cmake | 33 +++++++++++++++++++++++++++++++++ dpcpp/CMakeLists.txt | 6 ++++-- test/solver/CMakeLists.txt | 2 +- 9 files changed, 62 insertions(+), 14 deletions(-) create mode 100644 cmake/sycl.cmake diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 4652b3996e1..db18b510e21 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -6,7 +6,6 @@ on: - 'master' - 'develop' - 'release/**' - - 'icpx_compilation' tags: - '**' pull_request: diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6185608864f..e1f1eb8be3d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -581,7 +581,7 @@ build/nocuda-nomixed/nompi/clang/omp/debug/static: BUILD_SHARED_LIBS: "OFF" MIXED_PRECISION: "OFF" -build/dpcpp/2022-1/cpu/release/static: +build/dpcpp/2022-1/cpu/release/shared: extends: - .build_and_test_template - .default_variables @@ -665,6 +665,21 @@ build/dpcpp/level_zero_dgpu/release/shared: DPCPP_SINGLE_MODE: "ON" ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" +build/icpx/level_zero_dgpu/release/shared: + extends: + - .build_and_test_template + - .default_variables + - .quick_test_condition + - .use_gko-oneapi-dgpu + variables: + C_COMPILER: "icx" + CXX_COMPILER: "icpx" + BUILD_DPCPP: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" + BUILD_TYPE: "Release" + DPCPP_SINGLE_MODE: "ON" + ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" + # Job with important warnings as error warnings: stage: code_quality diff --git a/CMakeLists.txt b/CMakeLists.txt index 5dff9bcbaac..3f38c1e7165 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,9 @@ endif() if(GINKGO_BUILD_HIP) include(cmake/hip.cmake) endif() +if(GINKGO_BUILD_DPCPP) + include(cmake/sycl.cmake) +endif() if(GINKGO_BUILD_OMP) find_package(OpenMP 3.0 REQUIRED) endif() @@ -298,8 +301,6 @@ endif() if(GINKGO_BUILD_DPCPP) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION) - get_filename_component(GINKGO_SYCL_DIR ${CMAKE_CXX_COMPILER} DIRECTORY) - set(SYCL_INCLUDE_PATH "${GINKGO_SYCL_DIR}/../include;${GINKGO_SYCL_DIR}/../include/sycl") else() set(GINKGO_DPCPP_MAJOR_VERSION "0") endif() diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 7a4f5b1ca43..5cffddd51aa 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -141,7 +141,7 @@ if (GINKGO_BUILD_DPCPP) ginkgo_benchmark_onemkl_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(dpcpp_timer utils/dpcpp_timer.dp.cpp) target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS}) - target_link_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS}) + gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp) target_link_libraries(dpcpp_timer ginkgo) endif() diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index 25add05c60f..34189a09450 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -9,10 +9,6 @@ function(ginkgo_default_includes name) $ $ ) - if(DEFINED SYCL_INCLUDE_PATH) - # avoid -fsycl in all place - target_include_directories("${name}" PUBLIC ${SYCL_INCLUDE_PATH}) - endif() if(GINKGO_HAVE_HWLOC) target_include_directories("${name}" PUBLIC diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 55b70bbaeaa..3794a8026e1 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -124,7 +124,7 @@ function(ginkgo_create_dpcpp_test test_name) add_executable(${test_target_name} ${test_name}.dp.cpp) target_compile_features(${test_target_name} PUBLIC cxx_std_17) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) - target_link_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) + gko_add_sycl_to_target(TARGET ${test_target_name} SOURCES ${test_name}.dp.cpp) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_target_name} "_dpcpp" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE sycl) @@ -299,7 +299,9 @@ function(ginkgo_create_common_device_test test_name) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) - target_link_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) + # We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property. + configure_file(${test_name}.cpp ${test_name}.dp.cpp COPYONLY) + gko_add_sycl_to_target(TARGET ${test_target_name}_dpcpp SOURCES ${test_name}.dp.cpp) target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel) endif() if(GINKGO_BUILD_OMP) diff --git a/cmake/sycl.cmake b/cmake/sycl.cmake new file mode 100644 index 00000000000..b0f4eab91f1 --- /dev/null +++ b/cmake/sycl.cmake @@ -0,0 +1,33 @@ +# IntelSYCL for dpcpp and icpx if the config is existed and cmake reaches the requirement +if(CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx") + if(CMAKE_HOST_WIN32 AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.25) + find_package(IntelSYCL QUIET) + elseif(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20.5) + find_package(IntelSYCL QUIET) + endif() +endif() +# If we do not have the config from compiler, try to set components to make it work. +if(NOT COMMAND add_sycl_to_target) + if(NOT DEFINED SYCL_FLAGS) + set(SYCL_FLAGS "-fsycl" CACHE STRING "SYCL flags for compiler") + endif() +endif() + +# Provide a uniform way for those package without add_sycl_to_target +function(gko_add_sycl_to_target) + if(COMMAND add_sycl_to_target) + add_sycl_to_target(${ARGN}) + return() + endif() + # We handle them by adding SYCL_FLAGS to compile and link to the target + set(one_value_args TARGET) + set(multi_value_args SOURCES) + cmake_parse_arguments(SYCL + "" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + target_compile_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}") + target_link_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}") +endfunction() + diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 9d0952480be..0041b7cbd18 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -88,10 +88,12 @@ configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common. ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) -set(GINKGO_DPCPP_FLAGS "-fsycl") set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") -target_link_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") +# all file in target ginkgo_dpcpp are necessarily compiled with sycl, so we can ignore the warning. +# If we would like to use SOURCES, please use the new files copied from GKO_UNIFIED_COMMON_SOURCES. +# Otherwise, the source's properties will be changed by add_sycl_to_target +gko_add_sycl_to_target(TARGET ginkgo_dpcpp) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}") # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating # find_package(MKL) everywhere when linking ginkgo (see the MKL example diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt index 3231956beb7..f870ecfbf19 100644 --- a/test/solver/CMakeLists.txt +++ b/test/solver/CMakeLists.txt @@ -14,5 +14,5 @@ ginkgo_create_common_test(multigrid_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(solver DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(upper_trs_kernels DISABLE_EXECUTORS dpcpp) if(GINKGO_BUILD_DPCPP) - target_link_options(test_solver_idr_kernels_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) + gko_add_sycl_to_target(TARGET test_solver_idr_kernels_dpcpp SOURCES idr_kernels.cpp) endif() From e31912e1e02701b5812830b279112450e018ef0d Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 21 Aug 2023 22:55:52 +0200 Subject: [PATCH 376/583] rename GINKGO_BUILD_DPCPP to GINKGO_BUILD_SYCL Co-authored-by: Terry Cojean Co-authored-by: Tobias Ribizel --- .github/workflows/bot-pr-updated.yml | 2 +- .gitlab-ci.yml | 14 +++++++------- .gitlab/scripts.yml | 4 ++-- .gitlab/variables.yml | 1 + CMakeLists.txt | 16 ++++++++++------ INSTALL.md | 7 ++++--- README.md | 6 +++--- benchmark/CMakeLists.txt | 6 +++--- cmake/GinkgoConfig.cmake.in | 4 ++-- cmake/autodetect_executors.cmake | 6 +++--- cmake/create_test.cmake | 4 ++-- cmake/get_info.cmake | 4 ++-- cmake/rename.cmake | 25 +++++++++++++++++++++++++ core/device_hooks/CMakeLists.txt | 2 +- core/test/gtest/CMakeLists.txt | 2 +- doc/examples/examples.hpp.in | 2 +- test/solver/CMakeLists.txt | 2 +- 17 files changed, 69 insertions(+), 38 deletions(-) create mode 100644 cmake/rename.cmake diff --git a/.github/workflows/bot-pr-updated.yml b/.github/workflows/bot-pr-updated.yml index ae357c9db96..8554ca3b1e9 100644 --- a/.github/workflows/bot-pr-updated.yml +++ b/.github/workflows/bot-pr-updated.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-latest if: github.event.pull_request.author_association == 'COLLABORATOR' || github.event.pull_request.author_association == 'MEMBER' || github.event.pull_request.author_association == 'OWNER' env: - CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=DEBUG -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=DEBUG -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_SYCL=OFF steps: - name: Checkout the new code (shallow clone) uses: actions/checkout@v3 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e1f1eb8be3d..ffd037e45ff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -590,7 +590,7 @@ build/dpcpp/2022-1/cpu/release/shared: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" @@ -609,7 +609,7 @@ build/dpcpp/igpu/release/shared: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" @@ -626,7 +626,7 @@ build/dpcpp/igpu/release/shared: # variables: # C_COMPILER: "gcc" # CXX_COMPILER: "dpcpp" -# BUILD_DPCPP: "ON" +# BUILD_SYCL: "ON" # GKO_COMPILER_FLAGS: "-ffp-model=precise" # BUILD_TYPE: "Debug" # BUILD_SHARED_LIBS: "ON" @@ -643,7 +643,7 @@ build/dpcpp/dgpu/release/static: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OF" @@ -659,7 +659,7 @@ build/dpcpp/level_zero_dgpu/release/shared: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" DPCPP_SINGLE_MODE: "ON" @@ -674,7 +674,7 @@ build/icpx/level_zero_dgpu/release/shared: variables: C_COMPILER: "icx" CXX_COMPILER: "icpx" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" DPCPP_SINGLE_MODE: "ON" @@ -834,7 +834,7 @@ gh-pages: -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF - -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF -DGINKGO_BUILD_MPI=OFF + -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_SYCL=OFF -DGINKGO_BUILD_MPI=OFF -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON - make usr diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 15a2004bde6..504aa7dad40 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -40,7 +40,7 @@ -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} - -DGINKGO_BUILD_HIP=${BUILD_HIP} + -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL} -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR} -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE} @@ -85,7 +85,7 @@ -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} - -DGINKGO_BUILD_HIP=${BUILD_HIP} + -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL} -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR} -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE} diff --git a/.gitlab/variables.yml b/.gitlab/variables.yml index 2316b5abc71..6c75d60d069 100644 --- a/.gitlab/variables.yml +++ b/.gitlab/variables.yml @@ -11,6 +11,7 @@ BUILD_OMP: "OFF" BUILD_CUDA: "OFF" BUILD_HIP: "OFF" + BUILD_SYCL: "OFF" BUILD_HWLOC: "ON" BUILD_PAPI_SDE: "OFF" BUILD_MPI: "OFF" diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f38c1e7165..216feb658f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,9 @@ include(cmake/autodetect_executors.cmake) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules/") include(cmake/autodetect_system_libs.cmake) +# rename helper +include(cmake/rename.cmake) + # Ginkgo configuration options option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF) option(GINKGO_BUILD_TESTS "Generate build files for unit tests" ON) @@ -21,8 +24,9 @@ option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON) option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON) option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP}) option(GINKGO_BUILD_MPI "Compile the MPI module" ${GINKGO_HAS_MPI}) -option(GINKGO_BUILD_DPCPP - "Compile DPC++ kernels for Intel GPUs or other DPC++ enabled hardware" ${GINKGO_HAS_DPCPP}) +gko_rename_cache(GINKGO_BUILD_DPCPP GINKGO_BUILD_SYCL BOOL) +option(GINKGO_BUILD_SYCL + "Compile SYCL kernels for Intel GPUs or other SYCL enabled hardware" ${GINKGO_HAS_SYCL}) option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA}) option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP}) option(GINKGO_BUILD_DOC "Generate documentation" OFF) @@ -50,7 +54,7 @@ set(GINKGO_VERBOSE_LEVEL "1" CACHE STRING if(MSVC) set(GINKGO_COMPILER_FLAGS "" CACHE STRING "Set the required CXX compiler flags, mainly used for warnings. Current default is ``") -elseif(GINKGO_BUILD_DPCPP OR CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx") +elseif(GINKGO_BUILD_SYCL OR CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx") # For now always use `-ffp-model=precise` with DPC++. This can be removed when # the floating point issues are fixed. set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise" CACHE STRING @@ -99,7 +103,7 @@ endif() if(GINKGO_BUILD_HIP) include(cmake/hip.cmake) endif() -if(GINKGO_BUILD_DPCPP) +if(GINKGO_BUILD_SYCL) include(cmake/sycl.cmake) endif() if(GINKGO_BUILD_OMP) @@ -298,7 +302,7 @@ if(MSVC) endif() endif() -if(GINKGO_BUILD_DPCPP) +if(GINKGO_BUILD_SYCL) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION) else() @@ -321,7 +325,7 @@ endif() if(GINKGO_BUILD_HIP) add_subdirectory(hip) # High-performance kernels for AMD or NVIDIA GPUs endif() -if(GINKGO_BUILD_DPCPP) +if(GINKGO_BUILD_SYCL) add_subdirectory(dpcpp) # High-performance DPC++ kernels endif() if(GINKGO_BUILD_OMP) diff --git a/INSTALL.md b/INSTALL.md index b29358d4eb6..4da58010ba8 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,9 +42,10 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_BUILD_CUDA={ON, OFF}` builds optimized cuda versions of the kernels (requires CUDA), default is `ON` if a CUDA compiler could be detected, `OFF` otherwise. -* `-DGINKGO_BUILD_DPCPP={ON, OFF}` builds optimized DPC++ versions of the - kernels (requires `CMAKE_CXX_COMPILER` to be set to the `dpcpp` compiler). - The default is `ON` if `CMAKE_CXX_COMPILER` is a DPC++ compiler, `OFF` +* `-DGINKGO_BUILD_DPCPP={ON, OFF}` is deprecated. Please use `GINKGO_BUILD_SYCL` instead. +* `-DGINKGO_BUILD_SYCL={ON, OFF}` builds optimized SYCL versions of the + kernels (requires `CMAKE_CXX_COMPILER` to be set to the `dpcpp` or `icpx` compiler). + The default is `ON` if `CMAKE_CXX_COMPILER` is a SYCL compiler, `OFF` otherwise. * `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels (requires HIP), default is `ON` if an installation of HIP could be detected, diff --git a/README.md b/README.md index 102005e4a18..7f64d42599d 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Ginkgo is a high-performance linear algebra library for manycore systems, with a focus on the solution of sparse linear systems. It is implemented using modern C++ (you will need an at least C++14 compliant compiler to build it), with GPU kernels -implemented in CUDA, HIP, and DPC++. +implemented in CUDA, HIP, and DPC++(SYCL). Performance @@ -62,7 +62,7 @@ The Ginkgo HIP module has the following __additional__ requirements: * _10.1 <= CUDA < 11_ backend * if the hipFFT package is available, it is used to implement the FFT LinOps. -The Ginkgo DPC++ module has the following __additional__ requirements: +The Ginkgo DPC++(SYCL) module has the following __additional__ requirements: * _OneAPI 2021.3+_ * Set `dpcpp` or `icpx` as the `CMAKE_CXX_COMPILER` @@ -123,7 +123,7 @@ cmake -G "Unix Makefiles" .. && make By default, `GINKGO_BUILD_REFERENCE` is enabled. You should be able to run examples with this executor. By default, Ginkgo tries to enable the relevant modules depending on your machine environment (present of CUDA, ...). You can -also explicitly compile with the OpenMP, CUDA, HIP or DPC++ modules enabled to +also explicitly compile with the OpenMP, CUDA, HIP or DPC++(SYCL) modules enabled to run the examples with these executors. Please refer to the [Installation page](./INSTALL.md) for more details. diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 5cffddd51aa..347ecec7699 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -67,7 +67,7 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_definitions("${name}" PRIVATE HAS_HIP_TIMER=1) target_link_libraries("${name}" hip_timer) endif() - if (GINKGO_BUILD_DPCPP) + if (GINKGO_BUILD_SYCL) target_compile_definitions("${name}" PRIVATE HAS_DPCPP_TIMER=1) target_link_libraries("${name}" dpcpp_timer) endif() @@ -87,7 +87,7 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_definitions("${name}" PRIVATE HAS_HIP=1) target_link_libraries("${name}" hipsparse_linops_${type}) endif() - if (GINKGO_BUILD_DPCPP) + if (GINKGO_BUILD_SYCL) target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) target_link_libraries("${name}" onemkl_linops_${type}) endif() @@ -134,7 +134,7 @@ if (GINKGO_BUILD_HIP) target_link_libraries(hip_timer ginkgo) endif() -if (GINKGO_BUILD_DPCPP) +if (GINKGO_BUILD_SYCL) ginkgo_benchmark_onemkl_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) ginkgo_benchmark_onemkl_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) ginkgo_benchmark_onemkl_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 352cf1dde8d..41f3b8f2879 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -37,7 +37,7 @@ set(GINKGO_BUILD_OMP @GINKGO_BUILD_OMP@) set(GINKGO_BUILD_CUDA @GINKGO_BUILD_CUDA@) set(GINKGO_BUILD_HIP @GINKGO_BUILD_HIP@) set(GINKGO_BUILD_MPI @GINKGO_BUILD_MPI@) -set(GINKGO_BUILD_DPCPP @GINKGO_BUILD_DPCPP@) +set(GINKGO_BUILD_SYCL @GINKGO_BUILD_SYCL@) set(GINKGO_DEVEL_TOOLS @GINKGO_DEVEL_TOOLS@) set(GINKGO_BUILD_TESTS @GINKGO_BUILD_TESTS@) @@ -184,7 +184,7 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) endif() endif() -if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP) +if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_SYCL) find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}") find_package(oneDPL REQUIRED HINTS "${GINKGO_DPL_ROOT}") endif() diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake index 86949cecda9..757262f1ea1 100644 --- a/cmake/autodetect_executors.cmake +++ b/cmake/autodetect_executors.cmake @@ -1,7 +1,7 @@ set(GINKGO_HAS_OMP OFF) set(GINKGO_HAS_MPI OFF) set(GINKGO_HAS_CUDA OFF) -set(GINKGO_HAS_DPCPP OFF) +set(GINKGO_HAS_SYCL OFF) set(GINKGO_HAS_HIP OFF) include(CheckLanguage) @@ -37,7 +37,7 @@ if (NOT DEFINED GINKGO_BUILD_HIP) endif() endif() -if (NOT DEFINED GINKGO_BUILD_DPCPP) +if (NOT DEFINED GINKGO_BUILD_DPCPP AND NOT DEFINED GINKGO_BUILD_SYCL) try_compile(GKO_CAN_COMPILE_DPCPP ${PROJECT_BINARY_DIR}/dpcpp SOURCES ${PROJECT_SOURCE_DIR}/dpcpp/test_dpcpp.dp.cpp # try_compile will pass the project CMAKE_CXX_FLAGS so passing -DCMAKE_CXX_FLAGS does not affect it. @@ -47,6 +47,6 @@ if (NOT DEFINED GINKGO_BUILD_DPCPP) CXX_STANDARD 17) if (GKO_CAN_COMPILE_DPCPP) message(STATUS "Enabling DPCPP executor") - set(GINKGO_HAS_DPCPP ON) + set(GINKGO_HAS_SYCL ON) endif() endif() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 3794a8026e1..522ad5f2ba7 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -247,7 +247,7 @@ function(ginkgo_create_common_test test_name) if(GINKGO_BUILD_CUDA) ginkgo_create_common_test_internal(${test_name} CudaExecutor cuda ${ARGN}) endif() - if(GINKGO_BUILD_DPCPP) + if(GINKGO_BUILD_SYCL) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) endif() endfunction(ginkgo_create_common_test) @@ -295,7 +295,7 @@ endfunction(ginkgo_create_common_test_internal) function(ginkgo_create_common_device_test test_name) cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}") ginkgo_build_test_name(${test_name} test_target_name) - if(GINKGO_BUILD_DPCPP) + if(GINKGO_BUILD_SYCL) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 2dd068abb50..6b904189151 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -127,7 +127,7 @@ foreach(log_type ${log_types}) ginkgo_print_module_footer(${${log_type}} "User configuration:") ginkgo_print_module_footer(${${log_type}} " Enabled modules:") ginkgo_print_foreach_variable(${${log_type}} - "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP") + "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL") ginkgo_print_module_footer(${${log_type}} " Enabled features:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI") @@ -167,7 +167,7 @@ IF(GINKGO_BUILD_HIP) include(hip/get_info.cmake) ENDIF() -IF(GINKGO_BUILD_DPCPP) +IF(GINKGO_BUILD_SYCL) include(dpcpp/get_info.cmake) ENDIF() diff --git a/cmake/rename.cmake b/cmake/rename.cmake new file mode 100644 index 00000000000..d9837b84a1b --- /dev/null +++ b/cmake/rename.cmake @@ -0,0 +1,25 @@ +# Only for CACHE variable (option) +macro(gko_rename_cache deprecated actual type) + if(DEFINED ${deprecated}) + if(DEFINED ${actual}) + message("actual ${actual} and deprecated ${deprecated}") + if("${${actual}}" STREQUAL "${${deprecated}}") + # They are the same, so only throw warning + message(WARNING "${deprecated} was deprecated, please only use ${actual} instead.") + else() + # They are different + set(${deprecated}_copy ${${deprecated}}) + unset(${deprecated} CACHE) + message(FATAL_ERROR "Both ${deprecated} and ${actual} were specified, please use ${actual} instead. " + "We remove ${deprecated}:${${deprecated}_copy} and keep ${actual}:${${actual}}") + endif() + else() + # Only set `deprecated`, move it to `actual`. + message(WARNING "${deprecated} was deprecated, please use ${actual} instead. " + "We copy ${${deprecated}} to ${actual} and unset ${deprecated}.") + set(${actual} ${${deprecated}} CACHE ${type} "") + endif() + # We always unset the deprecated for easier next setup + unset(${deprecated} CACHE) + endif() +endmacro() \ No newline at end of file diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt index 901acef7797..573f87fad93 100644 --- a/core/device_hooks/CMakeLists.txt +++ b/core/device_hooks/CMakeLists.txt @@ -8,7 +8,7 @@ if(NOT GINKGO_BUILD_CUDA) ginkgo_install_library(ginkgo_cuda) endif() -if (NOT GINKGO_BUILD_DPCPP) +if (NOT GINKGO_BUILD_SYCL) add_library(ginkgo_dpcpp $ dpcpp_hooks.cpp) diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt index 6d77b663e84..cdfc67fafdf 100644 --- a/core/test/gtest/CMakeLists.txt +++ b/core/test/gtest/CMakeLists.txt @@ -27,6 +27,6 @@ endif() if (GINKGO_BUILD_HIP) add_gtest_main("_hip" "GKO_COMPILING_HIP") endif() -if (GINKGO_BUILD_DPCPP) +if (GINKGO_BUILD_SYCL) add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP") endif() diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in index a75ac59f186..7234a3ca8aa 100644 --- a/doc/examples/examples.hpp.in +++ b/doc/examples/examples.hpp.in @@ -64,7 +64,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
    *
  1. -DGINKGO_BUILD_CUDA=ON option for NVIDIA GPUs. *
  2. -DGINKGO_BUILD_HIP=ON option for AMD or NVIDIA GPUs. - *
  3. -DGINKGO_BUILD_DPCPP=ON option for Intel GPUs (and + *
  4. -DGINKGO_BUILD_SYCL=ON option for Intel GPUs (and * possibly any other platform). *
* diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt index f870ecfbf19..3cfe2db8ac3 100644 --- a/test/solver/CMakeLists.txt +++ b/test/solver/CMakeLists.txt @@ -13,6 +13,6 @@ ginkgo_create_common_test(lower_trs_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(multigrid_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(solver DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(upper_trs_kernels DISABLE_EXECUTORS dpcpp) -if(GINKGO_BUILD_DPCPP) +if(GINKGO_BUILD_SYCL) gko_add_sycl_to_target(TARGET test_solver_idr_kernels_dpcpp SOURCES idr_kernels.cpp) endif() From b08c49048920c2a83c9b4465bf3a55843ae515b0 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 13 Oct 2023 10:09:53 +0200 Subject: [PATCH 377/583] do not delete deprecated var from CMake, keep doc Co-authored-by: Tobias Ribizel --- CMakeLists.txt | 2 +- cmake/rename.cmake | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 216feb658f5..9eca64fa3fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON) option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON) option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP}) option(GINKGO_BUILD_MPI "Compile the MPI module" ${GINKGO_HAS_MPI}) -gko_rename_cache(GINKGO_BUILD_DPCPP GINKGO_BUILD_SYCL BOOL) +gko_rename_cache(GINKGO_BUILD_DPCPP GINKGO_BUILD_SYCL BOOL "Compile SYCL kernels for Intel GPUs or other SYCL enabled hardware") option(GINKGO_BUILD_SYCL "Compile SYCL kernels for Intel GPUs or other SYCL enabled hardware" ${GINKGO_HAS_SYCL}) option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA}) diff --git a/cmake/rename.cmake b/cmake/rename.cmake index d9837b84a1b..6c386bc24c6 100644 --- a/cmake/rename.cmake +++ b/cmake/rename.cmake @@ -1,5 +1,5 @@ # Only for CACHE variable (option) -macro(gko_rename_cache deprecated actual type) +macro(gko_rename_cache deprecated actual type doc_string) if(DEFINED ${deprecated}) if(DEFINED ${actual}) message("actual ${actual} and deprecated ${deprecated}") @@ -8,18 +8,13 @@ macro(gko_rename_cache deprecated actual type) message(WARNING "${deprecated} was deprecated, please only use ${actual} instead.") else() # They are different - set(${deprecated}_copy ${${deprecated}}) - unset(${deprecated} CACHE) - message(FATAL_ERROR "Both ${deprecated} and ${actual} were specified, please use ${actual} instead. " - "We remove ${deprecated}:${${deprecated}_copy} and keep ${actual}:${${actual}}") + message(FATAL_ERROR "Both ${deprecated} and ${actual} were specified differently, please only use ${actual} instead.") endif() else() # Only set `deprecated`, move it to `actual`. message(WARNING "${deprecated} was deprecated, please use ${actual} instead. " - "We copy ${${deprecated}} to ${actual} and unset ${deprecated}.") - set(${actual} ${${deprecated}} CACHE ${type} "") + "We copy ${${deprecated}} to ${actual}") + set(${actual} ${${deprecated}} CACHE ${type} "${doc_string}") endif() - # We always unset the deprecated for easier next setup - unset(${deprecated} CACHE) endif() endmacro() \ No newline at end of file From dfb9607836e37d1969fe4cac8e4cf026b60a00d7 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 13 Oct 2023 10:47:20 +0200 Subject: [PATCH 378/583] adapt MKL and oneDPL env --- dpcpp/CMakeLists.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 0041b7cbd18..9990496c98f 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -1,7 +1,8 @@ -find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}") -set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE) -find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}") -set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE) +find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}" "$ENV{MKL_ROOT}") +find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}" "$ENV{DPLROOT}") +# use the parameter from cmake +set(GINKGO_MKL_ROOT "${MKL_DIR}" PARENT_SCOPE) +set(GINKGO_DPL_ROOT "${oneDPL_DIR}" PARENT_SCOPE) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernels.instantiate.cpp DENSE_INSTANTIATE) From 392a626fe68672ea6c32a5427fb5314681b0d760 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 15 Oct 2023 19:49:54 +0200 Subject: [PATCH 379/583] add factorization unpack functions --- core/factorization/factorization.cpp | 74 ++++++++++++++++++- .../test/factorization/factorization.cpp | 45 +++++++++++ 2 files changed, 117 insertions(+), 2 deletions(-) diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp index d38d18ca3e5..436359a417a 100644 --- a/core/factorization/factorization.cpp +++ b/core/factorization/factorization.cpp @@ -33,18 +33,88 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include +#include "core/factorization/factorization_kernels.hpp" + + namespace gko { namespace experimental { namespace factorization { +namespace { + + +GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u, + factorization::initialize_row_ptrs_l_u); +GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u); +GKO_REGISTER_OPERATION(initialize_row_ptrs_l, + factorization::initialize_row_ptrs_l); +GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l); + + +} // namespace template std::unique_ptr> -Factorization::unpack() const GKO_NOT_IMPLEMENTED; +Factorization::unpack() const +{ + const auto exec = this->get_executor(); + const auto size = this->get_size(); + switch (this->get_storage_type()) { + case storage_type::empty: + GKO_NOT_SUPPORTED(nullptr); + case storage_type::composition: + case storage_type::symm_composition: + return this->clone(); + case storage_type::combined_lu: { + // count nonzeros + array l_row_ptrs{exec, size[0] + 1}; + array u_row_ptrs{exec, size[0] + 1}; + const auto mtx = this->get_combined(); + exec->run(make_initialize_row_ptrs_l_u(mtx.get(), l_row_ptrs.get_data(), + u_row_ptrs.get_data())); + const auto l_nnz = static_cast( + exec->copy_val_to_host(l_row_ptrs.get_const_data() + size[0])); + const auto u_nnz = static_cast( + exec->copy_val_to_host(u_row_ptrs.get_const_data() + size[0])); + // create matrices + auto l_mtx = matrix_type::create( + exec, size, array{exec, l_nnz}, + array{exec, l_nnz}, std::move(l_row_ptrs)); + auto u_mtx = matrix_type::create( + exec, size, array{exec, u_nnz}, + array{exec, u_nnz}, std::move(u_row_ptrs)); + // fill matrices + exec->run(make_initialize_l_u(mtx.get(), l_mtx.get(), u_mtx.get())); + return create_from_composition( + composition_type::create(std::move(l_mtx), std::move(u_mtx))); + } + case storage_type::symm_combined_cholesky: { + // count nonzeros + array l_row_ptrs{exec, size[0] + 1}; + const auto mtx = this->get_combined(); + exec->run(make_initialize_row_ptrs_l(mtx.get(), l_row_ptrs.get_data())); + const auto l_nnz = static_cast( + exec->copy_val_to_host(l_row_ptrs.get_const_data() + size[0])); + // create matrices + auto l_mtx = matrix_type::create( + exec, size, array{exec, l_nnz}, + array{exec, l_nnz}, std::move(l_row_ptrs)); + // fill matrices + exec->run(make_initialize_l(mtx.get(), l_mtx.get(), false)); + auto u_mtx = l_mtx->conj_transpose(); + return create_from_symm_composition( + composition_type::create(std::move(l_mtx), std::move(u_mtx))); + } + case storage_type::combined_ldu: + case storage_type::symm_combined_ldl: + GKO_NOT_IMPLEMENTED; + } +} template @@ -58,7 +128,7 @@ template std::shared_ptr> Factorization::get_lower_factor() const { - switch (storage_type_) { + switch (this->get_storage_type()) { case storage_type::composition: case storage_type::symm_composition: GKO_ASSERT(factors_->get_operators().size() == 2 || diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp index d9928491771..6abfd470385 100644 --- a/reference/test/factorization/factorization.cpp +++ b/reference/test/factorization/factorization.cpp @@ -71,9 +71,13 @@ class Factorization : public ::testing::Test { : ref(gko::ReferenceExecutor::create()), lower_mtx{gko::initialize( {{1.0, 0.0, 0.0}, {3.0, 1.0, 0.0}, {1.0, 2.0, 1.0}}, ref)}, + lower_cholesky_mtx{gko::initialize( + {{1.0, 0.0, 0.0}, {3.0, -1.0, 0.0}, {1.0, 2.0, 5.0}}, ref)}, diagonal{diag_type::create(ref, 3)}, upper_mtx(gko::initialize( {{1.0, 2.0, 1.0}, {0.0, 1.0, 3.0}, {0.0, 0.0, 1.0}}, ref)), + upper_nonunit_mtx(gko::initialize( + {{1.0, 2.0, 1.0}, {0.0, -1.0, 3.0}, {0.0, 0.0, 5.0}}, ref)), combined_mtx(gko::initialize( {{1.0, 2.0, 1.0}, {3.0, -1.0, 3.0}, {1.0, 2.0, 5.0}}, ref)), input(gko::initialize({1.0, 2.0, 3.0}, ref)), @@ -88,8 +92,10 @@ class Factorization : public ::testing::Test { std::shared_ptr ref; std::shared_ptr lower_mtx; + std::shared_ptr lower_cholesky_mtx; std::shared_ptr diagonal; std::shared_ptr upper_mtx; + std::shared_ptr upper_nonunit_mtx; std::shared_ptr combined_mtx; std::shared_ptr input; std::shared_ptr output; @@ -261,6 +267,45 @@ TYPED_TEST(Factorization, CreateSymmCombinedLDLWorks) } +TYPED_TEST(Factorization, UnpackCombinedLUWorks) +{ + using factorization_type = typename TestFixture::factorization_type; + auto fact = factorization_type::create_from_combined_lu( + this->combined_mtx->clone()); + + auto separated = fact->unpack(); + + ASSERT_EQ(separated->get_storage_type(), + gko::experimental::factorization::storage_type::composition); + ASSERT_EQ(separated->get_combined(), nullptr); + ASSERT_EQ(separated->get_diagonal(), nullptr); + GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_mtx, 0.0); + GKO_ASSERT_MTX_NEAR(separated->get_upper_factor(), this->upper_nonunit_mtx, + 0.0); +} + + +TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorks) +{ + using matrix_type = typename TestFixture::matrix_type; + using factorization_type = typename TestFixture::factorization_type; + auto fact = factorization_type::create_from_combined_cholesky( + this->combined_mtx->clone()); + + auto separated = fact->unpack(); + + ASSERT_EQ(separated->get_storage_type(), + gko::experimental::factorization::storage_type::symm_composition); + ASSERT_EQ(separated->get_combined(), nullptr); + ASSERT_EQ(separated->get_diagonal(), nullptr); + GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_cholesky_mtx, + 0.0); + GKO_ASSERT_MTX_NEAR( + separated->get_upper_factor(), + gko::as(this->lower_cholesky_mtx->conj_transpose()), 0.0); +} + + TYPED_TEST(Factorization, ApplyFromCompositionWorks) { using factorization_type = typename TestFixture::factorization_type; From aee10ee7d1e7e7045057ebadaae52ee459721cb4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 16 Oct 2023 11:06:00 +0200 Subject: [PATCH 380/583] test composition unpacking --- .../test/factorization/factorization.cpp | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp index 6abfd470385..2f8231f1da7 100644 --- a/reference/test/factorization/factorization.cpp +++ b/reference/test/factorization/factorization.cpp @@ -306,6 +306,48 @@ TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorks) } +TYPED_TEST(Factorization, UnpackCompositionWorks) +{ + using factorization_type = typename TestFixture::factorization_type; + using composition_type = typename TestFixture::composition_type; + auto fact = factorization_type::create_from_composition( + composition_type::create(this->lower_mtx, this->upper_nonunit_mtx)); + + auto separated = fact->unpack(); + + ASSERT_EQ(separated->get_storage_type(), + gko::experimental::factorization::storage_type::composition); + ASSERT_EQ(separated->get_combined(), nullptr); + ASSERT_EQ(separated->get_diagonal(), nullptr); + GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_mtx, 0.0); + GKO_ASSERT_MTX_NEAR(separated->get_upper_factor(), this->upper_nonunit_mtx, + 0.0); +} + + +TYPED_TEST(Factorization, UnpackSymmCompositionWorks) +{ + using matrix_type = typename TestFixture::matrix_type; + using factorization_type = typename TestFixture::factorization_type; + using composition_type = typename TestFixture::composition_type; + auto fact = factorization_type::create_from_symm_composition( + composition_type::create(this->lower_cholesky_mtx, + this->lower_cholesky_mtx->conj_transpose())); + + auto separated = fact->unpack(); + + ASSERT_EQ(separated->get_storage_type(), + gko::experimental::factorization::storage_type::symm_composition); + ASSERT_EQ(separated->get_combined(), nullptr); + ASSERT_EQ(separated->get_diagonal(), nullptr); + GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_cholesky_mtx, + 0.0); + GKO_ASSERT_MTX_NEAR( + separated->get_upper_factor(), + gko::as(this->lower_cholesky_mtx->conj_transpose()), 0.0); +} + + TYPED_TEST(Factorization, ApplyFromCompositionWorks) { using factorization_type = typename TestFixture::factorization_type; From 671b9752cb20ca53d24fb3efbc5045dda95b1834 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 16 Oct 2023 10:42:21 +0200 Subject: [PATCH 381/583] refactor tests to use index_type template param --- reference/test/matrix/dense_kernels.cpp | 2219 +++++++---------------- 1 file changed, 630 insertions(+), 1589 deletions(-) diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 9edab89e382..3a4cfb6826b 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -97,7 +97,6 @@ class Dense : public ::testing::Test { std::unique_ptr mtx6; std::unique_ptr mtx7; std::unique_ptr mtx8; - gko::int32 invalid_index = gko::invalid_index(); std::default_random_engine rand_engine; template @@ -115,6 +114,23 @@ class Dense : public ::testing::Test { TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator); +template +class DenseWithIndexType + : public Dense< + typename std::tuple_element<0, decltype(ValueIndexType())>::type> { +public: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + + index_type invalid_index = gko::invalid_index(); +}; + +TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes, + PairTypenameNameGenerator); + + TYPED_TEST(Dense, CopyRespectsStride) { using value_type = typename TestFixture::value_type; @@ -780,41 +796,9 @@ TYPED_TEST(Dense, MovesToPrecision) } -TYPED_TEST(Dense, ConvertsToCoo32) -{ - using T = typename TestFixture::value_type; - using Coo = typename gko::matrix::Coo; - auto coo_mtx = Coo::create(this->mtx4->get_executor()); - - this->mtx4->convert_to(coo_mtx); - auto v = coo_mtx->get_const_values(); - auto c = coo_mtx->get_const_col_idxs(); - auto r = coo_mtx->get_const_row_idxs(); - - ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); -} - - -TYPED_TEST(Dense, MovesToCoo32) +template +void assert_coo_eq_mtx4(const gko::matrix::Coo* coo_mtx) { - using T = typename TestFixture::value_type; - using Coo = typename gko::matrix::Coo; - auto coo_mtx = Coo::create(this->mtx4->get_executor()); - - this->mtx4->move_to(coo_mtx); auto v = coo_mtx->get_const_values(); auto c = coo_mtx->get_const_col_idxs(); auto r = coo_mtx->get_const_row_idxs(); @@ -829,121 +813,47 @@ TYPED_TEST(Dense, MovesToCoo32) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(Dense, ConvertsToCoo64) +TYPED_TEST(DenseWithIndexType, ConvertsToCoo) { - using T = typename TestFixture::value_type; - using Coo = typename gko::matrix::Coo; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Coo = typename gko::matrix::Coo; auto coo_mtx = Coo::create(this->mtx4->get_executor()); this->mtx4->convert_to(coo_mtx); - auto v = coo_mtx->get_const_values(); - auto c = coo_mtx->get_const_col_idxs(); - auto r = coo_mtx->get_const_row_idxs(); - ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + assert_coo_eq_mtx4(coo_mtx.get()); } -TYPED_TEST(Dense, MovesToCoo64) +TYPED_TEST(DenseWithIndexType, MovesToCoo) { - using T = typename TestFixture::value_type; - using Coo = typename gko::matrix::Coo; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Coo = typename gko::matrix::Coo; auto coo_mtx = Coo::create(this->mtx4->get_executor()); this->mtx4->move_to(coo_mtx); - auto v = coo_mtx->get_const_values(); - auto c = coo_mtx->get_const_col_idxs(); - auto r = coo_mtx->get_const_row_idxs(); - - ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); -} - -TYPED_TEST(Dense, ConvertsToCsr32) -{ - using T = typename TestFixture::value_type; - using Csr = typename gko::matrix::Csr; - auto csr_s_classical = std::make_shared(); - auto csr_s_merge = std::make_shared(); - auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); - auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); - - this->mtx4->convert_to(csr_mtx_c); - this->mtx4->convert_to(csr_mtx_m); - - auto v = csr_mtx_c->get_const_values(); - auto c = csr_mtx_c->get_const_col_idxs(); - auto r = csr_mtx_c->get_const_row_ptrs(); - ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); - ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); - GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); - ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); + assert_coo_eq_mtx4(coo_mtx.get()); } -TYPED_TEST(Dense, MovesToCsr32) +template +void assert_csr_eq_mtx4(const gko::matrix::Csr* csr_mtx) { - using T = typename TestFixture::value_type; - using Csr = typename gko::matrix::Csr; - auto csr_s_classical = std::make_shared(); - auto csr_s_merge = std::make_shared(); - auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); - auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); - auto mtx_clone = this->mtx4->clone(); - - this->mtx4->move_to(csr_mtx_c); - mtx_clone->move_to(csr_mtx_m); - - auto v = csr_mtx_c->get_const_values(); - auto c = csr_mtx_c->get_const_col_idxs(); - auto r = csr_mtx_c->get_const_row_ptrs(); - ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4); + auto v = csr_mtx->get_const_values(); + auto c = csr_mtx->get_const_col_idxs(); + auto r = csr_mtx->get_const_row_ptrs(); + ASSERT_EQ(csr_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(csr_mtx->get_num_stored_elements(), 4); EXPECT_EQ(r[0], 0); EXPECT_EQ(r[1], 3); EXPECT_EQ(r[2], 4); @@ -951,20 +861,18 @@ TYPED_TEST(Dense, MovesToCsr32) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); - ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); - GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); - ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(Dense, ConvertsToCsr64) +TYPED_TEST(DenseWithIndexType, ConvertsToCsr) { - using T = typename TestFixture::value_type; - using Csr = typename gko::matrix::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Csr = typename gko::matrix::Csr; auto csr_s_classical = std::make_shared(); auto csr_s_merge = std::make_shared(); auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); @@ -973,32 +881,18 @@ TYPED_TEST(Dense, ConvertsToCsr64) this->mtx4->convert_to(csr_mtx_c); this->mtx4->convert_to(csr_mtx_m); - auto v = csr_mtx_c->get_const_values(); - auto c = csr_mtx_c->get_const_col_idxs(); - auto r = csr_mtx_c->get_const_row_ptrs(); - ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + assert_csr_eq_mtx4(csr_mtx_c.get()); ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TYPED_TEST(Dense, MovesToCsr64) +TYPED_TEST(DenseWithIndexType, MovesToCsr) { - using T = typename TestFixture::value_type; - using Csr = typename gko::matrix::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Csr = typename gko::matrix::Csr; auto csr_s_classical = std::make_shared(); auto csr_s_merge = std::make_shared(); auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); @@ -1008,59 +902,17 @@ TYPED_TEST(Dense, MovesToCsr64) this->mtx4->move_to(csr_mtx_c); mtx_clone->move_to(csr_mtx_m); - auto v = csr_mtx_c->get_const_values(); - auto c = csr_mtx_c->get_const_col_idxs(); - auto r = csr_mtx_c->get_const_row_ptrs(); - ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + assert_csr_eq_mtx4(csr_mtx_c.get()); ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TYPED_TEST(Dense, ConvertsToSparsityCsr32) -{ - using T = typename TestFixture::value_type; - using SparsityCsr = typename gko::matrix::SparsityCsr; - auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); - - this->mtx4->convert_to(sparsity_csr_mtx); - auto v = sparsity_csr_mtx->get_const_value(); - auto c = sparsity_csr_mtx->get_const_col_idxs(); - auto r = sparsity_csr_mtx->get_const_row_ptrs(); - - ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); -} - - -TYPED_TEST(Dense, MovesToSparsityCsr32) +template +void assert_sparsity_csr_eq_mtx4( + const gko::matrix::SparsityCsr* sparsity_csr_mtx) { - using T = typename TestFixture::value_type; - using SparsityCsr = typename gko::matrix::SparsityCsr; - auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); - - this->mtx4->move_to(sparsity_csr_mtx); auto v = sparsity_csr_mtx->get_const_value(); auto c = sparsity_csr_mtx->get_const_col_idxs(); auto r = sparsity_csr_mtx->get_const_row_ptrs(); @@ -1074,65 +926,41 @@ TYPED_TEST(Dense, MovesToSparsityCsr32) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); + EXPECT_EQ(v[0], ValueType{1.0}); } -TYPED_TEST(Dense, ConvertsToSparsityCsr64) +TYPED_TEST(DenseWithIndexType, ConvertsToSparsityCsr) { - using T = typename TestFixture::value_type; - using SparsityCsr = typename gko::matrix::SparsityCsr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using SparsityCsr = + typename gko::matrix::SparsityCsr; auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); this->mtx4->convert_to(sparsity_csr_mtx); - auto v = sparsity_csr_mtx->get_const_value(); - auto c = sparsity_csr_mtx->get_const_col_idxs(); - auto r = sparsity_csr_mtx->get_const_row_ptrs(); - ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); + assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get()); } -TYPED_TEST(Dense, MovesToSparsityCsr64) +TYPED_TEST(DenseWithIndexType, MovesToSparsityCsr) { - using T = typename TestFixture::value_type; - using SparsityCsr = typename gko::matrix::SparsityCsr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using SparsityCsr = + typename gko::matrix::SparsityCsr; auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); this->mtx4->move_to(sparsity_csr_mtx); - auto v = sparsity_csr_mtx->get_const_value(); - auto c = sparsity_csr_mtx->get_const_col_idxs(); - auto r = sparsity_csr_mtx->get_const_row_ptrs(); - ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); + assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get()); } -TYPED_TEST(Dense, ConvertsToEll32) +template +void assert_ell_eq_mtx6(const gko::matrix::Ell* ell_mtx) { - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; - auto ell_mtx = Ell::create(this->mtx6->get_executor()); - - this->mtx6->convert_to(ell_mtx); auto v = ell_mtx->get_const_values(); auto c = ell_mtx->get_const_col_idxs(); @@ -1143,156 +971,99 @@ TYPED_TEST(Dense, ConvertsToEll32) EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{0.0}); + EXPECT_EQ(c[3], gko::invalid_index()); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{0.0}); } -TYPED_TEST(Dense, MovesToEll32) +TYPED_TEST(DenseWithIndexType, ConvertsToEll) { - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; auto ell_mtx = Ell::create(this->mtx6->get_executor()); - this->mtx6->move_to(ell_mtx); - auto v = ell_mtx->get_const_values(); - auto c = ell_mtx->get_const_col_idxs(); + this->mtx6->convert_to(ell_mtx); - ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); - ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4); - ASSERT_EQ(ell_mtx->get_stride(), 2); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{0.0}); + assert_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(Dense, ConvertsToEll64) +TYPED_TEST(DenseWithIndexType, MovesToEll) { - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; auto ell_mtx = Ell::create(this->mtx6->get_executor()); - this->mtx6->convert_to(ell_mtx); - auto v = ell_mtx->get_const_values(); - auto c = ell_mtx->get_const_col_idxs(); + this->mtx6->move_to(ell_mtx); - ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); - ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4); - ASSERT_EQ(ell_mtx->get_stride(), 2); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{0.0}); + assert_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(Dense, MovesToEll64) +template +void assert_strided_ell_eq_mtx6( + const gko::matrix::Ell* ell_mtx) { - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; - auto ell_mtx = Ell::create(this->mtx6->get_executor()); - - this->mtx6->move_to(ell_mtx); + constexpr auto invalid_index = gko::invalid_index(); auto v = ell_mtx->get_const_values(); auto c = ell_mtx->get_const_col_idxs(); ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); - ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4); - ASSERT_EQ(ell_mtx->get_stride(), 2); + ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6); + ASSERT_EQ(ell_mtx->get_stride(), 3); EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{0.0}); + EXPECT_EQ(c[2], invalid_index); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(c[4], invalid_index); + EXPECT_EQ(c[5], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[2], ValueType{0.0}); + EXPECT_EQ(v[3], ValueType{2.0}); + EXPECT_EQ(v[4], ValueType{0.0}); + EXPECT_EQ(v[5], ValueType{0.0}); } -TYPED_TEST(Dense, ConvertsToEllWithStride) +TYPED_TEST(DenseWithIndexType, ConvertsToEllWithStride) { - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); this->mtx6->convert_to(ell_mtx); - auto v = ell_mtx->get_const_values(); - auto c = ell_mtx->get_const_col_idxs(); - ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); - ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6); - ASSERT_EQ(ell_mtx->get_stride(), 3); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], this->invalid_index); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], this->invalid_index); - EXPECT_EQ(c[5], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{0.0}); - EXPECT_EQ(v[3], T{2.0}); - EXPECT_EQ(v[4], T{0.0}); - EXPECT_EQ(v[5], T{0.0}); + assert_strided_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(Dense, MovesToEllWithStride) +TYPED_TEST(DenseWithIndexType, MovesToEllWithStride) { - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; auto ell_mtx = Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); this->mtx6->move_to(ell_mtx); - auto v = ell_mtx->get_const_values(); - auto c = ell_mtx->get_const_col_idxs(); - ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); - ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6); - ASSERT_EQ(ell_mtx->get_stride(), 3); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], this->invalid_index); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], this->invalid_index); - EXPECT_EQ(c[5], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{0.0}); - EXPECT_EQ(v[3], T{2.0}); - EXPECT_EQ(v[4], T{0.0}); - EXPECT_EQ(v[5], T{0.0}); + assert_strided_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(Dense, MovesToHybridAutomatically32) +template +void assert_hybrid_auto_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); - - this->mtx4->move_to(hybrid_mtx); auto v = hybrid_mtx->get_const_coo_values(); auto c = hybrid_mtx->get_const_coo_col_idxs(); auto r = hybrid_mtx->get_const_coo_row_idxs(); @@ -1312,20 +1083,43 @@ TYPED_TEST(Dense, MovesToHybridAutomatically32) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(Dense, ConvertsToHybridAutomatically32) +TYPED_TEST(DenseWithIndexType, MovesToHybridAutomatically) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); + + this->mtx4->move_to(hybrid_mtx); + + assert_hybrid_auto_eq_mtx4(hybrid_mtx.get()); +} + + +TYPED_TEST(DenseWithIndexType, ConvertsToHybridAutomatically) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); this->mtx4->convert_to(hybrid_mtx); + + assert_hybrid_auto_eq_mtx4(hybrid_mtx.get()); +} + + +template +void assert_hybrid_strided_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) +{ auto v = hybrid_mtx->get_const_coo_values(); auto c = hybrid_mtx->get_const_coo_col_idxs(); auto r = hybrid_mtx->get_const_coo_row_idxs(); @@ -1336,7 +1130,7 @@ TYPED_TEST(Dense, ConvertsToHybridAutomatically32) ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); EXPECT_EQ(n, 0); - EXPECT_EQ(p, 2); + EXPECT_EQ(p, 3); EXPECT_EQ(r[0], 0); EXPECT_EQ(r[1], 0); EXPECT_EQ(r[2], 0); @@ -1345,156 +1139,46 @@ TYPED_TEST(Dense, ConvertsToHybridAutomatically32) EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 2); EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(Dense, MovesToHybridAutomatically64) +TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAutomatically) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); - - this->mtx4->move_to(hybrid_mtx); - auto v = hybrid_mtx->get_const_coo_values(); - auto c = hybrid_mtx->get_const_coo_col_idxs(); - auto r = hybrid_mtx->get_const_coo_row_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); - - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); - EXPECT_EQ(n, 0); - EXPECT_EQ(p, 2); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); -} - - -TYPED_TEST(Dense, ConvertsToHybridAutomatically64) -{ - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); - - this->mtx4->convert_to(hybrid_mtx); - auto v = hybrid_mtx->get_const_coo_values(); - auto c = hybrid_mtx->get_const_coo_col_idxs(); - auto r = hybrid_mtx->get_const_coo_row_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); - - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); - EXPECT_EQ(n, 0); - EXPECT_EQ(p, 2); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); -} - - -TYPED_TEST(Dense, MovesToHybridWithStrideAutomatically) -{ - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); this->mtx4->move_to(hybrid_mtx); - auto v = hybrid_mtx->get_const_coo_values(); - auto c = hybrid_mtx->get_const_coo_col_idxs(); - auto r = hybrid_mtx->get_const_coo_row_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); - EXPECT_EQ(n, 0); - EXPECT_EQ(p, 3); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + assert_hybrid_strided_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(Dense, ConvertsToHybridWithStrideAutomatically) +TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAutomatically) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); this->mtx4->convert_to(hybrid_mtx); - auto v = hybrid_mtx->get_const_coo_values(); - auto c = hybrid_mtx->get_const_coo_col_idxs(); - auto r = hybrid_mtx->get_const_coo_row_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); - EXPECT_EQ(n, 0); - EXPECT_EQ(p, 3); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{3.0}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{5.0}); + assert_hybrid_strided_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) +template +void assert_hybrid_limited_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, - std::make_shared(2)); - - this->mtx4->move_to(hybrid_mtx); + constexpr auto invalid_index = gko::invalid_index(); auto v = hybrid_mtx->get_const_ell_values(); auto c = hybrid_mtx->get_const_ell_col_idxs(); auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); @@ -1507,68 +1191,56 @@ TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2) EXPECT_EQ(p, 3); EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], this->invalid_index); + EXPECT_EQ(c[2], invalid_index); EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], this->invalid_index); - EXPECT_EQ(c[5], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{5.0}); - EXPECT_EQ(v[2], T{0.0}); - EXPECT_EQ(v[3], T{3.0}); - EXPECT_EQ(v[4], T{0.0}); - EXPECT_EQ(v[5], T{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0}); + EXPECT_EQ(c[4], invalid_index); + EXPECT_EQ(c[5], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{5.0}); + EXPECT_EQ(v[2], ValueType{0.0}); + EXPECT_EQ(v[3], ValueType{3.0}); + EXPECT_EQ(v[4], ValueType{0.0}); + EXPECT_EQ(v[5], ValueType{0.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], ValueType{2.0}); EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0); EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); } -TYPED_TEST(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2) +TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAndCooLengthByColumns2) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, std::make_shared(2)); - this->mtx4->convert_to(hybrid_mtx); - auto v = hybrid_mtx->get_const_ell_values(); - auto c = hybrid_mtx->get_const_ell_col_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); + this->mtx4->move_to(hybrid_mtx); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1); - EXPECT_EQ(n, 2); - EXPECT_EQ(p, 3); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], this->invalid_index); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], this->invalid_index); - EXPECT_EQ(c[5], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{5.0}); - EXPECT_EQ(v[2], T{0.0}); - EXPECT_EQ(v[3], T{3.0}); - EXPECT_EQ(v[4], T{0.0}); - EXPECT_EQ(v[5], T{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0}); + assert_hybrid_limited_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(Dense, MovesToHybridWithStrideByPercent40) +TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAndCooLengthByColumns2) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, - std::make_shared(0.4)); + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, + std::make_shared(2)); - this->mtx4->move_to(hybrid_mtx); + this->mtx4->convert_to(hybrid_mtx); + + assert_hybrid_limited_eq_mtx4(hybrid_mtx.get()); +} + + +template +void assert_hybrid_percent_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) +{ auto v = hybrid_mtx->get_const_ell_values(); auto c = hybrid_mtx->get_const_ell_col_idxs(); auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); @@ -1583,13 +1255,13 @@ TYPED_TEST(Dense, MovesToHybridWithStrideByPercent40) EXPECT_EQ(p, 3); EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{5.0}); - EXPECT_EQ(v[2], T{0.0}); + EXPECT_EQ(c[2], gko::invalid_index()); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{5.0}); + EXPECT_EQ(v[2], ValueType{0.0}); ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2); - EXPECT_EQ(coo_v[0], T{3.0}); - EXPECT_EQ(coo_v[1], T{2.0}); + EXPECT_EQ(coo_v[0], ValueType{3.0}); + EXPECT_EQ(coo_v[1], ValueType{2.0}); EXPECT_EQ(coo_c[0], 1); EXPECT_EQ(coo_c[1], 2); EXPECT_EQ(coo_r[0], 0); @@ -1597,87 +1269,41 @@ TYPED_TEST(Dense, MovesToHybridWithStrideByPercent40) } -TYPED_TEST(Dense, ConvertsToHybridWithStrideByPercent40) +TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideByPercent40) { - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, std::make_shared(0.4)); - this->mtx4->convert_to(hybrid_mtx); - auto v = hybrid_mtx->get_const_ell_values(); - auto c = hybrid_mtx->get_const_ell_col_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); - auto coo_v = hybrid_mtx->get_const_coo_values(); - auto coo_c = hybrid_mtx->get_const_coo_col_idxs(); - auto coo_r = hybrid_mtx->get_const_coo_row_idxs(); + this->mtx4->move_to(hybrid_mtx); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 3); - EXPECT_EQ(n, 1); - EXPECT_EQ(p, 3); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{5.0}); - EXPECT_EQ(v[2], T{0.0}); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2); - EXPECT_EQ(coo_v[0], T{3.0}); - EXPECT_EQ(coo_v[1], T{2.0}); - EXPECT_EQ(coo_c[0], 1); - EXPECT_EQ(coo_c[1], 2); - EXPECT_EQ(coo_r[0], 0); - EXPECT_EQ(coo_r[1], 0); + assert_hybrid_percent_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(Dense, ConvertsToSellp32) +TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideByPercent40) { - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; - auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, + std::make_shared(0.4)); - this->mtx7->convert_to(sellp_mtx); - auto v = sellp_mtx->get_const_values(); - auto c = sellp_mtx->get_const_col_idxs(); - auto s = sellp_mtx->get_const_slice_sets(); - auto l = sellp_mtx->get_const_slice_lengths(); + this->mtx4->convert_to(hybrid_mtx); - ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sellp_mtx->get_total_cols(), 3); - ASSERT_EQ(sellp_mtx->get_num_stored_elements(), - 3 * gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_stride_factor(), - gko::matrix::default_stride_factor); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); - EXPECT_EQ(s[0], 0); - EXPECT_EQ(s[1], 3); - EXPECT_EQ(l[0], 3); + assert_hybrid_percent_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(Dense, MovesToSellp32) +template +void assert_sellp_eq_mtx7( + const gko::matrix::Sellp* sellp_mtx) { - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; - auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); - - this->mtx7->move_to(sellp_mtx); + constexpr auto invalid_index = gko::invalid_index(); auto v = sellp_mtx->get_const_values(); auto c = sellp_mtx->get_const_col_idxs(); auto s = sellp_mtx->get_const_slice_sets(); @@ -1693,103 +1319,52 @@ TYPED_TEST(Dense, MovesToSellp32) EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); EXPECT_EQ(c[gko::matrix::default_slice_size], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index); + EXPECT_EQ(c[gko::matrix::default_slice_size + 1], invalid_index); EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); + EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[gko::matrix::default_slice_size], ValueType{2.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size + 1], ValueType{0.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size], ValueType{3.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], ValueType{0.0}); EXPECT_EQ(s[0], 0); EXPECT_EQ(s[1], 3); EXPECT_EQ(l[0], 3); } -TYPED_TEST(Dense, ConvertsToSellp64) +TYPED_TEST(DenseWithIndexType, ConvertsToSellp) { - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Sellp = typename gko::matrix::Sellp; auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); this->mtx7->convert_to(sellp_mtx); - auto v = sellp_mtx->get_const_values(); - auto c = sellp_mtx->get_const_col_idxs(); - auto s = sellp_mtx->get_const_slice_sets(); - auto l = sellp_mtx->get_const_slice_lengths(); - ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sellp_mtx->get_total_cols(), 3); - ASSERT_EQ(sellp_mtx->get_num_stored_elements(), - 3 * gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_stride_factor(), - gko::matrix::default_stride_factor); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); - EXPECT_EQ(s[0], 0); - EXPECT_EQ(s[1], 3); - EXPECT_EQ(l[0], 3); + assert_sellp_eq_mtx7(sellp_mtx.get()); } -TYPED_TEST(Dense, MovesToSellp64) +TYPED_TEST(DenseWithIndexType, MovesToSellp) { - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Sellp = typename gko::matrix::Sellp; auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); this->mtx7->move_to(sellp_mtx); - auto v = sellp_mtx->get_const_values(); - auto c = sellp_mtx->get_const_col_idxs(); - auto s = sellp_mtx->get_const_slice_sets(); - auto l = sellp_mtx->get_const_slice_lengths(); - ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sellp_mtx->get_total_cols(), 3); - ASSERT_EQ(sellp_mtx->get_num_stored_elements(), - 3 * gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_stride_factor(), - gko::matrix::default_stride_factor); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0}); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0}); - EXPECT_EQ(s[0], 0); - EXPECT_EQ(s[1], 3); - EXPECT_EQ(l[0], 3); + assert_sellp_eq_mtx7(sellp_mtx.get()); } -TYPED_TEST(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor) +template +void assert_sellp_strided_eq_mtx7( + const gko::matrix::Sellp* sellp_mtx) { - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; - auto sellp_mtx = - Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); - - this->mtx7->convert_to(sellp_mtx); + constexpr auto invalid_index = gko::invalid_index(); auto v = sellp_mtx->get_const_values(); auto c = sellp_mtx->get_const_col_idxs(); auto s = sellp_mtx->get_const_slice_sets(); @@ -1803,70 +1378,59 @@ TYPED_TEST(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor) EXPECT_EQ(c[0], 0); EXPECT_EQ(c[1], 1); EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], this->invalid_index); + EXPECT_EQ(c[3], invalid_index); EXPECT_EQ(c[4], 2); - EXPECT_EQ(c[5], this->invalid_index); - EXPECT_EQ(c[6], this->invalid_index); - EXPECT_EQ(c[7], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{0.0}); - EXPECT_EQ(v[4], T{3.0}); - EXPECT_EQ(v[5], T{0.0}); - EXPECT_EQ(v[6], T{0.0}); - EXPECT_EQ(v[7], T{0.0}); + EXPECT_EQ(c[5], invalid_index); + EXPECT_EQ(c[6], invalid_index); + EXPECT_EQ(c[7], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{0.0}); + EXPECT_EQ(v[4], ValueType{3.0}); + EXPECT_EQ(v[5], ValueType{0.0}); + EXPECT_EQ(v[6], ValueType{0.0}); + EXPECT_EQ(v[7], ValueType{0.0}); EXPECT_EQ(s[0], 0); EXPECT_EQ(s[1], 4); EXPECT_EQ(l[0], 4); } -TYPED_TEST(Dense, MovesToSellpWithSliceSizeAndStrideFactor) +TYPED_TEST(DenseWithIndexType, ConvertsToSellpWithSliceSizeAndStrideFactor) { - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = + Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); + + this->mtx7->convert_to(sellp_mtx); + + assert_sellp_strided_eq_mtx7(sellp_mtx.get()); +} + + +TYPED_TEST(DenseWithIndexType, MovesToSellpWithSliceSizeAndStrideFactor) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Sellp = typename gko::matrix::Sellp; auto sellp_mtx = Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); this->mtx7->move_to(sellp_mtx); - auto v = sellp_mtx->get_const_values(); - auto c = sellp_mtx->get_const_col_idxs(); - auto s = sellp_mtx->get_const_slice_sets(); - auto l = sellp_mtx->get_const_slice_lengths(); - ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sellp_mtx->get_total_cols(), 4); - ASSERT_EQ(sellp_mtx->get_num_stored_elements(), 8); - ASSERT_EQ(sellp_mtx->get_slice_size(), 2); - ASSERT_EQ(sellp_mtx->get_stride_factor(), 2); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], this->invalid_index); - EXPECT_EQ(c[4], 2); - EXPECT_EQ(c[5], this->invalid_index); - EXPECT_EQ(c[6], this->invalid_index); - EXPECT_EQ(c[7], this->invalid_index); - EXPECT_EQ(v[0], T{1.0}); - EXPECT_EQ(v[1], T{1.5}); - EXPECT_EQ(v[2], T{2.0}); - EXPECT_EQ(v[3], T{0.0}); - EXPECT_EQ(v[4], T{3.0}); - EXPECT_EQ(v[5], T{0.0}); - EXPECT_EQ(v[6], T{0.0}); - EXPECT_EQ(v[7], T{0.0}); - EXPECT_EQ(s[0], 0); - EXPECT_EQ(s[1], 4); - EXPECT_EQ(l[0], 4); + assert_sellp_strided_eq_mtx7(sellp_mtx.get()); } -TYPED_TEST(Dense, ConvertsToAndFromSellpWithMoreThanOneSlice) +TYPED_TEST(DenseWithIndexType, ConvertsToAndFromSellpWithMoreThanOneSlice) { - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; using Mtx = typename TestFixture::Mtx; - using Sellp = typename gko::matrix::Sellp; + using Sellp = typename gko::matrix::Sellp; auto x = this->template gen_mtx(65, 25); auto sellp_mtx = Sellp::create(this->exec); @@ -1908,11 +1472,12 @@ TYPED_TEST(Dense, MovesEmptyToPrecision) } -TYPED_TEST(Dense, ConvertsEmptyToCoo) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToCoo) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Coo = typename gko::matrix::Coo; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Coo = typename gko::matrix::Coo; auto empty = Dense::create(this->exec); auto res = Coo::create(this->exec); @@ -1923,11 +1488,12 @@ TYPED_TEST(Dense, ConvertsEmptyToCoo) } -TYPED_TEST(Dense, MovesEmptyToCoo) +TYPED_TEST(DenseWithIndexType, MovesEmptyToCoo) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Coo = typename gko::matrix::Coo; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Coo = typename gko::matrix::Coo; auto empty = Dense::create(this->exec); auto res = Coo::create(this->exec); @@ -1938,11 +1504,12 @@ TYPED_TEST(Dense, MovesEmptyToCoo) } -TYPED_TEST(Dense, ConvertsEmptyMatrixToCsr) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyMatrixToCsr) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Csr = typename gko::matrix::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Csr = typename gko::matrix::Csr; auto empty = Dense::create(this->exec); auto res = Csr::create(this->exec); @@ -1954,11 +1521,12 @@ TYPED_TEST(Dense, ConvertsEmptyMatrixToCsr) } -TYPED_TEST(Dense, MovesEmptyMatrixToCsr) +TYPED_TEST(DenseWithIndexType, MovesEmptyMatrixToCsr) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Csr = typename gko::matrix::Csr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Csr = typename gko::matrix::Csr; auto empty = Dense::create(this->exec); auto res = Csr::create(this->exec); @@ -1970,11 +1538,13 @@ TYPED_TEST(Dense, MovesEmptyMatrixToCsr) } -TYPED_TEST(Dense, ConvertsEmptyToSparsityCsr) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSparsityCsr) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using SparsityCsr = typename gko::matrix::SparsityCsr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using SparsityCsr = + typename gko::matrix::SparsityCsr; auto empty = Dense::create(this->exec); auto res = SparsityCsr::create(this->exec); @@ -1986,11 +1556,13 @@ TYPED_TEST(Dense, ConvertsEmptyToSparsityCsr) } -TYPED_TEST(Dense, MovesEmptyToSparsityCsr) +TYPED_TEST(DenseWithIndexType, MovesEmptyToSparsityCsr) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using SparsityCsr = typename gko::matrix::SparsityCsr; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using SparsityCsr = + typename gko::matrix::SparsityCsr; auto empty = Dense::create(this->exec); auto res = SparsityCsr::create(this->exec); @@ -2002,11 +1574,12 @@ TYPED_TEST(Dense, MovesEmptyToSparsityCsr) } -TYPED_TEST(Dense, ConvertsEmptyToEll) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToEll) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; auto empty = Dense::create(this->exec); auto res = Ell::create(this->exec); @@ -2017,11 +1590,12 @@ TYPED_TEST(Dense, ConvertsEmptyToEll) } -TYPED_TEST(Dense, MovesEmptyToEll) +TYPED_TEST(DenseWithIndexType, MovesEmptyToEll) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Ell = typename gko::matrix::Ell; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; auto empty = Dense::create(this->exec); auto res = Ell::create(this->exec); @@ -2032,11 +1606,12 @@ TYPED_TEST(Dense, MovesEmptyToEll) } -TYPED_TEST(Dense, ConvertsEmptyToHybrid) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToHybrid) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto empty = Dense::create(this->exec); auto res = Hybrid::create(this->exec); @@ -2047,11 +1622,12 @@ TYPED_TEST(Dense, ConvertsEmptyToHybrid) } -TYPED_TEST(Dense, MovesEmptyToHybrid) +TYPED_TEST(DenseWithIndexType, MovesEmptyToHybrid) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Hybrid = typename gko::matrix::Hybrid; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; auto empty = Dense::create(this->exec); auto res = Hybrid::create(this->exec); @@ -2062,11 +1638,12 @@ TYPED_TEST(Dense, MovesEmptyToHybrid) } -TYPED_TEST(Dense, ConvertsEmptyToSellp) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSellp) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Sellp = typename gko::matrix::Sellp; auto empty = Dense::create(this->exec); auto res = Sellp::create(this->exec); @@ -2078,11 +1655,12 @@ TYPED_TEST(Dense, ConvertsEmptyToSellp) } -TYPED_TEST(Dense, MovesEmptyToSellp) +TYPED_TEST(DenseWithIndexType, MovesEmptyToSellp) { using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using Sellp = typename gko::matrix::Sellp; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Sellp = typename gko::matrix::Sellp; auto empty = Dense::create(this->exec); auto res = Sellp::create(this->exec); @@ -2159,961 +1737,400 @@ TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDense) auto trans = Mtx::create(this->exec, gko::transpose(this->mtx4->get_size())); - this->mtx4->transpose(trans); - - GKO_ASSERT_MTX_NEAR(trans, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0); -} - - -TYPED_TEST(Dense, NonSquareSubmatrixIsTransposableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto trans = Mtx::create(this->exec, gko::dim<2>{2, 1}, 5); - - this->mtx4->create_submatrix({0, 1}, {0, 2})->transpose(trans); - - GKO_ASSERT_MTX_NEAR(trans, l({1.0, 3.0}), 0.0); - ASSERT_EQ(trans->get_stride(), 5); -} - - -TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDenseFailsForWrongDimensions) -{ - using Mtx = typename TestFixture::Mtx; - - ASSERT_THROW(this->mtx4->transpose(Mtx::create(this->exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixCanGatherRows) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - - auto row_collection = this->mtx5->row_gather(&permute_idxs); - - GKO_ASSERT_MTX_NEAR(row_collection, - l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixCanGatherRowsIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3}); - - this->mtx5->row_gather(&permute_idxs, row_collection); - - GKO_ASSERT_MTX_NEAR(row_collection, - l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0); -} - - -TYPED_TEST(Dense, SquareSubmatrixCanGatherRowsIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - - this->mtx5->create_submatrix({0, 2}, {1, 3}) - ->row_gather(&permute_idxs, row_collection); - - GKO_ASSERT_MTX_NEAR(row_collection, l({{2.0, 4.5}, {-1.0, -0.5}}), 0.0); - ASSERT_EQ(row_collection->get_stride(), 4); -} - - -TYPED_TEST(Dense, NonSquareSubmatrixCanGatherRowsIntoMixedDense) -{ - using Mtx = typename TestFixture::Mtx; - using MixedMtx = typename TestFixture::MixedMtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx4->get_executor(); - gko::array gather_index{exec, {1, 0, 1}}; - auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4); - - this->mtx4->row_gather(&gather_index, row_collection); - - GKO_ASSERT_MTX_NEAR( - row_collection, - l( - {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}), - 0.0); -} - - -TYPED_TEST(Dense, NonSquareSubmatrixCanAdvancedGatherRowsIntoMixedDense) -{ - using Mtx = typename TestFixture::Mtx; - using MixedMtx = typename TestFixture::MixedMtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx4->get_executor(); - gko::array gather_index{exec, {1, 0, 1}}; - auto row_collection = gko::initialize( - {{1.0, 0.5, -1.0}, {-1.5, 0.5, 1.0}, {2.0, -3.0, 1.0}}, exec); - auto alpha = gko::initialize({1.0}, exec); - auto beta = gko::initialize({2.0}, exec); - - this->mtx4->row_gather(alpha, &gather_index, beta, row_collection); - - GKO_ASSERT_MTX_NEAR( - row_collection, - l( - {{2.0, 6.0, -2.0}, {-2.0, 4.0, 4.0}, {4.0, -1.0, 2.0}}), - 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - - ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixCanGatherRows64) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - - auto row_collection = this->mtx5->row_gather(&permute_idxs); - - GKO_ASSERT_MTX_NEAR(row_collection, - l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixCanGatherRowsIntoDense64) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3}); - - this->mtx5->row_gather(&permute_idxs, row_collection); - - GKO_ASSERT_MTX_NEAR(row_collection, - l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0); -} - - -TYPED_TEST(Dense, SquareSubmatrixCanGatherRowsIntoDense64) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - - this->mtx5->create_submatrix({0, 2}, {1, 3}) - ->row_gather(&permute_idxs, row_collection); - - GKO_ASSERT_MTX_NEAR(row_collection, l({{2.0, 4.5}, {-1.0, -0.5}}), 0.0); - ASSERT_EQ(row_collection->get_stride(), 4); -} - - -TYPED_TEST(Dense, NonSquareSubmatrixCanGatherRowsIntoMixedDense64) -{ - using Mtx = typename TestFixture::Mtx; - using MixedMtx = typename TestFixture::MixedMtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx4->get_executor(); - gko::array gather_index{exec, {1, 0, 1}}; - auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4); - - this->mtx4->row_gather(&gather_index, row_collection); - - GKO_ASSERT_MTX_NEAR( - row_collection, - l( - {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}), - 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions64) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - - ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixIsPermutable) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - auto ref_permuted = - gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - auto permuted = gko::as(this->mtx5->permute(&permute_idxs)); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixIsPermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto permuted = Mtx::create(exec, this->mtx5->get_size()); - - auto ref_permuted = - gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - this->mtx5->permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareSubmatrixIsPermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - - auto ref_permuted = - gko::as(gko::as(mtx->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - mtx->permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); - ASSERT_EQ(permuted->get_stride(), 4); -} - - -TYPED_TEST(Dense, NonSquareMatrixPermuteIntoDenseFails) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - - ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()), - gko::ValueMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongDimensions) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixIsInversePermutable) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - auto ref_permuted = gko::as( - gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - auto permuted = gko::as(this->mtx5->inverse_permute(&permute_idxs)); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixIsInversePermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto permuted = Mtx::create(exec, this->mtx5->get_size()); - - auto ref_permuted = gko::as( - gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - this->mtx5->inverse_permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareSubmatrixIsInversePermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - - auto ref_permuted = - gko::as(gko::as(mtx->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - mtx->inverse_permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); - ASSERT_EQ(permuted->get_stride(), 4); -} - - -TYPED_TEST(Dense, NonSquareMatrixInversePermuteIntoDenseFails) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW( - this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, - SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {0, 1}}; - - ASSERT_THROW( - this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()), - gko::ValueMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixIsPermutable64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - auto ref_permuted = - gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - auto permuted = gko::as(this->mtx5->permute(&permute_idxs)); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixIsPermutableIntoDense64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto permuted = Mtx::create(exec, this->mtx5->get_size()); - - auto ref_permuted = - gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - this->mtx5->permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareSubmatrixIsPermutableIntoDense64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - - auto ref_permuted = - gko::as(gko::as(mtx->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - mtx->permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); - ASSERT_EQ(permuted->get_stride(), 4); -} - - -TYPED_TEST(Dense, NonSquareMatrixPermuteIntoDenseFails64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - - ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()), - gko::ValueMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongDimensions64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixIsInversePermutable64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - auto ref_permuted = gko::as( - gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - auto permuted = gko::as(this->mtx5->inverse_permute(&permute_idxs)); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixIsInversePermutableIntoDense64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto permuted = Mtx::create(exec, this->mtx5->get_size()); - - auto ref_permuted = gko::as( - gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - this->mtx5->inverse_permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); -} - - -TYPED_TEST(Dense, SquareSubmatrixIsInversePermutableIntoDense64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - - auto ref_permuted = - gko::as(gko::as(mtx->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - mtx->inverse_permute(&permute_idxs, permuted); - - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); - ASSERT_EQ(permuted->get_stride(), 4); -} - - -TYPED_TEST(Dense, NonSquareMatrixInversePermuteIntoDenseFails64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW( - this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, - SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - - ASSERT_THROW( - this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()), - gko::ValueMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions64) -{ - using Mtx = typename TestFixture::Mtx; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, SquareMatrixIsRowPermutable) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - auto row_permute = gko::as(this->mtx5->row_permute(&permute_idxs)); - - GKO_ASSERT_MTX_NEAR( - row_permute, - l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); -} - - -TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - - auto row_permute = gko::as(this->mtx4->row_permute(&permute_idxs)); - - GKO_ASSERT_MTX_NEAR(row_permute, l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), - 0.0); -} - - -TYPED_TEST(Dense, SquareMatrixIsRowPermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - - this->mtx5->row_permute(&permute_idxs, row_permute); - - GKO_ASSERT_MTX_NEAR( - row_permute, - l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); -} - - -TYPED_TEST(Dense, SquareSubmatrixIsRowPermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->row_permute(&permute_idxs, row_permute); + this->mtx4->transpose(trans); - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), 0.0); - ASSERT_EQ(row_permute->get_stride(), 4); + GKO_ASSERT_MTX_NEAR(trans, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0); } -TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(Dense, NonSquareSubmatrixIsTransposableIntoDense) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto trans = Mtx::create(this->exec, gko::dim<2>{2, 1}, 5); - ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, row_permute), - gko::ValueMismatch); + this->mtx4->create_submatrix({0, 1}, {0, 2})->transpose(trans); + + GKO_ASSERT_MTX_NEAR(trans, l({1.0, 3.0}), 0.0); + ASSERT_EQ(trans->get_stride(), 5); } -TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, Mtx::create(exec)), + ASSERT_THROW(this->mtx4->transpose(Mtx::create(this->exec)), gko::DimensionMismatch); } -TYPED_TEST(Dense, SquareMatrixIsColPermutable) +TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRows) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 0}}; - auto c_permute = gko::as(this->mtx5->column_permute(&permute_idxs)); + auto row_collection = this->mtx5->row_gather(&permute_idxs); - GKO_ASSERT_MTX_NEAR( - c_permute, l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), - 0.0); + GKO_ASSERT_MTX_NEAR(row_collection, + l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), + 0.0); } -TYPED_TEST(Dense, NonSquareMatrixIsColPermutable) +TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRowsIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 0}}; + auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3}); - auto c_permute = gko::as(this->mtx4->column_permute(&permute_idxs)); + this->mtx5->row_gather(&permute_idxs, row_collection); - GKO_ASSERT_MTX_NEAR(c_permute, l({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}), + GKO_ASSERT_MTX_NEAR(row_collection, + l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0); } -TYPED_TEST(Dense, SquareMatrixIsColPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixCanGatherRowsIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = Mtx::create(exec, this->mtx5->get_size()); + gko::array permute_idxs{exec, {1, 0}}; + auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - this->mtx5->column_permute(&permute_idxs, c_permute); + this->mtx5->create_submatrix({0, 2}, {1, 3}) + ->row_gather(&permute_idxs, row_collection); - GKO_ASSERT_MTX_NEAR( - c_permute, l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), - 0.0); + GKO_ASSERT_MTX_NEAR(row_collection, + l({{2.0, 4.5}, {-1.0, -0.5}}), 0.0); + ASSERT_EQ(row_collection->get_stride(), 4); } -TYPED_TEST(Dense, SquareSubmatrixIsColPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, NonSquareSubmatrixCanGatherRowsIntoMixedDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + using MixedMtx = typename TestFixture::MixedMtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array gather_index{exec, {1, 0, 1}}; + auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4); - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->column_permute(&permute_idxs, c_permute); + this->mtx4->row_gather(&gather_index, row_collection); - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), 0.0); - ASSERT_EQ(c_permute->get_stride(), 4); + GKO_ASSERT_MTX_NEAR( + row_collection, + l( + {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}), + 0.0); } -TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, + NonSquareSubmatrixCanAdvancedGatherRowsIntoMixedDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + using MixedMtx = typename TestFixture::MixedMtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array gather_index{exec, {1, 0, 1}}; + auto row_collection = gko::initialize( + {{1.0, 0.5, -1.0}, {-1.5, 0.5, 1.0}, {2.0, -3.0, 1.0}}, exec); + auto alpha = gko::initialize({1.0}, exec); + auto beta = gko::initialize({2.0}, exec); - ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, row_permute), - gko::ValueMismatch); + this->mtx4->row_gather(alpha, &gather_index, beta, row_collection); + + GKO_ASSERT_MTX_NEAR( + row_collection, + l( + {{2.0, 6.0, -2.0}, {-2.0, 4.0, 4.0}, {4.0, -1.0, 2.0}}), + 0.0); } -TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, + SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 0}}; - ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, Mtx::create(exec)), + ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)), gko::DimensionMismatch); } -TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutable) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; - auto inverse_row_permute = - gko::as(this->mtx5->inverse_row_permute(&inverse_permute_idxs)); + auto ref_permuted = + gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) + ->column_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx5->permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR( - inverse_row_permute, - l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx4->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 0}}; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - auto inverse_row_permute = - gko::as(this->mtx4->inverse_row_permute(&inverse_permute_idxs)); + auto ref_permuted = + gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) + ->column_permute(&permute_idxs)); + this->mtx5->permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR(inverse_row_permute, - l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + gko::array permute_idxs{exec, {1, 0}}; + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - this->mtx5->inverse_row_permute(&permute_idxs, row_permute); + auto ref_permuted = + gko::as(gko::as(mtx->row_permute(&permute_idxs)) + ->column_permute(&permute_idxs)); + mtx->permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR( - row_permute, - l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + ASSERT_EQ(permuted->get_stride(), 4); } -TYPED_TEST(Dense, SquareSubmatrixIsInverseRowPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixPermuteIntoDenseFails) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->inverse_row_permute(&permute_idxs, row_permute); + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), 0.0); - ASSERT_EQ(row_permute->get_stride(), 4); + ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()), + gko::DimensionMismatch); } -TYPED_TEST(Dense, - SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, + SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + gko::array permute_idxs{exec, {1, 2}}; - ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, row_permute), + ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()), gko::ValueMismatch); } -TYPED_TEST(Dense, SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, + SquareMatrixPermuteIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; - ASSERT_THROW( - this->mtx5->inverse_row_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(Dense, SquareMatrixIsInverseColPermutable) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; - auto inverse_c_permute = - gko::as(this->mtx5->inverse_column_permute(&inverse_permute_idxs)); + auto ref_permuted = gko::as( + gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) + ->inverse_column_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx5->inverse_permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR( - inverse_c_permute, - l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx4->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - auto inverse_c_permute = - gko::as(this->mtx4->inverse_column_permute(&inverse_permute_idxs)); + auto ref_permuted = gko::as( + gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) + ->inverse_column_permute(&permute_idxs)); + this->mtx5->inverse_permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR(inverse_c_permute, - l({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(Dense, SquareMatrixIsInverseColPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInversePermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = Mtx::create(exec, this->mtx5->get_size()); + gko::array permute_idxs{exec, {1, 0}}; + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - this->mtx5->inverse_column_permute(&permute_idxs, c_permute); + auto ref_permuted = + gko::as(gko::as(mtx->inverse_row_permute(&permute_idxs)) + ->inverse_column_permute(&permute_idxs)); + mtx->inverse_permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR( - c_permute, l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), - 0.0); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + ASSERT_EQ(permuted->get_stride(), 4); } -TYPED_TEST(Dense, SquareSubmatrixIsInverseColPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixInversePermuteIntoDenseFails) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->column_permute(&permute_idxs, c_permute); + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), 0.0); - ASSERT_EQ(c_permute->get_stride(), 4); + ASSERT_THROW( + this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()), + gko::DimensionMismatch); } -TYPED_TEST(Dense, - SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + gko::array permute_idxs{exec, {0, 1}}; - ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, row_permute), - gko::ValueMismatch); + ASSERT_THROW( + this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()), + gko::ValueMismatch); } -TYPED_TEST(Dense, SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; - ASSERT_THROW( - this->mtx5->inverse_column_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(Dense, SquareMatrixIsRowPermutable64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; auto row_permute = gko::as(this->mtx5->row_permute(&permute_idxs)); GKO_ASSERT_MTX_NEAR( row_permute, - l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); + l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), + 0.0); } -TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable64) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsRowPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + gko::array permute_idxs{exec, {1, 0}}; auto row_permute = gko::as(this->mtx4->row_permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR(row_permute, l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), - 0.0); + GKO_ASSERT_MTX_NEAR(row_permute, + l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); } -TYPED_TEST(Dense, SquareMatrixIsRowPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; auto row_permute = Mtx::create(exec, this->mtx5->get_size()); this->mtx5->row_permute(&permute_idxs, row_permute); GKO_ASSERT_MTX_NEAR( row_permute, - l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); + l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), + 0.0); } -TYPED_TEST(Dense, SquareSubmatrixIsRowPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsRowPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + gko::array permute_idxs{exec, {1, 0}}; auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) ->row_permute(&permute_idxs, row_permute); - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), 0.0); + GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), + 0.0); ASSERT_EQ(row_permute->get_stride(), 4); } -TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; + gko::array permute_idxs{exec, {1, 2}}; auto row_permute = Mtx::create(exec, this->mtx5->get_size()); ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, row_permute), @@ -3121,85 +2138,94 @@ TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize64) } -TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, Mtx::create(exec)), gko::DimensionMismatch); } -TYPED_TEST(Dense, SquareMatrixIsColPermutable64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; auto c_permute = gko::as(this->mtx5->column_permute(&permute_idxs)); GKO_ASSERT_MTX_NEAR( - c_permute, l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), + c_permute, + l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), 0.0); } -TYPED_TEST(Dense, NonSquareMatrixIsColPermutable64) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsColPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; auto c_permute = gko::as(this->mtx4->column_permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR(c_permute, l({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}), - 0.0); + GKO_ASSERT_MTX_NEAR(c_permute, + l({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}), 0.0); } -TYPED_TEST(Dense, SquareMatrixIsColPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; auto c_permute = Mtx::create(exec, this->mtx5->get_size()); this->mtx5->column_permute(&permute_idxs, c_permute); GKO_ASSERT_MTX_NEAR( - c_permute, l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), + c_permute, + l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), 0.0); } -TYPED_TEST(Dense, SquareSubmatrixIsColPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsColPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + gko::array permute_idxs{exec, {1, 0}}; auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) ->column_permute(&permute_idxs, c_permute); - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), 0.0); + GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), + 0.0); ASSERT_EQ(c_permute->get_stride(), 4); } -TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; + gko::array permute_idxs{exec, {1, 2}}; auto row_permute = Mtx::create(exec, this->mtx5->get_size()); ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, row_permute), @@ -3207,88 +2233,96 @@ TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize64) } -TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongDimensions64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixColPermuteIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, Mtx::create(exec)), gko::DimensionMismatch); } -TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutable64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto inverse_row_permute = + auto inv_row_permute = gko::as(this->mtx5->inverse_row_permute(&inverse_permute_idxs)); GKO_ASSERT_MTX_NEAR( - inverse_row_permute, - l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0); + inv_row_permute, + l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), + 0.0); } -TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable64) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseRowPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx4->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 0}}; + gko::array inverse_permute_idxs{exec, {1, 0}}; auto inverse_row_permute = gko::as(this->mtx4->inverse_row_permute(&inverse_permute_idxs)); GKO_ASSERT_MTX_NEAR(inverse_row_permute, - l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); + l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); } -TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; auto row_permute = Mtx::create(exec, this->mtx5->get_size()); this->mtx5->inverse_row_permute(&permute_idxs, row_permute); GKO_ASSERT_MTX_NEAR( row_permute, - l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0); + l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), + 0.0); } -TYPED_TEST(Dense, SquareSubmatrixIsInverseRowPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseRowPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + gko::array permute_idxs{exec, {1, 0}}; auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) ->inverse_row_permute(&permute_idxs, row_permute); - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), 0.0); + GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), + 0.0); ASSERT_EQ(row_permute->get_stride(), 4); } -TYPED_TEST(Dense, - SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; + gko::array permute_idxs{exec, {1, 2}}; auto row_permute = Mtx::create(exec, this->mtx5->get_size()); ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, row_permute), @@ -3296,13 +2330,13 @@ TYPED_TEST(Dense, } -TYPED_TEST(Dense, - SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; ASSERT_THROW( this->mtx5->inverse_row_permute(&permute_idxs, Mtx::create(exec)), @@ -3310,76 +2344,83 @@ TYPED_TEST(Dense, } -TYPED_TEST(Dense, SquareMatrixIsInverseColPermutable64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto inverse_c_permute = + auto inv_c_permute = gko::as(this->mtx5->inverse_column_permute(&inverse_permute_idxs)); GKO_ASSERT_MTX_NEAR( - inverse_c_permute, - l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0); + inv_c_permute, + l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), + 0.0); } -TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable64) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseColPermutable) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx4->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + gko::array inverse_permute_idxs{exec, {1, 2, 0}}; auto inverse_c_permute = gko::as(this->mtx4->inverse_column_permute(&inverse_permute_idxs)); GKO_ASSERT_MTX_NEAR(inverse_c_permute, - l({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0); + l({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0); } -TYPED_TEST(Dense, SquareMatrixIsInverseColPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; auto c_permute = Mtx::create(exec, this->mtx5->get_size()); this->mtx5->inverse_column_permute(&permute_idxs, c_permute); GKO_ASSERT_MTX_NEAR( - c_permute, l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), + c_permute, + l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0); } -TYPED_TEST(Dense, SquareSubmatrixIsInverseColPermutableIntoDense64) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseColPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + gko::array permute_idxs{exec, {1, 0}}; auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) ->column_permute(&permute_idxs, c_permute); - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), 0.0); + GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), + 0.0); ASSERT_EQ(c_permute->get_stride(), 4); } -TYPED_TEST(Dense, - SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; + gko::array permute_idxs{exec, {1, 2}}; auto row_permute = Mtx::create(exec, this->mtx5->get_size()); ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, row_permute), @@ -3387,13 +2428,13 @@ TYPED_TEST(Dense, } -TYPED_TEST(Dense, - SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions64) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 2, 0}}; ASSERT_THROW( this->mtx5->inverse_column_permute(&permute_idxs, Mtx::create(exec)), From 6fbcd9f744f95adea22ffef60c162078e16aca9c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 16 Oct 2023 10:43:50 +0200 Subject: [PATCH 382/583] group tests by fixture --- reference/test/matrix/dense_kernels.cpp | 2582 +++++++++++------------ 1 file changed, 1291 insertions(+), 1291 deletions(-) diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 3a4cfb6826b..56f082243e6 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -114,23 +114,6 @@ class Dense : public ::testing::Test { TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator); -template -class DenseWithIndexType - : public Dense< - typename std::tuple_element<0, decltype(ValueIndexType())>::type> { -public: - using value_type = - typename std::tuple_element<0, decltype(ValueIndexType())>::type; - using index_type = - typename std::tuple_element<1, decltype(ValueIndexType())>::type; - - index_type invalid_index = gko::invalid_index(); -}; - -TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes, - PairTypenameNameGenerator); - - TYPED_TEST(Dense, CopyRespectsStride) { using value_type = typename TestFixture::value_type; @@ -796,2086 +779,2103 @@ TYPED_TEST(Dense, MovesToPrecision) } -template -void assert_coo_eq_mtx4(const gko::matrix::Coo* coo_mtx) +TYPED_TEST(Dense, SquareMatrixIsTransposable) { - auto v = coo_mtx->get_const_values(); - auto c = coo_mtx->get_const_col_idxs(); - auto r = coo_mtx->get_const_row_idxs(); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto trans = gko::as(this->mtx5->transpose()); - ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{3.0}); - EXPECT_EQ(v[2], ValueType{2.0}); - EXPECT_EQ(v[3], ValueType{5.0}); + GKO_ASSERT_MTX_NEAR( + trans, l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, ConvertsToCoo) +TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDense) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Coo = typename gko::matrix::Coo; - auto coo_mtx = Coo::create(this->mtx4->get_executor()); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto trans = Mtx::create(this->exec, this->mtx5->get_size()); - this->mtx4->convert_to(coo_mtx); + this->mtx5->transpose(trans); - assert_coo_eq_mtx4(coo_mtx.get()); + GKO_ASSERT_MTX_NEAR( + trans, l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, MovesToCoo) +TYPED_TEST(Dense, SquareSubmatrixIsTransposableIntoDense) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Coo = typename gko::matrix::Coo; - auto coo_mtx = Coo::create(this->mtx4->get_executor()); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto trans = Mtx::create(this->exec, gko::dim<2>{2, 2}, 4); - this->mtx4->move_to(coo_mtx); + this->mtx5->create_submatrix({0, 2}, {0, 2})->transpose(trans); - assert_coo_eq_mtx4(coo_mtx.get()); + GKO_ASSERT_MTX_NEAR(trans, l({{1.0, -2.0}, {-1.0, 2.0}}), 0.0); + ASSERT_EQ(trans->get_stride(), 4); } -template -void assert_csr_eq_mtx4(const gko::matrix::Csr* csr_mtx) +TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDenseFailsForWrongDimensions) { - auto v = csr_mtx->get_const_values(); - auto c = csr_mtx->get_const_col_idxs(); - auto r = csr_mtx->get_const_row_ptrs(); - ASSERT_EQ(csr_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(csr_mtx->get_num_stored_elements(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{3.0}); - EXPECT_EQ(v[2], ValueType{2.0}); - EXPECT_EQ(v[3], ValueType{5.0}); + using Mtx = typename TestFixture::Mtx; + + ASSERT_THROW(this->mtx5->transpose(Mtx::create(this->exec)), + gko::DimensionMismatch); } -TYPED_TEST(DenseWithIndexType, ConvertsToCsr) +TYPED_TEST(Dense, NonSquareMatrixIsTransposable) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Csr = typename gko::matrix::Csr; - auto csr_s_classical = std::make_shared(); - auto csr_s_merge = std::make_shared(); - auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); - auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); - - this->mtx4->convert_to(csr_mtx_c); - this->mtx4->convert_to(csr_mtx_m); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto trans = gko::as(this->mtx4->transpose()); - assert_csr_eq_mtx4(csr_mtx_c.get()); - ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); - GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); - ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); + GKO_ASSERT_MTX_NEAR(trans, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0); } -TYPED_TEST(DenseWithIndexType, MovesToCsr) +TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDense) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Csr = typename gko::matrix::Csr; - auto csr_s_classical = std::make_shared(); - auto csr_s_merge = std::make_shared(); - auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); - auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); - auto mtx_clone = this->mtx4->clone(); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto trans = + Mtx::create(this->exec, gko::transpose(this->mtx4->get_size())); - this->mtx4->move_to(csr_mtx_c); - mtx_clone->move_to(csr_mtx_m); + this->mtx4->transpose(trans); - assert_csr_eq_mtx4(csr_mtx_c.get()); - ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); - GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); - ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); + GKO_ASSERT_MTX_NEAR(trans, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0); } -template -void assert_sparsity_csr_eq_mtx4( - const gko::matrix::SparsityCsr* sparsity_csr_mtx) +TYPED_TEST(Dense, NonSquareSubmatrixIsTransposableIntoDense) { - auto v = sparsity_csr_mtx->get_const_value(); - auto c = sparsity_csr_mtx->get_const_col_idxs(); - auto r = sparsity_csr_mtx->get_const_row_ptrs(); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto trans = Mtx::create(this->exec, gko::dim<2>{2, 1}, 5); - ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 3); - EXPECT_EQ(r[2], 4); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], ValueType{1.0}); + this->mtx4->create_submatrix({0, 1}, {0, 2})->transpose(trans); + + GKO_ASSERT_MTX_NEAR(trans, l({1.0, 3.0}), 0.0); + ASSERT_EQ(trans->get_stride(), 5); } -TYPED_TEST(DenseWithIndexType, ConvertsToSparsityCsr) +TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDenseFailsForWrongDimensions) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using SparsityCsr = - typename gko::matrix::SparsityCsr; - auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); - - this->mtx4->convert_to(sparsity_csr_mtx); + using Mtx = typename TestFixture::Mtx; - assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get()); + ASSERT_THROW(this->mtx4->transpose(Mtx::create(this->exec)), + gko::DimensionMismatch); } -TYPED_TEST(DenseWithIndexType, MovesToSparsityCsr) +TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrix) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using SparsityCsr = - typename gko::matrix::SparsityCsr; - auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); + using T = typename TestFixture::value_type; - this->mtx4->move_to(sparsity_csr_mtx); + auto diag = this->mtx5->extract_diagonal(); - assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get()); + ASSERT_EQ(diag->get_size()[0], 3); + ASSERT_EQ(diag->get_size()[1], 3); + ASSERT_EQ(diag->get_values()[0], T{1.}); + ASSERT_EQ(diag->get_values()[1], T{2.}); + ASSERT_EQ(diag->get_values()[2], T{1.2}); } -template -void assert_ell_eq_mtx6(const gko::matrix::Ell* ell_mtx) +TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrix) { - auto v = ell_mtx->get_const_values(); - auto c = ell_mtx->get_const_col_idxs(); + using T = typename TestFixture::value_type; - ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); - ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4); - ASSERT_EQ(ell_mtx->get_stride(), 2); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], gko::invalid_index()); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{1.5}); - EXPECT_EQ(v[2], ValueType{2.0}); - EXPECT_EQ(v[3], ValueType{0.0}); + auto diag = this->mtx4->extract_diagonal(); + + ASSERT_EQ(diag->get_size()[0], 2); + ASSERT_EQ(diag->get_size()[1], 2); + ASSERT_EQ(diag->get_values()[0], T{1.}); + ASSERT_EQ(diag->get_values()[1], T{5.}); } -TYPED_TEST(DenseWithIndexType, ConvertsToEll) +TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrix) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Ell = typename gko::matrix::Ell; - auto ell_mtx = Ell::create(this->mtx6->get_executor()); + using T = typename TestFixture::value_type; - this->mtx6->convert_to(ell_mtx); + auto diag = this->mtx8->extract_diagonal(); - assert_ell_eq_mtx6(ell_mtx.get()); + ASSERT_EQ(diag->get_size()[0], 2); + ASSERT_EQ(diag->get_size()[1], 2); + ASSERT_EQ(diag->get_values()[0], T{1.}); + ASSERT_EQ(diag->get_values()[1], T{2.}); } -TYPED_TEST(DenseWithIndexType, MovesToEll) +TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrixIntoDiagonal) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Ell = typename gko::matrix::Ell; - auto ell_mtx = Ell::create(this->mtx6->get_executor()); + using T = typename TestFixture::value_type; + auto diag = gko::matrix::Diagonal::create(this->exec, 3); - this->mtx6->move_to(ell_mtx); + this->mtx5->extract_diagonal(diag); - assert_ell_eq_mtx6(ell_mtx.get()); + ASSERT_EQ(diag->get_size()[0], 3); + ASSERT_EQ(diag->get_size()[1], 3); + ASSERT_EQ(diag->get_values()[0], T{1.}); + ASSERT_EQ(diag->get_values()[1], T{2.}); + ASSERT_EQ(diag->get_values()[2], T{1.2}); } -template -void assert_strided_ell_eq_mtx6( - const gko::matrix::Ell* ell_mtx) +TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrixIntoDiagonal) { - constexpr auto invalid_index = gko::invalid_index(); - auto v = ell_mtx->get_const_values(); - auto c = ell_mtx->get_const_col_idxs(); + using T = typename TestFixture::value_type; + auto diag = gko::matrix::Diagonal::create(this->exec, 2); - ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); - ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6); - ASSERT_EQ(ell_mtx->get_stride(), 3); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], invalid_index); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], invalid_index); - EXPECT_EQ(c[5], invalid_index); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{1.5}); - EXPECT_EQ(v[2], ValueType{0.0}); - EXPECT_EQ(v[3], ValueType{2.0}); - EXPECT_EQ(v[4], ValueType{0.0}); - EXPECT_EQ(v[5], ValueType{0.0}); + this->mtx4->extract_diagonal(diag); + + ASSERT_EQ(diag->get_size()[0], 2); + ASSERT_EQ(diag->get_size()[1], 2); + ASSERT_EQ(diag->get_values()[0], T{1.}); + ASSERT_EQ(diag->get_values()[1], T{5.}); } -TYPED_TEST(DenseWithIndexType, ConvertsToEllWithStride) +TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrixIntoDiagonal) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Ell = typename gko::matrix::Ell; - auto ell_mtx = - Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); + using T = typename TestFixture::value_type; + auto diag = gko::matrix::Diagonal::create(this->exec, 2); - this->mtx6->convert_to(ell_mtx); + this->mtx8->extract_diagonal(diag); - assert_strided_ell_eq_mtx6(ell_mtx.get()); + ASSERT_EQ(diag->get_size()[0], 2); + ASSERT_EQ(diag->get_size()[1], 2); + ASSERT_EQ(diag->get_values()[0], T{1.}); + ASSERT_EQ(diag->get_values()[1], T{2.}); } -TYPED_TEST(DenseWithIndexType, MovesToEllWithStride) +TYPED_TEST(Dense, InplaceAbsolute) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Ell = typename gko::matrix::Ell; - auto ell_mtx = - Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); + using T = typename TestFixture::value_type; - this->mtx6->move_to(ell_mtx); + this->mtx5->compute_absolute_inplace(); - assert_strided_ell_eq_mtx6(ell_mtx.get()); + GKO_ASSERT_MTX_NEAR( + this->mtx5, l({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), + 0.0); } -template -void assert_hybrid_auto_eq_mtx4( - const gko::matrix::Hybrid* hybrid_mtx) +TYPED_TEST(Dense, InplaceAbsoluteSubMatrix) { - auto v = hybrid_mtx->get_const_coo_values(); - auto c = hybrid_mtx->get_const_coo_col_idxs(); - auto r = hybrid_mtx->get_const_coo_row_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); + using T = typename TestFixture::value_type; + auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2}); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); - EXPECT_EQ(n, 0); - EXPECT_EQ(p, 2); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{3.0}); - EXPECT_EQ(v[2], ValueType{2.0}); - EXPECT_EQ(v[3], ValueType{5.0}); + mtx->compute_absolute_inplace(); + + GKO_ASSERT_MTX_NEAR( + this->mtx5, l({{1.0, 1.0, -0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, MovesToHybridAutomatically) +TYPED_TEST(Dense, OutplaceAbsolute) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); + using T = typename TestFixture::value_type; - this->mtx4->move_to(hybrid_mtx); + auto abs_mtx = this->mtx5->compute_absolute(); - assert_hybrid_auto_eq_mtx4(hybrid_mtx.get()); + GKO_ASSERT_MTX_NEAR( + abs_mtx, l({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, ConvertsToHybridAutomatically) +TYPED_TEST(Dense, OutplaceAbsoluteIntoDense) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto abs_mtx = + gko::remove_complex::create(this->exec, this->mtx5->get_size()); - this->mtx4->convert_to(hybrid_mtx); + this->mtx5->compute_absolute(abs_mtx); - assert_hybrid_auto_eq_mtx4(hybrid_mtx.get()); + GKO_ASSERT_MTX_NEAR( + abs_mtx, l({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), + 0.0); } -template -void assert_hybrid_strided_eq_mtx4( - const gko::matrix::Hybrid* hybrid_mtx) +TYPED_TEST(Dense, OutplaceAbsoluteSubMatrix) { - auto v = hybrid_mtx->get_const_coo_values(); - auto c = hybrid_mtx->get_const_coo_col_idxs(); - auto r = hybrid_mtx->get_const_coo_row_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); + using T = typename TestFixture::value_type; + auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2}); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); - EXPECT_EQ(n, 0); - EXPECT_EQ(p, 3); - EXPECT_EQ(r[0], 0); - EXPECT_EQ(r[1], 0); - EXPECT_EQ(r[2], 0); - EXPECT_EQ(r[3], 1); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 2); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{3.0}); - EXPECT_EQ(v[2], ValueType{2.0}); - EXPECT_EQ(v[3], ValueType{5.0}); + auto abs_mtx = mtx->compute_absolute(); + + GKO_ASSERT_MTX_NEAR(abs_mtx, l({{1.0, 1.0}, {2.0, 2.0}}), 0); + GKO_ASSERT_EQ(abs_mtx->get_stride(), 2); } -TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAutomatically) +TYPED_TEST(Dense, OutplaceSubmatrixAbsoluteIntoDense) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2}); + auto abs_mtx = + gko::remove_complex::create(this->exec, gko::dim<2>{2, 2}, 4); + + mtx->compute_absolute(abs_mtx); + + GKO_ASSERT_MTX_NEAR(abs_mtx, l({{1.0, 1.0}, {2.0, 2.0}}), 0); + GKO_ASSERT_EQ(abs_mtx->get_stride(), 4); +} + + +TYPED_TEST(Dense, AppliesToComplex) { using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); + using complex_type = gko::to_complex; + using Vec = gko::matrix::Dense; + auto exec = gko::ReferenceExecutor::create(); + auto b = + gko::initialize({{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}}, + {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}, + {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}}, + exec); + auto x = Vec::create(exec, gko::dim<2>{2, 2}); - this->mtx4->move_to(hybrid_mtx); + this->mtx1->apply(b, x); - assert_hybrid_strided_eq_mtx4(hybrid_mtx.get()); + GKO_ASSERT_MTX_NEAR( + x, + l({{complex_type{14.0, 16.0}, complex_type{20.0, 22.0}}, + {complex_type{17.0, 19.0}, complex_type{24.5, 26.5}}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAutomatically) +TYPED_TEST(Dense, AppliesToMixedComplex) +{ + using mixed_value_type = + gko::next_precision; + using mixed_complex_type = gko::to_complex; + using Vec = gko::matrix::Dense; + auto exec = gko::ReferenceExecutor::create(); + auto b = gko::initialize( + {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}}, + {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}, + {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}}, + exec); + auto x = Vec::create(exec, gko::dim<2>{2, 2}); + + this->mtx1->apply(b, x); + + GKO_ASSERT_MTX_NEAR( + x, + l({{mixed_complex_type{14.0, 16.0}, mixed_complex_type{20.0, 22.0}}, + {mixed_complex_type{17.0, 19.0}, mixed_complex_type{24.5, 26.5}}}), + 0.0); +} + + +TYPED_TEST(Dense, AdvancedAppliesToComplex) { using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); + using complex_type = gko::to_complex; + using Dense = gko::matrix::Dense; + using DenseComplex = gko::matrix::Dense; + auto exec = gko::ReferenceExecutor::create(); - this->mtx4->convert_to(hybrid_mtx); + auto b = gko::initialize( + {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}}, + {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}, + {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}}, + exec); + auto x = gko::initialize( + {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}}, + {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}}, + exec); + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); - assert_hybrid_strided_eq_mtx4(hybrid_mtx.get()); + this->mtx1->apply(alpha, b, beta, x); + + GKO_ASSERT_MTX_NEAR( + x, + l({{complex_type{-12.0, -16.0}, complex_type{-16.0, -20.0}}, + {complex_type{-13.0, -15.0}, complex_type{-18.5, -20.5}}}), + 0.0); } -template -void assert_hybrid_limited_eq_mtx4( - const gko::matrix::Hybrid* hybrid_mtx) +TYPED_TEST(Dense, AdvancedAppliesToMixedComplex) { - constexpr auto invalid_index = gko::invalid_index(); - auto v = hybrid_mtx->get_const_ell_values(); - auto c = hybrid_mtx->get_const_ell_col_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); + using mixed_value_type = + gko::next_precision; + using mixed_complex_type = gko::to_complex; + using MixedDense = gko::matrix::Dense; + using MixedDenseComplex = gko::matrix::Dense; + auto exec = gko::ReferenceExecutor::create(); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1); - EXPECT_EQ(n, 2); - EXPECT_EQ(p, 3); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], invalid_index); - EXPECT_EQ(c[3], 1); - EXPECT_EQ(c[4], invalid_index); - EXPECT_EQ(c[5], invalid_index); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{5.0}); - EXPECT_EQ(v[2], ValueType{0.0}); - EXPECT_EQ(v[3], ValueType{3.0}); - EXPECT_EQ(v[4], ValueType{0.0}); - EXPECT_EQ(v[5], ValueType{0.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], ValueType{2.0}); - EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0); - EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); + auto b = gko::initialize( + {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}}, + {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}, + {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}}, + exec); + auto x = gko::initialize( + {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}}, + {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}}, + exec); + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + + this->mtx1->apply(alpha, b, beta, x); + + GKO_ASSERT_MTX_NEAR( + x, + l({{mixed_complex_type{-12.0, -16.0}, mixed_complex_type{-16.0, -20.0}}, + {mixed_complex_type{-13.0, -15.0}, + mixed_complex_type{-18.5, -20.5}}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAndCooLengthByColumns2) +TYPED_TEST(Dense, MakeComplex) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, - std::make_shared(2)); + using T = typename TestFixture::value_type; - this->mtx4->move_to(hybrid_mtx); + auto complex_mtx = this->mtx5->make_complex(); - assert_hybrid_limited_eq_mtx4(hybrid_mtx.get()); + GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0); } -TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAndCooLengthByColumns2) +TYPED_TEST(Dense, MakeComplexIntoDense) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, - std::make_shared(2)); + using T = typename TestFixture::value_type; + using ComplexMtx = typename TestFixture::ComplexMtx; + auto exec = this->mtx5->get_executor(); - this->mtx4->convert_to(hybrid_mtx); + auto complex_mtx = ComplexMtx::create(exec, this->mtx5->get_size()); + this->mtx5->make_complex(complex_mtx); - assert_hybrid_limited_eq_mtx4(hybrid_mtx.get()); + GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0); } -template -void assert_hybrid_percent_eq_mtx4( - const gko::matrix::Hybrid* hybrid_mtx) +TYPED_TEST(Dense, MakeComplexIntoDenseFailsForWrongDimensions) { - auto v = hybrid_mtx->get_const_ell_values(); - auto c = hybrid_mtx->get_const_ell_col_idxs(); - auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); - auto p = hybrid_mtx->get_ell_stride(); - auto coo_v = hybrid_mtx->get_const_coo_values(); - auto coo_c = hybrid_mtx->get_const_coo_col_idxs(); - auto coo_r = hybrid_mtx->get_const_coo_row_idxs(); + using T = typename TestFixture::value_type; + using ComplexMtx = typename TestFixture::ComplexMtx; + auto exec = this->mtx5->get_executor(); - ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 3); - EXPECT_EQ(n, 1); - EXPECT_EQ(p, 3); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], gko::invalid_index()); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{5.0}); - EXPECT_EQ(v[2], ValueType{0.0}); - ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2); - EXPECT_EQ(coo_v[0], ValueType{3.0}); - EXPECT_EQ(coo_v[1], ValueType{2.0}); - EXPECT_EQ(coo_c[0], 1); - EXPECT_EQ(coo_c[1], 2); - EXPECT_EQ(coo_r[0], 0); - EXPECT_EQ(coo_r[1], 0); + auto complex_mtx = ComplexMtx::create(exec); + + ASSERT_THROW(this->mtx5->make_complex(complex_mtx), gko::DimensionMismatch); } -TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideByPercent40) +TYPED_TEST(Dense, GetReal) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, - std::make_shared(0.4)); + using T = typename TestFixture::value_type; - this->mtx4->move_to(hybrid_mtx); + auto real_mtx = this->mtx5->get_real(); - assert_hybrid_percent_eq_mtx4(hybrid_mtx.get()); + GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0); } -TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideByPercent40) +TYPED_TEST(Dense, GetRealIntoDense) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto hybrid_mtx = - Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, - std::make_shared(0.4)); + using T = typename TestFixture::value_type; + using RealMtx = typename TestFixture::RealMtx; + auto exec = this->mtx5->get_executor(); - this->mtx4->convert_to(hybrid_mtx); + auto real_mtx = RealMtx::create(exec, this->mtx5->get_size()); + this->mtx5->get_real(real_mtx); - assert_hybrid_percent_eq_mtx4(hybrid_mtx.get()); + GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0); } -template -void assert_sellp_eq_mtx7( - const gko::matrix::Sellp* sellp_mtx) +TYPED_TEST(Dense, GetRealIntoDenseFailsForWrongDimensions) { - constexpr auto invalid_index = gko::invalid_index(); - auto v = sellp_mtx->get_const_values(); - auto c = sellp_mtx->get_const_col_idxs(); - auto s = sellp_mtx->get_const_slice_sets(); - auto l = sellp_mtx->get_const_slice_lengths(); + using T = typename TestFixture::value_type; + using RealMtx = typename TestFixture::RealMtx; + auto exec = this->mtx5->get_executor(); - ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sellp_mtx->get_total_cols(), 3); - ASSERT_EQ(sellp_mtx->get_num_stored_elements(), - 3 * gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size); - ASSERT_EQ(sellp_mtx->get_stride_factor(), - gko::matrix::default_stride_factor); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size], 1); - EXPECT_EQ(c[gko::matrix::default_slice_size + 1], invalid_index); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); - EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], invalid_index); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{1.5}); - EXPECT_EQ(v[gko::matrix::default_slice_size], ValueType{2.0}); - EXPECT_EQ(v[gko::matrix::default_slice_size + 1], ValueType{0.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size], ValueType{3.0}); - EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], ValueType{0.0}); - EXPECT_EQ(s[0], 0); - EXPECT_EQ(s[1], 3); - EXPECT_EQ(l[0], 3); + auto real_mtx = RealMtx::create(exec); + ASSERT_THROW(this->mtx5->get_real(real_mtx), gko::DimensionMismatch); } -TYPED_TEST(DenseWithIndexType, ConvertsToSellp) +TYPED_TEST(Dense, GetImag) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Sellp = typename gko::matrix::Sellp; - auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); + using T = typename TestFixture::value_type; - this->mtx7->convert_to(sellp_mtx); + auto imag_mtx = this->mtx5->get_imag(); - assert_sellp_eq_mtx7(sellp_mtx.get()); + GKO_ASSERT_MTX_NEAR( + imag_mtx, l({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, MovesToSellp) +TYPED_TEST(Dense, GetImagIntoDense) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Sellp = typename gko::matrix::Sellp; - auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); + using T = typename TestFixture::value_type; + using RealMtx = typename TestFixture::RealMtx; + auto exec = this->mtx5->get_executor(); - this->mtx7->move_to(sellp_mtx); + auto imag_mtx = RealMtx::create(exec, this->mtx5->get_size()); + this->mtx5->get_imag(imag_mtx); - assert_sellp_eq_mtx7(sellp_mtx.get()); + GKO_ASSERT_MTX_NEAR( + imag_mtx, l({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}), + 0.0); } -template -void assert_sellp_strided_eq_mtx7( - const gko::matrix::Sellp* sellp_mtx) +TYPED_TEST(Dense, GetImagIntoDenseFailsForWrongDimensions) { - constexpr auto invalid_index = gko::invalid_index(); - auto v = sellp_mtx->get_const_values(); - auto c = sellp_mtx->get_const_col_idxs(); - auto s = sellp_mtx->get_const_slice_sets(); - auto l = sellp_mtx->get_const_slice_lengths(); + using T = typename TestFixture::value_type; + using RealMtx = typename TestFixture::RealMtx; + auto exec = this->mtx5->get_executor(); - ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(sellp_mtx->get_total_cols(), 4); - ASSERT_EQ(sellp_mtx->get_num_stored_elements(), 8); - ASSERT_EQ(sellp_mtx->get_slice_size(), 2); - ASSERT_EQ(sellp_mtx->get_stride_factor(), 2); - EXPECT_EQ(c[0], 0); - EXPECT_EQ(c[1], 1); - EXPECT_EQ(c[2], 1); - EXPECT_EQ(c[3], invalid_index); - EXPECT_EQ(c[4], 2); - EXPECT_EQ(c[5], invalid_index); - EXPECT_EQ(c[6], invalid_index); - EXPECT_EQ(c[7], invalid_index); - EXPECT_EQ(v[0], ValueType{1.0}); - EXPECT_EQ(v[1], ValueType{1.5}); - EXPECT_EQ(v[2], ValueType{2.0}); - EXPECT_EQ(v[3], ValueType{0.0}); - EXPECT_EQ(v[4], ValueType{3.0}); - EXPECT_EQ(v[5], ValueType{0.0}); - EXPECT_EQ(v[6], ValueType{0.0}); - EXPECT_EQ(v[7], ValueType{0.0}); - EXPECT_EQ(s[0], 0); - EXPECT_EQ(s[1], 4); - EXPECT_EQ(l[0], 4); + auto imag_mtx = RealMtx::create(exec); + ASSERT_THROW(this->mtx5->get_imag(imag_mtx), gko::DimensionMismatch); } -TYPED_TEST(DenseWithIndexType, ConvertsToSellpWithSliceSizeAndStrideFactor) +TYPED_TEST(Dense, MakeTemporaryConversionDoesntConvertOnMatch) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Sellp = typename gko::matrix::Sellp; - auto sellp_mtx = - Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); - - this->mtx7->convert_to(sellp_mtx); + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto alpha = gko::initialize({8.0}, this->exec); - assert_sellp_strided_eq_mtx7(sellp_mtx.get()); + ASSERT_EQ(gko::make_temporary_conversion(alpha).get(), alpha.get()); } -TYPED_TEST(DenseWithIndexType, MovesToSellpWithSliceSizeAndStrideFactor) +TYPED_TEST(Dense, MakeTemporaryConversionConvertsBack) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Sellp = typename gko::matrix::Sellp; - auto sellp_mtx = - Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); + using MixedMtx = typename TestFixture::MixedMtx; + using T = typename TestFixture::value_type; + using MixedT = typename MixedMtx::value_type; + auto alpha = gko::initialize({8.0}, this->exec); - this->mtx7->move_to(sellp_mtx); + { + auto conversion = gko::make_temporary_conversion(alpha); + conversion->at(0, 0) = T{7.0}; + } - assert_sellp_strided_eq_mtx7(sellp_mtx.get()); + ASSERT_EQ(alpha->at(0, 0), MixedT{7.0}); } -TYPED_TEST(DenseWithIndexType, ConvertsToAndFromSellpWithMoreThanOneSlice) +TYPED_TEST(Dense, MakeTemporaryConversionConstDoesntConvertBack) { - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Mtx = typename TestFixture::Mtx; - using Sellp = typename gko::matrix::Sellp; - auto x = this->template gen_mtx(65, 25); + using MixedMtx = typename TestFixture::MixedMtx; + using T = typename TestFixture::value_type; + using MixedT = typename MixedMtx::value_type; + auto alpha = gko::initialize({8.0}, this->exec); - auto sellp_mtx = Sellp::create(this->exec); - auto dense_mtx = Mtx::create(this->exec); - x->convert_to(sellp_mtx); - sellp_mtx->convert_to(dense_mtx); + { + auto conversion = gko::make_temporary_conversion( + static_cast(alpha.get())); + alpha->at(0, 0) = MixedT{7.0}; + } - GKO_ASSERT_MTX_NEAR(dense_mtx, x, 0.0); + ASSERT_EQ(alpha->at(0, 0), MixedT{7.0}); } -TYPED_TEST(Dense, ConvertsEmptyToPrecision) +TYPED_TEST(Dense, ScaleAddIdentityRectangular) { - using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; - using OtherDense = typename gko::matrix::Dense; - auto empty = OtherDense::create(this->exec); - auto res = Dense::create(this->exec); + using Vec = typename TestFixture::Mtx; + using MixedVec = typename TestFixture::MixedMtx; + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto b = gko::initialize( + {I{2.0, 0.0}, I{1.0, 2.5}, I{0.0, -4.0}}, this->exec); - empty->convert_to(res); + b->add_scaled_identity(alpha, beta); - ASSERT_FALSE(res->get_size()); + GKO_ASSERT_MTX_NEAR(b, l({{0.0, 0.0}, {-1.0, -0.5}, {0.0, 4.0}}), 0.0); } -TYPED_TEST(Dense, MovesEmptyToPrecision) -{ - using Dense = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; - using OtherDense = typename gko::matrix::Dense; - auto empty = OtherDense::create(this->exec); - auto res = Dense::create(this->exec); +template +class DenseWithIndexType + : public Dense< + typename std::tuple_element<0, decltype(ValueIndexType())>::type> { +public: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; - empty->move_to(res); + index_type invalid_index = gko::invalid_index(); +}; - ASSERT_FALSE(res->get_size()); -} +TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes, + PairTypenameNameGenerator); -TYPED_TEST(DenseWithIndexType, ConvertsEmptyToCoo) +template +void assert_coo_eq_mtx4(const gko::matrix::Coo* coo_mtx) { - using Dense = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Coo = typename gko::matrix::Coo; - auto empty = Dense::create(this->exec); - auto res = Coo::create(this->exec); - - empty->convert_to(res); + auto v = coo_mtx->get_const_values(); + auto c = coo_mtx->get_const_col_idxs(); + auto r = coo_mtx->get_const_row_idxs(); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_FALSE(res->get_size()); + ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 0); + EXPECT_EQ(r[2], 0); + EXPECT_EQ(r[3], 1); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(DenseWithIndexType, MovesEmptyToCoo) +TYPED_TEST(DenseWithIndexType, ConvertsToCoo) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; using Coo = typename gko::matrix::Coo; - auto empty = Dense::create(this->exec); - auto res = Coo::create(this->exec); + auto coo_mtx = Coo::create(this->mtx4->get_executor()); - empty->move_to(res); + this->mtx4->convert_to(coo_mtx); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_FALSE(res->get_size()); + assert_coo_eq_mtx4(coo_mtx.get()); } -TYPED_TEST(DenseWithIndexType, ConvertsEmptyMatrixToCsr) +TYPED_TEST(DenseWithIndexType, MovesToCoo) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using Csr = typename gko::matrix::Csr; - auto empty = Dense::create(this->exec); - auto res = Csr::create(this->exec); + using Coo = typename gko::matrix::Coo; + auto coo_mtx = Coo::create(this->mtx4->get_executor()); - empty->convert_to(res); + this->mtx4->move_to(coo_mtx); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_EQ(*res->get_const_row_ptrs(), 0); - ASSERT_FALSE(res->get_size()); + assert_coo_eq_mtx4(coo_mtx.get()); } -TYPED_TEST(DenseWithIndexType, MovesEmptyMatrixToCsr) +template +void assert_csr_eq_mtx4(const gko::matrix::Csr* csr_mtx) { - using Dense = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Csr = typename gko::matrix::Csr; - auto empty = Dense::create(this->exec); - auto res = Csr::create(this->exec); - - empty->move_to(res); - - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_EQ(*res->get_const_row_ptrs(), 0); - ASSERT_FALSE(res->get_size()); + auto v = csr_mtx->get_const_values(); + auto c = csr_mtx->get_const_col_idxs(); + auto r = csr_mtx->get_const_row_ptrs(); + ASSERT_EQ(csr_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(csr_mtx->get_num_stored_elements(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 4); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSparsityCsr) +TYPED_TEST(DenseWithIndexType, ConvertsToCsr) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using SparsityCsr = - typename gko::matrix::SparsityCsr; - auto empty = Dense::create(this->exec); - auto res = SparsityCsr::create(this->exec); + using Csr = typename gko::matrix::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); - empty->convert_to(res); + this->mtx4->convert_to(csr_mtx_c); + this->mtx4->convert_to(csr_mtx_m); - ASSERT_EQ(res->get_num_nonzeros(), 0); - ASSERT_EQ(*res->get_const_row_ptrs(), 0); - ASSERT_FALSE(res->get_size()); + assert_csr_eq_mtx4(csr_mtx_c.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TYPED_TEST(DenseWithIndexType, MovesEmptyToSparsityCsr) +TYPED_TEST(DenseWithIndexType, MovesToCsr) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using SparsityCsr = - typename gko::matrix::SparsityCsr; - auto empty = Dense::create(this->exec); - auto res = SparsityCsr::create(this->exec); + using Csr = typename gko::matrix::Csr; + auto csr_s_classical = std::make_shared(); + auto csr_s_merge = std::make_shared(); + auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical); + auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge); + auto mtx_clone = this->mtx4->clone(); - empty->move_to(res); + this->mtx4->move_to(csr_mtx_c); + mtx_clone->move_to(csr_mtx_m); - ASSERT_EQ(res->get_num_nonzeros(), 0); - ASSERT_EQ(*res->get_const_row_ptrs(), 0); - ASSERT_FALSE(res->get_size()); + assert_csr_eq_mtx4(csr_mtx_c.get()); + ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical"); + GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0); + ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path"); } -TYPED_TEST(DenseWithIndexType, ConvertsEmptyToEll) +template +void assert_sparsity_csr_eq_mtx4( + const gko::matrix::SparsityCsr* sparsity_csr_mtx) { - using Dense = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Ell = typename gko::matrix::Ell; - auto empty = Dense::create(this->exec); - auto res = Ell::create(this->exec); - - empty->convert_to(res); + auto v = sparsity_csr_mtx->get_const_value(); + auto c = sparsity_csr_mtx->get_const_col_idxs(); + auto r = sparsity_csr_mtx->get_const_row_ptrs(); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_FALSE(res->get_size()); + ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 3); + EXPECT_EQ(r[2], 4); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], ValueType{1.0}); } -TYPED_TEST(DenseWithIndexType, MovesEmptyToEll) +TYPED_TEST(DenseWithIndexType, ConvertsToSparsityCsr) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using Ell = typename gko::matrix::Ell; - auto empty = Dense::create(this->exec); - auto res = Ell::create(this->exec); + using SparsityCsr = + typename gko::matrix::SparsityCsr; + auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); - empty->move_to(res); + this->mtx4->convert_to(sparsity_csr_mtx); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_FALSE(res->get_size()); + assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get()); } -TYPED_TEST(DenseWithIndexType, ConvertsEmptyToHybrid) +TYPED_TEST(DenseWithIndexType, MovesToSparsityCsr) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto empty = Dense::create(this->exec); - auto res = Hybrid::create(this->exec); + using SparsityCsr = + typename gko::matrix::SparsityCsr; + auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor()); - empty->convert_to(res); + this->mtx4->move_to(sparsity_csr_mtx); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_FALSE(res->get_size()); + assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get()); } -TYPED_TEST(DenseWithIndexType, MovesEmptyToHybrid) +template +void assert_ell_eq_mtx6(const gko::matrix::Ell* ell_mtx) { - using Dense = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - using Hybrid = typename gko::matrix::Hybrid; - auto empty = Dense::create(this->exec); - auto res = Hybrid::create(this->exec); - - empty->move_to(res); + auto v = ell_mtx->get_const_values(); + auto c = ell_mtx->get_const_col_idxs(); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_FALSE(res->get_size()); + ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); + ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4); + ASSERT_EQ(ell_mtx->get_stride(), 2); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 1); + EXPECT_EQ(c[3], gko::invalid_index()); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{0.0}); } -TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSellp) +TYPED_TEST(DenseWithIndexType, ConvertsToEll) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using Sellp = typename gko::matrix::Sellp; - auto empty = Dense::create(this->exec); - auto res = Sellp::create(this->exec); + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor()); - empty->convert_to(res); + this->mtx6->convert_to(ell_mtx); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_EQ(*res->get_const_slice_sets(), 0); - ASSERT_FALSE(res->get_size()); + assert_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(DenseWithIndexType, MovesEmptyToSellp) +TYPED_TEST(DenseWithIndexType, MovesToEll) { - using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using Sellp = typename gko::matrix::Sellp; - auto empty = Dense::create(this->exec); - auto res = Sellp::create(this->exec); + using Ell = typename gko::matrix::Ell; + auto ell_mtx = Ell::create(this->mtx6->get_executor()); - empty->move_to(res); + this->mtx6->move_to(ell_mtx); - ASSERT_EQ(res->get_num_stored_elements(), 0); - ASSERT_EQ(*res->get_const_slice_sets(), 0); - ASSERT_FALSE(res->get_size()); + assert_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(Dense, SquareMatrixIsTransposable) +template +void assert_strided_ell_eq_mtx6( + const gko::matrix::Ell* ell_mtx) { - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto trans = gko::as(this->mtx5->transpose()); + constexpr auto invalid_index = gko::invalid_index(); + auto v = ell_mtx->get_const_values(); + auto c = ell_mtx->get_const_col_idxs(); - GKO_ASSERT_MTX_NEAR( - trans, l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}), - 0.0); + ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2); + ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6); + ASSERT_EQ(ell_mtx->get_stride(), 3); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], invalid_index); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(c[4], invalid_index); + EXPECT_EQ(c[5], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[2], ValueType{0.0}); + EXPECT_EQ(v[3], ValueType{2.0}); + EXPECT_EQ(v[4], ValueType{0.0}); + EXPECT_EQ(v[5], ValueType{0.0}); } -TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDense) +TYPED_TEST(DenseWithIndexType, ConvertsToEllWithStride) { - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto trans = Mtx::create(this->exec, this->mtx5->get_size()); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = + Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); - this->mtx5->transpose(trans); + this->mtx6->convert_to(ell_mtx); - GKO_ASSERT_MTX_NEAR( - trans, l({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}), - 0.0); + assert_strided_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(Dense, SquareSubmatrixIsTransposableIntoDense) +TYPED_TEST(DenseWithIndexType, MovesToEllWithStride) { - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto trans = Mtx::create(this->exec, gko::dim<2>{2, 2}, 4); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Ell = typename gko::matrix::Ell; + auto ell_mtx = + Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3); - this->mtx5->create_submatrix({0, 2}, {0, 2})->transpose(trans); + this->mtx6->move_to(ell_mtx); - GKO_ASSERT_MTX_NEAR(trans, l({{1.0, -2.0}, {-1.0, 2.0}}), 0.0); - ASSERT_EQ(trans->get_stride(), 4); + assert_strided_ell_eq_mtx6(ell_mtx.get()); } -TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDenseFailsForWrongDimensions) +template +void assert_hybrid_auto_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) { - using Mtx = typename TestFixture::Mtx; + auto v = hybrid_mtx->get_const_coo_values(); + auto c = hybrid_mtx->get_const_coo_col_idxs(); + auto r = hybrid_mtx->get_const_coo_row_idxs(); + auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); + auto p = hybrid_mtx->get_ell_stride(); - ASSERT_THROW(this->mtx5->transpose(Mtx::create(this->exec)), - gko::DimensionMismatch); + ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); + EXPECT_EQ(n, 0); + EXPECT_EQ(p, 2); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 0); + EXPECT_EQ(r[2], 0); + EXPECT_EQ(r[3], 1); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(Dense, NonSquareMatrixIsTransposable) +TYPED_TEST(DenseWithIndexType, MovesToHybridAutomatically) { - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto trans = gko::as(this->mtx4->transpose()); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); - GKO_ASSERT_MTX_NEAR(trans, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0); + this->mtx4->move_to(hybrid_mtx); + + assert_hybrid_auto_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDense) +TYPED_TEST(DenseWithIndexType, ConvertsToHybridAutomatically) { - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto trans = - Mtx::create(this->exec, gko::transpose(this->mtx4->get_size())); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor()); - this->mtx4->transpose(trans); + this->mtx4->convert_to(hybrid_mtx); - GKO_ASSERT_MTX_NEAR(trans, l({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0); + assert_hybrid_auto_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(Dense, NonSquareSubmatrixIsTransposableIntoDense) +template +void assert_hybrid_strided_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) { - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto trans = Mtx::create(this->exec, gko::dim<2>{2, 1}, 5); - - this->mtx4->create_submatrix({0, 1}, {0, 2})->transpose(trans); + auto v = hybrid_mtx->get_const_coo_values(); + auto c = hybrid_mtx->get_const_coo_col_idxs(); + auto r = hybrid_mtx->get_const_coo_row_idxs(); + auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); + auto p = hybrid_mtx->get_ell_stride(); - GKO_ASSERT_MTX_NEAR(trans, l({1.0, 3.0}), 0.0); - ASSERT_EQ(trans->get_stride(), 5); + ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4); + EXPECT_EQ(n, 0); + EXPECT_EQ(p, 3); + EXPECT_EQ(r[0], 0); + EXPECT_EQ(r[1], 0); + EXPECT_EQ(r[2], 0); + EXPECT_EQ(r[3], 1); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 2); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{3.0}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{5.0}); } -TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAutomatically) { - using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); - ASSERT_THROW(this->mtx4->transpose(Mtx::create(this->exec)), - gko::DimensionMismatch); + this->mtx4->move_to(hybrid_mtx); + + assert_hybrid_strided_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRows) +TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAutomatically) { - using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3); - auto row_collection = this->mtx5->row_gather(&permute_idxs); + this->mtx4->convert_to(hybrid_mtx); - GKO_ASSERT_MTX_NEAR(row_collection, - l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), - 0.0); + assert_hybrid_strided_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRowsIntoDense) +template +void assert_hybrid_limited_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) +{ + constexpr auto invalid_index = gko::invalid_index(); + auto v = hybrid_mtx->get_const_ell_values(); + auto c = hybrid_mtx->get_const_ell_col_idxs(); + auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); + auto p = hybrid_mtx->get_ell_stride(); + + ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1); + EXPECT_EQ(n, 2); + EXPECT_EQ(p, 3); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], invalid_index); + EXPECT_EQ(c[3], 1); + EXPECT_EQ(c[4], invalid_index); + EXPECT_EQ(c[5], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{5.0}); + EXPECT_EQ(v[2], ValueType{0.0}); + EXPECT_EQ(v[3], ValueType{3.0}); + EXPECT_EQ(v[4], ValueType{0.0}); + EXPECT_EQ(v[5], ValueType{0.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], ValueType{2.0}); + EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0); + EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2); +} + + +TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAndCooLengthByColumns2) { - using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3}); + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, + std::make_shared(2)); - this->mtx5->row_gather(&permute_idxs, row_collection); + this->mtx4->move_to(hybrid_mtx); - GKO_ASSERT_MTX_NEAR(row_collection, - l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), - 0.0); + assert_hybrid_limited_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(DenseWithIndexType, SquareSubmatrixCanGatherRowsIntoDense) +TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAndCooLengthByColumns2) { - using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3, + std::make_shared(2)); - this->mtx5->create_submatrix({0, 2}, {1, 3}) - ->row_gather(&permute_idxs, row_collection); + this->mtx4->convert_to(hybrid_mtx); - GKO_ASSERT_MTX_NEAR(row_collection, - l({{2.0, 4.5}, {-1.0, -0.5}}), 0.0); - ASSERT_EQ(row_collection->get_stride(), 4); + assert_hybrid_limited_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(DenseWithIndexType, NonSquareSubmatrixCanGatherRowsIntoMixedDense) +template +void assert_hybrid_percent_eq_mtx4( + const gko::matrix::Hybrid* hybrid_mtx) { - using Mtx = typename TestFixture::Mtx; - using MixedMtx = typename TestFixture::MixedMtx; - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx4->get_executor(); - gko::array gather_index{exec, {1, 0, 1}}; - auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4); - - this->mtx4->row_gather(&gather_index, row_collection); + auto v = hybrid_mtx->get_const_ell_values(); + auto c = hybrid_mtx->get_const_ell_col_idxs(); + auto n = hybrid_mtx->get_ell_num_stored_elements_per_row(); + auto p = hybrid_mtx->get_ell_stride(); + auto coo_v = hybrid_mtx->get_const_coo_values(); + auto coo_c = hybrid_mtx->get_const_coo_col_idxs(); + auto coo_r = hybrid_mtx->get_const_coo_row_idxs(); - GKO_ASSERT_MTX_NEAR( - row_collection, - l( - {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}), - 0.0); + ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 3); + EXPECT_EQ(n, 1); + EXPECT_EQ(p, 3); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], gko::invalid_index()); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{5.0}); + EXPECT_EQ(v[2], ValueType{0.0}); + ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2); + EXPECT_EQ(coo_v[0], ValueType{3.0}); + EXPECT_EQ(coo_v[1], ValueType{2.0}); + EXPECT_EQ(coo_c[0], 1); + EXPECT_EQ(coo_c[1], 2); + EXPECT_EQ(coo_r[0], 0); + EXPECT_EQ(coo_r[1], 0); } -TYPED_TEST(DenseWithIndexType, - NonSquareSubmatrixCanAdvancedGatherRowsIntoMixedDense) +TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideByPercent40) { - using Mtx = typename TestFixture::Mtx; - using MixedMtx = typename TestFixture::MixedMtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx4->get_executor(); - gko::array gather_index{exec, {1, 0, 1}}; - auto row_collection = gko::initialize( - {{1.0, 0.5, -1.0}, {-1.5, 0.5, 1.0}, {2.0, -3.0, 1.0}}, exec); - auto alpha = gko::initialize({1.0}, exec); - auto beta = gko::initialize({2.0}, exec); + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, + std::make_shared(0.4)); - this->mtx4->row_gather(alpha, &gather_index, beta, row_collection); + this->mtx4->move_to(hybrid_mtx); - GKO_ASSERT_MTX_NEAR( - row_collection, - l( - {{2.0, 6.0, -2.0}, {-2.0, 4.0, 4.0}, {4.0, -1.0, 2.0}}), - 0.0); + assert_hybrid_percent_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideByPercent40) { - using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + using Hybrid = typename gko::matrix::Hybrid; + auto hybrid_mtx = + Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3, + std::make_shared(0.4)); - ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + this->mtx4->convert_to(hybrid_mtx); + + assert_hybrid_percent_eq_mtx4(hybrid_mtx.get()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutable) +template +void assert_sellp_eq_mtx7( + const gko::matrix::Sellp* sellp_mtx) { - using Mtx = typename TestFixture::Mtx; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - - auto ref_permuted = - gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - auto permuted = gko::as(this->mtx5->permute(&permute_idxs)); + constexpr auto invalid_index = gko::invalid_index(); + auto v = sellp_mtx->get_const_values(); + auto c = sellp_mtx->get_const_col_idxs(); + auto s = sellp_mtx->get_const_slice_sets(); + auto l = sellp_mtx->get_const_slice_lengths(); - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(sellp_mtx->get_total_cols(), 3); + ASSERT_EQ(sellp_mtx->get_num_stored_elements(), + 3 * gko::matrix::default_slice_size); + ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size); + ASSERT_EQ(sellp_mtx->get_stride_factor(), + gko::matrix::default_stride_factor); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[gko::matrix::default_slice_size], 1); + EXPECT_EQ(c[gko::matrix::default_slice_size + 1], invalid_index); + EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2); + EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[gko::matrix::default_slice_size], ValueType{2.0}); + EXPECT_EQ(v[gko::matrix::default_slice_size + 1], ValueType{0.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size], ValueType{3.0}); + EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], ValueType{0.0}); + EXPECT_EQ(s[0], 0); + EXPECT_EQ(s[1], 3); + EXPECT_EQ(l[0], 3); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, ConvertsToSellp) { - using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto permuted = Mtx::create(exec, this->mtx5->get_size()); + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); - auto ref_permuted = - gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - this->mtx5->permute(&permute_idxs, permuted); + this->mtx7->convert_to(sellp_mtx); - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + assert_sellp_eq_mtx7(sellp_mtx.get()); } -TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, MovesToSellp) { - using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = Sellp::create(this->mtx7->get_executor()); - auto ref_permuted = - gko::as(gko::as(mtx->row_permute(&permute_idxs)) - ->column_permute(&permute_idxs)); - mtx->permute(&permute_idxs, permuted); + this->mtx7->move_to(sellp_mtx); - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); - ASSERT_EQ(permuted->get_stride(), 4); + assert_sellp_eq_mtx7(sellp_mtx.get()); } -TYPED_TEST(DenseWithIndexType, NonSquareMatrixPermuteIntoDenseFails) +template +void assert_sellp_strided_eq_mtx7( + const gko::matrix::Sellp* sellp_mtx) { - using Mtx = typename TestFixture::Mtx; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + constexpr auto invalid_index = gko::invalid_index(); + auto v = sellp_mtx->get_const_values(); + auto c = sellp_mtx->get_const_col_idxs(); + auto s = sellp_mtx->get_const_slice_sets(); + auto l = sellp_mtx->get_const_slice_lengths(); - ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()), - gko::DimensionMismatch); + ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(sellp_mtx->get_total_cols(), 4); + ASSERT_EQ(sellp_mtx->get_num_stored_elements(), 8); + ASSERT_EQ(sellp_mtx->get_slice_size(), 2); + ASSERT_EQ(sellp_mtx->get_stride_factor(), 2); + EXPECT_EQ(c[0], 0); + EXPECT_EQ(c[1], 1); + EXPECT_EQ(c[2], 1); + EXPECT_EQ(c[3], invalid_index); + EXPECT_EQ(c[4], 2); + EXPECT_EQ(c[5], invalid_index); + EXPECT_EQ(c[6], invalid_index); + EXPECT_EQ(c[7], invalid_index); + EXPECT_EQ(v[0], ValueType{1.0}); + EXPECT_EQ(v[1], ValueType{1.5}); + EXPECT_EQ(v[2], ValueType{2.0}); + EXPECT_EQ(v[3], ValueType{0.0}); + EXPECT_EQ(v[4], ValueType{3.0}); + EXPECT_EQ(v[5], ValueType{0.0}); + EXPECT_EQ(v[6], ValueType{0.0}); + EXPECT_EQ(v[7], ValueType{0.0}); + EXPECT_EQ(s[0], 0); + EXPECT_EQ(s[1], 4); + EXPECT_EQ(l[0], 4); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, ConvertsToSellpWithSliceSizeAndStrideFactor) { - using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = + Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); - ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()), - gko::ValueMismatch); + this->mtx7->convert_to(sellp_mtx); + + assert_sellp_strided_eq_mtx7(sellp_mtx.get()); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixPermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, MovesToSellpWithSliceSizeAndStrideFactor) { - using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Sellp = typename gko::matrix::Sellp; + auto sellp_mtx = + Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0); - ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + this->mtx7->move_to(sellp_mtx); + + assert_sellp_strided_eq_mtx7(sellp_mtx.get()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutable) +TYPED_TEST(DenseWithIndexType, ConvertsToAndFromSellpWithMoreThanOneSlice) { - using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Mtx = typename TestFixture::Mtx; + using Sellp = typename gko::matrix::Sellp; + auto x = this->template gen_mtx(65, 25); - auto ref_permuted = gko::as( - gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - auto permuted = gko::as(this->mtx5->inverse_permute(&permute_idxs)); + auto sellp_mtx = Sellp::create(this->exec); + auto dense_mtx = Mtx::create(this->exec); + x->convert_to(sellp_mtx); + sellp_mtx->convert_to(dense_mtx); - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + GKO_ASSERT_MTX_NEAR(dense_mtx, x, 0.0); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutableIntoDense) +TYPED_TEST(Dense, ConvertsEmptyToPrecision) { - using Mtx = typename TestFixture::Mtx; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto permuted = Mtx::create(exec, this->mtx5->get_size()); + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherDense = typename gko::matrix::Dense; + auto empty = OtherDense::create(this->exec); + auto res = Dense::create(this->exec); - auto ref_permuted = gko::as( - gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - this->mtx5->inverse_permute(&permute_idxs, permuted); + empty->convert_to(res); - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInversePermutableIntoDense) +TYPED_TEST(Dense, MovesEmptyToPrecision) { - using Mtx = typename TestFixture::Mtx; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); + using Dense = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using OtherT = typename gko::next_precision; + using OtherDense = typename gko::matrix::Dense; + auto empty = OtherDense::create(this->exec); + auto res = Dense::create(this->exec); - auto ref_permuted = - gko::as(gko::as(mtx->inverse_row_permute(&permute_idxs)) - ->inverse_column_permute(&permute_idxs)); - mtx->inverse_permute(&permute_idxs, permuted); + empty->move_to(res); - GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); - ASSERT_EQ(permuted->get_stride(), 4); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, NonSquareMatrixInversePermuteIntoDenseFails) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToCoo) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Coo = typename gko::matrix::Coo; + auto empty = Dense::create(this->exec); + auto res = Coo::create(this->exec); - ASSERT_THROW( - this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()), - gko::DimensionMismatch); + empty->convert_to(res); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, MovesEmptyToCoo) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {0, 1}}; + using Coo = typename gko::matrix::Coo; + auto empty = Dense::create(this->exec); + auto res = Coo::create(this->exec); - ASSERT_THROW( - this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()), - gko::ValueMismatch); + empty->move_to(res); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyMatrixToCsr) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Csr = typename gko::matrix::Csr; + auto empty = Dense::create(this->exec); + auto res = Csr::create(this->exec); - ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + empty->convert_to(res); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutable) +TYPED_TEST(DenseWithIndexType, MovesEmptyMatrixToCsr) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Csr = typename gko::matrix::Csr; + auto empty = Dense::create(this->exec); + auto res = Csr::create(this->exec); - auto row_permute = gko::as(this->mtx5->row_permute(&permute_idxs)); + empty->move_to(res); - GKO_ASSERT_MTX_NEAR( - row_permute, - l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), - 0.0); + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsRowPermutable) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSparsityCsr) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; + using SparsityCsr = + typename gko::matrix::SparsityCsr; + auto empty = Dense::create(this->exec); + auto res = SparsityCsr::create(this->exec); - auto row_permute = gko::as(this->mtx4->row_permute(&permute_idxs)); + empty->convert_to(res); - GKO_ASSERT_MTX_NEAR(row_permute, - l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); + ASSERT_EQ(res->get_num_nonzeros(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, MovesEmptyToSparsityCsr) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + using SparsityCsr = + typename gko::matrix::SparsityCsr; + auto empty = Dense::create(this->exec); + auto res = SparsityCsr::create(this->exec); - this->mtx5->row_permute(&permute_idxs, row_permute); + empty->move_to(res); - GKO_ASSERT_MTX_NEAR( - row_permute, - l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), - 0.0); + ASSERT_EQ(res->get_num_nonzeros(), 0); + ASSERT_EQ(*res->get_const_row_ptrs(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsRowPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToEll) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + using Ell = typename gko::matrix::Ell; + auto empty = Dense::create(this->exec); + auto res = Ell::create(this->exec); - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->row_permute(&permute_idxs, row_permute); + empty->convert_to(res); - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), - 0.0); - ASSERT_EQ(row_permute->get_stride(), 4); + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, MovesEmptyToEll) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + using Ell = typename gko::matrix::Ell; + auto empty = Dense::create(this->exec); + auto res = Ell::create(this->exec); - ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, row_permute), - gko::ValueMismatch); + empty->move_to(res); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToHybrid) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Hybrid = typename gko::matrix::Hybrid; + auto empty = Dense::create(this->exec); + auto res = Hybrid::create(this->exec); - ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + empty->convert_to(res); + + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutable) +TYPED_TEST(DenseWithIndexType, MovesEmptyToHybrid) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Hybrid = typename gko::matrix::Hybrid; + auto empty = Dense::create(this->exec); + auto res = Hybrid::create(this->exec); - auto c_permute = gko::as(this->mtx5->column_permute(&permute_idxs)); + empty->move_to(res); - GKO_ASSERT_MTX_NEAR( - c_permute, - l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), - 0.0); + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsColPermutable) +TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSellp) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx4->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + using Sellp = typename gko::matrix::Sellp; + auto empty = Dense::create(this->exec); + auto res = Sellp::create(this->exec); - auto c_permute = gko::as(this->mtx4->column_permute(&permute_idxs)); + empty->convert_to(res); - GKO_ASSERT_MTX_NEAR(c_permute, - l({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}), 0.0); + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, MovesEmptyToSellp) { - using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = Mtx::create(exec, this->mtx5->get_size()); + using Sellp = typename gko::matrix::Sellp; + auto empty = Dense::create(this->exec); + auto res = Sellp::create(this->exec); - this->mtx5->column_permute(&permute_idxs, c_permute); + empty->move_to(res); - GKO_ASSERT_MTX_NEAR( - c_permute, - l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), - 0.0); + ASSERT_EQ(res->get_num_stored_elements(), 0); + ASSERT_EQ(*res->get_const_slice_sets(), 0); + ASSERT_FALSE(res->get_size()); } -TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsColPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRows) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 0}}; - auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->column_permute(&permute_idxs, c_permute); + auto row_collection = this->mtx5->row_gather(&permute_idxs); - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), + GKO_ASSERT_MTX_NEAR(row_collection, + l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0); - ASSERT_EQ(c_permute->get_stride(), 4); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRowsIntoDense) { using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - - ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, row_permute), - gko::ValueMismatch); -} - + gko::array permute_idxs{exec, {1, 0}}; + auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3}); -TYPED_TEST(DenseWithIndexType, - SquareMatrixColPermuteIntoDenseFailsForWrongDimensions) -{ - using Mtx = typename TestFixture::Mtx; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; + this->mtx5->row_gather(&permute_idxs, row_collection); - ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + GKO_ASSERT_MTX_NEAR(row_collection, + l({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutable) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixCanGatherRowsIntoDense) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 0}}; + auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto inv_row_permute = - gko::as(this->mtx5->inverse_row_permute(&inverse_permute_idxs)); + this->mtx5->create_submatrix({0, 2}, {1, 3}) + ->row_gather(&permute_idxs, row_collection); - GKO_ASSERT_MTX_NEAR( - inv_row_permute, - l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), - 0.0); + GKO_ASSERT_MTX_NEAR(row_collection, + l({{2.0, 4.5}, {-1.0, -0.5}}), 0.0); + ASSERT_EQ(row_collection->get_stride(), 4); } -TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseRowPermutable) +TYPED_TEST(DenseWithIndexType, NonSquareSubmatrixCanGatherRowsIntoMixedDense) { using Mtx = typename TestFixture::Mtx; + using MixedMtx = typename TestFixture::MixedMtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; auto exec = this->mtx4->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 0}}; + gko::array gather_index{exec, {1, 0, 1}}; + auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4); - auto inverse_row_permute = - gko::as(this->mtx4->inverse_row_permute(&inverse_permute_idxs)); + this->mtx4->row_gather(&gather_index, row_collection); - GKO_ASSERT_MTX_NEAR(inverse_row_permute, - l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); + GKO_ASSERT_MTX_NEAR( + row_collection, + l( + {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}), + 0.0); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, + NonSquareSubmatrixCanAdvancedGatherRowsIntoMixedDense) { using Mtx = typename TestFixture::Mtx; + using MixedMtx = typename TestFixture::MixedMtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2, 0}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto exec = this->mtx4->get_executor(); + gko::array gather_index{exec, {1, 0, 1}}; + auto row_collection = gko::initialize( + {{1.0, 0.5, -1.0}, {-1.5, 0.5, 1.0}, {2.0, -3.0, 1.0}}, exec); + auto alpha = gko::initialize({1.0}, exec); + auto beta = gko::initialize({2.0}, exec); - this->mtx5->inverse_row_permute(&permute_idxs, row_permute); + this->mtx4->row_gather(alpha, &gather_index, beta, row_collection); GKO_ASSERT_MTX_NEAR( - row_permute, - l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), + row_collection, + l( + {{2.0, 6.0, -2.0}, {-2.0, 4.0, 4.0}, {4.0, -1.0, 2.0}}), 0.0); } -TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseRowPermutableIntoDense) +TYPED_TEST(DenseWithIndexType, + SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 0}}; - auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->inverse_row_permute(&permute_idxs, row_permute); - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), - 0.0); - ASSERT_EQ(row_permute->get_stride(), 4); + ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutable) { using Mtx = typename TestFixture::Mtx; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + gko::array permute_idxs{exec, {1, 2, 0}}; - ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, row_permute), - gko::ValueMismatch); + auto ref_permuted = + gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) + ->column_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx5->permute(&permute_idxs)); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(DenseWithIndexType, - SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_THROW( - this->mtx5->inverse_row_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); + auto ref_permuted = + gko::as(gko::as(this->mtx5->row_permute(&permute_idxs)) + ->column_permute(&permute_idxs)); + this->mtx5->permute(&permute_idxs, permuted); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutable) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; + gko::array permute_idxs{exec, {1, 0}}; + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - auto inv_c_permute = - gko::as(this->mtx5->inverse_column_permute(&inverse_permute_idxs)); + auto ref_permuted = + gko::as(gko::as(mtx->row_permute(&permute_idxs)) + ->column_permute(&permute_idxs)); + mtx->permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR( - inv_c_permute, - l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), - 0.0); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + ASSERT_EQ(permuted->get_stride(), 4); } -TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseColPermutable) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixPermuteIntoDenseFails) { using Mtx = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; auto exec = this->mtx4->get_executor(); - gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - - auto inverse_c_permute = - gko::as(this->mtx4->inverse_column_permute(&inverse_permute_idxs)); - - GKO_ASSERT_MTX_NEAR(inverse_c_permute, - l({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0); -} - - -TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = Mtx::create(exec, this->mtx5->get_size()); - - this->mtx5->inverse_column_permute(&permute_idxs, c_permute); - - GKO_ASSERT_MTX_NEAR( - c_permute, - l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), - 0.0); -} - - -TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseColPermutableIntoDense) -{ - using Mtx = typename TestFixture::Mtx; - using value_type = typename TestFixture::value_type; - using index_type = typename TestFixture::index_type; - auto exec = this->mtx5->get_executor(); - gko::array permute_idxs{exec, {1, 0}}; - auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - - this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->column_permute(&permute_idxs, c_permute); - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), - 0.0); - ASSERT_EQ(c_permute->get_stride(), 4); + ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()), + gko::DimensionMismatch); } TYPED_TEST(DenseWithIndexType, - SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize) + SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, row_permute), + ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()), gko::ValueMismatch); } TYPED_TEST(DenseWithIndexType, - SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions) + SquareMatrixPermuteIntoDenseFailsForWrongDimensions) { using Mtx = typename TestFixture::Mtx; using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - ASSERT_THROW( - this->mtx5->inverse_column_permute(&permute_idxs, Mtx::create(exec)), - gko::DimensionMismatch); -} - - -TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrix) -{ - using T = typename TestFixture::value_type; - - auto diag = this->mtx5->extract_diagonal(); - - ASSERT_EQ(diag->get_size()[0], 3); - ASSERT_EQ(diag->get_size()[1], 3); - ASSERT_EQ(diag->get_values()[0], T{1.}); - ASSERT_EQ(diag->get_values()[1], T{2.}); - ASSERT_EQ(diag->get_values()[2], T{1.2}); + ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrix) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutable) { - using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - auto diag = this->mtx4->extract_diagonal(); + auto ref_permuted = gko::as( + gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) + ->inverse_column_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx5->inverse_permute(&permute_idxs)); - ASSERT_EQ(diag->get_size()[0], 2); - ASSERT_EQ(diag->get_size()[1], 2); - ASSERT_EQ(diag->get_values()[0], T{1.}); - ASSERT_EQ(diag->get_values()[1], T{5.}); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrix) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutableIntoDense) { - using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - auto diag = this->mtx8->extract_diagonal(); + auto ref_permuted = gko::as( + gko::as(this->mtx5->inverse_row_permute(&permute_idxs)) + ->inverse_column_permute(&permute_idxs)); + this->mtx5->inverse_permute(&permute_idxs, permuted); - ASSERT_EQ(diag->get_size()[0], 2); - ASSERT_EQ(diag->get_size()[1], 2); - ASSERT_EQ(diag->get_values()[0], T{1.}); - ASSERT_EQ(diag->get_values()[1], T{2.}); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); } -TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrixIntoDiagonal) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInversePermutableIntoDense) { - using T = typename TestFixture::value_type; - auto diag = gko::matrix::Diagonal::create(this->exec, 3); + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 0}}; + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3}); - this->mtx5->extract_diagonal(diag); + auto ref_permuted = + gko::as(gko::as(mtx->inverse_row_permute(&permute_idxs)) + ->inverse_column_permute(&permute_idxs)); + mtx->inverse_permute(&permute_idxs, permuted); - ASSERT_EQ(diag->get_size()[0], 3); - ASSERT_EQ(diag->get_size()[1], 3); - ASSERT_EQ(diag->get_values()[0], T{1.}); - ASSERT_EQ(diag->get_values()[1], T{2.}); - ASSERT_EQ(diag->get_values()[2], T{1.2}); + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + ASSERT_EQ(permuted->get_stride(), 4); } -TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrixIntoDiagonal) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixInversePermuteIntoDenseFails) { - using T = typename TestFixture::value_type; - auto diag = gko::matrix::Diagonal::create(this->exec, 2); - - this->mtx4->extract_diagonal(diag); + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - ASSERT_EQ(diag->get_size()[0], 2); - ASSERT_EQ(diag->get_size()[1], 2); - ASSERT_EQ(diag->get_values()[0], T{1.}); - ASSERT_EQ(diag->get_values()[1], T{5.}); + ASSERT_THROW( + this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()), + gko::DimensionMismatch); } -TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrixIntoDiagonal) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize) { - using T = typename TestFixture::value_type; - auto diag = gko::matrix::Diagonal::create(this->exec, 2); - - this->mtx8->extract_diagonal(diag); + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {0, 1}}; - ASSERT_EQ(diag->get_size()[0], 2); - ASSERT_EQ(diag->get_size()[1], 2); - ASSERT_EQ(diag->get_values()[0], T{1.}); - ASSERT_EQ(diag->get_values()[1], T{2.}); + ASSERT_THROW( + this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()), + gko::ValueMismatch); } -TYPED_TEST(Dense, InplaceAbsolute) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions) { - using T = typename TestFixture::value_type; - - this->mtx5->compute_absolute_inplace(); + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - GKO_ASSERT_MTX_NEAR( - this->mtx5, l({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), - 0.0); + ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(Dense, InplaceAbsoluteSubMatrix) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutable) { - using T = typename TestFixture::value_type; - auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2}); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - mtx->compute_absolute_inplace(); + auto row_permute = gko::as(this->mtx5->row_permute(&permute_idxs)); GKO_ASSERT_MTX_NEAR( - this->mtx5, l({{1.0, 1.0, -0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), + row_permute, + l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); } -TYPED_TEST(Dense, OutplaceAbsolute) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsRowPermutable) { - using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array permute_idxs{exec, {1, 0}}; - auto abs_mtx = this->mtx5->compute_absolute(); + auto row_permute = gko::as(this->mtx4->row_permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR( - abs_mtx, l({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), - 0.0); + GKO_ASSERT_MTX_NEAR(row_permute, + l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); } -TYPED_TEST(Dense, OutplaceAbsoluteIntoDense) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto abs_mtx = - gko::remove_complex::create(this->exec, this->mtx5->get_size()); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - this->mtx5->compute_absolute(abs_mtx); + this->mtx5->row_permute(&permute_idxs, row_permute); GKO_ASSERT_MTX_NEAR( - abs_mtx, l({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}), + row_permute, + l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); } -TYPED_TEST(Dense, OutplaceAbsoluteSubMatrix) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsRowPermutableIntoDense) { - using T = typename TestFixture::value_type; - auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2}); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 0}}; + auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto abs_mtx = mtx->compute_absolute(); + this->mtx5->create_submatrix({0, 2}, {0, 2}) + ->row_permute(&permute_idxs, row_permute); - GKO_ASSERT_MTX_NEAR(abs_mtx, l({{1.0, 1.0}, {2.0, 2.0}}), 0); - GKO_ASSERT_EQ(abs_mtx->get_stride(), 2); + GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), + 0.0); + ASSERT_EQ(row_permute->get_stride(), 4); } -TYPED_TEST(Dense, OutplaceSubmatrixAbsoluteIntoDense) +TYPED_TEST(DenseWithIndexType, + SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2}); - auto abs_mtx = - gko::remove_complex::create(this->exec, gko::dim<2>{2, 2}, 4); - - mtx->compute_absolute(abs_mtx); + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2}}; + auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - GKO_ASSERT_MTX_NEAR(abs_mtx, l({{1.0, 1.0}, {2.0, 2.0}}), 0); - GKO_ASSERT_EQ(abs_mtx->get_stride(), 4); + ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, row_permute), + gko::ValueMismatch); } -TYPED_TEST(Dense, AppliesToComplex) +TYPED_TEST(DenseWithIndexType, + SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions) { - using value_type = typename TestFixture::value_type; - using complex_type = gko::to_complex; - using Vec = gko::matrix::Dense; - auto exec = gko::ReferenceExecutor::create(); - auto b = - gko::initialize({{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}}, - {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}, - {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}}, - exec); - auto x = Vec::create(exec, gko::dim<2>{2, 2}); - - this->mtx1->apply(b, x); + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - GKO_ASSERT_MTX_NEAR( - x, - l({{complex_type{14.0, 16.0}, complex_type{20.0, 22.0}}, - {complex_type{17.0, 19.0}, complex_type{24.5, 26.5}}}), - 0.0); + ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(Dense, AppliesToMixedComplex) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutable) { - using mixed_value_type = - gko::next_precision; - using mixed_complex_type = gko::to_complex; - using Vec = gko::matrix::Dense; - auto exec = gko::ReferenceExecutor::create(); - auto b = gko::initialize( - {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}}, - {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}, - {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}}, - exec); - auto x = Vec::create(exec, gko::dim<2>{2, 2}); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - this->mtx1->apply(b, x); + auto c_permute = gko::as(this->mtx5->column_permute(&permute_idxs)); GKO_ASSERT_MTX_NEAR( - x, - l({{mixed_complex_type{14.0, 16.0}, mixed_complex_type{20.0, 22.0}}, - {mixed_complex_type{17.0, 19.0}, mixed_complex_type{24.5, 26.5}}}), + c_permute, + l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), 0.0); } -TYPED_TEST(Dense, AdvancedAppliesToComplex) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsColPermutable) { + using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; - using complex_type = gko::to_complex; - using Dense = gko::matrix::Dense; - using DenseComplex = gko::matrix::Dense; - auto exec = gko::ReferenceExecutor::create(); - - auto b = gko::initialize( - {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}}, - {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}, - {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}}, - exec); - auto x = gko::initialize( - {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}}, - {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}}, - exec); - auto alpha = gko::initialize({-1.0}, this->exec); - auto beta = gko::initialize({2.0}, this->exec); + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - this->mtx1->apply(alpha, b, beta, x); + auto c_permute = gko::as(this->mtx4->column_permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR( - x, - l({{complex_type{-12.0, -16.0}, complex_type{-16.0, -20.0}}, - {complex_type{-13.0, -15.0}, complex_type{-18.5, -20.5}}}), - 0.0); + GKO_ASSERT_MTX_NEAR(c_permute, + l({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}), 0.0); } -TYPED_TEST(Dense, AdvancedAppliesToMixedComplex) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutableIntoDense) { - using mixed_value_type = - gko::next_precision; - using mixed_complex_type = gko::to_complex; - using MixedDense = gko::matrix::Dense; - using MixedDenseComplex = gko::matrix::Dense; - auto exec = gko::ReferenceExecutor::create(); - - auto b = gko::initialize( - {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}}, - {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}, - {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}}, - exec); - auto x = gko::initialize( - {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}}, - {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}}, - exec); - auto alpha = gko::initialize({-1.0}, this->exec); - auto beta = gko::initialize({2.0}, this->exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + auto c_permute = Mtx::create(exec, this->mtx5->get_size()); - this->mtx1->apply(alpha, b, beta, x); + this->mtx5->column_permute(&permute_idxs, c_permute); GKO_ASSERT_MTX_NEAR( - x, - l({{mixed_complex_type{-12.0, -16.0}, mixed_complex_type{-16.0, -20.0}}, - {mixed_complex_type{-13.0, -15.0}, - mixed_complex_type{-18.5, -20.5}}}), + c_permute, + l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), 0.0); } -TYPED_TEST(Dense, MakeComplex) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsColPermutableIntoDense) { - using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 0}}; + auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto complex_mtx = this->mtx5->make_complex(); + this->mtx5->create_submatrix({0, 2}, {0, 2}) + ->column_permute(&permute_idxs, c_permute); - GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0); + GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), + 0.0); + ASSERT_EQ(c_permute->get_stride(), 4); } -TYPED_TEST(Dense, MakeComplexIntoDense) +TYPED_TEST(DenseWithIndexType, + SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize) { - using T = typename TestFixture::value_type; - using ComplexMtx = typename TestFixture::ComplexMtx; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2}}; + auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - auto complex_mtx = ComplexMtx::create(exec, this->mtx5->get_size()); - this->mtx5->make_complex(complex_mtx); + ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, row_permute), + gko::ValueMismatch); +} - GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0); + +TYPED_TEST(DenseWithIndexType, + SquareMatrixColPermuteIntoDenseFailsForWrongDimensions) +{ + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + + ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(Dense, MakeComplexIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutable) { - using T = typename TestFixture::value_type; - using ComplexMtx = typename TestFixture::ComplexMtx; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); + gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto complex_mtx = ComplexMtx::create(exec); + auto inv_row_permute = + gko::as(this->mtx5->inverse_row_permute(&inverse_permute_idxs)); - ASSERT_THROW(this->mtx5->make_complex(complex_mtx), gko::DimensionMismatch); + GKO_ASSERT_MTX_NEAR( + inv_row_permute, + l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), + 0.0); } -TYPED_TEST(Dense, GetReal) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseRowPermutable) { - using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array inverse_permute_idxs{exec, {1, 0}}; - auto real_mtx = this->mtx5->get_real(); + auto inverse_row_permute = + gko::as(this->mtx4->inverse_row_permute(&inverse_permute_idxs)); - GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0); + GKO_ASSERT_MTX_NEAR(inverse_row_permute, + l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); } -TYPED_TEST(Dense, GetRealIntoDense) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutableIntoDense) { - using T = typename TestFixture::value_type; - using RealMtx = typename TestFixture::RealMtx; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - auto real_mtx = RealMtx::create(exec, this->mtx5->get_size()); - this->mtx5->get_real(real_mtx); + this->mtx5->inverse_row_permute(&permute_idxs, row_permute); - GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0); + GKO_ASSERT_MTX_NEAR( + row_permute, + l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), + 0.0); } -TYPED_TEST(Dense, GetRealIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseRowPermutableIntoDense) { - using T = typename TestFixture::value_type; - using RealMtx = typename TestFixture::RealMtx; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 0}}; + auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - auto real_mtx = RealMtx::create(exec); - ASSERT_THROW(this->mtx5->get_real(real_mtx), gko::DimensionMismatch); + this->mtx5->create_submatrix({0, 2}, {0, 2}) + ->inverse_row_permute(&permute_idxs, row_permute); + + GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), + 0.0); + ASSERT_EQ(row_permute->get_stride(), 4); } -TYPED_TEST(Dense, GetImag) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize) { - using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2}}; + auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - auto imag_mtx = this->mtx5->get_imag(); + ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, row_permute), + gko::ValueMismatch); +} - GKO_ASSERT_MTX_NEAR( - imag_mtx, l({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}), - 0.0); + +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions) +{ + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + + ASSERT_THROW( + this->mtx5->inverse_row_permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } -TYPED_TEST(Dense, GetImagIntoDense) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutable) { - using T = typename TestFixture::value_type; - using RealMtx = typename TestFixture::RealMtx; + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); + gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto imag_mtx = RealMtx::create(exec, this->mtx5->get_size()); - this->mtx5->get_imag(imag_mtx); + auto inv_c_permute = + gko::as(this->mtx5->inverse_column_permute(&inverse_permute_idxs)); GKO_ASSERT_MTX_NEAR( - imag_mtx, l({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}), + inv_c_permute, + l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0); } -TYPED_TEST(Dense, GetImagIntoDenseFailsForWrongDimensions) +TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseColPermutable) { - using T = typename TestFixture::value_type; - using RealMtx = typename TestFixture::RealMtx; - auto exec = this->mtx5->get_executor(); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx4->get_executor(); + gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto imag_mtx = RealMtx::create(exec); - ASSERT_THROW(this->mtx5->get_imag(imag_mtx), gko::DimensionMismatch); + auto inverse_c_permute = + gko::as(this->mtx4->inverse_column_permute(&inverse_permute_idxs)); + + GKO_ASSERT_MTX_NEAR(inverse_c_permute, + l({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0); } -TYPED_TEST(Dense, MakeTemporaryConversionDoesntConvertOnMatch) +TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutableIntoDense) { using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto alpha = gko::initialize({8.0}, this->exec); + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; + auto c_permute = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_EQ(gko::make_temporary_conversion(alpha).get(), alpha.get()); + this->mtx5->inverse_column_permute(&permute_idxs, c_permute); + + GKO_ASSERT_MTX_NEAR( + c_permute, + l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), + 0.0); } -TYPED_TEST(Dense, MakeTemporaryConversionConvertsBack) +TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseColPermutableIntoDense) { - using MixedMtx = typename TestFixture::MixedMtx; - using T = typename TestFixture::value_type; - using MixedT = typename MixedMtx::value_type; - auto alpha = gko::initialize({8.0}, this->exec); + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 0}}; + auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); - { - auto conversion = gko::make_temporary_conversion(alpha); - conversion->at(0, 0) = T{7.0}; - } + this->mtx5->create_submatrix({0, 2}, {0, 2}) + ->column_permute(&permute_idxs, c_permute); - ASSERT_EQ(alpha->at(0, 0), MixedT{7.0}); + GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), + 0.0); + ASSERT_EQ(c_permute->get_stride(), 4); } -TYPED_TEST(Dense, MakeTemporaryConversionConstDoesntConvertBack) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize) { - using MixedMtx = typename TestFixture::MixedMtx; - using T = typename TestFixture::value_type; - using MixedT = typename MixedMtx::value_type; - auto alpha = gko::initialize({8.0}, this->exec); - - { - auto conversion = gko::make_temporary_conversion( - static_cast(alpha.get())); - alpha->at(0, 0) = MixedT{7.0}; - } + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2}}; + auto row_permute = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_EQ(alpha->at(0, 0), MixedT{7.0}); + ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, row_permute), + gko::ValueMismatch); } -TYPED_TEST(Dense, ScaleAddIdentityRectangular) +TYPED_TEST(DenseWithIndexType, + SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions) { - using T = typename TestFixture::value_type; - using Vec = typename TestFixture::Mtx; - using MixedVec = typename TestFixture::MixedMtx; - auto alpha = gko::initialize({2.0}, this->exec); - auto beta = gko::initialize({-1.0}, this->exec); - auto b = gko::initialize( - {I{2.0, 0.0}, I{1.0, 2.5}, I{0.0, -4.0}}, this->exec); - - b->add_scaled_identity(alpha, beta); + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto exec = this->mtx5->get_executor(); + gko::array permute_idxs{exec, {1, 2, 0}}; - GKO_ASSERT_MTX_NEAR(b, l({{0.0, 0.0}, {-1.0, -0.5}, {0.0, 4.0}}), 0.0); + ASSERT_THROW( + this->mtx5->inverse_column_permute(&permute_idxs, Mtx::create(exec)), + gko::DimensionMismatch); } From 07f01db4c8aa6c76d54db81e981eb91d4d5ffc7e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 17 Oct 2023 09:52:27 +0200 Subject: [PATCH 383/583] review updates - Remove unused declarations - Consistent variable naming Co-authored-by: Marcel Koch --- reference/test/matrix/dense_kernels.cpp | 108 ++++++++++++------------ 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 56f082243e6..60713c815de 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -68,7 +68,6 @@ class Dense : public ::testing::Test { using Mtx = gko::matrix::Dense; using MixedMtx = gko::matrix::Dense>; using ComplexMtx = gko::to_complex; - using MixedComplexMtx = gko::to_complex; using RealMtx = gko::remove_complex; Dense() : exec(gko::ReferenceExecutor::create()), @@ -1297,7 +1296,6 @@ TYPED_TEST(Dense, ScaleAddIdentityRectangular) { using T = typename TestFixture::value_type; using Vec = typename TestFixture::Mtx; - using MixedVec = typename TestFixture::MixedMtx; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); auto b = gko::initialize( @@ -1318,8 +1316,6 @@ class DenseWithIndexType typename std::tuple_element<0, decltype(ValueIndexType())>::type; using index_type = typename std::tuple_element<1, decltype(ValueIndexType())>::type; - - index_type invalid_index = gko::invalid_index(); }; TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes, @@ -2501,10 +2497,10 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutable) auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto row_permute = gko::as(this->mtx5->row_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx5->row_permute(&permute_idxs)); GKO_ASSERT_MTX_NEAR( - row_permute, + permuted, l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); } @@ -2518,9 +2514,9 @@ TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsRowPermutable) auto exec = this->mtx4->get_executor(); gko::array permute_idxs{exec, {1, 0}}; - auto row_permute = gko::as(this->mtx4->row_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx4->row_permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR(row_permute, + GKO_ASSERT_MTX_NEAR(permuted, l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); } @@ -2532,12 +2528,12 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - this->mtx5->row_permute(&permute_idxs, row_permute); + this->mtx5->row_permute(&permute_idxs, permuted); GKO_ASSERT_MTX_NEAR( - row_permute, + permuted, l({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0); } @@ -2550,14 +2546,14 @@ TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsRowPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 0}}; - auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->row_permute(&permute_idxs, row_permute); + ->row_permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), + GKO_ASSERT_MTX_NEAR(permuted, l({{-2.0, 2.0}, {1.0, -1.0}}), 0.0); - ASSERT_EQ(row_permute->get_stride(), 4); + ASSERT_EQ(permuted->get_stride(), 4); } @@ -2568,9 +2564,9 @@ TYPED_TEST(DenseWithIndexType, using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, row_permute), + ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, permuted), gko::ValueMismatch); } @@ -2596,10 +2592,10 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutable) auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = gko::as(this->mtx5->column_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx5->column_permute(&permute_idxs)); GKO_ASSERT_MTX_NEAR( - c_permute, + permuted, l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), 0.0); } @@ -2613,9 +2609,9 @@ TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsColPermutable) auto exec = this->mtx4->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = gko::as(this->mtx4->column_permute(&permute_idxs)); + auto permuted = gko::as(this->mtx4->column_permute(&permute_idxs)); - GKO_ASSERT_MTX_NEAR(c_permute, + GKO_ASSERT_MTX_NEAR(permuted, l({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}), 0.0); } @@ -2627,12 +2623,12 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - this->mtx5->column_permute(&permute_idxs, c_permute); + this->mtx5->column_permute(&permute_idxs, permuted); GKO_ASSERT_MTX_NEAR( - c_permute, + permuted, l({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}), 0.0); } @@ -2645,14 +2641,14 @@ TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsColPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 0}}; - auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->column_permute(&permute_idxs, c_permute); + ->column_permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), + GKO_ASSERT_MTX_NEAR(permuted, l({{-1.0, 1.0}, {2.0, -2.0}}), 0.0); - ASSERT_EQ(c_permute->get_stride(), 4); + ASSERT_EQ(permuted->get_stride(), 4); } @@ -2663,9 +2659,9 @@ TYPED_TEST(DenseWithIndexType, using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, row_permute), + ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, permuted), gko::ValueMismatch); } @@ -2691,11 +2687,11 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutable) auto exec = this->mtx5->get_executor(); gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto inv_row_permute = + auto permuted = gko::as(this->mtx5->inverse_row_permute(&inverse_permute_idxs)); GKO_ASSERT_MTX_NEAR( - inv_row_permute, + permuted, l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0); } @@ -2709,10 +2705,10 @@ TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseRowPermutable) auto exec = this->mtx4->get_executor(); gko::array inverse_permute_idxs{exec, {1, 0}}; - auto inverse_row_permute = + auto permuted = gko::as(this->mtx4->inverse_row_permute(&inverse_permute_idxs)); - GKO_ASSERT_MTX_NEAR(inverse_row_permute, + GKO_ASSERT_MTX_NEAR(permuted, l({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0); } @@ -2724,12 +2720,12 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - this->mtx5->inverse_row_permute(&permute_idxs, row_permute); + this->mtx5->inverse_row_permute(&permute_idxs, permuted); GKO_ASSERT_MTX_NEAR( - row_permute, + permuted, l({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0); } @@ -2742,14 +2738,14 @@ TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseRowPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 0}}; - auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->inverse_row_permute(&permute_idxs, row_permute); + ->inverse_row_permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR(row_permute, l({{-2.0, 2.0}, {1.0, -1.0}}), + GKO_ASSERT_MTX_NEAR(permuted, l({{-2.0, 2.0}, {1.0, -1.0}}), 0.0); - ASSERT_EQ(row_permute->get_stride(), 4); + ASSERT_EQ(permuted->get_stride(), 4); } @@ -2760,9 +2756,9 @@ TYPED_TEST(DenseWithIndexType, using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, row_permute), + ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, permuted), gko::ValueMismatch); } @@ -2789,11 +2785,11 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutable) auto exec = this->mtx5->get_executor(); gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto inv_c_permute = + auto permuted = gko::as(this->mtx5->inverse_column_permute(&inverse_permute_idxs)); GKO_ASSERT_MTX_NEAR( - inv_c_permute, + permuted, l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0); } @@ -2807,10 +2803,10 @@ TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseColPermutable) auto exec = this->mtx4->get_executor(); gko::array inverse_permute_idxs{exec, {1, 2, 0}}; - auto inverse_c_permute = + auto permuted = gko::as(this->mtx4->inverse_column_permute(&inverse_permute_idxs)); - GKO_ASSERT_MTX_NEAR(inverse_c_permute, + GKO_ASSERT_MTX_NEAR(permuted, l({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0); } @@ -2822,12 +2818,12 @@ TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2, 0}}; - auto c_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - this->mtx5->inverse_column_permute(&permute_idxs, c_permute); + this->mtx5->inverse_column_permute(&permute_idxs, permuted); GKO_ASSERT_MTX_NEAR( - c_permute, + permuted, l({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0); } @@ -2840,14 +2836,14 @@ TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseColPermutableIntoDense) using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 0}}; - auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4); + auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4); this->mtx5->create_submatrix({0, 2}, {0, 2}) - ->column_permute(&permute_idxs, c_permute); + ->column_permute(&permute_idxs, permuted); - GKO_ASSERT_MTX_NEAR(c_permute, l({{-1.0, 1.0}, {2.0, -2.0}}), + GKO_ASSERT_MTX_NEAR(permuted, l({{-1.0, 1.0}, {2.0, -2.0}}), 0.0); - ASSERT_EQ(c_permute->get_stride(), 4); + ASSERT_EQ(permuted->get_stride(), 4); } @@ -2858,9 +2854,9 @@ TYPED_TEST(DenseWithIndexType, using index_type = typename TestFixture::index_type; auto exec = this->mtx5->get_executor(); gko::array permute_idxs{exec, {1, 2}}; - auto row_permute = Mtx::create(exec, this->mtx5->get_size()); + auto permuted = Mtx::create(exec, this->mtx5->get_size()); - ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, row_permute), + ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, permuted), gko::ValueMismatch); } From c459676141f8dc7ca0a16ad802bd109259169b03 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 6 Oct 2023 14:39:38 +0200 Subject: [PATCH 384/583] Add batch::matrix::Ell class and core Co-authored-by: Aditya Kashi --- core/matrix/batch_ell.cpp | 235 ++++++++++++++ include/ginkgo/core/matrix/batch_ell.hpp | 390 +++++++++++++++++++++++ 2 files changed, 625 insertions(+) create mode 100644 core/matrix/batch_ell.cpp create mode 100644 include/ginkgo/core/matrix/batch_ell.hpp diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp new file mode 100644 index 00000000000..63d4f0dda8a --- /dev/null +++ b/core/matrix/batch_ell.cpp @@ -0,0 +1,235 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/matrix/batch_ell_kernels.hpp" + + +namespace gko { +namespace batch { +namespace matrix { +namespace ell { +namespace { + + +GKO_REGISTER_OPERATION(simple_apply, batch_ell::simple_apply); +GKO_REGISTER_OPERATION(advanced_apply, batch_ell::advanced_apply); + + +} // namespace +} // namespace ell + + +namespace detail { + + +template +batch_dim<2> compute_batch_size( + const std::vector*>& matrices) +{ + auto common_size = matrices[0]->get_size(); + for (size_type i = 1; i < matrices.size(); ++i) { + GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); + } + return batch_dim<2>{matrices.size(), common_size}; +} + + +} // namespace detail + + +template +std::unique_ptr> +Ell::create_view_for_item(size_type item_id) +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create( + exec, this->get_common_size(), + make_array_view(exec, this->get_num_elements_per_item(), + this->get_values_for_item(item_id)), + make_array_view(exec, this->get_num_elements_per_item(), + this->get_col_idxs_for_item(item_id)), + this->get_num_stored_elements_per_row(), stride); + return mat; +} + + +template +std::unique_ptr> +Ell::create_const_view_for_item(size_type item_id) const +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create_const( + exec, this->get_common_size(), + make_const_array_view(exec, this->get_num_elements_per_item(), + this->get_const_values_for_item(item_id)), + make_const_array_view(exec, this->get_num_elements_per_item(), + this->get_const_col_idxs_for_item(item_id)), + this->get_num_stored_elements_per_row(), stride); + return mat; +} + + +template +std::unique_ptr> +Ell::create_with_config_of( + ptr_param> other) +{ + // De-referencing `other` before calling the functions (instead of + // using operator `->`) is currently required to be compatible with + // CUDA 10.1. + // Otherwise, it results in a compile error. + return (*other).create_with_same_config(); +} + + +template +std::unique_ptr> +Ell::create_with_same_config() const +{ + return Ell::create( + this->get_executor(), this->get_size(), + this->get_num_stored_elements_per_row()); +} + + +template +std::unique_ptr> +Ell::create_const( + std::shared_ptr exec, const batch_dim<2>& sizes, + int num_elems_per_row, gko::detail::const_array_view&& values, + gko::detail::const_array_view&& col_idxs) +{ + // cast const-ness away, but return a const object afterwards, + // so we can ensure that no modifications take place. + return std::unique_ptr( + new Ell{exec, sizes, num_elems_per_row, + gko::detail::array_const_cast(std::move(values)), + gko::detail::array_const_cast(std::move(col_idxs))}); +} + + +inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) +{ + return batch_dim<2>(sizes.get_num_batch_items(), + dim<2>(1, sizes.get_common_size()[1])); +} + + +template +Ell::Ell(std::shared_ptr exec, + const batch_dim<2>& size, int num_elems_per_row) + : EnableBatchLinOp>(exec, size), + num_elems_per_row_(num_elems_per_row), + values_(exec, compute_num_elems(size, num_elems_per_row)), + col_idxs_(exec, compute_num_elems(size, num_elems_per_row)) +{} + + +template +void Ell::apply_impl(const MultiVector* b, + MultiVector* x) const +{ + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); + + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); + this->get_executor()->run(ell::make_simple_apply(this, b, x)); +} + + +template +void Ell::apply_impl(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const +{ + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); + + GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); + GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(), gko::dim<2>(1, 1)); + GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1)); + this->get_executor()->run( + ell::make_advanced_apply(alpha, this, b, beta, x)); +} + + +template +void Ell::convert_to( + Ell>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_elems_per_row_ = this->num_elems_per_row_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell>* result) +{ + this->convert_to(result); +} + + +#define GKO_DECLARE_BATCH_ELL_MATRIX(_type) class Ell<_vtype, _itype> +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX); + + +} // namespace matrix +} // namespace batch +} // namespace gko diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp new file mode 100644 index 00000000000..374f1479664 --- /dev/null +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -0,0 +1,390 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_ +#define GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_ + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace batch { +namespace matrix { + + +/** + * Ell is a batch matrix format which explicitly stores all values of the + * matrix in each of the batches. + * + * The values in each of the batches are stored in row-major format (values + * belonging to the same row appear consecutive in the memory and the values of + * each batch item are also stored consecutively in memory). + * + * @note Though the storage layout is similar to the multi-vector object, the + * class semantics and the operations it aims to provide is different. Hence it + * is recommended to create multi-vector objects if the user means to view the + * data as a set of vectors. + * + * @tparam ValueType precision of matrix elements + * + * @ingroup batch_ell + * @ingroup mat_formats + * @ingroup BatchLinOp + */ +template +class Ell final + : public EnableBatchLinOp>, + public EnableCreateMethod>, + public ConvertibleTo, IndexType>> { + friend class EnableCreateMethod; + friend class EnablePolymorphicObject; + friend class Ell, IndexType>; + friend class Ell, IndexType>; + +public: + using EnableBatchLinOp::convert_to; + using EnableBatchLinOp::move_to; + + using value_type = ValueType; + using index_type = int32; + using transposed_type = Ell; + using unbatch_type = gko::matrix::Ell; + using absolute_type = remove_complex; + using complex_type = to_complex; + + /** + * Creates a Ell matrix with the configuration of another Ell + * matrix. + * + * @param other The other matrix whose configuration needs to copied. + */ + static std::unique_ptr create_with_config_of( + ptr_param other); + + void convert_to( + Ell, IndexType>* result) const override; + + void move_to(Ell, IndexType>* result) override; + + /** + * Creates a mutable view (of matrix::Ell type) of one item of the + * batch::matrix::Ell object. Does not perform any deep + * copies, but only returns a view of the data. + * + * @param item_id The index of the batch item + * + * @return a batch::matrix::Ell object with the data from the batch item + * at the given index. + */ + std::unique_ptr create_view_for_item(size_type item_id); + + /** + * @copydoc create_view_for_item(size_type) + */ + std::unique_ptr create_const_view_for_item( + size_type item_id) const; + + /** + * Returns a pointer to the array of values of the matrix + * + * @return the pointer to the array of values + */ + value_type* get_values() noexcept { return values_.get_data(); } + + /** + * @copydoc get_values() + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_values() const noexcept + { + return values_.get_const_data(); + } + + /** + * Returns a pointer to the array of column indices of the matrix + * + * @return the pointer to the array of column indices + */ + index_type* get_col_idxs() noexcept { return col_idxs_.get_data(); } + + /** + * @copydoc get_col_idxs() + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const index_type* get_const_col_idxs() const noexcept + { + return col_idxs_.get_const_data(); + } + + /** + * Returns the number of elements per row explicitly stored. + * + * @return the number of elements stored in each row of the ELL matrix. Same + * for each batch item + */ + int get_num_stored_elements_per_row() const noexcept + { + return num_elems_per_row_; + } + + /** + * Returns the number of elements explicitly stored in the batch matrix, + * cumulative across all the batch items. + * + * @return the number of elements explicitly stored in the vector, + * cumulative across all the batch items + */ + size_type get_num_stored_elements() const noexcept + { + return values_.get_num_elems(); + } + + /** + * Returns the number of stored elements in each batch item. + * + * @return the number of stored elements per batch item. + */ + size_type get_num_elements_per_item() const noexcept + { + return this->get_num_stored_elements() / this->get_num_batch_items(); + } + + /** + * Returns a pointer to the array of col_idxs of the matrix for a + * specific batch item. + * + * @param batch_id the id of the batch item. + * + * @return the pointer to the array of col_idxs + */ + value_type* get_col_idxs_for_item(size_type batch_id) noexcept + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return col_idxs_.get_data() + + batch_id * this->get_num_elements_per_item(); + } + + /** + * @copydoc get_col_idxs_for_item(size_type) + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_col_idxs_for_item( + size_type batch_id) const noexcept + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return col_idxs_.get_const_data() + + batch_id * this->get_num_elements_per_item(); + } + + /** + * Returns a pointer to the array of values of the matrix for a + * specific batch item. + * + * @param batch_id the id of the batch item. + * + * @return the pointer to the array of values + */ + value_type* get_values_for_item(size_type batch_id) noexcept + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_data() + + batch_id * this->get_num_elements_per_item(); + } + + /** + * @copydoc get_values_for_item(size_type) + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept + { + GKO_ASSERT(batch_id < this->get_num_batch_items()); + return values_.get_const_data() + + batch_id * this->get_num_elements_per_item(); + } + + /** + * Creates a constant (immutable) batch ell matrix from a constant + * array. + * + * @param exec the executor to create the matrix on + * @param size the dimensions of the matrix + * @param num_elems_per_row the number of elements to be stored in each row + * @param values the value array of the matrix + * @param col_idxs the col_idxs array of the matrix + * + * @return A smart pointer to the constant matrix wrapping the input + * array (if it resides on the same executor as the matrix) or a copy of the + * array on the correct executor. + */ + static std::unique_ptr> create_const( + std::shared_ptr exec, const batch_dim<2>& sizes, + const int num_elems_per_row, + gko::detail::const_array_view&& values, + gko::detail::const_array_view&& col_idxs); + + /** + * Apply the matrix to a multi-vector. Represents the matrix vector + * multiplication, x = A * b, where x and b are both multi-vectors. + * + * @param b the multi-vector to be applied to + * @param x the output multi-vector + */ + void apply(const MultiVector* b, + MultiVector* x) const + { + this->apply_impl(b, x); + } + + /** + * Apply the matrix to a multi-vector with a linear combination of the given + * input vector. Represents the matrix vector multiplication, x = alpha* A * + * b + beta * x, where x and b are both multi-vectors. + * + * @param alpha the scalar to scale the matrix-vector product with + * @param b the multi-vector to be applied to + * @param beta the scalar to scale the x vector with + * @param x the output multi-vector + */ + void apply(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const + { + this->apply_impl(alpha, b, beta, x); + } + +private: + size_type compute_num_elems(const batch_dim<2>& size, int num_elems_per_row) + { + return size->get_common_size()[0] * num_elems_per_row; + } + + +protected: + /** + * Creates an uninitialized Ell matrix of the specified size. + * + * @param exec Executor associated to the matrix + * @param size size of the matrix + * @param num_elems_per_row the number of elements to be stored in each row + */ + Ell(std::shared_ptr exec, + const batch_dim<2>& size = batch_dim<2>{}, + const int num_elems_per_row = 0); + + /** + * Creates a Ell matrix from an already allocated (and initialized) + * array. + * + * @tparam ValuesArray type of array of values + * + * @param exec Executor associated to the matrix + * @param size size of the matrix + * @param num_elems_per_row the number of elements to be stored in each row + * @param values array of matrix values + * @param col_idxs the col_idxs array of the matrix + * + * @note If `values` is not an rvalue, not an array of ValueType, or is on + * the wrong executor, an internal copy will be created, and the + * original array data will not be used in the matrix. + */ + template + Ell(std::shared_ptr exec, const batch_dim<2>& size, + const int num_elems_per_row, ValuesArray&& values, + IndicesArray&& col_idxs) + : EnableBatchLinOp(exec, size), + num_elems_per_row_{num_elems_per_row}, + values_{exec, std::forward(values)}, + col_idxs_{exec, std::forward(col_idxs)} + { + // Ensure that the value and col_idxs arrays have the correct size + auto num_elems = this->get_size()[0] * num_elems_per_row() * + this->get_num_batch_items(); + GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1); + GKO_ENSURE_IN_BOUNDS(num_elems, col_idxs_.get_num_elems() + 1); + } + + /** + * Creates a Ell matrix with the same configuration as the callers + * matrix. + * + * @returns a Ell matrix with the same configuration as the caller. + */ + std::unique_ptr create_with_same_config() const; + + void apply_impl(const MultiVector* b, + MultiVector* x) const; + + void apply_impl(const MultiVector* alpha, + const MultiVector* b, + const MultiVector* beta, + MultiVector* x) const; + +private: + int num_elems_per_row_; + array values_; + array col_idxs_; +}; + + +} // namespace matrix +} // namespace batch +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_ From 79fb2c12bbdba47e19246e96f61b759f1b80cc84 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 6 Oct 2023 15:13:34 +0200 Subject: [PATCH 385/583] Add ref, omp kernels and scaffold Co-authored-by: Aditya Kashi --- core/CMakeLists.txt | 1 + core/device_hooks/common_kernels.inc.cpp | 10 + core/matrix/batch_ell_kernels.hpp | 84 ++++ core/test/matrix/CMakeLists.txt | 1 + core/test/matrix/batch_ell.cpp | 478 +++++++++++++++++++++ cuda/CMakeLists.txt | 1 + cuda/matrix/batch_ell_kernels.cu | 86 ++++ dpcpp/CMakeLists.txt | 1 + dpcpp/matrix/batch_ell_kernels.dp.cpp | 102 +++++ hip/CMakeLists.txt | 1 + hip/matrix/batch_ell_kernels.hip.cpp | 86 ++++ omp/CMakeLists.txt | 1 + omp/matrix/batch_ell_kernels.cpp | 117 +++++ reference/CMakeLists.txt | 1 + reference/matrix/batch_ell_kernels.cpp | 116 +++++ reference/matrix/batch_ell_kernels.hpp.inc | 78 ++++ 16 files changed, 1164 insertions(+) create mode 100644 core/matrix/batch_ell_kernels.hpp create mode 100644 core/test/matrix/batch_ell.cpp create mode 100644 cuda/matrix/batch_ell_kernels.cu create mode 100644 dpcpp/matrix/batch_ell_kernels.dp.cpp create mode 100644 hip/matrix/batch_ell_kernels.hip.cpp create mode 100644 omp/matrix/batch_ell_kernels.cpp create mode 100644 reference/matrix/batch_ell_kernels.cpp create mode 100644 reference/matrix/batch_ell_kernels.hpp.inc diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 46ea67abc65..ae8035bcbf9 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -40,6 +40,7 @@ target_sources(ginkgo log/record.cpp log/stream.cpp matrix/batch_dense.cpp + matrix/batch_ell.cpp matrix/coo.cpp matrix/csr.cpp matrix/dense.cpp diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 87cab3dcf0b..b685063da10 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -310,6 +310,16 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); } // namespace batch_dense +namespace batch_ell { + + +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_ell + + namespace dense { diff --git a/core/matrix/batch_ell_kernels.hpp b/core/matrix/batch_ell_kernels.hpp new file mode 100644 index 00000000000..1b1ef345ae0 --- /dev/null +++ b/core/matrix/batch_ell_kernels.hpp @@ -0,0 +1,84 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_ +#define GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_ + + +#include + + +#include +#include +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(_vtype, _itype) \ + void simple_apply(std::shared_ptr exec, \ + const batch::matrix::Ell<_vtype, _itype>* a, \ + const batch::MultiVector<_vtype, _itype>* b, \ + batch::MultiVector<_vtype, _itype>* c) + +#define GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(_vtype, _itype) \ + void advanced_apply(std::shared_ptr exec, \ + const batch::MultiVector<_vtype, _itype>* alpha, \ + const batch::matrix::Ell<_vtype, _itype>* a, \ + const batch::MultiVector<_vtype, _itype>* b, \ + const batch::MultiVector<_vtype, _itype>* beta, \ + batch::MultiVector<_vtype, _itype>* c) + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(ValueType, IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_ell, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_ diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt index cca4b8da1c0..ec7ef93e517 100644 --- a/core/test/matrix/CMakeLists.txt +++ b/core/test/matrix/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_test(batch_dense) +ginkgo_create_test(batch_ell) ginkgo_create_test(coo) ginkgo_create_test(coo_builder) ginkgo_create_test(csr) diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp new file mode 100644 index 00000000000..931efb47d2e --- /dev/null +++ b/core/test/matrix/batch_ell.cpp @@ -0,0 +1,478 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/base/batch_utilities.hpp" +#include "core/test/utils.hpp" +#include "core/test/utils/batch_helpers.hpp" + + +template +class Ell : public ::testing::Test { +protected: + using value_type = T; + using EllMtx = gko::matrix::Ell; + using size_type = gko::size_type; + Ell() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::batch::initialize>( + {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + exec)), + mvec(gko::batch::initialize>( + {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, + exec)), + ell_mtx(gko::initialize>( + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)) + {} + + + static void assert_equal_to_original_mtx( + gko::batch::matrix::Ell* m) + { + ASSERT_EQ(m->get_num_batch_items(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3)); + EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(0, 0, 2), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5}); + EXPECT_EQ(m->at(0, 1, 1), value_type{2.5}); + ASSERT_EQ(m->at(0, 1, 2), value_type{3.5}); + EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{2.5}); + EXPECT_EQ(m->at(1, 0, 2), value_type{3.0}); + EXPECT_EQ(m->at(1, 1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{2.0}); + ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); + } + + static void assert_empty(gko::batch::matrix::Ell* m) + { + ASSERT_EQ(m->get_num_batch_items(), 0); + ASSERT_EQ(m->get_num_stored_elements(), 0); + } + + std::shared_ptr exec; + std::unique_ptr> mtx; + std::unique_ptr> mvec; + std::unique_ptr> ell_mtx; +}; + +TYPED_TEST_SUITE(Ell, gko::test::ValueTypes); + + +TYPED_TEST(Ell, KnowsItsSizeAndValues) +{ + this->assert_equal_to_original_mtx(this->mtx.get()); +} + + +TYPED_TEST(Ell, CanBeEmpty) +{ + auto empty = gko::batch::matrix::Ell::create(this->exec); + this->assert_empty(empty.get()); +} + + +TYPED_TEST(Ell, ReturnsNullValuesArrayWhenEmpty) +{ + auto empty = gko::batch::matrix::Ell::create(this->exec); + ASSERT_EQ(empty->get_const_values(), nullptr); +} + + +TYPED_TEST(Ell, CanGetValuesForEntry) +{ + using value_type = typename TestFixture::value_type; + + ASSERT_EQ(this->mtx->get_values_for_item(1)[0], value_type{1.0}); +} + + +TYPED_TEST(Ell, CanCreateEllItemView) +{ + GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->ell_mtx, 0.0); +} + + +TYPED_TEST(Ell, CanCreateMultiVectorView) +{ + GKO_ASSERT_BATCH_MTX_NEAR(this->mtx->create_multi_vector_view(), this->mvec, + 0.0); +} + + +TYPED_TEST(Ell, CanBeCopied) +{ + auto mtx_copy = gko::batch::matrix::Ell::create(this->exec); + + mtx_copy->copy_from(this->mtx.get()); + + this->assert_equal_to_original_mtx(this->mtx.get()); + this->mtx->at(0, 0, 0) = 7; + this->mtx->at(0, 1) = 7; + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(Ell, CanBeMoved) +{ + auto mtx_copy = gko::batch::matrix::Ell::create(this->exec); + + this->mtx->move_to(mtx_copy); + + this->assert_equal_to_original_mtx(mtx_copy.get()); +} + + +TYPED_TEST(Ell, CanBeCloned) +{ + auto mtx_clone = this->mtx->clone(); + + this->assert_equal_to_original_mtx( + dynamic_castmtx.get())>(mtx_clone.get())); +} + + +TYPED_TEST(Ell, CanBeCleared) +{ + this->mtx->clear(); + + this->assert_empty(this->mtx.get()); +} + + +TYPED_TEST(Ell, CanBeConstructedWithSize) +{ + using size_type = gko::size_type; + + auto m = gko::batch::matrix::Ell::create( + this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3})); + + ASSERT_EQ(m->get_num_batch_items(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3)); + ASSERT_EQ(m->get_num_stored_elements(), 30); +} + + +TYPED_TEST(Ell, CanBeConstructedFromExistingData) +{ + using value_type = typename TestFixture::value_type; + using size_type = gko::size_type; + // clang-format off + value_type data[] = { + 1.0, 2.0, + -1.0, 3.0, + 4.0, -1.0, + 3.0, 5.0, + 1.0, 5.0, + 6.0, -3.0}; + // clang-format on + + auto m = gko::batch::matrix::Ell::create( + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), + gko::array::view(this->exec, 8, data)); + + ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); + ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); + ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); + ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); +} + + +TYPED_TEST(Ell, CanBeConstructedFromExistingConstData) +{ + using value_type = typename TestFixture::value_type; + using size_type = gko::size_type; + // clang-format off + const value_type data[] = { + 1.0, 2.0, + -1.0, 3.0, + 4.0, -1.0, + 3.0, 5.0, + 1.0, 5.0, + 6.0, -3.0}; + // clang-format on + + auto m = gko::batch::matrix::Ell::create_const( + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), + gko::array::const_view(this->exec, 8, data)); + + ASSERT_EQ(m->get_const_values(), data); + ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); + ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); + ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); + ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); + ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); + ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); + ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); + ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); +} + + +TYPED_TEST(Ell, CanBeConstructedFromEllMatrices) +{ + using value_type = typename TestFixture::value_type; + using EllMtx = typename TestFixture::EllMtx; + using size_type = gko::size_type; + + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); + auto mat2 = + gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + + auto m = gko::batch::create_from_item>( + this->exec, std::vector{mat1.get(), mat2.get()}); + + this->assert_equal_to_original_mtx(m.get()); +} + + +TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication) +{ + using value_type = typename TestFixture::value_type; + using EllMtx = typename TestFixture::EllMtx; + using size_type = gko::size_type; + + auto mat1 = gko::initialize(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); + auto mat2 = + gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + + auto bat_m = + gko::batch::create_from_item>( + this->exec, + std::vector{mat1.get(), mat1.get(), mat1.get()}); + auto m = gko::batch::create_from_item>( + this->exec, 3, mat1.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); +} + + +TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices) +{ + using value_type = typename TestFixture::value_type; + using EllMtx = typename TestFixture::EllMtx; + using size_type = gko::size_type; + + auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); + auto mat2 = + gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + + auto m = gko::batch::create_from_item>( + this->exec, std::vector{mat1.get(), mat2.get()}); + auto m_ref = + gko::batch::create_from_item>( + this->exec, + std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), + mat1.get(), mat2.get()}); + + auto m2 = gko::batch::duplicate>( + this->exec, 3, m.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); +} + + +TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices) +{ + using value_type = typename TestFixture::value_type; + using EllMtx = typename TestFixture::EllMtx; + using size_type = gko::size_type; + auto mat1 = gko::initialize(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + this->exec); + auto mat2 = + gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + + auto ell_mats = gko::batch::unbatch>( + this->mtx.get()); + + GKO_ASSERT_MTX_NEAR(ell_mats[0].get(), mat1.get(), 0.); + GKO_ASSERT_MTX_NEAR(ell_mats[1].get(), mat2.get(), 0.); +} + + +TYPED_TEST(Ell, CanBeListConstructed) +{ + using value_type = typename TestFixture::value_type; + auto m = gko::batch::initialize>( + {{1.0, 2.0}, {1.0, 3.0}}, this->exec); + + ASSERT_EQ(m->get_num_batch_items(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); + EXPECT_EQ(m->at(0, 0), value_type{1}); + EXPECT_EQ(m->at(0, 1), value_type{2}); + EXPECT_EQ(m->at(1, 0), value_type{1}); + EXPECT_EQ(m->at(1, 1), value_type{3}); +} + + +TYPED_TEST(Ell, CanBeListConstructedByCopies) +{ + using value_type = typename TestFixture::value_type; + + auto m = gko::batch::initialize>( + 2, I({1.0, 2.0}), this->exec); + + ASSERT_EQ(m->get_num_batch_items(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{2.0}); +} + + +TYPED_TEST(Ell, CanBeDoubleListConstructed) +{ + using value_type = typename TestFixture::value_type; + using T = value_type; + + auto m = gko::batch::initialize>( + {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, + {I{1.0, 2.0, -1.0}, I{3.0, 4.0, -2.0}, I{5.0, 6.0, -3.0}}}, + this->exec); + + ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3)); + EXPECT_EQ(m->at(0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 1), value_type{1.0}); + EXPECT_EQ(m->at(0, 2), value_type{0.0}); + ASSERT_EQ(m->at(0, 3), value_type{2.0}); + EXPECT_EQ(m->at(0, 4), value_type{4.0}); + EXPECT_EQ(m->at(1, 0), value_type{1.0}); + EXPECT_EQ(m->at(1, 1), value_type{2.0}); + EXPECT_EQ(m->at(1, 2), value_type{-1.0}); + ASSERT_EQ(m->at(1, 3), value_type{3.0}); + EXPECT_EQ(m->at(1, 4), value_type{4.0}); +} + + +TYPED_TEST(Ell, CanBeReadFromMatrixData) +{ + using value_type = typename TestFixture::value_type; + using index_type = int; + + auto vec_data = std::vector>{}; + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}})); + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}})); + + auto m = gko::batch::read>(this->exec, + vec_data); + + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); + EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); +} + + +TYPED_TEST(Ell, CanBeReadFromSparseMatrixData) +{ + using value_type = typename TestFixture::value_type; + using index_type = int; + auto vec_data = std::vector>{}; + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}})); + vec_data.emplace_back(gko::matrix_data( + {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}})); + + auto m = gko::batch::read>(this->exec, + vec_data); + + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); + EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); + EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); + EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); + EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); + EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); + EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); + EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); +} + + +TYPED_TEST(Ell, GeneratesCorrectMatrixData) +{ + using value_type = typename TestFixture::value_type; + using index_type = int; + using tpl = typename gko::matrix_data::nonzero_type; + + auto data = + gko::batch::write>(this->mtx.get()); + + ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); + ASSERT_EQ(data[0].nonzeros.size(), 6); + EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0})); + EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0})); + EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0})); + EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5})); + EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5})); + EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5})); + ASSERT_EQ(data[1].size, gko::dim<2>(2, 3)); + ASSERT_EQ(data[1].nonzeros.size(), 6); + EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0})); + EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5})); + EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0})); + EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0})); + EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0})); + EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0})); +} diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index dfa1b2177ee..f5b7932ed39 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -39,6 +39,7 @@ target_sources(ginkgo_cuda factorization/par_ilut_spgeam_kernel.cu factorization/par_ilut_sweep_kernel.cu matrix/batch_dense_kernels.cu + matrix/batch_ell_kernels.cu matrix/coo_kernels.cu ${CSR_INSTANTIATE} matrix/dense_kernels.cu diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu new file mode 100644 index 00000000000..c41b436daed --- /dev/null +++ b/cuda/matrix/batch_ell_kernels.cu @@ -0,0 +1,86 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include +#include + + +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/cublas_bindings.hpp" +#include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 9990496c98f..9c2e799ede9 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -37,6 +37,7 @@ target_sources(ginkgo_dpcpp factorization/par_ilut_spgeam_kernel.dp.cpp factorization/par_ilut_sweep_kernel.dp.cpp matrix/batch_dense_kernels.dp.cpp + matrix/batch_ell_kernels.dp.cpp matrix/coo_kernels.dp.cpp matrix/csr_kernels.dp.cpp matrix/fbcsr_kernels.dp.cpp diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp new file mode 100644 index 00000000000..f886b7dd790 --- /dev/null +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -0,0 +1,102 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +// #include "dpcpp/matrix/batch_dense_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_ell +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 21b573b6cd0..ccc88769a4e 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -36,6 +36,7 @@ set(GINKGO_HIP_SOURCES factorization/par_ilut_spgeam_kernel.hip.cpp factorization/par_ilut_sweep_kernel.hip.cpp matrix/batch_dense_kernels.hip.cpp + matrix/batch_ell_kernels.hip.cpp matrix/coo_kernels.hip.cpp ${CSR_INSTANTIATE} matrix/dense_kernels.hip.cpp diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp new file mode 100644 index 00000000000..c41b436daed --- /dev/null +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -0,0 +1,86 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include +#include + + +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/cublas_bindings.hpp" +#include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index d87399492f5..aa8e30cd590 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -24,6 +24,7 @@ target_sources(ginkgo_omp factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp matrix/batch_dense_kernels.cpp + matrix/batch_ell_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp new file mode 100644 index 00000000000..282920c05f3 --- /dev/null +++ b/omp/matrix/batch_ell_kernels.cpp @@ -0,0 +1,117 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_ell_kernels.hpp" + + +#include + + +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +#include "reference/matrix/batch_ell_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + batch::MultiVector* x) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + simple_apply_kernel(mat_item, b_item, x_item); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* x) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); +#pragma omp parallel for + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); + const auto beta_item = batch::extract_batch_item(beta_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_ell +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 37498588ca7..21dfc0dfb5a 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -26,6 +26,7 @@ target_sources(ginkgo_reference factorization/par_ilu_kernels.cpp factorization/par_ilut_kernels.cpp matrix/batch_dense_kernels.cpp + matrix/batch_ell_kernels.cpp matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp new file mode 100644 index 00000000000..1fab322dc5f --- /dev/null +++ b/reference/matrix/batch_ell_kernels.cpp @@ -0,0 +1,116 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_ell_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +#include "reference/matrix/batch_ell_kernels.hpp.inc" + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + batch::MultiVector* x) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + simple_apply_kernel(mat_item, b_item, x_item); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* x) +{ + const auto b_ub = host::get_batch_struct(b); + const auto x_ub = host::get_batch_struct(x); + const auto mat_ub = host::get_batch_struct(mat); + const auto alpha_ub = host::get_batch_struct(alpha); + const auto beta_ub = host::get_batch_struct(beta); + for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { + const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); + const auto b_item = batch::extract_batch_item(b_ub, batch); + const auto x_item = batch::extract_batch_item(x_ub, batch); + const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); + const auto beta_item = batch::extract_batch_item(beta_ub, batch); + advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, + beta_item.values[0], x_item); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); + + +} // namespace batch_ell +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc new file mode 100644 index 00000000000..1874d1db9f3 --- /dev/null +++ b/reference/matrix/batch_ell_kernels.hpp.inc @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +inline void simple_apply_kernel( + const gko::batch::matrix::batch_ell::batch_item& a, + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& c) +{ + for (int row = 0; row < a.num_rows; ++row) { + for (int j = 0; j < b.num_rhs; ++j) { + c.values[row * c.stride + j] = zero(); + } + for (auto k = 0; k < a.num_stored_elems_per_row; ++k) { + auto val = a.values[row + k * a.stride]; + auto col = a.col_idxs[row + k * a.stride]; + for (int j = 0; j < b.num_rhs; ++j) { + c.values[row * c.stride + j] += + val * b.values[col * b.stride + j]; + } + } + } +} + + +template +inline void advanced_apply_kernel( + const ValueType alpha, + const gko::batch::matrix::batch_ell::batch_item& a, + const gko::batch::multi_vector::batch_item& b, + const ValueType beta, + const gko::batch::multi_vector::batch_item& c) +{ + for (int row = 0; row < a.num_rows; ++row) { + for (int j = 0; j < c.num_rhs; ++j) { + c.values[row * c.stride + j] *= beta; + } + for (auto k = 0; k < a.num_stored_elems_per_row; ++k) { + auto val = a.values[row + k * a.stride]; + auto col = a.col_idxs[row + k * a.stride]; + for (int j = 0; j < b.num_rhs; ++j) { + c.values[row * c.stride + j] += + alpha * val * b.values[col * b.stride + j]; + } + } + } +} From e0683e489f5044ac4f3de241c1c767665abb1c40 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 6 Oct 2023 17:19:37 +0200 Subject: [PATCH 386/583] Use only int32 --- .../matrix/batch_ell_kernel_launcher.hpp.inc | 53 +++++++++++ core/device_hooks/common_kernels.inc.cpp | 10 +- core/matrix/batch_ell.cpp | 34 ++----- core/matrix/batch_ell_kernels.hpp | 20 ++-- core/matrix/batch_struct.hpp | 95 +++++++++++++++++++ cuda/matrix/batch_ell_kernels.cu | 4 +- dpcpp/matrix/batch_ell_kernels.dp.cpp | 4 +- hip/matrix/batch_ell_kernels.hip.cpp | 4 +- include/ginkgo/core/base/types.hpp | 16 ++++ include/ginkgo/core/matrix/batch_ell.hpp | 31 +++--- omp/matrix/batch_ell_kernels.cpp | 4 +- reference/matrix/batch_ell_kernels.cpp | 4 +- reference/matrix/batch_ell_kernels.hpp.inc | 10 +- reference/matrix/batch_struct.hpp | 35 +++++++ 14 files changed, 256 insertions(+), 68 deletions(-) create mode 100644 common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc new file mode 100644 index 00000000000..263e911c31a --- /dev/null +++ b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc @@ -0,0 +1,53 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( + GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( + GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index b685063da10..462675c15db 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -58,6 +58,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/factorization/par_ilu_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_ell_kernels.hpp" #include "core/matrix/coo_kernels.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/dense_kernels.hpp" @@ -137,6 +138,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) +#define GKO_STUB_VALUE_AND_INT32_TYPE(_macro) \ + template \ + _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) + #define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ template \ @@ -313,8 +319,8 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); namespace batch_ell { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); +GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); +GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); } // namespace batch_ell diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 63d4f0dda8a..3aea6e1aae4 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -64,24 +64,6 @@ GKO_REGISTER_OPERATION(advanced_apply, batch_ell::advanced_apply); } // namespace ell -namespace detail { - - -template -batch_dim<2> compute_batch_size( - const std::vector*>& matrices) -{ - auto common_size = matrices[0]->get_size(); - for (size_type i = 1; i < matrices.size(); ++i) { - GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); - } - return batch_dim<2>{matrices.size(), common_size}; -} - - -} // namespace detail - - template std::unique_ptr> Ell::create_view_for_item(size_type item_id) @@ -145,7 +127,8 @@ template std::unique_ptr> Ell::create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - int num_elems_per_row, gko::detail::const_array_view&& values, + const IndexType num_elems_per_row, + gko::detail::const_array_view&& values, gko::detail::const_array_view&& col_idxs) { // cast const-ness away, but return a const object afterwards, @@ -166,7 +149,8 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) template Ell::Ell(std::shared_ptr exec, - const batch_dim<2>& size, int num_elems_per_row) + const batch_dim<2>& size, + IndexType num_elems_per_row) : EnableBatchLinOp>(exec, size), num_elems_per_row_(num_elems_per_row), values_(exec, compute_num_elems(size, num_elems_per_row)), @@ -209,7 +193,7 @@ void Ell::apply_impl(const MultiVector* alpha, template void Ell::convert_to( - Ell>* result) const + Ell, IndexType>* result) const { result->values_ = this->values_; result->col_idxs_ = this->col_idxs_; @@ -218,16 +202,16 @@ void Ell::convert_to( } -template +template void Ell::move_to( - Ell>* result) + Ell, IndexType>* result) { this->convert_to(result); } -#define GKO_DECLARE_BATCH_ELL_MATRIX(_type) class Ell<_vtype, _itype> -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX); +#define GKO_DECLARE_BATCH_ELL_MATRIX(ValueType) class Ell +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX); } // namespace matrix diff --git a/core/matrix/batch_ell_kernels.hpp b/core/matrix/batch_ell_kernels.hpp index 1b1ef345ae0..d3acc582f9b 100644 --- a/core/matrix/batch_ell_kernels.hpp +++ b/core/matrix/batch_ell_kernels.hpp @@ -52,16 +52,16 @@ namespace kernels { #define GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(_vtype, _itype) \ void simple_apply(std::shared_ptr exec, \ const batch::matrix::Ell<_vtype, _itype>* a, \ - const batch::MultiVector<_vtype, _itype>* b, \ - batch::MultiVector<_vtype, _itype>* c) - -#define GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(_vtype, _itype) \ - void advanced_apply(std::shared_ptr exec, \ - const batch::MultiVector<_vtype, _itype>* alpha, \ - const batch::matrix::Ell<_vtype, _itype>* a, \ - const batch::MultiVector<_vtype, _itype>* b, \ - const batch::MultiVector<_vtype, _itype>* beta, \ - batch::MultiVector<_vtype, _itype>* c) + const batch::MultiVector<_vtype>* b, \ + batch::MultiVector<_vtype>* c) + +#define GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(_vtype, _itype) \ + void advanced_apply(std::shared_ptr exec, \ + const batch::MultiVector<_vtype>* alpha, \ + const batch::matrix::Ell<_vtype, _itype>* a, \ + const batch::MultiVector<_vtype>* b, \ + const batch::MultiVector<_vtype>* beta, \ + batch::MultiVector<_vtype>* c) #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index 0bbfde40cc9..272bb506df2 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace gko { @@ -82,6 +83,53 @@ struct uniform_batch { } // namespace dense +namespace batch_ell { + + +/** + * Encapsulates one matrix from a batch of ell matrices. + */ +template +struct batch_item { + using value_type = ValueType; + using index_type = int32; + + ValueType* values; + const index_type* col_idxs; + index_type stride; + index_type num_rows; + index_type num_cols; + index_type num_stored_elems_per_row; +}; + + +/** + * A 'simple' structure to store a global uniform batch of ell matrices. + */ +template +struct uniform_batch { + using value_type = ValueType; + using index_type = int; + using entry_type = batch_item; + + ValueType* values; + const index_type* col_idxs; + size_type num_batch_items; + index_type stride; + index_type num_rows; + index_type num_cols; + index_type num_stored_elems_per_row; + + size_type get_entry_storage() const + { + return num_rows * num_stored_elems_per_row * sizeof(value_type); + } +}; + + +} // namespace batch_ell + + template GKO_ATTRIBUTES GKO_INLINE dense::batch_item to_const( const dense::batch_item& b) @@ -116,6 +164,53 @@ GKO_ATTRIBUTES GKO_INLINE dense::batch_item extract_batch_item( } +template +GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item to_const( + const batch_ell::batch_item& b) +{ + return {b.values, b.col_idxs, b.stride, + b.num_rows, b.num_cols, b.num_stored_elems_per_row}; +} + + +template +GKO_ATTRIBUTES GKO_INLINE batch_ell::uniform_batch to_const( + const batch_ell::uniform_batch& ub) +{ + return {ub.values, ub.col_idxs, ub.num_batch_items, ub.stride, + ub.num_rows, ub.num_cols, ub.num_stored_elems_per_row}; +} + + +template +GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item extract_batch_item( + const batch_ell::uniform_batch& batch, const size_type batch_idx) +{ + return {batch.values + + batch_idx * batch.num_stored_elems_per_row * batch.num_rows, + batch.col_idxs + + batch_idx * batch.num_stored_elems_per_row * batch.num_rows, + batch.stride, + batch.num_rows, + batch.num_cols, + batch.num_stored_elems_per_row}; +} + +template +GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item extract_batch_item( + ValueType* const batch_values, int* const batch_col_idxs, const int stride, + const int num_rows, const int num_cols, int num_elems_per_row, + const size_type batch_idx) +{ + return {batch_values + batch_idx * num_elems_per_row * num_rows, + batch_col_idxs + batch_idx * num_elems_per_row * num_rows, + stride, + num_rows, + num_cols, + num_elems_per_row}; +} + + } // namespace matrix } // namespace batch } // namespace gko diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index c41b436daed..567d863d95c 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_ell_kernels.hpp" #include @@ -72,7 +72,7 @@ constexpr int sm_oversubscription = 4; // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index f886b7dd790..cdcd5abd024 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -80,7 +80,7 @@ void simple_apply(std::shared_ptr exec, const batch::MultiVector* b, batch::MultiVector* x) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); @@ -92,7 +92,7 @@ void advanced_apply(std::shared_ptr exec, const batch::MultiVector* beta, batch::MultiVector* x) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index c41b436daed..567d863d95c 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_ell_kernels.hpp" #include @@ -72,7 +72,7 @@ constexpr int sm_oversubscription = 4; // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 68b5da6e3eb..f5a75c7448e 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -531,6 +531,22 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(double, int64) #endif +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \ + template _macro(float, int32); \ + template <> \ + _macro(double, int32) GKO_NOT_IMPLEMENTED; \ + template _macro(std::complex, int32); \ + template <> \ + _macro(std::complex, int32) GKO_NOT_IMPLEMENTED +#else +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \ + template _macro(float, int32); \ + template _macro(double, int32); \ + template _macro(std::complex, int32); \ + template _macro(std::complex, int32) +#endif + /** * Instantiates a template for each value and index type compiled by Ginkgo. diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index 374f1479664..af77fc1e390 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -88,7 +88,7 @@ class Ell final using EnableBatchLinOp::move_to; using value_type = ValueType; - using index_type = int32; + using index_type = IndexType; using transposed_type = Ell; using unbatch_type = gko::matrix::Ell; using absolute_type = remove_complex; @@ -170,7 +170,7 @@ class Ell final * @return the number of elements stored in each row of the ELL matrix. Same * for each batch item */ - int get_num_stored_elements_per_row() const noexcept + index_type get_num_stored_elements_per_row() const noexcept { return num_elems_per_row_; } @@ -205,7 +205,7 @@ class Ell final * * @return the pointer to the array of col_idxs */ - value_type* get_col_idxs_for_item(size_type batch_id) noexcept + index_type* get_col_idxs_for_item(size_type batch_id) noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return col_idxs_.get_data() + @@ -219,8 +219,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_col_idxs_for_item( - size_type batch_id) const noexcept + const index_type* get_const_col_idxs_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return col_idxs_.get_const_data() + @@ -249,8 +249,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + @@ -271,9 +271,9 @@ class Ell final * array (if it resides on the same executor as the matrix) or a copy of the * array on the correct executor. */ - static std::unique_ptr> create_const( + static std::unique_ptr create_const( std::shared_ptr exec, const batch_dim<2>& sizes, - const int num_elems_per_row, + const index_type num_elems_per_row, gko::detail::const_array_view&& values, gko::detail::const_array_view&& col_idxs); @@ -309,9 +309,10 @@ class Ell final } private: - size_type compute_num_elems(const batch_dim<2>& size, int num_elems_per_row) + size_type compute_num_elems(const batch_dim<2>& size, + IndexType num_elems_per_row) { - return size->get_common_size()[0] * num_elems_per_row; + return size.get_common_size()[0] * num_elems_per_row; } @@ -325,7 +326,7 @@ class Ell final */ Ell(std::shared_ptr exec, const batch_dim<2>& size = batch_dim<2>{}, - const int num_elems_per_row = 0); + const IndexType num_elems_per_row = 0); /** * Creates a Ell matrix from an already allocated (and initialized) @@ -345,7 +346,7 @@ class Ell final */ template Ell(std::shared_ptr exec, const batch_dim<2>& size, - const int num_elems_per_row, ValuesArray&& values, + const IndexType num_elems_per_row, ValuesArray&& values, IndicesArray&& col_idxs) : EnableBatchLinOp(exec, size), num_elems_per_row_{num_elems_per_row}, @@ -353,7 +354,7 @@ class Ell final col_idxs_{exec, std::forward(col_idxs)} { // Ensure that the value and col_idxs arrays have the correct size - auto num_elems = this->get_size()[0] * num_elems_per_row() * + auto num_elems = this->get_common_size()[0] * num_elems_per_row * this->get_num_batch_items(); GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1); GKO_ENSURE_IN_BOUNDS(num_elems, col_idxs_.get_num_elems() + 1); @@ -376,7 +377,7 @@ class Ell final MultiVector* x) const; private: - int num_elems_per_row_; + index_type num_elems_per_row_; array values_; array col_idxs_; }; diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp index 282920c05f3..20ea4614e7d 100644 --- a/omp/matrix/batch_ell_kernels.cpp +++ b/omp/matrix/batch_ell_kernels.cpp @@ -78,7 +78,7 @@ void simple_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); @@ -107,7 +107,7 @@ void advanced_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp index 1fab322dc5f..a3f69827c02 100644 --- a/reference/matrix/batch_ell_kernels.cpp +++ b/reference/matrix/batch_ell_kernels.cpp @@ -78,7 +78,7 @@ void simple_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); @@ -106,7 +106,7 @@ void advanced_apply(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc index 1874d1db9f3..37370261d44 100644 --- a/reference/matrix/batch_ell_kernels.hpp.inc +++ b/reference/matrix/batch_ell_kernels.hpp.inc @@ -30,10 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -template +template inline void simple_apply_kernel( - const gko::batch::matrix::batch_ell::batch_item& a, + const gko::batch::matrix::batch_ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) { @@ -53,11 +52,10 @@ inline void simple_apply_kernel( } -template +template inline void advanced_apply_kernel( const ValueType alpha, - const gko::batch::matrix::batch_ell::batch_item& a, + const gko::batch::matrix::batch_ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const ValueType beta, const gko::batch::multi_vector::batch_item& c) diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index 483d7717718..b5eacd80d18 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include "core/base/batch_struct.hpp" @@ -90,6 +91,40 @@ inline batch::matrix::dense::uniform_batch get_batch_struct( } +/** + * Generates an immutable uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch +get_batch_struct(const batch::matrix::Ell* const op) +{ + return {op->get_const_values(), + op->get_const_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + +/** + * Generates a uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch get_batch_struct( + batch::matrix::Ell* const op) +{ + return {op->get_values(), + op->get_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + } // namespace host } // namespace kernels } // namespace gko From 251914e01044906c762db4a0e368b03a0862a089 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 7 Oct 2023 17:31:07 +0200 Subject: [PATCH 387/583] Generalize and rewrite batch utils --- core/base/batch_utilities.hpp | 273 ++++++++++++++- core/matrix/batch_ell.cpp | 15 +- core/test/matrix/batch_ell.cpp | 330 +++++++++--------- .../ginkgo/core/base/batch_multi_vector.hpp | 222 +----------- include/ginkgo/core/matrix/batch_ell.hpp | 18 +- 5 files changed, 449 insertions(+), 409 deletions(-) diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index 834e89c8358..c37c0cae721 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include #include @@ -53,15 +52,18 @@ namespace gko { namespace batch { -template +template std::unique_ptr duplicate(std::shared_ptr exec, size_type num_duplications, - const OutputType* input) + const OutputType* input, + TArgs&&... create_args) { auto num_batch_items = input->get_num_batch_items(); - auto tmp = OutputType::create( - exec, batch_dim<2>(num_batch_items * num_duplications, - input->get_common_size())); + auto tmp = + OutputType::create(exec, + batch_dim<2>(num_batch_items * num_duplications, + input->get_common_size()), + std::forward(create_args)...); for (size_type i = 0; i < num_duplications; ++i) { for (size_type b = 0; b < num_batch_items; ++b) { @@ -74,14 +76,15 @@ std::unique_ptr duplicate(std::shared_ptr exec, } -template +template std::unique_ptr create_from_item( std::shared_ptr exec, const size_type num_duplications, - const typename OutputType::unbatch_type* input) + const typename OutputType::unbatch_type* input, TArgs&&... create_args) { auto num_batch_items = num_duplications; auto tmp = OutputType::create( - exec, batch_dim<2>(num_batch_items, input->get_size())); + exec, batch_dim<2>(num_batch_items, input->get_size()), + std::forward(create_args)...); for (size_type b = 0; b < num_batch_items; ++b) { tmp->create_view_for_item(b)->copy_from(input); @@ -91,14 +94,16 @@ std::unique_ptr create_from_item( } -template +template std::unique_ptr create_from_item( std::shared_ptr exec, - const std::vector& input) + const std::vector& input, + TArgs&&... create_args) { auto num_batch_items = input.size(); auto tmp = OutputType::create( - exec, batch_dim<2>(num_batch_items, input[0]->get_size())); + exec, batch_dim<2>(num_batch_items, input[0]->get_size()), + std::forward(create_args)...); for (size_type b = 0; b < num_batch_items; ++b) { tmp->create_view_for_item(b)->copy_from(input[b]); @@ -121,14 +126,17 @@ auto unbatch(const InputType* batch_object) } -template +template std::unique_ptr read( std::shared_ptr exec, - const std::vector>& data) + const std::vector>& data, + TArgs&&... create_args) { auto num_batch_items = data.size(); auto tmp = - OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size)); + OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size), + std::forward(create_args)...); for (size_type b = 0; b < num_batch_items; ++b) { tmp->create_view_for_item(b)->read(data[b]); @@ -154,6 +162,241 @@ std::vector> write( } +/** + * Creates and initializes a batch of single column-vectors. + * + * This function first creates a temporary MultiVector, fills it with + * passed in values, and then converts the vector to the requested type. + * + * @tparam Matrix matrix type to initialize + * (MultiVector has to implement the ConvertibleTo + * interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param vals values used to initialize the batch vector + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup MultiVector + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + size_type num_batch_items = vals.size(); + GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); + auto vals_begin = begin(vals); + size_type common_num_rows = vals_begin ? vals_begin->size() : 0; + auto common_size = dim<2>(common_num_rows, 1); + for (auto& val : vals) { + GKO_ASSERT_EQ(common_num_rows, val.size()); + } + auto b_size = batch_dim<2>(num_batch_items, common_size); + size_type batch = 0; + std::vector input_mat_data(num_batch_items, common_size); + for (const auto& b : vals) { + input_mat_data[batch].nonzeros.reserve(b.size()); + size_type idx = 0; + for (const auto& elem : b) { + if (elem != zero()) { + input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem); + } + ++idx; + } + ++batch; + } + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + +/** + * Creates and initializes a batch of multi-vectors. + * + * This function first creates a temporary MultiVector, fills it with + * passed in values, and then converts the vector to the requested type. + * + * @tparam Matrix matrix type to initialize + * (Dense has to implement the ConvertibleTo interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param vals values used to initialize the vector + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup MultiVector + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + std::initializer_list>> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + size_type num_batch_items = vals.size(); + GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); + auto vals_begin = begin(vals); + size_type common_num_rows = vals_begin ? vals_begin->size() : 0; + size_type common_num_cols = + vals_begin->begin() ? vals_begin->begin()->size() : 0; + auto common_size = dim<2>(common_num_rows, common_num_cols); + for (const auto& b : vals) { + auto num_rows = b.size(); + auto num_cols = begin(b)->size(); + auto b_size = dim<2>(num_rows, num_cols); + GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size); + } + + auto b_size = batch_dim<2>(num_batch_items, common_size); + size_type batch = 0; + std::vector input_mat_data(num_batch_items, common_size); + for (const auto& b : vals) { + size_type ridx = 0; + for (const auto& row : b) { + size_type cidx = 0; + for (const auto& elem : row) { + if (elem != zero()) { + input_mat_data[batch].nonzeros.emplace_back(ridx, cidx, + elem); + } + ++cidx; + } + ++ridx; + } + ++batch; + } + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + +/** + * Creates and initializes a batch single column-vector by making copies of the + * single input column vector. + * + * This function first creates a temporary batch multi-vector, fills it with + * passed in values, and then converts the vector to the requested type. + * + * @tparam Matrix matrix type to initialize + * (MultiVector has to implement the ConvertibleTo + * interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param num_vectors The number of times the input vector is to be duplicated + * @param vals values used to initialize each vector in the temp. batch + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup MultiVector + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + const size_type num_vectors, + std::initializer_list vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + size_type num_batch_items = num_vectors; + GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, + "Input data is empty"); + auto num_rows = begin(vals) ? vals.size() : 0; + auto common_size = dim<2>(num_rows, 1); + auto b_size = batch_dim<2>(num_batch_items, common_size); + std::vector input_mat_data(num_batch_items, common_size); + for (size_type batch = 0; batch < num_vectors; batch++) { + input_mat_data[batch].nonzeros.reserve(num_rows); + size_type idx = 0; + for (const auto& elem : vals) { + if (elem != zero()) { + input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem); + } + ++idx; + } + } + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + +/** + * Creates and initializes a matrix from copies of a given matrix. + * + * This function first creates a temporary batch multi-vector, fills it with + * passed in values, and then converts the vector to the requested type. + * + * @tparam Matrix matrix type to initialize + * (MultiVector has to implement the ConvertibleTo + * interface) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param num_batch_items The number of times the input matrix is duplicated + * @param vals values used to initialize each vector in the temp. batch + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup LinOp + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + const size_type num_batch_items, + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, + "Input data is empty"); + auto common_size = dim<2>(begin(vals) ? vals.size() : 0, + begin(vals) ? begin(vals)->size() : 0); + batch_dim<2> b_size(num_batch_items, common_size); + std::vector input_mat_data(num_batch_items, common_size); + for (size_type batch = 0; batch < num_batch_items; batch++) { + size_type ridx = 0; + for (const auto& row : vals) { + size_type cidx = 0; + for (const auto& elem : row) { + if (elem != zero()) { + input_mat_data[batch].nonzeros.emplace_back(ridx, cidx, + elem); + } + ++cidx; + } + ++ridx; + } + } + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + } // namespace batch } // namespace gko diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 3aea6e1aae4..0d903b10968 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -70,13 +70,13 @@ Ell::create_view_for_item(size_type item_id) { auto exec = this->get_executor(); auto num_rows = this->get_common_size()[0]; - auto stride = this->get_common_size()[1]; + auto stride = this->get_common_size()[0]; auto mat = unbatch_type::create( exec, this->get_common_size(), make_array_view(exec, this->get_num_elements_per_item(), this->get_values_for_item(item_id)), make_array_view(exec, this->get_num_elements_per_item(), - this->get_col_idxs_for_item(item_id)), + this->get_col_idxs()), this->get_num_stored_elements_per_row(), stride); return mat; } @@ -88,13 +88,13 @@ Ell::create_const_view_for_item(size_type item_id) const { auto exec = this->get_executor(); auto num_rows = this->get_common_size()[0]; - auto stride = this->get_common_size()[1]; + auto stride = this->get_common_size()[0]; auto mat = unbatch_type::create_const( exec, this->get_common_size(), make_const_array_view(exec, this->get_num_elements_per_item(), this->get_const_values_for_item(item_id)), make_const_array_view(exec, this->get_num_elements_per_item(), - this->get_const_col_idxs_for_item(item_id)), + this->get_const_col_idxs()), this->get_num_stored_elements_per_row(), stride); return mat; } @@ -152,9 +152,10 @@ Ell::Ell(std::shared_ptr exec, const batch_dim<2>& size, IndexType num_elems_per_row) : EnableBatchLinOp>(exec, size), - num_elems_per_row_(num_elems_per_row), - values_(exec, compute_num_elems(size, num_elems_per_row)), - col_idxs_(exec, compute_num_elems(size, num_elems_per_row)) + num_elems_per_row_(num_elems_per_row == 0 ? size.get_common_size()[1] + : num_elems_per_row), + values_(exec, compute_num_elems(size, num_elems_per_row_)), + col_idxs_(exec, this->get_common_size()[0] * num_elems_per_row_) {} diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp index 931efb47d2e..2830705bf5f 100644 --- a/core/test/matrix/batch_ell.cpp +++ b/core/test/matrix/batch_ell.cpp @@ -51,6 +51,7 @@ template class Ell : public ::testing::Test { protected: using value_type = T; + using index_type = gko::int32; using EllMtx = gko::matrix::Ell; using size_type = gko::size_type; Ell() @@ -58,46 +59,71 @@ class Ell : public ::testing::Test { mtx(gko::batch::initialize>( {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, - exec)), - mvec(gko::batch::initialize>( - {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, - {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, - exec)), + exec, 3)), + sp_mtx(gko::batch::initialize>( + {{{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}}, + {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}}, + exec, 2)), ell_mtx(gko::initialize>( - {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec)) + {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 3)), + sp_ell_mtx(gko::initialize>( + {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 2)) {} + static void assert_equal_to_original_sparse_mtx( + const gko::batch::matrix::Ell* m) + { + ASSERT_EQ(m->get_num_batch_items(), 2); + ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); + ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 2)); + ASSERT_EQ(m->get_num_stored_elements_per_row(), 2); + EXPECT_EQ(m->get_const_values()[0], value_type{-1.0}); + EXPECT_EQ(m->get_const_values()[1], value_type{2.5}); + EXPECT_EQ(m->get_const_values()[2], value_type{0.0}); + EXPECT_EQ(m->get_const_values()[3], value_type{3.5}); + EXPECT_EQ(m->get_const_values()[4], value_type{1.0}); + EXPECT_EQ(m->get_const_values()[5], value_type{2.0}); + EXPECT_EQ(m->get_const_values()[6], value_type{0.0}); + EXPECT_EQ(m->get_const_values()[7], value_type{3.0}); + EXPECT_EQ(m->get_const_col_idxs()[0], index_type{0}); + EXPECT_EQ(m->get_const_col_idxs()[1], index_type{1}); + EXPECT_EQ(m->get_const_col_idxs()[2], index_type{-1}); + ASSERT_EQ(m->get_const_col_idxs()[3], index_type{2}); + } static void assert_equal_to_original_mtx( - gko::batch::matrix::Ell* m) + const gko::batch::matrix::Ell* m) { ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3)); - EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0}); - EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); - EXPECT_EQ(m->at(0, 0, 2), value_type{3.0}); - EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5}); - EXPECT_EQ(m->at(0, 1, 1), value_type{2.5}); - ASSERT_EQ(m->at(0, 1, 2), value_type{3.5}); - EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(1, 0, 1), value_type{2.5}); - EXPECT_EQ(m->at(1, 0, 2), value_type{3.0}); - EXPECT_EQ(m->at(1, 1, 0), value_type{1.0}); - EXPECT_EQ(m->at(1, 1, 1), value_type{2.0}); - ASSERT_EQ(m->at(1, 1, 2), value_type{3.0}); + ASSERT_EQ(m->get_num_stored_elements_per_row(), 3); + EXPECT_EQ(m->get_const_values()[0], value_type{-1.0}); + EXPECT_EQ(m->get_const_values()[1], value_type{-1.5}); + EXPECT_EQ(m->get_const_values()[2], value_type{2.0}); + EXPECT_EQ(m->get_const_values()[3], value_type{2.5}); + EXPECT_EQ(m->get_const_values()[4], value_type{3.0}); + EXPECT_EQ(m->get_const_values()[5], value_type{3.5}); + EXPECT_EQ(m->get_const_values()[6], value_type{1.0}); + EXPECT_EQ(m->get_const_values()[7], value_type{1.0}); + EXPECT_EQ(m->get_const_values()[8], value_type{2.5}); + EXPECT_EQ(m->get_const_values()[9], value_type{2.0}); + EXPECT_EQ(m->get_const_values()[10], value_type{3.0}); + ASSERT_EQ(m->get_const_values()[11], value_type{3.0}); } static void assert_empty(gko::batch::matrix::Ell* m) { ASSERT_EQ(m->get_num_batch_items(), 0); ASSERT_EQ(m->get_num_stored_elements(), 0); + ASSERT_EQ(m->get_num_stored_elements_per_row(), 0); } std::shared_ptr exec; std::unique_ptr> mtx; - std::unique_ptr> mvec; + std::unique_ptr> sp_mtx; std::unique_ptr> ell_mtx; + std::unique_ptr> sp_ell_mtx; }; TYPED_TEST_SUITE(Ell, gko::test::ValueTypes); @@ -109,6 +135,12 @@ TYPED_TEST(Ell, KnowsItsSizeAndValues) } +TYPED_TEST(Ell, SparseMtxKnowsItsSizeAndValues) +{ + this->assert_equal_to_original_sparse_mtx(this->sp_mtx.get()); +} + + TYPED_TEST(Ell, CanBeEmpty) { auto empty = gko::batch::matrix::Ell::create(this->exec); @@ -137,10 +169,10 @@ TYPED_TEST(Ell, CanCreateEllItemView) } -TYPED_TEST(Ell, CanCreateMultiVectorView) +TYPED_TEST(Ell, CanCreateSpEllItemView) { - GKO_ASSERT_BATCH_MTX_NEAR(this->mtx->create_multi_vector_view(), this->mvec, - 0.0); + GKO_ASSERT_MTX_NEAR(this->sp_mtx->create_view_for_item(1), this->sp_ell_mtx, + 0.0); } @@ -151,8 +183,7 @@ TYPED_TEST(Ell, CanBeCopied) mtx_copy->copy_from(this->mtx.get()); this->assert_equal_to_original_mtx(this->mtx.get()); - this->mtx->at(0, 0, 0) = 7; - this->mtx->at(0, 1) = 7; + this->mtx->get_values()[0] = 7; this->assert_equal_to_original_mtx(mtx_copy.get()); } @@ -189,71 +220,62 @@ TYPED_TEST(Ell, CanBeConstructedWithSize) using size_type = gko::size_type; auto m = gko::batch::matrix::Ell::create( - this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3})); + this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3}), 2); ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3)); - ASSERT_EQ(m->get_num_stored_elements(), 30); + ASSERT_EQ(m->get_num_stored_elements_per_row(), 2); + ASSERT_EQ(m->get_num_stored_elements(), 20); } TYPED_TEST(Ell, CanBeConstructedFromExistingData) { using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; using size_type = gko::size_type; // clang-format off - value_type data[] = { + value_type values[] = { + -1.0, 2.5, + 0.0, 3.5, 1.0, 2.0, - -1.0, 3.0, - 4.0, -1.0, - 3.0, 5.0, - 1.0, 5.0, - 6.0, -3.0}; + 0.0, 3.0}; + index_type col_idxs[] = { + 0, 1, + -1, 2}; // clang-format on auto m = gko::batch::matrix::Ell::create( - this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), - gko::array::view(this->exec, 8, data)); - - ASSERT_EQ(m->get_const_values(), data); - ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); - ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); - ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); - ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); - ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); - ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); - ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2, + gko::array::view(this->exec, 8, values), + gko::array::view(this->exec, 4, col_idxs)); + + this->assert_equal_to_original_sparse_mtx(m.get()); } TYPED_TEST(Ell, CanBeConstructedFromExistingConstData) { using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; using size_type = gko::size_type; // clang-format off - const value_type data[] = { + value_type values[] = { + -1.0, 2.5, + 0.0, 3.5, 1.0, 2.0, - -1.0, 3.0, - 4.0, -1.0, - 3.0, 5.0, - 1.0, 5.0, - 6.0, -3.0}; + 0.0, 3.0}; + index_type col_idxs[] = { + 0, 1, + -1, 2}; // clang-format on auto m = gko::batch::matrix::Ell::create_const( - this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)), - gko::array::const_view(this->exec, 8, data)); - - ASSERT_EQ(m->get_const_values(), data); - ASSERT_EQ(m->at(0, 0, 0), value_type{1.0}); - ASSERT_EQ(m->at(0, 0, 1), value_type{2.0}); - ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0}); - ASSERT_EQ(m->at(0, 1, 1), value_type{3.0}); - ASSERT_EQ(m->at(1, 0, 0), value_type{4.0}); - ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0}); - ASSERT_EQ(m->at(1, 1, 0), value_type{3.0}); - ASSERT_EQ(m->at(1, 1, 1), value_type{5.0}); + this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2, + gko::array::const_view(this->exec, 8, values), + gko::array::const_view(this->exec, 4, col_idxs)); + + this->assert_equal_to_original_sparse_mtx(m.get()); } @@ -263,35 +285,36 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatrices) using EllMtx = typename TestFixture::EllMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}}, this->exec); auto mat2 = - gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec); auto m = gko::batch::create_from_item>( - this->exec, std::vector{mat1.get(), mat2.get()}); + this->exec, std::vector{mat1.get(), mat2.get()}, + mat1->get_num_stored_elements_per_row()); - this->assert_equal_to_original_mtx(m.get()); + this->assert_equal_to_original_sparse_mtx(m.get()); } TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication) { using value_type = typename TestFixture::value_type; + using index_type = int; using EllMtx = typename TestFixture::EllMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, - this->exec); - auto mat2 = - gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + auto mat1 = + gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec); auto bat_m = gko::batch::create_from_item>( this->exec, - std::vector{mat1.get(), mat1.get(), mat1.get()}); + std::vector{mat1.get(), mat1.get(), mat1.get()}, + mat1->get_num_stored_elements_per_row()); auto m = gko::batch::create_from_item>( - this->exec, 3, mat1.get()); + this->exec, 3, mat1.get(), mat1->get_num_stored_elements_per_row()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); } @@ -300,24 +323,27 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication) TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices) { using value_type = typename TestFixture::value_type; + using index_type = int; using EllMtx = typename TestFixture::EllMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 0.0}}, this->exec); auto mat2 = - gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec); auto m = gko::batch::create_from_item>( - this->exec, std::vector{mat1.get(), mat2.get()}); + this->exec, std::vector{mat1.get(), mat2.get()}, + mat1->get_num_stored_elements_per_row()); auto m_ref = gko::batch::create_from_item>( this->exec, std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), - mat1.get(), mat2.get()}); + mat1.get(), mat2.get()}, + mat1->get_num_stored_elements_per_row()); auto m2 = gko::batch::duplicate>( - this->exec, 3, m.get()); + this->exec, 3, m.get(), mat1->get_num_stored_elements_per_row()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); } @@ -326,15 +352,16 @@ TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices) TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices) { using value_type = typename TestFixture::value_type; + using index_type = int; using EllMtx = typename TestFixture::EllMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, + auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}}, this->exec); auto mat2 = - gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec); + gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec); auto ell_mats = gko::batch::unbatch>( - this->mtx.get()); + this->sp_mtx.get()); GKO_ASSERT_MTX_NEAR(ell_mats[0].get(), mat1.get(), 0.); GKO_ASSERT_MTX_NEAR(ell_mats[1].get(), mat2.get(), 0.); @@ -344,55 +371,83 @@ TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices) TYPED_TEST(Ell, CanBeListConstructed) { using value_type = typename TestFixture::value_type; + using index_type = int; auto m = gko::batch::initialize>( - {{1.0, 2.0}, {1.0, 3.0}}, this->exec); + {{0.0, -1.0}, {1.0, 0.0}}, this->exec); ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); - EXPECT_EQ(m->at(0, 0), value_type{1}); - EXPECT_EQ(m->at(0, 1), value_type{2}); - EXPECT_EQ(m->at(1, 0), value_type{1}); - EXPECT_EQ(m->at(1, 1), value_type{3}); + ASSERT_EQ(m->get_num_stored_elements(), 4); + ASSERT_EQ(m->get_num_stored_elements_per_row(), 1); + EXPECT_EQ(m->get_values()[0], value_type{0.0}); + EXPECT_EQ(m->get_values()[1], value_type{-1.0}); + EXPECT_EQ(m->get_values()[2], value_type{1.0}); + EXPECT_EQ(m->get_values()[3], value_type{0.0}); + EXPECT_EQ(m->get_col_idxs()[0], index_type{0}); + EXPECT_EQ(m->get_col_idxs()[1], index_type{-1}); } TYPED_TEST(Ell, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; + using index_type = int; auto m = gko::batch::initialize>( - 2, I({1.0, 2.0}), this->exec); + 2, I({0.0, -1.0}), this->exec, 1); ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); - EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 0, 1), value_type{2.0}); - EXPECT_EQ(m->at(1, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(1, 0, 1), value_type{2.0}); + ASSERT_EQ(m->get_num_stored_elements(), 4); + ASSERT_EQ(m->get_num_stored_elements_per_row(), 1); + EXPECT_EQ(m->get_values()[0], value_type{0.0}); + EXPECT_EQ(m->get_values()[1], value_type{-1.0}); + EXPECT_EQ(m->get_values()[2], value_type{0.0}); + EXPECT_EQ(m->get_values()[3], value_type{-1.0}); + EXPECT_EQ(m->get_col_idxs()[0], index_type{-1}); + EXPECT_EQ(m->get_col_idxs()[1], index_type{0}); } TYPED_TEST(Ell, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; + using index_type = int; using T = value_type; auto m = gko::batch::initialize>( - {{I{1.0, 1.0, 0.0}, I{2.0, 4.0, 3.0}, I{3.0, 6.0, 1.0}}, - {I{1.0, 2.0, -1.0}, I{3.0, 4.0, -2.0}, I{5.0, 6.0, -3.0}}}, - this->exec); + // clang-format off + {{I{1.0, 0.0, 0.0}, + I{2.0, 0.0, 3.0}, + I{3.0, 6.0, 0.0}}, + {I{1.0, 0.0, 0.0}, + I{3.0, 0.0, -2.0}, + I{5.0, 8.0, 0.0}}}, + // clang-format on + this->exec, 2); + ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3)); - EXPECT_EQ(m->at(0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 1), value_type{1.0}); - EXPECT_EQ(m->at(0, 2), value_type{0.0}); - ASSERT_EQ(m->at(0, 3), value_type{2.0}); - EXPECT_EQ(m->at(0, 4), value_type{4.0}); - EXPECT_EQ(m->at(1, 0), value_type{1.0}); - EXPECT_EQ(m->at(1, 1), value_type{2.0}); - EXPECT_EQ(m->at(1, 2), value_type{-1.0}); - ASSERT_EQ(m->at(1, 3), value_type{3.0}); - EXPECT_EQ(m->at(1, 4), value_type{4.0}); + ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3)); + ASSERT_EQ(m->get_num_stored_elements_per_row(), 2); + EXPECT_EQ(m->get_values()[0], value_type{1.0}); + EXPECT_EQ(m->get_values()[1], value_type{2.0}); + EXPECT_EQ(m->get_values()[2], value_type{3.0}); + EXPECT_EQ(m->get_values()[3], value_type{0.0}); + EXPECT_EQ(m->get_values()[4], value_type{3.0}); + EXPECT_EQ(m->get_values()[5], value_type{6.0}); + EXPECT_EQ(m->get_values()[6], value_type{1.0}); + EXPECT_EQ(m->get_values()[7], value_type{3.0}); + EXPECT_EQ(m->get_values()[8], value_type{5.0}); + EXPECT_EQ(m->get_values()[9], value_type{0.0}); + EXPECT_EQ(m->get_values()[10], value_type{-2.0}); + EXPECT_EQ(m->get_values()[11], value_type{8.0}); + EXPECT_EQ(m->get_col_idxs()[0], index_type{0}); + EXPECT_EQ(m->get_col_idxs()[1], index_type{0}); + EXPECT_EQ(m->get_col_idxs()[2], index_type{0}); + EXPECT_EQ(m->get_col_idxs()[3], index_type{-1}); + EXPECT_EQ(m->get_col_idxs()[4], index_type{2}); + EXPECT_EQ(m->get_col_idxs()[5], index_type{1}); } @@ -400,52 +455,17 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; using index_type = int; - auto vec_data = std::vector>{}; vec_data.emplace_back(gko::matrix_data( - {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}})); + {2, 3}, {{0, 0, -1.0}, {1, 1, 2.5}, {1, 2, 3.5}})); vec_data.emplace_back(gko::matrix_data( - {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}})); + {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}})); auto m = gko::batch::read>(this->exec, - vec_data); - - ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); - EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); - EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); - EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); - EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); - EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); - EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); - EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); -} - - -TYPED_TEST(Ell, CanBeReadFromSparseMatrixData) -{ - using value_type = typename TestFixture::value_type; - using index_type = int; - auto vec_data = std::vector>{}; - vec_data.emplace_back(gko::matrix_data( - {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}})); - vec_data.emplace_back(gko::matrix_data( - {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}})); + vec_data, 2); - auto m = gko::batch::read>(this->exec, - vec_data); - - ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2)); - EXPECT_EQ(m->at(0, 0, 0), value_type{1.0}); - EXPECT_EQ(m->at(0, 0, 1), value_type{3.0}); - EXPECT_EQ(m->at(0, 1, 0), value_type{0.0}); - EXPECT_EQ(m->at(0, 1, 1), value_type{5.0}); - EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0}); - EXPECT_EQ(m->at(1, 0, 1), value_type{0.5}); - EXPECT_EQ(m->at(1, 1, 0), value_type{0.0}); - EXPECT_EQ(m->at(1, 1, 1), value_type{9.0}); + this->assert_equal_to_original_sparse_mtx(m.get()); } @@ -455,24 +475,18 @@ TYPED_TEST(Ell, GeneratesCorrectMatrixData) using index_type = int; using tpl = typename gko::matrix_data::nonzero_type; - auto data = - gko::batch::write>(this->mtx.get()); + auto data = gko::batch::write>( + this->sp_mtx.get()); ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); - ASSERT_EQ(data[0].nonzeros.size(), 6); + ASSERT_EQ(data[0].nonzeros.size(), 3); EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0})); - EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0})); - EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0})); - EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5})); - EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5})); - EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5})); + EXPECT_EQ(data[0].nonzeros[1], tpl(1, 1, value_type{2.5})); + EXPECT_EQ(data[0].nonzeros[2], tpl(1, 2, value_type{3.5})); ASSERT_EQ(data[1].size, gko::dim<2>(2, 3)); - ASSERT_EQ(data[1].nonzeros.size(), 6); + ASSERT_EQ(data[1].nonzeros.size(), 3); EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0})); - EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5})); - EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0})); - EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0})); - EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0})); - EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0})); + EXPECT_EQ(data[1].nonzeros[1], tpl(1, 1, value_type{2.0})); + EXPECT_EQ(data[1].nonzeros[2], tpl(1, 2, value_type{3.0})); } diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 61dffba3193..45ba0686468 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -212,8 +212,8 @@ class MultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + this->get_cumulative_offset(batch_id); @@ -462,224 +462,6 @@ class MultiVector }; -/** - * Creates and initializes a batch of single column-vectors. - * - * This function first creates a temporary MultiVector, fills it with - * passed in values, and then converts the vector to the requested type. - * - * @tparam Matrix matrix type to initialize - * (MultiVector has to implement the ConvertibleTo - * interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param vals values used to initialize the batch vector - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup MultiVector - * @ingroup mat_formats - */ -template -std::unique_ptr initialize( - std::initializer_list> - vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - using batch_multi_vector = MultiVector; - size_type num_batch_items = vals.size(); - GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); - auto vals_begin = begin(vals); - size_type common_num_rows = vals_begin ? vals_begin->size() : 0; - auto common_size = dim<2>(common_num_rows, 1); - for (auto& val : vals) { - GKO_ASSERT_EQ(common_num_rows, val.size()); - } - auto b_size = batch_dim<2>(num_batch_items, common_size); - auto tmp = batch_multi_vector::create(exec->get_master(), b_size); - size_type batch = 0; - for (const auto& b : vals) { - size_type idx = 0; - for (const auto& elem : b) { - tmp->at(batch, idx) = elem; - ++idx; - } - ++batch; - } - auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx); - return mtx; -} - - -/** - * Creates and initializes a batch of multi-vectors. - * - * This function first creates a temporary MultiVector, fills it with - * passed in values, and then converts the vector to the requested type. - * - * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param vals values used to initialize the vector - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup MultiVector - * @ingroup mat_formats - */ -template -std::unique_ptr initialize( - std::initializer_list>> - vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - using batch_multi_vector = MultiVector; - size_type num_batch_items = vals.size(); - GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); - auto vals_begin = begin(vals); - size_type common_num_rows = vals_begin ? vals_begin->size() : 0; - size_type common_num_cols = - vals_begin->begin() ? vals_begin->begin()->size() : 0; - auto common_size = dim<2>(common_num_rows, common_num_cols); - for (const auto& b : vals) { - auto num_rows = b.size(); - auto num_cols = begin(b)->size(); - auto b_size = dim<2>(num_rows, num_cols); - GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size); - } - - auto b_size = batch_dim<2>(num_batch_items, common_size); - auto tmp = batch_multi_vector::create(exec->get_master(), b_size); - size_type batch = 0; - for (const auto& b : vals) { - size_type ridx = 0; - for (const auto& row : b) { - size_type cidx = 0; - for (const auto& elem : row) { - tmp->at(batch, ridx, cidx) = elem; - ++cidx; - } - ++ridx; - } - ++batch; - } - auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx); - return mtx; -} - - -/** - * Creates and initializes a batch single column-vector by making copies of the - * single input column vector. - * - * This function first creates a temporary batch multi-vector, fills it with - * passed in values, and then converts the vector to the requested type. - * - * @tparam Matrix matrix type to initialize - * (MultiVector has to implement the ConvertibleTo - * interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param num_vectors The number of times the input vector is to be duplicated - * @param vals values used to initialize each vector in the temp. batch - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup MultiVector - * @ingroup mat_formats - */ -template -std::unique_ptr initialize( - const size_type num_vectors, - std::initializer_list vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - using batch_multi_vector = MultiVector; - size_type num_batch_items = num_vectors; - GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, - "Input data is empty"); - auto b_size = - batch_dim<2>(num_batch_items, dim<2>(begin(vals) ? vals.size() : 0, 1)); - auto tmp = batch_multi_vector::create(exec->get_master(), b_size); - for (size_type batch = 0; batch < num_vectors; batch++) { - size_type idx = 0; - for (const auto& elem : vals) { - tmp->at(batch, idx) = elem; - ++idx; - } - } - auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx); - return mtx; -} - - -/** - * Creates and initializes a matrix from copies of a given matrix. - * - * This function first creates a temporary batch multi-vector, fills it with - * passed in values, and then converts the vector to the requested type. - * - * @tparam Matrix matrix type to initialize - * (MultiVector has to implement the ConvertibleTo - * interface) - * @tparam TArgs argument types for Matrix::create method - * (not including the implied Executor as the first argument) - * - * @param num_batch_items The number of times the input matrix is duplicated - * @param vals values used to initialize each vector in the temp. batch - * @param exec Executor associated to the vector - * @param create_args additional arguments passed to Matrix::create, not - * including the Executor, which is passed as the first - * argument - * - * @ingroup LinOp - * @ingroup mat_formats - */ -template -std::unique_ptr initialize( - const size_type num_batch_items, - std::initializer_list> - vals, - std::shared_ptr exec, TArgs&&... create_args) -{ - using batch_multi_vector = MultiVector; - GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, - "Input data is empty"); - auto common_size = dim<2>(begin(vals) ? vals.size() : 0, - begin(vals) ? begin(vals)->size() : 0); - batch_dim<2> b_size(num_batch_items, common_size); - auto tmp = batch_multi_vector::create(exec->get_master(), b_size); - for (size_type batch = 0; batch < num_batch_items; batch++) { - size_type ridx = 0; - for (const auto& row : vals) { - size_type cidx = 0; - for (const auto& elem : row) { - tmp->at(batch, ridx, cidx) = elem; - ++cidx; - } - ++ridx; - } - } - auto mtx = Matrix::create(exec, std::forward(create_args)...); - tmp->move_to(mtx); - return mtx; -} - - } // namespace batch } // namespace gko diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index af77fc1e390..490f7a7d4b0 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -198,8 +198,8 @@ class Ell final } /** - * Returns a pointer to the array of col_idxs of the matrix for a - * specific batch item. + * Returns a pointer to the array of col_idxs of the matrix. This is shared + * across all batch items. * * @param batch_id the id of the batch item. * @@ -208,8 +208,7 @@ class Ell final index_type* get_col_idxs_for_item(size_type batch_id) noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); - return col_idxs_.get_data() + - batch_id * this->get_num_elements_per_item(); + return col_idxs_.get_data(); } /** @@ -223,8 +222,7 @@ class Ell final noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); - return col_idxs_.get_const_data() + - batch_id * this->get_num_elements_per_item(); + return col_idxs_.get_const_data(); } /** @@ -312,7 +310,8 @@ class Ell final size_type compute_num_elems(const batch_dim<2>& size, IndexType num_elems_per_row) { - return size.get_common_size()[0] * num_elems_per_row; + return size.get_num_batch_items() * size.get_common_size()[0] * + num_elems_per_row; } @@ -356,8 +355,9 @@ class Ell final // Ensure that the value and col_idxs arrays have the correct size auto num_elems = this->get_common_size()[0] * num_elems_per_row * this->get_num_batch_items(); - GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1); - GKO_ENSURE_IN_BOUNDS(num_elems, col_idxs_.get_num_elems() + 1); + GKO_ASSERT_EQ(num_elems, values_.get_num_elems()); + GKO_ASSERT_EQ(this->get_num_elements_per_item(), + col_idxs_.get_num_elems()); } /** From 3622b20f1d318cf009d38dd409a6cc5b241a8cf9 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 8 Oct 2023 10:26:50 +0200 Subject: [PATCH 388/583] Add OMP, CUDA, HIP kernels and tests Co-authored-by: Aditya Kashi --- .../matrix/batch_ell_kernel_launcher.hpp.inc | 29 +- .../cuda_hip/matrix/batch_ell_kernels.hpp.inc | 155 +++++++++++ core/matrix/batch_struct.hpp | 5 +- cuda/matrix/batch_ell_kernels.cu | 2 +- cuda/matrix/batch_struct.hpp | 34 +++ hip/matrix/batch_dense_kernels.hip.cpp | 1 - hip/matrix/batch_ell_kernels.hip.cpp | 27 +- hip/matrix/batch_struct.hip.hpp | 34 +++ reference/matrix/batch_ell_kernels.hpp.inc | 6 +- reference/matrix/batch_struct.hpp | 4 +- reference/test/matrix/CMakeLists.txt | 1 + reference/test/matrix/batch_ell_kernels.cpp | 248 ++++++++++++++++++ test/matrix/CMakeLists.txt | 1 + test/matrix/batch_ell_kernels.cpp | 128 +++++++++ 14 files changed, 650 insertions(+), 25 deletions(-) create mode 100644 common/cuda_hip/matrix/batch_ell_kernels.hpp.inc create mode 100644 reference/test/matrix/batch_ell_kernels.cpp create mode 100644 test/matrix/batch_ell_kernels.cpp diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc index 263e911c31a..f8da432aa4d 100644 --- a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc @@ -34,7 +34,18 @@ template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, const batch::MultiVector* b, - batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -47,7 +58,21 @@ void advanced_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, const batch::MultiVector* b, const batch::MultiVector* beta, - batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, + beta_ub, x_ub); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc new file mode 100644 index 00000000000..e55e7a60471 --- /dev/null +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc @@ -0,0 +1,155 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +__device__ __forceinline__ void simple_apply( + const gko::batch::matrix::batch_ell::batch_item& mat, + const ValueType* const __restrict__ b, ValueType* const __restrict__ x) +{ + const auto num_rows = mat.num_rows; + const auto num_stored_elements_per_row = mat.num_stored_elems_per_row; + const auto stride = mat.stride; + const auto val = mat.values; + const auto col = mat.col_idxs; + for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) { + auto temp = zero(); + for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { + const auto ind = tidx + idx * stride; + const auto col_idx = col[ind]; + if (col_idx < idx) { + break; + } else { + temp += val[ind] * b[col_idx]; + } + } + x[tidx] = temp; + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: + batch_ell::uniform_batch< + const ValueType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + simple_apply(mat_b, b_b.values, x_b.values); + } +} + + +template +__device__ __forceinline__ void advanced_apply( + const ValueType alpha, + const gko::batch::matrix::batch_ell::batch_item& mat, + const ValueType* const __restrict__ b, const ValueType beta, + ValueType* const __restrict__ x) +{ + const auto num_rows = mat.num_rows; + const auto num_stored_elements_per_row = mat.num_stored_elems_per_row; + const auto stride = mat.stride; + const auto val = mat.values; + const auto col = mat.col_idxs; + for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) { + auto temp = zero(); + for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { + const auto ind = tidx + idx * stride; + const auto col_idx = col[ind]; + if (col_idx < idx) { + break; + } else { + temp += alpha * val[ind] * b[col_idx]; + } + } + x[tidx] = temp + beta * x[tidx]; + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void advanced_apply_kernel(const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + alpha, + const gko::batch::matrix:: + batch_ell:: + uniform_batch< + const ValueType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + beta, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto beta_b = gko::batch::extract_batch_item(beta, batch_id); + advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values); + } +} diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index 272bb506df2..2eed40882bc 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -188,8 +188,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item extract_batch_item( { return {batch.values + batch_idx * batch.num_stored_elems_per_row * batch.num_rows, - batch.col_idxs + - batch_idx * batch.num_stored_elems_per_row * batch.num_rows, + batch.col_idxs, batch.stride, batch.num_rows, batch.num_cols, @@ -203,7 +202,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item extract_batch_item( const size_type batch_idx) { return {batch_values + batch_idx * num_elems_per_row * num_rows, - batch_col_idxs + batch_idx * num_elems_per_row * num_rows, + batch_col_idxs, stride, num_rows, num_cols, diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index 567d863d95c..ee6a99f04ca 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -72,7 +72,7 @@ constexpr int sm_oversubscription = 4; // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 73712a7b81b..7a6a4ac7f00 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -87,6 +87,40 @@ get_batch_struct(batch::matrix::Dense* const op) } +/** + * Generates an immutable uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch> +get_batch_struct(const batch::matrix::Ell* const op) +{ + return {as_cuda_type(op->get_const_values()), + op->get_const_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + +/** + * Generates a uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch> +get_batch_struct(batch::matrix::Ell* const op) +{ + return {as_cuda_type(op->get_values()), + op->get_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + } // namespace cuda } // namespace kernels } // namespace gko diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index eb3da83760a..3361feeb8b8 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include "core/base/batch_struct.hpp" diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index 567d863d95c..fdd52c38f57 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/batch_ell_kernels.hpp" +#include #include #include @@ -42,21 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" namespace gko { namespace kernels { -namespace cuda { +namespace hip { /** * @brief The Ell matrix format namespace. * @ref Ell @@ -72,7 +73,7 @@ constexpr int sm_oversubscription = 4; // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES -// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" @@ -81,6 +82,6 @@ constexpr int sm_oversubscription = 4; } // namespace batch_ell -} // namespace cuda +} // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index 4670cf0988b..a43d7d058b0 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -87,6 +87,40 @@ get_batch_struct(batch::matrix::Dense* const op) } +/** + * Generates an immutable uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch> +get_batch_struct(const batch::matrix::Ell* const op) +{ + return {as_hip_type(op->get_const_values()), + op->get_const_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + +/** + * Generates a uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch> +get_batch_struct(batch::matrix::Ell* const op) +{ + return {as_hip_type(op->get_values()), + op->get_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + } // namespace hip } // namespace kernels } // namespace gko diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc index 37370261d44..41d0a00ddcd 100644 --- a/reference/matrix/batch_ell_kernels.hpp.inc +++ b/reference/matrix/batch_ell_kernels.hpp.inc @@ -36,14 +36,14 @@ inline void simple_apply_kernel( const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) { - for (int row = 0; row < a.num_rows; ++row) { - for (int j = 0; j < b.num_rhs; ++j) { + for (int row = 0; row < c.num_rows; ++row) { + for (int j = 0; j < c.num_rhs; ++j) { c.values[row * c.stride + j] = zero(); } for (auto k = 0; k < a.num_stored_elems_per_row; ++k) { auto val = a.values[row + k * a.stride]; auto col = a.col_idxs[row + k * a.stride]; - for (int j = 0; j < b.num_rhs; ++j) { + for (int j = 0; j < c.num_rhs; ++j) { c.values[row * c.stride + j] += val * b.values[col * b.stride + j]; } diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index b5eacd80d18..3b562450ee0 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -101,7 +101,7 @@ get_batch_struct(const batch::matrix::Ell* const op) return {op->get_const_values(), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1]), static_cast(op->get_num_stored_elements_per_row())}; @@ -118,7 +118,7 @@ inline batch::matrix::batch_ell::uniform_batch get_batch_struct( return {op->get_values(), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), + static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1]), static_cast(op->get_num_stored_elements_per_row())}; diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt index 18634de662d..05498cbadc4 100644 --- a/reference/test/matrix/CMakeLists.txt +++ b/reference/test/matrix/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_test(batch_dense_kernels) +ginkgo_create_test(batch_ell_kernels) ginkgo_create_test(coo_kernels) ginkgo_create_test(csr_kernels) ginkgo_create_test(dense_kernels) diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp new file mode 100644 index 00000000000..76b681c69f7 --- /dev/null +++ b/reference/test/matrix/batch_ell_kernels.cpp @@ -0,0 +1,248 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/matrix/batch_ell_kernels.hpp" +#include "core/test/utils.hpp" + + +template +class Ell : public ::testing::Test { +protected: + using value_type = T; + using size_type = gko::size_type; + using Mtx = gko::batch::matrix::Ell; + using MVec = gko::batch::MultiVector; + using EllMtx = gko::matrix::Ell; + using DenseMtx = gko::matrix::Dense; + using ComplexMtx = gko::to_complex; + using RealMtx = gko::remove_complex; + Ell() + : exec(gko::ReferenceExecutor::create()), + mtx_0(gko::batch::initialize( + {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, + {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}}, + exec)), + mtx_00(gko::initialize( + {I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, exec)), + mtx_01(gko::initialize( + {I({1.0, -2.0, -0.5}), I({1.0, -2.5, 4.0})}, exec)), + b_0(gko::batch::initialize( + {{I({1.0, 0.0, 1.0}), I({2.0, 0.0, 1.0}), + I({1.0, 0.0, 2.0})}, + {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), + I({1.0, 0.0, 2.0})}}, + exec)), + b_00(gko::initialize( + {I({1.0, 0.0, 1.0}), I({2.0, 0.0, 1.0}), + I({1.0, 0.0, 2.0})}, + exec)), + b_01(gko::initialize( + {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), + I({1.0, 0.0, 2.0})}, + exec)), + x_0(gko::batch::initialize( + {{I({2.0, 0.0, 1.0}), I({2.0, 0.0, 2.0})}, + {I({-2.0, 1.0, 1.0}), I({1.0, -1.0, -1.0})}}, + exec)), + x_00(gko::initialize( + {I({2.0, 0.0, 1.0}), I({2.0, 0.0, 2.0})}, exec)), + x_01(gko::initialize( + {I({-2.0, 1.0, 1.0}), I({1.0, -1.0, -1.0})}, exec)) + {} + + std::shared_ptr exec; + std::unique_ptr mtx_0; + std::unique_ptr mtx_00; + std::unique_ptr mtx_01; + std::unique_ptr b_0; + std::unique_ptr b_00; + std::unique_ptr b_01; + std::unique_ptr x_0; + std::unique_ptr x_00; + std::unique_ptr x_01; + + std::ranlux48 rand_engine; +}; + + +TYPED_TEST_SUITE(Ell, gko::test::ValueTypes); + + +TYPED_TEST(Ell, AppliesToBatchMultiVector) +{ + using T = typename TestFixture::value_type; + + this->mtx_0->apply(this->b_0.get(), this->x_0.get()); + this->mtx_00->apply(this->b_00.get(), this->x_00.get()); + this->mtx_01->apply(this->b_01.get(), this->x_01.get()); + + auto res = gko::batch::unbatch>(this->x_0.get()); + + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); +} + + +TYPED_TEST(Ell, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) +{ + using Mtx = typename TestFixture::Mtx; + using MVec = typename TestFixture::MVec; + using DenseMtx = typename TestFixture::DenseMtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch::initialize(2, {1.5}, this->exec); + auto beta = gko::batch::initialize(2, {-4.0}, this->exec); + auto alpha0 = gko::initialize({1.5}, this->exec); + auto alpha1 = gko::initialize({1.5}, this->exec); + auto beta0 = gko::initialize({-4.0}, this->exec); + auto beta1 = gko::initialize({-4.0}, this->exec); + + this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), + this->x_0.get()); + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), + this->x_00.get()); + this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), + this->x_01.get()); + + auto res = gko::batch::unbatch>(this->x_0.get()); + + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); +} + + +TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector) +{ + using Mtx = typename TestFixture::Mtx; + using MVec = typename TestFixture::MVec; + using DenseMtx = typename TestFixture::DenseMtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch::initialize({{1.5}, {-1.0}}, this->exec); + auto beta = gko::batch::initialize({{2.5}, {-4.0}}, this->exec); + auto alpha0 = gko::initialize({1.5}, this->exec); + auto alpha1 = gko::initialize({-1.0}, this->exec); + auto beta0 = gko::initialize({2.5}, this->exec); + auto beta1 = gko::initialize({-4.0}, this->exec); + + this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), + this->x_0.get()); + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), + this->x_00.get()); + this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), + this->x_01.get()); + + auto res = gko::batch::unbatch>(this->x_0.get()); + + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); +} + + +TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols) +{ + using MVec = typename TestFixture::MVec; + auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}}); + + ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultRows) +{ + using MVec = typename TestFixture::MVec; + auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}}); + + ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension) +{ + using MVec = typename TestFixture::MVec; + auto res = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + + ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(Ell, AdvancedApplyFailsOnWrongInnerDimension) +{ + using MVec = typename TestFixture::MVec; + auto res = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + auto alpha = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + auto beta = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + + ASSERT_THROW( + this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), + gko::DimensionMismatch); +} + + +TYPED_TEST(Ell, AdvancedApplyFailsOnWrongAlphaDimension) +{ + using MVec = typename TestFixture::MVec; + auto res = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}}); + auto alpha = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}}); + auto beta = + MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + + ASSERT_THROW( + this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), + gko::DimensionMismatch); +} diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt index 9f3b17cd858..f1c91e615e7 100644 --- a/test/matrix/CMakeLists.txt +++ b/test/matrix/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_common_test(batch_dense_kernels) +ginkgo_create_common_test(batch_ell_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_device_test(csr_kernels) ginkgo_create_common_test(csr_kernels2) ginkgo_create_common_test(coo_kernels) diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp new file mode 100644 index 00000000000..9629a2263ff --- /dev/null +++ b/test/matrix/batch_ell_kernels.cpp @@ -0,0 +1,128 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/batch_ell_kernels.hpp" + + +#include +#include + + +#include + + +#include +#include +#include +#include + + +#include "core/base/batch_utilities.hpp" +#include "core/test/utils.hpp" +#include "core/test/utils/assertions.hpp" +#include "core/test/utils/batch_helpers.hpp" +#include "test/utils/executor.hpp" + + +class Ell : public CommonTestFixture { +protected: + using Mtx = gko::batch::matrix::Ell; + using MVec = gko::batch::MultiVector; + + Ell() : rand_engine(15) {} + + template + std::unique_ptr gen_mtx(const gko::size_type num_batch_items, + gko::size_type num_rows, + gko::size_type num_cols) + { + return gko::test::generate_random_batch_matrix( + num_batch_items, num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_data(gko::size_type num_vecs = 1) + { + const int num_rows = 252; + const int num_cols = 32; + x = gen_mtx(batch_size, num_rows, num_cols); + y = gen_mtx(batch_size, num_cols, num_vecs); + alpha = gen_mtx(batch_size, 1, 1); + beta = gen_mtx(batch_size, 1, 1); + dx = gko::clone(exec, x); + dy = gko::clone(exec, y); + dalpha = gko::clone(exec, alpha); + dbeta = gko::clone(exec, beta); + expected = MVec::create( + ref, + gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs})); + expected->fill(gko::one()); + dresult = gko::clone(exec, expected); + } + + std::ranlux48 rand_engine; + + const size_t batch_size = 11; + std::unique_ptr x; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr dresult; + std::unique_ptr dx; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; +}; + + +TEST_F(Ell, SingleVectorApplyIsEquivalentToRef) +{ + set_up_apply_data(1); + + x->apply(y.get(), expected.get()); + dx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Ell, SingleVectorAdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(1); + + x->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); +} From 301fa6ea283f4a8b5e654d1cdcc1ecda2d8ed859 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 8 Oct 2023 12:03:36 +0200 Subject: [PATCH 389/583] Add DPCPP kernels and tests Co-authored-by: Phuong Nguyen --- core/test/utils/batch_helpers.hpp | 2 - dpcpp/matrix/batch_ell_kernels.dp.cpp | 84 ++++++++++++++++++++++++-- dpcpp/matrix/batch_ell_kernels.hpp.inc | 79 ++++++++++++++++++++++++ dpcpp/matrix/batch_struct.hpp | 34 +++++++++++ test/matrix/CMakeLists.txt | 2 +- test/matrix/batch_ell_kernels.cpp | 26 ++++++-- 6 files changed, 213 insertions(+), 14 deletions(-) create mode 100644 dpcpp/matrix/batch_ell_kernels.hpp.inc diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp index 4cf9d4973e2..b040691999e 100644 --- a/core/test/utils/batch_helpers.hpp +++ b/core/test/utils/batch_helpers.hpp @@ -83,8 +83,6 @@ std::unique_ptr generate_random_batch_matrix( exec, batch_dim<2>(num_batch_items, dim<2>(num_rows, num_cols)), std::forward(args)...); - // TODO: Need to preserve sparsity pattern across batch items for batched - // sparse matrix formats for (size_type b = 0; b < num_batch_items; b++) { auto rand_mat = generate_random_matrix( diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index cdcd5abd024..1ed83d79630 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_ell_kernels.hpp" #include @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include +#include #include "core/base/batch_struct.hpp" @@ -71,14 +71,48 @@ namespace dpcpp { namespace batch_ell { -// #include "dpcpp/matrix/batch_dense_kernels.hpp.inc" +#include "dpcpp/matrix/batch_ell_kernels.hpp.inc" template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, const batch::MultiVector* b, - batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) +{ + const size_type num_rows = mat->get_common_size()[0]; + const size_type num_cols = mat->get_common_size()[1]; + + const auto num_batch_items = mat->get_num_batch_items(); + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batch_items); + const auto x_ub = get_batch_struct(x); + const auto b_ub = get_batch_struct(b); + const auto mat_ub = get_batch_struct(mat); + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + + // Launch a kernel that has nbatches blocks, each block has max group size + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); + }); + }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); @@ -90,7 +124,47 @@ void advanced_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, const batch::MultiVector* b, const batch::MultiVector* beta, - batch::MultiVector* x) GKO_NOT_IMPLEMENTED; + batch::MultiVector* x) +{ + const auto mat_ub = get_batch_struct(mat); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + + if (b_ub.num_rhs > 1) { + GKO_NOT_IMPLEMENTED; + } + + const auto num_batch_items = mat_ub.num_batch_items; + auto device = exec->get_queue()->get_device(); + auto group_size = + device.get_info(); + + const dim3 block(group_size); + const dim3 grid(num_batch_items); + + // Launch a kernel that has nbatches blocks, each block has max group size + (exec->get_queue())->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); + }); + }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc new file mode 100644 index 00000000000..1048f2f8ff8 --- /dev/null +++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc @@ -0,0 +1,79 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__dpct_inline__ void simple_apply_kernel( + const gko::batch::matrix::batch_ell::batch_item& mat, + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& x, + sycl::nd_item<3>& item_ct1) +{ + for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows; + tidx += item_ct1.get_local_range().size()) { + auto temp = zero(); + for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { + const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; + if (col_idx < idx) + break; + else + temp += mat.values[tidx + idx * mat.stride] * + b.values[col_idx * b.stride]; + } + x.values[tidx * x.stride] = temp; + } +} + + +template +__dpct_inline__ void advanced_apply_kernel( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::matrix::batch_ell::batch_item& mat, + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& beta, + const gko::batch::multi_vector::batch_item& x, + sycl::nd_item<3>& item_ct1) +{ + for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows; + tidx += item_ct1.get_local_range().size()) { + auto temp = zero(); + for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { + const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; + if (col_idx < idx) + break; + else + temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] * + b.values[col_idx * b.stride]; + } + x.values[tidx * x.stride] = + temp + beta.values[0] * x.values[tidx * x.stride]; + } +} diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index b0393daf55d..35ff1148dd5 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -86,6 +86,40 @@ inline batch::matrix::dense::uniform_batch get_batch_struct( } +/** + * Generates an immutable uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch +get_batch_struct(const batch::matrix::Ell* const op) +{ + return {op->get_const_values(), + op->get_const_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + +/** + * Generates a uniform batch struct from a batch of ell matrices. + */ +template +inline batch::matrix::batch_ell::uniform_batch get_batch_struct( + batch::matrix::Ell* const op) +{ + return {op->get_values(), + op->get_col_idxs(), + op->get_num_batch_items(), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; +} + + } // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt index f1c91e615e7..a03a0a0bb4e 100644 --- a/test/matrix/CMakeLists.txt +++ b/test/matrix/CMakeLists.txt @@ -1,5 +1,5 @@ ginkgo_create_common_test(batch_dense_kernels) -ginkgo_create_common_test(batch_ell_kernels DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_test(batch_ell_kernels) ginkgo_create_common_device_test(csr_kernels) ginkgo_create_common_test(csr_kernels2) ginkgo_create_common_test(coo_kernels) diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp index 9629a2263ff..bc1e0c7fb42 100644 --- a/test/matrix/batch_ell_kernels.cpp +++ b/test/matrix/batch_ell_kernels.cpp @@ -63,22 +63,36 @@ class Ell : public CommonTestFixture { template std::unique_ptr gen_mtx(const gko::size_type num_batch_items, gko::size_type num_rows, - gko::size_type num_cols) + gko::size_type num_cols, + int num_elems_per_row) { return gko::test::generate_random_batch_matrix( + num_batch_items, num_rows, num_cols, + std::uniform_int_distribution<>(num_elems_per_row, + num_elems_per_row), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref, + num_elems_per_row); + } + + std::unique_ptr gen_mvec(const gko::size_type num_batch_items, + gko::size_type num_rows, + gko::size_type num_cols) + { + return gko::test::generate_random_batch_matrix( num_batch_items, num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); } - void set_up_apply_data(gko::size_type num_vecs = 1) + void set_up_apply_data(gko::size_type num_vecs = 1, + int num_elems_per_row = 5) { const int num_rows = 252; const int num_cols = 32; - x = gen_mtx(batch_size, num_rows, num_cols); - y = gen_mtx(batch_size, num_cols, num_vecs); - alpha = gen_mtx(batch_size, 1, 1); - beta = gen_mtx(batch_size, 1, 1); + x = gen_mtx(batch_size, num_rows, num_cols, num_elems_per_row); + y = gen_mvec(batch_size, num_cols, num_vecs); + alpha = gen_mvec(batch_size, 1, 1); + beta = gen_mvec(batch_size, 1, 1); dx = gko::clone(exec, x); dy = gko::clone(exec, y); dalpha = gko::clone(exec, alpha); From 715235cc5f5f4785dff9657dc1543fcca74d2024 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sun, 8 Oct 2023 12:15:00 +0200 Subject: [PATCH 390/583] Update docs --- include/ginkgo/core/matrix/batch_ell.hpp | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index 490f7a7d4b0..48a3a6d9831 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -55,19 +55,15 @@ namespace matrix { /** - * Ell is a batch matrix format which explicitly stores all values of the - * matrix in each of the batches. + * Ell is a sparse matrix format that stores the same number of nonzeros in each + * row, enabling coalesced accesses. It is suitable for sparsity patterns that + * have a similar number of nonzeros in every row. The values are stored in a + * column-major fashion similar to the monolithic gko::matrix::Ell class. It is + * also assumed that the sparsity pattern of all the items in the batch is the + * same and therefore only a single copy of the sparsity pattern is stored. * - * The values in each of the batches are stored in row-major format (values - * belonging to the same row appear consecutive in the memory and the values of - * each batch item are also stored consecutively in memory). - * - * @note Though the storage layout is similar to the multi-vector object, the - * class semantics and the operations it aims to provide is different. Hence it - * is recommended to create multi-vector objects if the user means to view the - * data as a set of vectors. - * - * @tparam ValueType precision of matrix elements + * @tparam ValueType value precision of matrix elements + * @tparam IndexType index precision of matrix elements * * @ingroup batch_ell * @ingroup mat_formats From 5cdcedef194ee25d2ca60627bfca7e076aa08e70 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Sun, 8 Oct 2023 10:49:57 +0000 Subject: [PATCH 391/583] Format files Co-authored-by: Pratik Nayak --- dpcpp/matrix/batch_ell_kernels.dp.cpp | 54 +++++++++---------- .../ginkgo/core/base/batch_multi_vector.hpp | 4 +- include/ginkgo/core/matrix/batch_ell.hpp | 8 +-- include/ginkgo/ginkgo.hpp | 1 + 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index 1ed83d79630..1d1210cc270 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -100,17 +100,17 @@ void simple_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b, x_b, item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); + }); }); } @@ -147,22 +147,22 @@ void advanced_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size (exec->get_queue())->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto alpha_b = - batch::extract_batch_item(alpha_ub, group_id); - const auto beta_b = - batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, - item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); + }); }); } diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 45ba0686468..9a4b8d5cf1d 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -212,8 +212,8 @@ class MultiVector * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + this->get_cumulative_offset(batch_id); diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index 48a3a6d9831..5cb5f73dec5 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -214,8 +214,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const index_type* get_const_col_idxs_for_item(size_type batch_id) const - noexcept + const index_type* get_const_col_idxs_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return col_idxs_.get_const_data(); @@ -243,8 +243,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 8bb29242e88..ad90e264189 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -109,6 +109,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include From a2c8e6551912ddbbe4ad0f7cabae3a2567e9b455 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 10 Oct 2023 17:15:48 +0200 Subject: [PATCH 392/583] Some general fixes. --- .../cuda_hip/matrix/batch_ell_kernels.hpp.inc | 13 ++- core/matrix/batch_ell.cpp | 32 +------ core/matrix/batch_struct.hpp | 20 ++--- core/test/matrix/batch_ell.cpp | 8 +- cuda/matrix/batch_dense_kernels.cu | 3 +- cuda/matrix/batch_ell_kernels.cu | 6 +- cuda/matrix/batch_struct.hpp | 23 ++--- dpcpp/matrix/batch_ell_kernels.dp.cpp | 62 +++++++------ dpcpp/matrix/batch_ell_kernels.hpp.inc | 4 +- dpcpp/matrix/batch_struct.hpp | 23 ++--- hip/matrix/batch_ell_kernels.hip.cpp | 6 +- hip/matrix/batch_struct.hip.hpp | 23 ++--- include/ginkgo/core/matrix/batch_ell.hpp | 8 -- omp/matrix/batch_dense_kernels.cpp | 4 +- omp/matrix/batch_ell_kernels.cpp | 4 +- reference/matrix/batch_dense_kernels.cpp | 5 +- reference/matrix/batch_ell_kernels.cpp | 5 +- reference/matrix/batch_ell_kernels.hpp.inc | 4 +- reference/matrix/batch_struct.hpp | 22 ++--- reference/test/matrix/batch_ell_kernels.cpp | 87 ++++++------------- test/matrix/batch_ell_kernels.cpp | 59 ++++++------- test/test_install/test_install.cpp | 9 +- 22 files changed, 184 insertions(+), 246 deletions(-) diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc index e55e7a60471..5c00358c5a0 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __device__ __forceinline__ void simple_apply( - const gko::batch::matrix::batch_ell::batch_item& mat, + const gko::batch::matrix::ell::batch_item& mat, const ValueType* const __restrict__ b, ValueType* const __restrict__ x) { const auto num_rows = mat.num_rows; @@ -60,7 +60,7 @@ template __global__ __launch_bounds__( default_block_size, sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - batch_ell::uniform_batch< + ell::uniform_batch< const ValueType> mat, const gko::batch:: @@ -88,7 +88,7 @@ __global__ __launch_bounds__( template __device__ __forceinline__ void advanced_apply( const ValueType alpha, - const gko::batch::matrix::batch_ell::batch_item& mat, + const gko::batch::matrix::ell::batch_item& mat, const ValueType* const __restrict__ b, const ValueType beta, ValueType* const __restrict__ x) { @@ -121,10 +121,9 @@ __global__ __launch_bounds__( const ValueType> alpha, const gko::batch::matrix:: - batch_ell:: - uniform_batch< - const ValueType> - mat, + ell::uniform_batch< + const ValueType> + mat, const gko::batch:: multi_vector:: uniform_batch< diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 0d903b10968..f421fdf2b49 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -104,22 +104,10 @@ template std::unique_ptr> Ell::create_with_config_of( ptr_param> other) -{ - // De-referencing `other` before calling the functions (instead of - // using operator `->`) is currently required to be compatible with - // CUDA 10.1. - // Otherwise, it results in a compile error. - return (*other).create_with_same_config(); -} - - -template -std::unique_ptr> -Ell::create_with_same_config() const { return Ell::create( - this->get_executor(), this->get_size(), - this->get_num_stored_elements_per_row()); + other->get_executor(), other->get_size(), + other->get_num_stored_elements_per_row()); } @@ -163,12 +151,7 @@ template void Ell::apply_impl(const MultiVector* b, MultiVector* x) const { - GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); - GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); - - GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); - GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); - GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); + this->validate_application_parameters(b, x); this->get_executor()->run(ell::make_simple_apply(this, b, x)); } @@ -179,14 +162,7 @@ void Ell::apply_impl(const MultiVector* alpha, const MultiVector* beta, MultiVector* x) const { - GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); - GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items()); - - GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size()); - GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size()); - GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size()); - GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(), gko::dim<2>(1, 1)); - GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1)); + this->validate_application_parameters(alpha, b, beta, x); this->get_executor()->run( ell::make_advanced_apply(alpha, this, b, beta, x)); } diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index 2eed40882bc..eeeeebd53d6 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -83,7 +83,7 @@ struct uniform_batch { } // namespace dense -namespace batch_ell { +namespace ell { /** @@ -109,7 +109,7 @@ struct batch_item { template struct uniform_batch { using value_type = ValueType; - using index_type = int; + using index_type = int32; using entry_type = batch_item; ValueType* values; @@ -127,7 +127,7 @@ struct uniform_batch { }; -} // namespace batch_ell +} // namespace ell template @@ -165,8 +165,8 @@ GKO_ATTRIBUTES GKO_INLINE dense::batch_item extract_batch_item( template -GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item to_const( - const batch_ell::batch_item& b) +GKO_ATTRIBUTES GKO_INLINE ell::batch_item to_const( + const ell::batch_item& b) { return {b.values, b.col_idxs, b.stride, b.num_rows, b.num_cols, b.num_stored_elems_per_row}; @@ -174,8 +174,8 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item to_const( template -GKO_ATTRIBUTES GKO_INLINE batch_ell::uniform_batch to_const( - const batch_ell::uniform_batch& ub) +GKO_ATTRIBUTES GKO_INLINE ell::uniform_batch to_const( + const ell::uniform_batch& ub) { return {ub.values, ub.col_idxs, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_cols, ub.num_stored_elems_per_row}; @@ -183,8 +183,8 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::uniform_batch to_const( template -GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item extract_batch_item( - const batch_ell::uniform_batch& batch, const size_type batch_idx) +GKO_ATTRIBUTES GKO_INLINE ell::batch_item extract_batch_item( + const ell::uniform_batch& batch, const size_type batch_idx) { return {batch.values + batch_idx * batch.num_stored_elems_per_row * batch.num_rows, @@ -196,7 +196,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item extract_batch_item( } template -GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item extract_batch_item( +GKO_ATTRIBUTES GKO_INLINE ell::batch_item extract_batch_item( ValueType* const batch_values, int* const batch_col_idxs, const int stride, const int num_rows, const int num_cols, int num_elems_per_row, const size_type batch_idx) diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp index 2830705bf5f..e4dcab23917 100644 --- a/core/test/matrix/batch_ell.cpp +++ b/core/test/matrix/batch_ell.cpp @@ -144,6 +144,7 @@ TYPED_TEST(Ell, SparseMtxKnowsItsSizeAndValues) TYPED_TEST(Ell, CanBeEmpty) { auto empty = gko::batch::matrix::Ell::create(this->exec); + this->assert_empty(empty.get()); } @@ -151,6 +152,7 @@ TYPED_TEST(Ell, CanBeEmpty) TYPED_TEST(Ell, ReturnsNullValuesArrayWhenEmpty) { auto empty = gko::batch::matrix::Ell::create(this->exec); + ASSERT_EQ(empty->get_const_values(), nullptr); } @@ -284,7 +286,6 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatrices) using value_type = typename TestFixture::value_type; using EllMtx = typename TestFixture::EllMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}}, this->exec); auto mat2 = @@ -304,15 +305,14 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication) using index_type = int; using EllMtx = typename TestFixture::EllMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec); - auto bat_m = gko::batch::create_from_item>( this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}, mat1->get_num_stored_elements_per_row()); + auto m = gko::batch::create_from_item>( this->exec, 3, mat1.get(), mat1->get_num_stored_elements_per_row()); @@ -326,7 +326,6 @@ TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices) using index_type = int; using EllMtx = typename TestFixture::EllMtx; using size_type = gko::size_type; - auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 0.0}}, this->exec); auto mat2 = @@ -372,6 +371,7 @@ TYPED_TEST(Ell, CanBeListConstructed) { using value_type = typename TestFixture::value_type; using index_type = int; + auto m = gko::batch::initialize>( {{0.0, -1.0}, {1.0, 0.0}}, this->exec); diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index dd82e15b8cc..c693a3ae861 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -36,7 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include +#include #include "core/base/batch_struct.hpp" diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index ee6a99f04ca..6dd268a2d8e 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -34,18 +34,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include +#include +#include #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 7a6a4ac7f00..e2db1ea6e97 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "core/base/batch_struct.hpp" @@ -91,16 +92,16 @@ get_batch_struct(batch::matrix::Dense* const op) * Generates an immutable uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch> +inline batch::matrix::ell::uniform_batch> get_batch_struct(const batch::matrix::Ell* const op) { return {as_cuda_type(op->get_const_values()), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } @@ -108,16 +109,16 @@ get_batch_struct(const batch::matrix::Ell* const op) * Generates a uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch> -get_batch_struct(batch::matrix::Ell* const op) +inline batch::matrix::ell::uniform_batch> get_batch_struct( + batch::matrix::Ell* const op) { return {as_cuda_type(op->get_values()), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index 1d1210cc270..fca265eceb0 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -39,17 +39,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include -#include #include #include "core/base/batch_struct.hpp" -#include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/batch_struct.hpp" #include "dpcpp/base/batch_struct.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" @@ -98,19 +94,19 @@ void simple_apply(std::shared_ptr exec, } // Launch a kernel that has nbatches blocks, each block has max group size - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b, x_b, item_ct1); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); + }); }); } @@ -145,24 +141,24 @@ void advanced_apply(std::shared_ptr exec, const dim3 grid(num_batch_items); // Launch a kernel that has nbatches blocks, each block has max group size - (exec->get_queue())->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto alpha_b = - batch::extract_batch_item(alpha_ub, group_id); - const auto beta_b = - batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, - item_ct1); - }); + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); + }); }); } diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc index 1048f2f8ff8..7500ae9e060 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp.inc +++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc @@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template __dpct_inline__ void simple_apply_kernel( - const gko::batch::matrix::batch_ell::batch_item& mat, + const gko::batch::matrix::ell::batch_item& mat, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& x, sycl::nd_item<3>& item_ct1) @@ -56,7 +56,7 @@ __dpct_inline__ void simple_apply_kernel( template __dpct_inline__ void advanced_apply_kernel( const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::matrix::batch_ell::batch_item& mat, + const gko::batch::matrix::ell::batch_item& mat, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& beta, const gko::batch::multi_vector::batch_item& x, diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index 35ff1148dd5..f857653e05e 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "core/base/batch_struct.hpp" @@ -90,16 +91,16 @@ inline batch::matrix::dense::uniform_batch get_batch_struct( * Generates an immutable uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch -get_batch_struct(const batch::matrix::Ell* const op) +inline batch::matrix::ell::uniform_batch get_batch_struct( + const batch::matrix::Ell* const op) { return {op->get_const_values(), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } @@ -107,16 +108,16 @@ get_batch_struct(const batch::matrix::Ell* const op) * Generates a uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch get_batch_struct( +inline batch::matrix::ell::uniform_batch get_batch_struct( batch::matrix::Ell* const op) { return {op->get_values(), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index fdd52c38f57..5c6d5179a21 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -35,18 +35,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include -#include +#include +#include #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index a43d7d058b0..6f15b2d966a 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "core/base/batch_struct.hpp" @@ -91,16 +92,16 @@ get_batch_struct(batch::matrix::Dense* const op) * Generates an immutable uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch> +inline batch::matrix::ell::uniform_batch> get_batch_struct(const batch::matrix::Ell* const op) { return {as_hip_type(op->get_const_values()), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } @@ -108,16 +109,16 @@ get_batch_struct(const batch::matrix::Ell* const op) * Generates a uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch> -get_batch_struct(batch::matrix::Ell* const op) +inline batch::matrix::ell::uniform_batch> get_batch_struct( + batch::matrix::Ell* const op) { return {as_hip_type(op->get_values()), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index 5cb5f73dec5..6f3db1bb96b 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -356,14 +356,6 @@ class Ell final col_idxs_.get_num_elems()); } - /** - * Creates a Ell matrix with the same configuration as the callers - * matrix. - * - * @returns a Ell matrix with the same configuration as the caller. - */ - std::unique_ptr create_with_same_config() const; - void apply_impl(const MultiVector* b, MultiVector* x) const; diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp index 2d0b7ed4d40..b91a4133dba 100644 --- a/omp/matrix/batch_dense_kernels.cpp +++ b/omp/matrix/batch_dense_kernels.cpp @@ -36,8 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include +#include +#include #include "core/base/batch_struct.hpp" diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp index 20ea4614e7d..17710a97366 100644 --- a/omp/matrix/batch_ell_kernels.cpp +++ b/omp/matrix/batch_ell_kernels.cpp @@ -36,8 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include +#include +#include #include "core/base/batch_struct.hpp" diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index 3d7ef03a3bd..87d73bb8e34 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -36,9 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include -#include +#include +#include #include "core/base/batch_struct.hpp" diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp index a3f69827c02..1d3a0e1ef94 100644 --- a/reference/matrix/batch_ell_kernels.cpp +++ b/reference/matrix/batch_ell_kernels.cpp @@ -36,9 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include -#include +#include +#include #include "core/base/batch_struct.hpp" diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc index 41d0a00ddcd..44de2a57af9 100644 --- a/reference/matrix/batch_ell_kernels.hpp.inc +++ b/reference/matrix/batch_ell_kernels.hpp.inc @@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template inline void simple_apply_kernel( - const gko::batch::matrix::batch_ell::batch_item& a, + const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) { @@ -55,7 +55,7 @@ inline void simple_apply_kernel( template inline void advanced_apply_kernel( const ValueType alpha, - const gko::batch::matrix::batch_ell::batch_item& a, + const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const ValueType beta, const gko::batch::multi_vector::batch_item& c) diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index 3b562450ee0..fb0e08c16f5 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -95,16 +95,16 @@ inline batch::matrix::dense::uniform_batch get_batch_struct( * Generates an immutable uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch -get_batch_struct(const batch::matrix::Ell* const op) +inline batch::matrix::ell::uniform_batch get_batch_struct( + const batch::matrix::Ell* const op) { return {op->get_const_values(), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } @@ -112,16 +112,16 @@ get_batch_struct(const batch::matrix::Ell* const op) * Generates a uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::batch_ell::uniform_batch get_batch_struct( +inline batch::matrix::ell::uniform_batch get_batch_struct( batch::matrix::Ell* const op) { return {op->get_values(), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp index 76b681c69f7..8a5806a9513 100644 --- a/reference/test/matrix/batch_ell_kernels.cpp +++ b/reference/test/matrix/batch_ell_kernels.cpp @@ -58,15 +58,13 @@ class Ell : public ::testing::Test { protected: using value_type = T; using size_type = gko::size_type; - using Mtx = gko::batch::matrix::Ell; - using MVec = gko::batch::MultiVector; + using BMtx = gko::batch::matrix::Ell; + using BMVec = gko::batch::MultiVector; using EllMtx = gko::matrix::Ell; using DenseMtx = gko::matrix::Dense; - using ComplexMtx = gko::to_complex; - using RealMtx = gko::remove_complex; Ell() : exec(gko::ReferenceExecutor::create()), - mtx_0(gko::batch::initialize( + mtx_0(gko::batch::initialize( {{I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}}, exec)), @@ -74,7 +72,7 @@ class Ell : public ::testing::Test { {I({1.0, -1.0, 1.5}), I({-2.0, 2.0, 3.0})}, exec)), mtx_01(gko::initialize( {I({1.0, -2.0, -0.5}), I({1.0, -2.5, 4.0})}, exec)), - b_0(gko::batch::initialize( + b_0(gko::batch::initialize( {{I({1.0, 0.0, 1.0}), I({2.0, 0.0, 1.0}), I({1.0, 0.0, 2.0})}, {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), @@ -88,7 +86,7 @@ class Ell : public ::testing::Test { {I({-1.0, 1.0, 1.0}), I({1.0, -1.0, 1.0}), I({1.0, 0.0, 2.0})}, exec)), - x_0(gko::batch::initialize( + x_0(gko::batch::initialize( {{I({2.0, 0.0, 1.0}), I({2.0, 0.0, 2.0})}, {I({-2.0, 1.0, 1.0}), I({1.0, -1.0, -1.0})}}, exec)), @@ -99,13 +97,13 @@ class Ell : public ::testing::Test { {} std::shared_ptr exec; - std::unique_ptr mtx_0; + std::unique_ptr mtx_0; std::unique_ptr mtx_00; std::unique_ptr mtx_01; - std::unique_ptr b_0; + std::unique_ptr b_0; std::unique_ptr b_00; std::unique_ptr b_01; - std::unique_ptr x_0; + std::unique_ptr x_0; std::unique_ptr x_00; std::unique_ptr x_01; @@ -121,38 +119,10 @@ TYPED_TEST(Ell, AppliesToBatchMultiVector) using T = typename TestFixture::value_type; this->mtx_0->apply(this->b_0.get(), this->x_0.get()); + this->mtx_00->apply(this->b_00.get(), this->x_00.get()); this->mtx_01->apply(this->b_01.get(), this->x_01.get()); - - auto res = gko::batch::unbatch>(this->x_0.get()); - - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); -} - - -TYPED_TEST(Ell, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) -{ - using Mtx = typename TestFixture::Mtx; - using MVec = typename TestFixture::MVec; - using DenseMtx = typename TestFixture::DenseMtx; - using T = typename TestFixture::value_type; - auto alpha = gko::batch::initialize(2, {1.5}, this->exec); - auto beta = gko::batch::initialize(2, {-4.0}, this->exec); - auto alpha0 = gko::initialize({1.5}, this->exec); - auto alpha1 = gko::initialize({1.5}, this->exec); - auto beta0 = gko::initialize({-4.0}, this->exec); - auto beta1 = gko::initialize({-4.0}, this->exec); - - this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), - this->x_0.get()); - this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), - this->x_00.get()); - this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), - this->x_01.get()); - auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); } @@ -160,12 +130,12 @@ TYPED_TEST(Ell, AppliesLinearCombinationWithSameAlphaToBatchMultiVector) TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector) { - using Mtx = typename TestFixture::Mtx; - using MVec = typename TestFixture::MVec; + using BMtx = typename TestFixture::BMtx; + using BMVec = typename TestFixture::BMVec; using DenseMtx = typename TestFixture::DenseMtx; using T = typename TestFixture::value_type; - auto alpha = gko::batch::initialize({{1.5}, {-1.0}}, this->exec); - auto beta = gko::batch::initialize({{2.5}, {-4.0}}, this->exec); + auto alpha = gko::batch::initialize({{1.5}, {-1.0}}, this->exec); + auto beta = gko::batch::initialize({{2.5}, {-4.0}}, this->exec); auto alpha0 = gko::initialize({1.5}, this->exec); auto alpha1 = gko::initialize({-1.0}, this->exec); auto beta0 = gko::initialize({2.5}, this->exec); @@ -173,13 +143,12 @@ TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector) this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(), this->x_0.get()); + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), this->x_00.get()); this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), this->x_01.get()); - auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); } @@ -187,8 +156,8 @@ TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector) TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols) { - using MVec = typename TestFixture::MVec; - auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}}); + using BMVec = typename TestFixture::BMVec; + auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}}); ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), gko::DimensionMismatch); @@ -197,8 +166,8 @@ TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols) TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultRows) { - using MVec = typename TestFixture::MVec; - auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}}); + using BMVec = typename TestFixture::BMVec; + auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}}); ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()), gko::DimensionMismatch); @@ -207,9 +176,9 @@ TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultRows) TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension) { - using MVec = typename TestFixture::MVec; + using BMVec = typename TestFixture::BMVec; auto res = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()), gko::DimensionMismatch); @@ -218,13 +187,13 @@ TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension) TYPED_TEST(Ell, AdvancedApplyFailsOnWrongInnerDimension) { - using MVec = typename TestFixture::MVec; + using BMVec = typename TestFixture::BMVec; auto res = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}}); auto alpha = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); auto beta = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); ASSERT_THROW( this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), @@ -234,13 +203,13 @@ TYPED_TEST(Ell, AdvancedApplyFailsOnWrongInnerDimension) TYPED_TEST(Ell, AdvancedApplyFailsOnWrongAlphaDimension) { - using MVec = typename TestFixture::MVec; + using BMVec = typename TestFixture::BMVec; auto res = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}}); auto alpha = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}}); auto beta = - MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); + BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}}); ASSERT_THROW( this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()), diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp index bc1e0c7fb42..083af0a0938 100644 --- a/test/matrix/batch_ell_kernels.cpp +++ b/test/matrix/batch_ell_kernels.cpp @@ -55,18 +55,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. class Ell : public CommonTestFixture { protected: - using Mtx = gko::batch::matrix::Ell; - using MVec = gko::batch::MultiVector; + using BMtx = gko::batch::matrix::Ell; + using BMVec = gko::batch::MultiVector; Ell() : rand_engine(15) {} - template - std::unique_ptr gen_mtx(const gko::size_type num_batch_items, - gko::size_type num_rows, - gko::size_type num_cols, - int num_elems_per_row) + template + std::unique_ptr gen_mtx(const gko::size_type num_batch_items, + gko::size_type num_rows, + gko::size_type num_cols, + int num_elems_per_row) { - return gko::test::generate_random_batch_matrix( + return gko::test::generate_random_batch_matrix( num_batch_items, num_rows, num_cols, std::uniform_int_distribution<>(num_elems_per_row, num_elems_per_row), @@ -74,11 +74,11 @@ class Ell : public CommonTestFixture { num_elems_per_row); } - std::unique_ptr gen_mvec(const gko::size_type num_batch_items, - gko::size_type num_rows, - gko::size_type num_cols) + std::unique_ptr gen_mvec(const gko::size_type num_batch_items, + gko::size_type num_rows, + gko::size_type num_cols) { - return gko::test::generate_random_batch_matrix( + return gko::test::generate_random_batch_matrix( num_batch_items, num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); @@ -89,15 +89,16 @@ class Ell : public CommonTestFixture { { const int num_rows = 252; const int num_cols = 32; - x = gen_mtx(batch_size, num_rows, num_cols, num_elems_per_row); + GKO_ASSERT(num_elems_per_row <= num_cols); + mat = gen_mtx(batch_size, num_rows, num_cols, num_elems_per_row); y = gen_mvec(batch_size, num_cols, num_vecs); alpha = gen_mvec(batch_size, 1, 1); beta = gen_mvec(batch_size, 1, 1); - dx = gko::clone(exec, x); + dmat = gko::clone(exec, mat); dy = gko::clone(exec, y); dalpha = gko::clone(exec, alpha); dbeta = gko::clone(exec, beta); - expected = MVec::create( + expected = BMVec::create( ref, gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs})); expected->fill(gko::one()); @@ -107,16 +108,16 @@ class Ell : public CommonTestFixture { std::ranlux48 rand_engine; const size_t batch_size = 11; - std::unique_ptr x; - std::unique_ptr y; - std::unique_ptr alpha; - std::unique_ptr beta; - std::unique_ptr expected; - std::unique_ptr dresult; - std::unique_ptr dx; - std::unique_ptr dy; - std::unique_ptr dalpha; - std::unique_ptr dbeta; + std::unique_ptr mat; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr dresult; + std::unique_ptr dmat; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; }; @@ -124,8 +125,8 @@ TEST_F(Ell, SingleVectorApplyIsEquivalentToRef) { set_up_apply_data(1); - x->apply(y.get(), expected.get()); - dx->apply(dy.get(), dresult.get()); + mat->apply(y.get(), expected.get()); + dmat->apply(dy.get(), dresult.get()); GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); } @@ -135,8 +136,8 @@ TEST_F(Ell, SingleVectorAdvancedApplyIsEquivalentToRef) { set_up_apply_data(1); - x->apply(alpha.get(), y.get(), beta.get(), expected.get()); - dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + mat->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmat->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r::value); } diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index 7e53ea8f165..c00bb594ecd 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -219,13 +219,20 @@ int main() auto test = batch_multi_vector_type::create(exec); } - // core/base/batch_dense.hpp + // core/matrix/batch_dense.hpp { using type1 = float; using batch_dense_type = gko::batch::matrix::Dense; auto test = batch_dense_type::create(exec); } + // core/matrix/batch_ell.hpp + { + using type1 = float; + using batch_ell_type = gko::batch::matrix::Ell; + auto test = batch_ell_type::create(exec); + } + // core/base/combination.hpp { using type1 = int; From b4d877fbe4e6e49371f6d682d87b2c5127f9fffa Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 11 Oct 2023 14:07:13 +0200 Subject: [PATCH 393/583] Kernel updates and batch_random_matrix gen --- .../cuda_hip/matrix/batch_ell_kernels.hpp.inc | 4 +- core/matrix/batch_ell.cpp | 7 -- core/test/utils/batch_helpers.hpp | 17 +++- core/test/utils/matrix_generator.hpp | 90 +++++++++++++++++++ cuda/matrix/batch_ell_kernels.cu | 1 + dpcpp/matrix/batch_ell_kernels.hpp.inc | 57 ++++++------ hip/matrix/batch_ell_kernels.hip.cpp | 1 + include/ginkgo/core/matrix/batch_ell.hpp | 19 ++-- test/matrix/batch_ell_kernels.cpp | 2 +- 9 files changed, 149 insertions(+), 49 deletions(-) diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc index 5c00358c5a0..19c29f14aa8 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc @@ -46,7 +46,7 @@ __device__ __forceinline__ void simple_apply( for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; - if (col_idx < idx) { + if (col_idx == invalid_index()) { break; } else { temp += val[ind] * b[col_idx]; @@ -102,7 +102,7 @@ __device__ __forceinline__ void advanced_apply( for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; - if (col_idx < idx) { + if (col_idx == invalid_index()) { break; } else { temp += alpha * val[ind] * b[col_idx]; diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index f421fdf2b49..c9dbe6d51c9 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -128,13 +128,6 @@ Ell::create_const( } -inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) -{ - return batch_dim<2>(sizes.get_num_batch_items(), - dim<2>(1, sizes.get_common_size()[1])); -} - - template Ell::Ell(std::shared_ptr exec, const batch_dim<2>& size, diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp index b040691999e..0b6197b5062 100644 --- a/core/test/utils/batch_helpers.hpp +++ b/core/test/utils/batch_helpers.hpp @@ -82,11 +82,22 @@ std::unique_ptr generate_random_batch_matrix( auto result = MatrixType::create( exec, batch_dim<2>(num_batch_items, dim<2>(num_rows, num_cols)), std::forward(args)...); + auto sp_mat = generate_random_device_matrix_data( + num_rows, num_cols, nonzero_dist, value_dist, engine, + exec->get_master()); + auto row_idxs = gko::array::const_view( + exec->get_master(), sp_mat.get_num_elems(), + sp_mat.get_const_row_idxs()) + .copy_to_array(); + auto col_idxs = gko::array::const_view( + exec->get_master(), sp_mat.get_num_elems(), + sp_mat.get_const_col_idxs()) + .copy_to_array(); for (size_type b = 0; b < num_batch_items; b++) { - auto rand_mat = - generate_random_matrix( - num_rows, num_cols, nonzero_dist, value_dist, engine, exec); + auto rand_mat = fill_random_matrix_with_sparsity_pattern< + typename MatrixType::unbatch_type, index_type>( + num_rows, num_cols, row_idxs, col_idxs, value_dist, engine, exec); result->create_view_for_item(b)->copy_from(rand_mat.get()); } diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 6928c5424a5..8a82ae744e7 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -54,6 +55,49 @@ namespace gko { namespace test { +/** + * Fills matrix data for a random matrix given a sparsity pattern + * + * @tparam ValueType the type for matrix values + * @tparam IndexType the type for row and column indices + * @tparam ValueDistribution type of value distribution + * @tparam Engine type of random engine + * + * @param num_rows number of rows + * @param num_cols number of columns + * @param row_idxs the row indices of the matrix + * @param col_idxs the column indices of the matrix + * @param value_dist distribution of matrix values + * @param engine a random engine + * + * @return the generated matrix_data with entries according to the given + * dimensions and nonzero count and value distributions. + */ +template +matrix_data fill_random_matrix_data( + size_type num_rows, size_type num_cols, + const gko::array& row_indices, + const gko::array& col_indices, ValueDistribution&& value_dist, + Engine&& engine) +{ + matrix_data data{gko::dim<2>{num_rows, num_cols}, {}}; + auto host_exec = row_indices.get_executor()->get_master(); + auto host_row_indices = make_temporary_clone(host_exec, &row_indices); + auto host_col_indices = make_temporary_clone(host_exec, &col_indices); + + for (int nnz = 0; nnz < row_indices.get_num_elems(); ++nnz) { + data.nonzeros.emplace_back( + host_row_indices->get_const_data()[nnz], + host_col_indices->get_const_data()[nnz], + detail::get_rand_value(value_dist, engine)); + } + + data.ensure_row_major_order(); + return data; +} + + /** * Generates matrix data for a random matrix. * @@ -156,6 +200,48 @@ generate_random_device_matrix_data(gko::size_type num_rows, } +/** + * Fills a random matrix with given sparsity pattern. + * + * @tparam MatrixType type of matrix to generate (must implement + * the interface `ReadableFromMatrixData<>` and provide + * matching `value_type` and `index_type` type aliases) + * + * @param num_rows number of rows + * @param num_cols number of columns + * @param value_dist distribution of matrix values + * @param row_idxs the row indices of the matrix + * @param col_idxs the column indices of the matrix + * @param exec executor where the matrix should be allocated + * @param args additional arguments for the matrix constructor + * + * The other (template) parameters match generate_random_matrix_data. + * + * @return the unique pointer of MatrixType + */ +template , + typename IndexType = typename MatrixType::index_type, + typename ValueDistribution, typename Engine, typename... MatrixArgs> +std::unique_ptr fill_random_matrix_with_sparsity_pattern( + size_type num_rows, size_type num_cols, + const gko::array& row_idxs, + const gko::array& col_idxs, ValueDistribution&& value_dist, + Engine&& engine, std::shared_ptr exec, MatrixArgs&&... args) +{ + using value_type = typename MatrixType::value_type; + using index_type = IndexType; + + GKO_ASSERT(row_idxs.get_num_elems() == col_idxs.get_num_elems()); + GKO_ASSERT(row_idxs.get_num_elems() < (num_rows * num_cols)); + auto result = MatrixType::create(exec, std::forward(args)...); + result->read(fill_random_matrix_data( + num_rows, num_cols, row_idxs, col_idxs, + std::forward(value_dist), + std::forward(engine))); + return result; +} + + /** * Generates a random matrix. * @@ -163,6 +249,10 @@ generate_random_device_matrix_data(gko::size_type num_rows, * the interface `ReadableFromMatrixData<>` and provide * matching `value_type` and `index_type` type aliases) * + * @param num_rows number of rows + * @param num_cols number of columns + * @param nonzero_dist distribution of nonzeros per row + * @param value_dist distribution of matrix values * @param exec executor where the matrix should be allocated * @param args additional arguments for the matrix constructor * diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index 6dd268a2d8e..5cadd7755a2 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc index 7500ae9e060..e6501bafaba 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp.inc +++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc @@ -42,38 +42,37 @@ __dpct_inline__ void simple_apply_kernel( auto temp = zero(); for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; - if (col_idx < idx) + if (col_idx == invalid_index()) { break; - else - temp += mat.values[tidx + idx * mat.stride] * - b.values[col_idx * b.stride]; + else temp += mat.values[tidx + idx * mat.stride] * + b.values[col_idx * b.stride]; + } + x.values[tidx * x.stride] = temp; } - x.values[tidx * x.stride] = temp; } -} -template -__dpct_inline__ void advanced_apply_kernel( - const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::matrix::ell::batch_item& mat, - const gko::batch::multi_vector::batch_item& b, - const gko::batch::multi_vector::batch_item& beta, - const gko::batch::multi_vector::batch_item& x, - sycl::nd_item<3>& item_ct1) -{ - for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows; - tidx += item_ct1.get_local_range().size()) { - auto temp = zero(); - for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { - const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; - if (col_idx < idx) - break; - else - temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] * - b.values[col_idx * b.stride]; + template + __dpct_inline__ void advanced_apply_kernel( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::matrix::ell::batch_item& mat, + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& beta, + const gko::batch::multi_vector::batch_item& x, + sycl::nd_item<3>& item_ct1) + { + for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows; + tidx += item_ct1.get_local_range().size()) { + auto temp = zero(); + for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { + const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; + if (col_idx == invalid_index()) { + break; + else temp += alpha.values[0] * + mat.values[tidx + idx * mat.stride] * + b.values[col_idx * b.stride]; + } + x.values[tidx * x.stride] = + temp + beta.values[0] * x.values[tidx * x.stride]; + } } - x.values[tidx * x.stride] = - temp + beta.values[0] * x.values[tidx * x.stride]; - } -} diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index 5c6d5179a21..96e7cdb298e 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index 6f3db1bb96b..be49e2cff41 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -58,9 +58,14 @@ namespace matrix { * Ell is a sparse matrix format that stores the same number of nonzeros in each * row, enabling coalesced accesses. It is suitable for sparsity patterns that * have a similar number of nonzeros in every row. The values are stored in a - * column-major fashion similar to the monolithic gko::matrix::Ell class. It is - * also assumed that the sparsity pattern of all the items in the batch is the - * same and therefore only a single copy of the sparsity pattern is stored. + * column-major fashion similar to the monolithic gko::matrix::Ell class. + * + * Similar to the monolithic gko::matrix::Ell class, invalid_index is + * used as the column index for padded zero entries. + * + * @note It is also assumed that the sparsity pattern of all the items in the + * batch is the same and therefore only a single copy of the sparsity pattern is + * stored. * * @tparam ValueType value precision of matrix elements * @tparam IndexType index precision of matrix elements @@ -253,13 +258,13 @@ class Ell final /** * Creates a constant (immutable) batch ell matrix from a constant - * array. + * array. The column indices array needs to be the same for all batch items. * * @param exec the executor to create the matrix on * @param size the dimensions of the matrix * @param num_elems_per_row the number of elements to be stored in each row * @param values the value array of the matrix - * @param col_idxs the col_idxs array of the matrix + * @param col_idxs the col_idxs array of a single batch item of the matrix. * * @return A smart pointer to the constant matrix wrapping the input * array (if it resides on the same executor as the matrix) or a copy of the @@ -325,7 +330,7 @@ class Ell final /** * Creates a Ell matrix from an already allocated (and initialized) - * array. + * array. The column indices array needs to be the same for all batch items. * * @tparam ValuesArray type of array of values * @@ -333,7 +338,7 @@ class Ell final * @param size size of the matrix * @param num_elems_per_row the number of elements to be stored in each row * @param values array of matrix values - * @param col_idxs the col_idxs array of the matrix + * @param col_idxs the col_idxs array of a single batch item of the matrix. * * @note If `values` is not an rvalue, not an array of ValueType, or is on * the wrong executor, an internal copy will be created, and the diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp index 083af0a0938..572f47ba47d 100644 --- a/test/matrix/batch_ell_kernels.cpp +++ b/test/matrix/batch_ell_kernels.cpp @@ -55,7 +55,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. class Ell : public CommonTestFixture { protected: - using BMtx = gko::batch::matrix::Ell; + using BMtx = gko::batch::matrix::Ell; using BMVec = gko::batch::MultiVector; Ell() : rand_engine(15) {} From ea785065fbb5ef33a3f6061a4e2db6b904214a8a Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 11 Oct 2023 15:41:55 +0200 Subject: [PATCH 394/583] Review updates Co-authored-by: Marcel Koch Co-authored-by: Yu-Hsiang Tsai --- .../cuda_hip/matrix/batch_ell_kernels.hpp.inc | 22 +-- core/base/batch_multi_vector.cpp | 21 --- core/base/batch_utilities.hpp | 47 ++--- core/matrix/batch_struct.hpp | 40 ++--- core/test/matrix/batch_ell.cpp | 160 ++++++++---------- core/test/utils/matrix_generator.hpp | 2 +- cuda/matrix/batch_struct.hpp | 28 +-- dpcpp/matrix/batch_ell_kernels.dp.cpp | 2 + dpcpp/matrix/batch_ell_kernels.hpp.inc | 61 +++---- dpcpp/matrix/batch_struct.hpp | 28 +-- hip/matrix/batch_struct.hip.hpp | 28 +-- .../ginkgo/core/base/batch_multi_vector.hpp | 18 +- include/ginkgo/core/matrix/batch_dense.hpp | 2 - include/ginkgo/core/matrix/batch_ell.hpp | 7 +- reference/matrix/batch_ell_kernels.hpp.inc | 24 +-- reference/matrix/batch_struct.hpp | 28 +-- reference/test/matrix/batch_ell_kernels.cpp | 8 +- 17 files changed, 235 insertions(+), 291 deletions(-) diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc index 19c29f14aa8..de6ca879890 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc @@ -31,9 +31,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -template +template __device__ __forceinline__ void simple_apply( - const gko::batch::matrix::ell::batch_item& mat, + const gko::batch::matrix::ell::batch_item& mat, const ValueType* const __restrict__ b, ValueType* const __restrict__ x) { const auto num_rows = mat.num_rows; @@ -46,7 +46,7 @@ __device__ __forceinline__ void simple_apply( for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; - if (col_idx == invalid_index()) { + if (col_idx == invalid_index()) { break; } else { temp += val[ind] * b[col_idx]; @@ -56,12 +56,13 @@ __device__ __forceinline__ void simple_apply( } } -template +template __global__ __launch_bounds__( default_block_size, sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: ell::uniform_batch< - const ValueType> + const ValueType, + IndexType> mat, const gko::batch:: multi_vector:: @@ -85,10 +86,10 @@ __global__ __launch_bounds__( } -template +template __device__ __forceinline__ void advanced_apply( const ValueType alpha, - const gko::batch::matrix::ell::batch_item& mat, + const gko::batch::matrix::ell::batch_item& mat, const ValueType* const __restrict__ b, const ValueType beta, ValueType* const __restrict__ x) { @@ -102,7 +103,7 @@ __device__ __forceinline__ void advanced_apply( for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; - if (col_idx == invalid_index()) { + if (col_idx == invalid_index()) { break; } else { temp += alpha * val[ind] * b[col_idx]; @@ -112,7 +113,7 @@ __device__ __forceinline__ void advanced_apply( } } -template +template __global__ __launch_bounds__( default_block_size, sm_oversubscription) void advanced_apply_kernel(const gko::batch:: @@ -122,7 +123,8 @@ __global__ __launch_bounds__( alpha, const gko::batch::matrix:: ell::uniform_batch< - const ValueType> + const ValueType, + IndexType> mat, const gko::batch:: multi_vector:: diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 6a14919bf2f..6dcf8dd90b5 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -291,27 +291,6 @@ void MultiVector::move_to( } -template -void MultiVector::convert_to(matrix::Dense* result) const -{ - auto exec = result->get_executor() == nullptr ? this->get_executor() - : result->get_executor(); - auto tmp = gko::batch::matrix::Dense::create_const( - exec, this->get_size(), - make_const_array_view(this->get_executor(), - this->get_num_stored_elements(), - this->get_const_values())); - result->copy_from(tmp); -} - - -template -void MultiVector::move_to(matrix::Dense* result) -{ - this->convert_to(result); -} - - #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index c37c0cae721..7204c78a552 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -165,12 +165,8 @@ std::vector> write( /** * Creates and initializes a batch of single column-vectors. * - * This function first creates a temporary MultiVector, fills it with - * passed in values, and then converts the vector to the requested type. - * - * @tparam Matrix matrix type to initialize - * (MultiVector has to implement the ConvertibleTo - * interface) + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * @@ -180,7 +176,6 @@ std::vector> write( * including the Executor, which is passed as the first * argument * - * @ingroup MultiVector * @ingroup mat_formats */ template @@ -220,23 +215,19 @@ std::unique_ptr initialize( /** - * Creates and initializes a batch of multi-vectors. - * - * This function first creates a temporary MultiVector, fills it with - * passed in values, and then converts the vector to the requested type. + * Creates and initializes a batch of matrices. * - * @tparam Matrix matrix type to initialize - * (Dense has to implement the ConvertibleTo interface) + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param vals values used to initialize the vector - * @param exec Executor associated to the vector + * @param vals values used to initialize the matrix + * @param exec Executor associated with the matrix * @param create_args additional arguments passed to Matrix::create, not * including the Executor, which is passed as the first * argument * - * @ingroup MultiVector * @ingroup mat_formats */ template @@ -290,23 +281,18 @@ std::unique_ptr initialize( * Creates and initializes a batch single column-vector by making copies of the * single input column vector. * - * This function first creates a temporary batch multi-vector, fills it with - * passed in values, and then converts the vector to the requested type. - * - * @tparam Matrix matrix type to initialize - * (MultiVector has to implement the ConvertibleTo - * interface) + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * * @param num_vectors The number of times the input vector is to be duplicated * @param vals values used to initialize each vector in the temp. batch - * @param exec Executor associated to the vector + * @param exec Executor associated with the matrix * @param create_args additional arguments passed to Matrix::create, not * including the Executor, which is passed as the first * argument * - * @ingroup MultiVector * @ingroup mat_formats */ template @@ -343,23 +329,18 @@ std::unique_ptr initialize( /** * Creates and initializes a matrix from copies of a given matrix. * - * This function first creates a temporary batch multi-vector, fills it with - * passed in values, and then converts the vector to the requested type. - * - * @tparam Matrix matrix type to initialize - * (MultiVector has to implement the ConvertibleTo - * interface) + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * * @param num_batch_items The number of times the input matrix is duplicated - * @param vals values used to initialize each vector in the temp. batch - * @param exec Executor associated to the vector + * @param vals values used to initialize each matrix in the temp. batch + * @param exec Executor associated to the matrix * @param create_args additional arguments passed to Matrix::create, not * including the Executor, which is passed as the first * argument * - * @ingroup LinOp * @ingroup mat_formats */ template diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp index eeeeebd53d6..f208f5ff078 100644 --- a/core/matrix/batch_struct.hpp +++ b/core/matrix/batch_struct.hpp @@ -89,10 +89,10 @@ namespace ell { /** * Encapsulates one matrix from a batch of ell matrices. */ -template +template struct batch_item { using value_type = ValueType; - using index_type = int32; + using index_type = IndexType; ValueType* values; const index_type* col_idxs; @@ -106,11 +106,11 @@ struct batch_item { /** * A 'simple' structure to store a global uniform batch of ell matrices. */ -template +template struct uniform_batch { using value_type = ValueType; - using index_type = int32; - using entry_type = batch_item; + using index_type = IndexType; + using entry_type = batch_item; ValueType* values; const index_type* col_idxs; @@ -164,27 +164,28 @@ GKO_ATTRIBUTES GKO_INLINE dense::batch_item extract_batch_item( } -template -GKO_ATTRIBUTES GKO_INLINE ell::batch_item to_const( - const ell::batch_item& b) +template +GKO_ATTRIBUTES GKO_INLINE ell::batch_item to_const( + const ell::batch_item& b) { return {b.values, b.col_idxs, b.stride, b.num_rows, b.num_cols, b.num_stored_elems_per_row}; } -template -GKO_ATTRIBUTES GKO_INLINE ell::uniform_batch to_const( - const ell::uniform_batch& ub) +template +GKO_ATTRIBUTES GKO_INLINE ell::uniform_batch +to_const(const ell::uniform_batch& ub) { return {ub.values, ub.col_idxs, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_cols, ub.num_stored_elems_per_row}; } -template -GKO_ATTRIBUTES GKO_INLINE ell::batch_item extract_batch_item( - const ell::uniform_batch& batch, const size_type batch_idx) +template +GKO_ATTRIBUTES GKO_INLINE ell::batch_item +extract_batch_item(const ell::uniform_batch& batch, + const size_type batch_idx) { return {batch.values + batch_idx * batch.num_stored_elems_per_row * batch.num_rows, @@ -195,11 +196,12 @@ GKO_ATTRIBUTES GKO_INLINE ell::batch_item extract_batch_item( batch.num_stored_elems_per_row}; } -template -GKO_ATTRIBUTES GKO_INLINE ell::batch_item extract_batch_item( - ValueType* const batch_values, int* const batch_col_idxs, const int stride, - const int num_rows, const int num_cols, int num_elems_per_row, - const size_type batch_idx) +template +GKO_ATTRIBUTES GKO_INLINE ell::batch_item +extract_batch_item(ValueType* const batch_values, + IndexType* const batch_col_idxs, const int stride, + const int num_rows, const int num_cols, + int num_elems_per_row, const size_type batch_idx) { return {batch_values + batch_idx * num_elems_per_row * num_rows, batch_col_idxs, diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp index e4dcab23917..c36a877ac14 100644 --- a/core/test/matrix/batch_ell.cpp +++ b/core/test/matrix/batch_ell.cpp @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include #include @@ -52,26 +51,26 @@ class Ell : public ::testing::Test { protected: using value_type = T; using index_type = gko::int32; - using EllMtx = gko::matrix::Ell; + using BatchEllMtx = gko::batch::matrix::Ell; + using EllMtx = gko::matrix::Ell; using size_type = gko::size_type; Ell() : exec(gko::ReferenceExecutor::create()), - mtx(gko::batch::initialize>( + mtx(gko::batch::initialize( {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}}, exec, 3)), - sp_mtx(gko::batch::initialize>( + sp_mtx(gko::batch::initialize( {{{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}}, {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}}, exec, 2)), - ell_mtx(gko::initialize>( - {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 3)), - sp_ell_mtx(gko::initialize>( - {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 2)) + ell_mtx(gko::initialize({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, + exec, gko::dim<2>(2, 3), 3)), + sp_ell_mtx(gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, + exec, gko::dim<2>(2, 3), 2)) {} - static void assert_equal_to_original_sparse_mtx( - const gko::batch::matrix::Ell* m) + static void assert_equal_to_original_sparse_mtx(const BatchEllMtx* m) { ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); @@ -91,8 +90,7 @@ class Ell : public ::testing::Test { ASSERT_EQ(m->get_const_col_idxs()[3], index_type{2}); } - static void assert_equal_to_original_mtx( - const gko::batch::matrix::Ell* m) + static void assert_equal_to_original_mtx(const BatchEllMtx* m) { ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3)); @@ -112,7 +110,7 @@ class Ell : public ::testing::Test { ASSERT_EQ(m->get_const_values()[11], value_type{3.0}); } - static void assert_empty(gko::batch::matrix::Ell* m) + static void assert_empty(BatchEllMtx* m) { ASSERT_EQ(m->get_num_batch_items(), 0); ASSERT_EQ(m->get_num_stored_elements(), 0); @@ -120,10 +118,10 @@ class Ell : public ::testing::Test { } std::shared_ptr exec; - std::unique_ptr> mtx; - std::unique_ptr> sp_mtx; - std::unique_ptr> ell_mtx; - std::unique_ptr> sp_ell_mtx; + std::unique_ptr mtx; + std::unique_ptr sp_mtx; + std::unique_ptr ell_mtx; + std::unique_ptr sp_ell_mtx; }; TYPED_TEST_SUITE(Ell, gko::test::ValueTypes); @@ -143,16 +141,11 @@ TYPED_TEST(Ell, SparseMtxKnowsItsSizeAndValues) TYPED_TEST(Ell, CanBeEmpty) { - auto empty = gko::batch::matrix::Ell::create(this->exec); + using BatchEllMtx = typename TestFixture::BatchEllMtx; - this->assert_empty(empty.get()); -} - - -TYPED_TEST(Ell, ReturnsNullValuesArrayWhenEmpty) -{ - auto empty = gko::batch::matrix::Ell::create(this->exec); + auto empty = BatchEllMtx::create(this->exec); + this->assert_empty(empty.get()); ASSERT_EQ(empty->get_const_values(), nullptr); } @@ -180,7 +173,9 @@ TYPED_TEST(Ell, CanCreateSpEllItemView) TYPED_TEST(Ell, CanBeCopied) { - auto mtx_copy = gko::batch::matrix::Ell::create(this->exec); + using BatchEllMtx = typename TestFixture::BatchEllMtx; + + auto mtx_copy = BatchEllMtx::create(this->exec); mtx_copy->copy_from(this->mtx.get()); @@ -192,7 +187,9 @@ TYPED_TEST(Ell, CanBeCopied) TYPED_TEST(Ell, CanBeMoved) { - auto mtx_copy = gko::batch::matrix::Ell::create(this->exec); + using BatchEllMtx = typename TestFixture::BatchEllMtx; + + auto mtx_copy = BatchEllMtx::create(this->exec); this->mtx->move_to(mtx_copy); @@ -219,10 +216,10 @@ TYPED_TEST(Ell, CanBeCleared) TYPED_TEST(Ell, CanBeConstructedWithSize) { - using size_type = gko::size_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; - auto m = gko::batch::matrix::Ell::create( - this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3}), 2); + auto m = BatchEllMtx::create(this->exec, + gko::batch_dim<2>(2, gko::dim<2>{5, 3}), 2); ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3)); @@ -235,19 +232,19 @@ TYPED_TEST(Ell, CanBeConstructedFromExistingData) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using size_type = gko::size_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; // clang-format off value_type values[] = { -1.0, 2.5, - 0.0, 3.5, - 1.0, 2.0, - 0.0, 3.0}; + 0.0, 3.5, + 1.0, 2.0, + 0.0, 3.0}; index_type col_idxs[] = { - 0, 1, + 0, 1, -1, 2}; // clang-format on - auto m = gko::batch::matrix::Ell::create( + auto m = BatchEllMtx::create( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2, gko::array::view(this->exec, 8, values), gko::array::view(this->exec, 4, col_idxs)); @@ -260,19 +257,19 @@ TYPED_TEST(Ell, CanBeConstructedFromExistingConstData) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using size_type = gko::size_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; // clang-format off value_type values[] = { -1.0, 2.5, - 0.0, 3.5, - 1.0, 2.0, - 0.0, 3.0}; + 0.0, 3.5, + 1.0, 2.0, + 0.0, 3.0}; index_type col_idxs[] = { - 0, 1, + 0, 1, -1, 2}; // clang-format on - auto m = gko::batch::matrix::Ell::create_const( + auto m = BatchEllMtx::create_const( this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2, gko::array::const_view(this->exec, 8, values), gko::array::const_view(this->exec, 4, col_idxs)); @@ -283,15 +280,14 @@ TYPED_TEST(Ell, CanBeConstructedFromExistingConstData) TYPED_TEST(Ell, CanBeConstructedFromEllMatrices) { - using value_type = typename TestFixture::value_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; using EllMtx = typename TestFixture::EllMtx; - using size_type = gko::size_type; auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}}, this->exec); auto mat2 = gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec); - auto m = gko::batch::create_from_item>( + auto m = gko::batch::create_from_item( this->exec, std::vector{mat1.get(), mat2.get()}, mat1->get_num_stored_elements_per_row()); @@ -301,19 +297,15 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatrices) TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication) { - using value_type = typename TestFixture::value_type; - using index_type = int; + using BatchEllMtx = typename TestFixture::BatchEllMtx; using EllMtx = typename TestFixture::EllMtx; - using size_type = gko::size_type; auto mat1 = gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec); - auto bat_m = - gko::batch::create_from_item>( - this->exec, - std::vector{mat1.get(), mat1.get(), mat1.get()}, - mat1->get_num_stored_elements_per_row()); + auto bat_m = gko::batch::create_from_item( + this->exec, std::vector{mat1.get(), mat1.get(), mat1.get()}, + mat1->get_num_stored_elements_per_row()); - auto m = gko::batch::create_from_item>( + auto m = gko::batch::create_from_item( this->exec, 3, mat1.get(), mat1->get_num_stored_elements_per_row()); GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14); @@ -322,26 +314,23 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication) TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices) { - using value_type = typename TestFixture::value_type; - using index_type = int; + using BatchEllMtx = typename TestFixture::BatchEllMtx; using EllMtx = typename TestFixture::EllMtx; - using size_type = gko::size_type; auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 0.0}}, this->exec); auto mat2 = gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec); - auto m = gko::batch::create_from_item>( + auto m = gko::batch::create_from_item( this->exec, std::vector{mat1.get(), mat2.get()}, mat1->get_num_stored_elements_per_row()); - auto m_ref = - gko::batch::create_from_item>( - this->exec, - std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), - mat1.get(), mat2.get()}, - mat1->get_num_stored_elements_per_row()); - - auto m2 = gko::batch::duplicate>( + auto m_ref = gko::batch::create_from_item( + this->exec, + std::vector{mat1.get(), mat2.get(), mat1.get(), mat2.get(), + mat1.get(), mat2.get()}, + mat1->get_num_stored_elements_per_row()); + + auto m2 = gko::batch::duplicate( this->exec, 3, m.get(), mat1->get_num_stored_elements_per_row()); GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14); @@ -350,17 +339,14 @@ TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices) TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices) { - using value_type = typename TestFixture::value_type; - using index_type = int; + using BatchEllMtx = typename TestFixture::BatchEllMtx; using EllMtx = typename TestFixture::EllMtx; - using size_type = gko::size_type; auto mat1 = gko::initialize({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}}, this->exec); auto mat2 = gko::initialize({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec); - auto ell_mats = gko::batch::unbatch>( - this->sp_mtx.get()); + auto ell_mats = gko::batch::unbatch(this->sp_mtx.get()); GKO_ASSERT_MTX_NEAR(ell_mats[0].get(), mat1.get(), 0.); GKO_ASSERT_MTX_NEAR(ell_mats[1].get(), mat2.get(), 0.); @@ -370,10 +356,12 @@ TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices) TYPED_TEST(Ell, CanBeListConstructed) { using value_type = typename TestFixture::value_type; - using index_type = int; + using index_type = typename TestFixture::index_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; + using EllMtx = typename TestFixture::EllMtx; - auto m = gko::batch::initialize>( - {{0.0, -1.0}, {1.0, 0.0}}, this->exec); + auto m = gko::batch::initialize({{0.0, -1.0}, {1.0, 0.0}}, + this->exec); ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); @@ -391,10 +379,11 @@ TYPED_TEST(Ell, CanBeListConstructed) TYPED_TEST(Ell, CanBeListConstructedByCopies) { using value_type = typename TestFixture::value_type; - using index_type = int; + using index_type = typename TestFixture::index_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; - auto m = gko::batch::initialize>( - 2, I({0.0, -1.0}), this->exec, 1); + auto m = gko::batch::initialize(2, I({0.0, -1.0}), + this->exec, 1); ASSERT_EQ(m->get_num_batch_items(), 2); ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1)); @@ -412,10 +401,11 @@ TYPED_TEST(Ell, CanBeListConstructedByCopies) TYPED_TEST(Ell, CanBeDoubleListConstructed) { using value_type = typename TestFixture::value_type; - using index_type = int; + using index_type = typename TestFixture::index_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; using T = value_type; - auto m = gko::batch::initialize>( + auto m = gko::batch::initialize( // clang-format off {{I{1.0, 0.0, 0.0}, I{2.0, 0.0, 3.0}, @@ -454,15 +444,15 @@ TYPED_TEST(Ell, CanBeDoubleListConstructed) TYPED_TEST(Ell, CanBeReadFromMatrixData) { using value_type = typename TestFixture::value_type; - using index_type = int; + using index_type = typename TestFixture::index_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; auto vec_data = std::vector>{}; vec_data.emplace_back(gko::matrix_data( {2, 3}, {{0, 0, -1.0}, {1, 1, 2.5}, {1, 2, 3.5}})); vec_data.emplace_back(gko::matrix_data( {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}})); - auto m = gko::batch::read>(this->exec, + auto m = gko::batch::read(this->exec, vec_data, 2); this->assert_equal_to_original_sparse_mtx(m.get()); @@ -472,11 +462,11 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData) TYPED_TEST(Ell, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; - using index_type = int; + using index_type = typename TestFixture::index_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; using tpl = typename gko::matrix_data::nonzero_type; - auto data = gko::batch::write>( + auto data = gko::batch::write( this->sp_mtx.get()); ASSERT_EQ(data[0].size, gko::dim<2>(2, 3)); diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 8a82ae744e7..7490a24bbe5 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -232,7 +232,7 @@ std::unique_ptr fill_random_matrix_with_sparsity_pattern( using index_type = IndexType; GKO_ASSERT(row_idxs.get_num_elems() == col_idxs.get_num_elems()); - GKO_ASSERT(row_idxs.get_num_elems() < (num_rows * num_cols)); + GKO_ASSERT(row_idxs.get_num_elems() <= (num_rows * num_cols)); auto result = MatrixType::create(exec, std::forward(args)...); result->read(fill_random_matrix_data( num_rows, num_cols, row_idxs, col_idxs, diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index e2db1ea6e97..4a2a1835961 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -91,34 +91,34 @@ get_batch_struct(batch::matrix::Dense* const op) /** * Generates an immutable uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch> -get_batch_struct(const batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch, IndexType> +get_batch_struct(const batch::matrix::Ell* const op) { return {as_cuda_type(op->get_const_values()), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } /** * Generates a uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch> get_batch_struct( - batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch, IndexType> +get_batch_struct(batch::matrix::Ell* const op) { return {as_cuda_type(op->get_values()), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index fca265eceb0..e4d2421a42f 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -81,6 +81,7 @@ void simple_apply(std::shared_ptr exec, const auto num_batch_items = mat->get_num_batch_items(); auto device = exec->get_queue()->get_device(); + // TODO: use runtime selection of group size based on num_rows. auto group_size = device.get_info(); @@ -134,6 +135,7 @@ void advanced_apply(std::shared_ptr exec, const auto num_batch_items = mat_ub.num_batch_items; auto device = exec->get_queue()->get_device(); + // TODO: use runtime selection of group size based on num_rows. auto group_size = device.get_info(); diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc index e6501bafaba..553e0aa1f3c 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp.inc +++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc @@ -30,9 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -template +template __dpct_inline__ void simple_apply_kernel( - const gko::batch::matrix::ell::batch_item& mat, + const gko::batch::matrix::ell::batch_item& mat, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& x, sycl::nd_item<3>& item_ct1) @@ -42,37 +42,38 @@ __dpct_inline__ void simple_apply_kernel( auto temp = zero(); for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; - if (col_idx == invalid_index()) { + if (col_idx != invalid_index()) { break; - else temp += mat.values[tidx + idx * mat.stride] * - b.values[col_idx * b.stride]; - } - x.values[tidx * x.stride] = temp; + } else + temp += mat.values[tidx + idx * mat.stride] * + b.values[col_idx * b.stride]; } + x.values[tidx * x.stride] = temp; } +} - template - __dpct_inline__ void advanced_apply_kernel( - const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::matrix::ell::batch_item& mat, - const gko::batch::multi_vector::batch_item& b, - const gko::batch::multi_vector::batch_item& beta, - const gko::batch::multi_vector::batch_item& x, - sycl::nd_item<3>& item_ct1) - { - for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows; - tidx += item_ct1.get_local_range().size()) { - auto temp = zero(); - for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { - const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; - if (col_idx == invalid_index()) { - break; - else temp += alpha.values[0] * - mat.values[tidx + idx * mat.stride] * - b.values[col_idx * b.stride]; - } - x.values[tidx * x.stride] = - temp + beta.values[0] * x.values[tidx * x.stride]; - } +template +__dpct_inline__ void advanced_apply_kernel( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::matrix::ell::batch_item& mat, + const gko::batch::multi_vector::batch_item& b, + const gko::batch::multi_vector::batch_item& beta, + const gko::batch::multi_vector::batch_item& x, + sycl::nd_item<3>& item_ct1) +{ + for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows; + tidx += item_ct1.get_local_range().size()) { + auto temp = zero(); + for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { + const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; + if (col_idx != invalid_index()) { + break; + } else + temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] * + b.values[col_idx * b.stride]; } + x.values[tidx * x.stride] = + temp + beta.values[0] * x.values[tidx * x.stride]; + } +} diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index f857653e05e..fe04407d82d 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -90,34 +90,34 @@ inline batch::matrix::dense::uniform_batch get_batch_struct( /** * Generates an immutable uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch get_batch_struct( - const batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch +get_batch_struct(const batch::matrix::Ell* const op) { return {op->get_const_values(), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } /** * Generates a uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch get_batch_struct( - batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch get_batch_struct( + batch::matrix::Ell* const op) { return {op->get_values(), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index 6f15b2d966a..e35f13f1249 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -91,34 +91,34 @@ get_batch_struct(batch::matrix::Dense* const op) /** * Generates an immutable uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch> -get_batch_struct(const batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch, IndexType> +get_batch_struct(const batch::matrix::Ell* const op) { return {as_hip_type(op->get_const_values()), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } /** * Generates a uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch> get_batch_struct( - batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch, IndexType> +get_batch_struct(batch::matrix::Ell* const op) { return {as_hip_type(op->get_values()), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 9a4b8d5cf1d..405603269ff 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -52,14 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace batch { -namespace matrix { - - -template -class Dense; - - -} /** @@ -90,21 +82,17 @@ class MultiVector : public EnablePolymorphicObject>, public EnablePolymorphicAssignment>, public EnableCreateMethod>, - public ConvertibleTo>>, - public ConvertibleTo> { + public ConvertibleTo>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class MultiVector>; friend class MultiVector>; - friend class matrix::Dense; public: using EnablePolymorphicAssignment::convert_to; using EnablePolymorphicAssignment::move_to; using ConvertibleTo>>::convert_to; using ConvertibleTo>>::move_to; - using ConvertibleTo>::convert_to; - using ConvertibleTo>::move_to; using value_type = ValueType; using index_type = int32; @@ -126,10 +114,6 @@ class MultiVector void move_to(MultiVector>* result) override; - void convert_to(matrix::Dense* result) const override; - - void move_to(matrix::Dense* result) override; - /** * Creates a mutable view (of matrix::Dense type) of one item of the Batch * MultiVector object. Does not perform any deep copies, but only returns a diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 7f3ce5890e4..cbec04482a3 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -306,7 +306,6 @@ class Dense final : public EnableBatchLinOp>, size.get_common_size()[1]; } -protected: /** * Creates an uninitialized Dense matrix of the specified size. * @@ -362,7 +361,6 @@ class Dense final : public EnableBatchLinOp>, idx % this->get_common_size()[1]); } -private: array values_; }; diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index be49e2cff41..943f63bfdd7 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -67,6 +67,8 @@ namespace matrix { * batch is the same and therefore only a single copy of the sparsity pattern is * stored. * + * @note Currently only IndexType of int32 is supported. + * * @tparam ValueType value precision of matrix elements * @tparam IndexType index precision of matrix elements * @@ -83,6 +85,8 @@ class Ell final friend class EnablePolymorphicObject; friend class Ell, IndexType>; friend class Ell, IndexType>; + static_assert(std::is_same::value, + "IndexType must be a 32 bit integer"); public: using EnableBatchLinOp::convert_to; @@ -315,8 +319,6 @@ class Ell final num_elems_per_row; } - -protected: /** * Creates an uninitialized Ell matrix of the specified size. * @@ -369,7 +371,6 @@ class Ell final const MultiVector* beta, MultiVector* x) const; -private: index_type num_elems_per_row_; array values_; array col_idxs_; diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc index 44de2a57af9..979df1a19bd 100644 --- a/reference/matrix/batch_ell_kernels.hpp.inc +++ b/reference/matrix/batch_ell_kernels.hpp.inc @@ -30,9 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -template +template inline void simple_apply_kernel( - const gko::batch::matrix::ell::batch_item& a, + const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) { @@ -43,19 +43,21 @@ inline void simple_apply_kernel( for (auto k = 0; k < a.num_stored_elems_per_row; ++k) { auto val = a.values[row + k * a.stride]; auto col = a.col_idxs[row + k * a.stride]; - for (int j = 0; j < c.num_rhs; ++j) { - c.values[row * c.stride + j] += - val * b.values[col * b.stride + j]; + if (col != invalid_index()) { + for (int j = 0; j < c.num_rhs; ++j) { + c.values[row * c.stride + j] += + val * b.values[col * b.stride + j]; + } } } } } -template +template inline void advanced_apply_kernel( const ValueType alpha, - const gko::batch::matrix::ell::batch_item& a, + const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const ValueType beta, const gko::batch::multi_vector::batch_item& c) @@ -67,9 +69,11 @@ inline void advanced_apply_kernel( for (auto k = 0; k < a.num_stored_elems_per_row; ++k) { auto val = a.values[row + k * a.stride]; auto col = a.col_idxs[row + k * a.stride]; - for (int j = 0; j < b.num_rhs; ++j) { - c.values[row * c.stride + j] += - alpha * val * b.values[col * b.stride + j]; + if (col != invalid_index()) { + for (int j = 0; j < b.num_rhs; ++j) { + c.values[row * c.stride + j] += + alpha * val * b.values[col * b.stride + j]; + } } } } diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index fb0e08c16f5..bb7680d1493 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -94,34 +94,34 @@ inline batch::matrix::dense::uniform_batch get_batch_struct( /** * Generates an immutable uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch get_batch_struct( - const batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch +get_batch_struct(const batch::matrix::Ell* const op) { return {op->get_const_values(), op->get_const_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } /** * Generates a uniform batch struct from a batch of ell matrices. */ -template -inline batch::matrix::ell::uniform_batch get_batch_struct( - batch::matrix::Ell* const op) +template +inline batch::matrix::ell::uniform_batch get_batch_struct( + batch::matrix::Ell* const op) { return {op->get_values(), op->get_col_idxs(), op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[0]), + static_cast(op->get_common_size()[1]), + static_cast(op->get_num_stored_elements_per_row())}; } diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp index 8a5806a9513..81f189c3e02 100644 --- a/reference/test/matrix/batch_ell_kernels.cpp +++ b/reference/test/matrix/batch_ell_kernels.cpp @@ -123,8 +123,8 @@ TYPED_TEST(Ell, AppliesToBatchMultiVector) this->mtx_00->apply(this->b_00.get(), this->x_00.get()); this->mtx_01->apply(this->b_01.get(), this->x_01.get()); auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r::value); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r::value); } @@ -149,8 +149,8 @@ TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector) this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), this->x_01.get()); auto res = gko::batch::unbatch>(this->x_0.get()); - GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.); - GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r::value); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r::value); } From 0532b2b30ddec5546062da327983bb41ab2706e2 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 12 Oct 2023 12:05:08 +0200 Subject: [PATCH 395/583] Add apply temp clone, review updates Co-authored-by: Tobias Ribizel --- core/matrix/batch_dense.cpp | 64 +++++++++++++++++++++- core/matrix/batch_ell.cpp | 62 +++++++++++++++++++++ dpcpp/matrix/batch_ell_kernels.hpp.inc | 4 +- include/ginkgo/core/matrix/batch_dense.hpp | 38 ++++++++----- include/ginkgo/core/matrix/batch_ell.hpp | 55 +++++++++++-------- 5 files changed, 182 insertions(+), 41 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index 758635cea7f..8390d43fd7d 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -124,11 +124,72 @@ Dense::Dense(std::shared_ptr exec, {} +template +Dense* Dense::apply( + ptr_param> b, + ptr_param> x) +{ + this->validate_application_parameters(b.get(), x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + +template +const Dense* Dense::apply( + ptr_param> b, + ptr_param> x) const +{ + this->validate_application_parameters(b.get(), x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + +template +Dense* Dense::apply( + ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x) +{ + this->validate_application_parameters(alpha.get(), b.get(), beta.get(), + x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, beta).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + +template +const Dense* Dense::apply( + ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x) const +{ + this->validate_application_parameters(alpha.get(), b.get(), beta.get(), + x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, beta).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + template void Dense::apply_impl(const MultiVector* b, MultiVector* x) const { - this->validate_application_parameters(b, x); this->get_executor()->run(dense::make_simple_apply(this, b, x)); } @@ -139,7 +200,6 @@ void Dense::apply_impl(const MultiVector* alpha, const MultiVector* beta, MultiVector* x) const { - this->validate_application_parameters(alpha, b, beta, x); this->get_executor()->run( dense::make_advanced_apply(alpha, this, b, beta, x)); } diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index c9dbe6d51c9..a50b2f3e23a 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -140,6 +140,68 @@ Ell::Ell(std::shared_ptr exec, {} +template +Ell* Ell::apply( + ptr_param> b, + ptr_param> x) +{ + this->validate_application_parameters(b.get(), x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + +template +const Ell* Ell::apply( + ptr_param> b, + ptr_param> x) const +{ + this->validate_application_parameters(b.get(), x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + +template +Ell* Ell::apply( + ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x) +{ + this->validate_application_parameters(alpha.get(), b.get(), beta.get(), + x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, beta).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + +template +const Ell* Ell::apply( + ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x) const +{ + this->validate_application_parameters(alpha.get(), b.get(), beta.get(), + x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, beta).get(), + make_temporary_clone(exec, x).get()); + return this; +} + + template void Ell::apply_impl(const MultiVector* b, MultiVector* x) const diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc index 553e0aa1f3c..8cdb8daa273 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp.inc +++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc @@ -42,7 +42,7 @@ __dpct_inline__ void simple_apply_kernel( auto temp = zero(); for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; - if (col_idx != invalid_index()) { + if (col_idx == invalid_index()) { break; } else temp += mat.values[tidx + idx * mat.stride] * @@ -67,7 +67,7 @@ __dpct_inline__ void advanced_apply_kernel( auto temp = zero(); for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) { const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; - if (col_idx != invalid_index()) { + if (col_idx == invalid_index()) { break; } else temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] * diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index cbec04482a3..07b862ef484 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -233,8 +233,8 @@ class Dense final : public EnableBatchLinOp>, * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + this->get_cumulative_offset(batch_id); @@ -275,11 +275,8 @@ class Dense final : public EnableBatchLinOp>, * @param b the multi-vector to be applied to * @param x the output multi-vector */ - void apply(const MultiVector* b, - MultiVector* x) const - { - this->apply_impl(b, x); - } + Dense* apply(ptr_param> b, + ptr_param> x); /** * Apply the matrix to a multi-vector with a linear combination of the given @@ -291,13 +288,26 @@ class Dense final : public EnableBatchLinOp>, * @param beta the scalar to scale the x vector with * @param x the output multi-vector */ - void apply(const MultiVector* alpha, - const MultiVector* b, - const MultiVector* beta, - MultiVector* x) const - { - this->apply_impl(alpha, b, beta, x); - } + Dense* apply(ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x); + + /** + * @copydoc apply(const MultiVector*, MultiVector*) + */ + const Dense* apply(ptr_param> b, + ptr_param> x) const; + + /** + * @copydoc apply(const MultiVector*, const + * MultiVector*, const MultiVector*, + * MultiVector*) + */ + const Dense* apply(ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x) const; private: inline size_type compute_num_elems(const batch_dim<2>& size) diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index 943f63bfdd7..5be94f1035e 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -85,7 +85,7 @@ class Ell final friend class EnablePolymorphicObject; friend class Ell, IndexType>; friend class Ell, IndexType>; - static_assert(std::is_same::value, + static_assert(std::is_same::value, "IndexType must be a 32 bit integer"); public: @@ -94,8 +94,7 @@ class Ell final using value_type = ValueType; using index_type = IndexType; - using transposed_type = Ell; - using unbatch_type = gko::matrix::Ell; + using unbatch_type = gko::matrix::Ell; using absolute_type = remove_complex; using complex_type = to_complex; @@ -223,8 +222,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const index_type* get_const_col_idxs_for_item( - size_type batch_id) const noexcept + const index_type* get_const_col_idxs_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return col_idxs_.get_const_data(); @@ -252,8 +251,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + @@ -277,8 +276,8 @@ class Ell final static std::unique_ptr create_const( std::shared_ptr exec, const batch_dim<2>& sizes, const index_type num_elems_per_row, - gko::detail::const_array_view&& values, - gko::detail::const_array_view&& col_idxs); + gko::detail::const_array_view&& values, + gko::detail::const_array_view&& col_idxs); /** * Apply the matrix to a multi-vector. Represents the matrix vector @@ -287,29 +286,39 @@ class Ell final * @param b the multi-vector to be applied to * @param x the output multi-vector */ - void apply(const MultiVector* b, - MultiVector* x) const - { - this->apply_impl(b, x); - } + Ell* apply(ptr_param> b, + ptr_param> x); /** * Apply the matrix to a multi-vector with a linear combination of the given - * input vector. Represents the matrix vector multiplication, x = alpha* A * - * b + beta * x, where x and b are both multi-vectors. + * input vector. Represents the matrix vector multiplication, x = alpha * A + * * b + beta * x, where x and b are both multi-vectors. * * @param alpha the scalar to scale the matrix-vector product with * @param b the multi-vector to be applied to * @param beta the scalar to scale the x vector with * @param x the output multi-vector */ - void apply(const MultiVector* alpha, - const MultiVector* b, - const MultiVector* beta, - MultiVector* x) const - { - this->apply_impl(alpha, b, beta, x); - } + Ell* apply(ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x); + + /** + * @copydoc apply(const MultiVector*, MultiVector*) + */ + const Ell* apply(ptr_param> b, + ptr_param> x) const; + + /** + * @copydoc apply(const MultiVector*, const + * MultiVector*, const MultiVector*, + * MultiVector*) + */ + const Ell* apply(ptr_param> alpha, + ptr_param> b, + ptr_param> beta, + ptr_param> x) const; private: size_type compute_num_elems(const batch_dim<2>& size, From ca459356009f5556493e39c0073ebf8a2ab60bab Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 12 Oct 2023 10:56:01 +0000 Subject: [PATCH 396/583] Format files Co-authored-by: Pratik Nayak --- dpcpp/matrix/batch_ell_kernels.dp.cpp | 54 +++++++++++----------- include/ginkgo/core/matrix/batch_dense.hpp | 4 +- include/ginkgo/core/matrix/batch_ell.hpp | 8 ++-- 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index e4d2421a42f..5a69bbd3d5d 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -97,17 +97,17 @@ void simple_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b, x_b, item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + simple_apply_kernel(mat_b, b_b, x_b, item_ct1); + }); }); } @@ -145,22 +145,22 @@ void advanced_apply(std::shared_ptr exec, // Launch a kernel that has nbatches blocks, each block has max group size exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto mat_b = - batch::matrix::extract_batch_item(mat_ub, group_id); - const auto b_b = batch::extract_batch_item(b_ub, group_id); - const auto x_b = batch::extract_batch_item(x_ub, group_id); - const auto alpha_b = - batch::extract_batch_item(alpha_ub, group_id); - const auto beta_b = - batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, - item_ct1); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto mat_b = + batch::matrix::extract_batch_item(mat_ub, group_id); + const auto b_b = batch::extract_batch_item(b_ub, group_id); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto alpha_b = + batch::extract_batch_item(alpha_ub, group_id); + const auto beta_b = + batch::extract_batch_item(beta_ub, group_id); + advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b, + item_ct1); + }); }); } diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 07b862ef484..0b2bcc49166 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -233,8 +233,8 @@ class Dense final : public EnableBatchLinOp>, * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + this->get_cumulative_offset(batch_id); diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index 5be94f1035e..a6381f90f10 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -222,8 +222,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const index_type* get_const_col_idxs_for_item(size_type batch_id) const - noexcept + const index_type* get_const_col_idxs_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return col_idxs_.get_const_data(); @@ -251,8 +251,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + From a74b018fae4c15c04145dbe55e6d75d7215d542f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 12 Oct 2023 16:06:05 +0200 Subject: [PATCH 397/583] Fix sparsity issues and review updates Co-authored-by: Marcel Koch Co-authored-by: Yu-Hsiang Tsai --- core/base/batch_utilities.hpp | 55 ++++++++++++++++++++++---- core/matrix/batch_ell.cpp | 2 - core/test/matrix/batch_ell.cpp | 32 ++++++++++++--- core/test/utils/batch_helpers.hpp | 7 ++-- core/test/utils/matrix_generator.hpp | 9 +++-- dpcpp/matrix/batch_ell_kernels.hpp.inc | 10 +++-- 6 files changed, 89 insertions(+), 26 deletions(-) diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index 7204c78a552..3117b35d0f4 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -46,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace gko { @@ -126,6 +128,36 @@ auto unbatch(const InputType* batch_object) } +namespace detail { + + +template +void assert_same_sparsity_in_batched_data( + const std::vector>& data) +{ + auto num_nnz = data[0].nonzeros.size(); + auto base_data = data[0]; + base_data.ensure_row_major_order(); + for (int b = 0; b < data.size(); ++b) { + if (data[b].nonzeros.size() != num_nnz) { + GKO_NOT_IMPLEMENTED; + } + auto temp_data = data[b]; + temp_data.ensure_row_major_order(); + for (int nnz = 0; nnz < num_nnz; ++nnz) { + if (temp_data.nonzeros[nnz].row != base_data.nonzeros[nnz].row || + temp_data.nonzeros[nnz].column != + base_data.nonzeros[nnz].column) { + GKO_NOT_IMPLEMENTED; + } + } + } +} + + +} // namespace detail + + template std::unique_ptr read( @@ -134,6 +166,12 @@ std::unique_ptr read( TArgs&&... create_args) { auto num_batch_items = data.size(); + // Throw if all the items in the batch dont have same sparsity. + if (!std::is_same>::value && + !std::is_same>::value) { + detail::assert_same_sparsity_in_batched_data(data); + } auto tmp = OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size), std::forward(create_args)...); @@ -163,7 +201,8 @@ std::vector> write( /** - * Creates and initializes a batch of single column-vectors. + * Creates and initializes a batch of the specified Matrix type with a single + * column-vector. * * @tparam Matrix matrix type to initialize (It has to implement the * read function) @@ -278,15 +317,16 @@ std::unique_ptr initialize( /** - * Creates and initializes a batch single column-vector by making copies of the - * single input column vector. + * Creates and initializes a batch of specified Matrix type with a single + * column-vector by making copies of the single input column vector. * * @tparam Matrix matrix type to initialize (It has to implement the * read function) * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param num_vectors The number of times the input vector is to be duplicated + * @param num_batch_items The number of times the input vector is to be + * duplicated * @param vals values used to initialize each vector in the temp. batch * @param exec Executor associated with the matrix * @param create_args additional arguments passed to Matrix::create, not @@ -297,21 +337,20 @@ std::unique_ptr initialize( */ template std::unique_ptr initialize( - const size_type num_vectors, + const size_type num_batch_items, std::initializer_list vals, std::shared_ptr exec, TArgs&&... create_args) { using value_type = typename Matrix::value_type; using index_type = typename Matrix::index_type; using mat_data = gko::matrix_data; - size_type num_batch_items = num_vectors; GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, "Input data is empty"); auto num_rows = begin(vals) ? vals.size() : 0; auto common_size = dim<2>(num_rows, 1); auto b_size = batch_dim<2>(num_batch_items, common_size); std::vector input_mat_data(num_batch_items, common_size); - for (size_type batch = 0; batch < num_vectors; batch++) { + for (size_type batch = 0; batch < num_batch_items; batch++) { input_mat_data[batch].nonzeros.reserve(num_rows); size_type idx = 0; for (const auto& elem : vals) { @@ -334,7 +373,7 @@ std::unique_ptr initialize( * @tparam TArgs argument types for Matrix::create method * (not including the implied Executor as the first argument) * - * @param num_batch_items The number of times the input matrix is duplicated + * @param num_batch_items The number of times the input matrix is duplicated * @param vals values used to initialize each matrix in the temp. batch * @param exec Executor associated to the matrix * @param create_args additional arguments passed to Matrix::create, not diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index a50b2f3e23a..5626860e7ee 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -206,7 +206,6 @@ template void Ell::apply_impl(const MultiVector* b, MultiVector* x) const { - this->validate_application_parameters(b, x); this->get_executor()->run(ell::make_simple_apply(this, b, x)); } @@ -217,7 +216,6 @@ void Ell::apply_impl(const MultiVector* alpha, const MultiVector* beta, MultiVector* x) const { - this->validate_application_parameters(alpha, b, beta, x); this->get_executor()->run( ell::make_advanced_apply(alpha, this, b, beta, x)); } diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp index c36a877ac14..e04ed96bf4c 100644 --- a/core/test/matrix/batch_ell.cpp +++ b/core/test/matrix/batch_ell.cpp @@ -360,7 +360,7 @@ TYPED_TEST(Ell, CanBeListConstructed) using BatchEllMtx = typename TestFixture::BatchEllMtx; using EllMtx = typename TestFixture::EllMtx; - auto m = gko::batch::initialize({{0.0, -1.0}, {1.0, 0.0}}, + auto m = gko::batch::initialize({{0.0, -1.0}, {0.0, -5.0}}, this->exec); ASSERT_EQ(m->get_num_batch_items(), 2); @@ -369,10 +369,10 @@ TYPED_TEST(Ell, CanBeListConstructed) ASSERT_EQ(m->get_num_stored_elements_per_row(), 1); EXPECT_EQ(m->get_values()[0], value_type{0.0}); EXPECT_EQ(m->get_values()[1], value_type{-1.0}); - EXPECT_EQ(m->get_values()[2], value_type{1.0}); - EXPECT_EQ(m->get_values()[3], value_type{0.0}); - EXPECT_EQ(m->get_col_idxs()[0], index_type{0}); - EXPECT_EQ(m->get_col_idxs()[1], index_type{-1}); + EXPECT_EQ(m->get_values()[2], value_type{0.0}); + EXPECT_EQ(m->get_values()[3], value_type{-5.0}); + EXPECT_EQ(m->get_col_idxs()[0], index_type{-1}); + EXPECT_EQ(m->get_col_idxs()[1], index_type{0}); } @@ -459,6 +459,28 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData) } +TYPED_TEST(Ell, CanBeDetectDataWithDifferentSparsity) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; + auto vec_data = std::vector>{}; + vec_data.emplace_back( + gko::matrix_data({2, 3}, { + {0, 0, -1.0}, + {1, 1, 2.5}, + {1, 2, 0.5}, + {2, 2, -3.0}, + })); + vec_data.emplace_back(gko::matrix_data( + {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}})); + + EXPECT_THROW( + gko::batch::detail::assert_same_sparsity_in_batched_data(vec_data), + gko::NotImplemented); +} + + TYPED_TEST(Ell, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp index 0b6197b5062..5b1fa60ed36 100644 --- a/core/test/utils/batch_helpers.hpp +++ b/core/test/utils/batch_helpers.hpp @@ -95,9 +95,10 @@ std::unique_ptr generate_random_batch_matrix( .copy_to_array(); for (size_type b = 0; b < num_batch_items; b++) { - auto rand_mat = fill_random_matrix_with_sparsity_pattern< - typename MatrixType::unbatch_type, index_type>( - num_rows, num_cols, row_idxs, col_idxs, value_dist, engine, exec); + auto rand_mat = + fill_random_matrix( + num_rows, num_cols, row_idxs, col_idxs, value_dist, engine, + exec); result->create_view_for_item(b)->copy_from(rand_mat.get()); } diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 7490a24bbe5..d5370c6ef6a 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -206,23 +206,24 @@ generate_random_device_matrix_data(gko::size_type num_rows, * @tparam MatrixType type of matrix to generate (must implement * the interface `ReadableFromMatrixData<>` and provide * matching `value_type` and `index_type` type aliases) + * @tparam IndexType the type for row and column indices + * @tparam ValueDistribution type of value distribution + * @tparam Engine type of random engine * * @param num_rows number of rows * @param num_cols number of columns - * @param value_dist distribution of matrix values * @param row_idxs the row indices of the matrix * @param col_idxs the column indices of the matrix + * @param value_dist distribution of matrix values * @param exec executor where the matrix should be allocated * @param args additional arguments for the matrix constructor * - * The other (template) parameters match generate_random_matrix_data. - * * @return the unique pointer of MatrixType */ template , typename IndexType = typename MatrixType::index_type, typename ValueDistribution, typename Engine, typename... MatrixArgs> -std::unique_ptr fill_random_matrix_with_sparsity_pattern( +std::unique_ptr fill_random_matrix( size_type num_rows, size_type num_cols, const gko::array& row_idxs, const gko::array& col_idxs, ValueDistribution&& value_dist, diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc index 8cdb8daa273..64d71710dbb 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp.inc +++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc @@ -44,9 +44,10 @@ __dpct_inline__ void simple_apply_kernel( const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; if (col_idx == invalid_index()) { break; - } else + } else { temp += mat.values[tidx + idx * mat.stride] * b.values[col_idx * b.stride]; + } } x.values[tidx * x.stride] = temp; } @@ -69,11 +70,12 @@ __dpct_inline__ void advanced_apply_kernel( const auto col_idx = mat.col_idxs[tidx + idx * mat.stride]; if (col_idx == invalid_index()) { break; - } else - temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] * + } else { + temp += mat.values[tidx + idx * mat.stride] * b.values[col_idx * b.stride]; + } } x.values[tidx * x.stride] = - temp + beta.values[0] * x.values[tidx * x.stride]; + alpha.values[0] * temp + beta.values[0] * x.values[tidx * x.stride]; } } From 9f077180d3db898ded7075d89cef1a2f216a3d46 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 16 Oct 2023 16:28:44 +0200 Subject: [PATCH 398/583] vector mat data with duplication --- core/base/batch_utilities.hpp | 39 ++++++++++++++++------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index 3117b35d0f4..e6a52250565 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -349,17 +349,16 @@ std::unique_ptr initialize( auto num_rows = begin(vals) ? vals.size() : 0; auto common_size = dim<2>(num_rows, 1); auto b_size = batch_dim<2>(num_batch_items, common_size); - std::vector input_mat_data(num_batch_items, common_size); - for (size_type batch = 0; batch < num_batch_items; batch++) { - input_mat_data[batch].nonzeros.reserve(num_rows); - size_type idx = 0; - for (const auto& elem : vals) { - if (elem != zero()) { - input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem); - } - ++idx; + mat_data single_mat_data(common_size); + single_mat_data.nonzeros.reserve(num_rows); + size_type idx = 0; + for (const auto& elem : vals) { + if (elem != zero()) { + single_mat_data.nonzeros.emplace_back(idx, 0, elem); } + ++idx; } + std::vector input_mat_data(num_batch_items, single_mat_data); return read( exec, input_mat_data, std::forward(create_args)...); } @@ -397,21 +396,19 @@ std::unique_ptr initialize( auto common_size = dim<2>(begin(vals) ? vals.size() : 0, begin(vals) ? begin(vals)->size() : 0); batch_dim<2> b_size(num_batch_items, common_size); - std::vector input_mat_data(num_batch_items, common_size); - for (size_type batch = 0; batch < num_batch_items; batch++) { - size_type ridx = 0; - for (const auto& row : vals) { - size_type cidx = 0; - for (const auto& elem : row) { - if (elem != zero()) { - input_mat_data[batch].nonzeros.emplace_back(ridx, cidx, - elem); - } - ++cidx; + mat_data single_mat_data(common_size); + size_type ridx = 0; + for (const auto& row : vals) { + size_type cidx = 0; + for (const auto& elem : row) { + if (elem != zero()) { + single_mat_data.nonzeros.emplace_back(ridx, cidx, elem); } - ++ridx; + ++cidx; } + ++ridx; } + std::vector input_mat_data(num_batch_items, single_mat_data); return read( exec, input_mat_data, std::forward(create_args)...); } From 70044e458d5f0b5cb8e40e7aa1b39b024638369a Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 16 Oct 2023 21:49:43 +0200 Subject: [PATCH 399/583] Review updates Co-authored-by: Yu-Hsiang Tsai Co-authored-by: Marcel Koch --- core/base/batch_utilities.hpp | 44 +++++++++++++++++----- core/matrix/batch_dense.cpp | 8 ---- core/matrix/batch_ell.cpp | 11 ------ core/test/matrix/batch_ell.cpp | 23 ++++++++++- include/ginkgo/core/matrix/batch_dense.hpp | 13 +------ include/ginkgo/core/matrix/batch_ell.hpp | 17 ++------- 6 files changed, 62 insertions(+), 54 deletions(-) diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index e6a52250565..febfd59b636 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -54,6 +54,9 @@ namespace gko { namespace batch { +/** + * Duplicate a given input batch object. + */ template std::unique_ptr duplicate(std::shared_ptr exec, size_type num_duplications, @@ -78,6 +81,9 @@ std::unique_ptr duplicate(std::shared_ptr exec, } +/** + * Duplicate a monolithic matrix and create a batch object. + */ template std::unique_ptr create_from_item( std::shared_ptr exec, const size_type num_duplications, @@ -96,6 +102,13 @@ std::unique_ptr create_from_item( } +/** + * Create a batch object from a vector of monolithic object that share the same + * sparsity pattern. + * + * @note The sparsity of the elements in the input vector of matrices needs to + * be the same. TODO: Check for same sparsity among the different input items + */ template std::unique_ptr create_from_item( std::shared_ptr exec, @@ -115,6 +128,9 @@ std::unique_ptr create_from_item( } +/** + * Unbatch a batched object into a vector of items of its unbatch_type. + */ template auto unbatch(const InputType* batch_object) { @@ -135,19 +151,20 @@ template void assert_same_sparsity_in_batched_data( const std::vector>& data) { - auto num_nnz = data[0].nonzeros.size(); - auto base_data = data[0]; + auto num_nnz = data.at(0).nonzeros.size(); + auto base_data = data.at(0); base_data.ensure_row_major_order(); - for (int b = 0; b < data.size(); ++b) { + for (int b = 1; b < data.size(); ++b) { if (data[b].nonzeros.size() != num_nnz) { GKO_NOT_IMPLEMENTED; } - auto temp_data = data[b]; + auto temp_data = data.at(b); temp_data.ensure_row_major_order(); for (int nnz = 0; nnz < num_nnz; ++nnz) { - if (temp_data.nonzeros[nnz].row != base_data.nonzeros[nnz].row || - temp_data.nonzeros[nnz].column != - base_data.nonzeros[nnz].column) { + if (temp_data.nonzeros.at(nnz).row != + base_data.nonzeros.at(nnz).row || + temp_data.nonzeros.at(nnz).column != + base_data.nonzeros.at(nnz).column) { GKO_NOT_IMPLEMENTED; } } @@ -158,6 +175,10 @@ void assert_same_sparsity_in_batched_data( } // namespace detail +/** + * Create a batch object from a vector of gko::matrix_data objects. Each item of + * the vector needs to store the same sparsity pattern. + */ template std::unique_ptr read( @@ -173,7 +194,7 @@ std::unique_ptr read( detail::assert_same_sparsity_in_batched_data(data); } auto tmp = - OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size), + OutputType::create(exec, batch_dim<2>(num_batch_items, data.at(0).size), std::forward(create_args)...); for (size_type b = 0; b < num_batch_items; ++b) { @@ -184,6 +205,9 @@ std::unique_ptr read( } +/** + * Write a vector of matrix data objects from an input batch object. + */ template std::vector> write( const OutputType* mvec) @@ -201,8 +225,8 @@ std::vector> write( /** - * Creates and initializes a batch of the specified Matrix type with a single - * column-vector. + * Creates and initializes a batch of the specified Matrix type from a series of + * single column-vectors. * * @tparam Matrix matrix type to initialize (It has to implement the * read function) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index 8390d43fd7d..58c7fa25cea 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -96,14 +96,6 @@ Dense::create_const_view_for_item(size_type item_id) const } -template -std::unique_ptr> Dense::create_with_config_of( - ptr_param> other) -{ - return Dense::create(other->get_executor(), other->get_size()); -} - - template std::unique_ptr> Dense::create_const( std::shared_ptr exec, const batch_dim<2>& sizes, diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 5626860e7ee..88863a05dd4 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -100,17 +100,6 @@ Ell::create_const_view_for_item(size_type item_id) const } -template -std::unique_ptr> -Ell::create_with_config_of( - ptr_param> other) -{ - return Ell::create( - other->get_executor(), other->get_size(), - other->get_num_stored_elements_per_row()); -} - - template std::unique_ptr> Ell::create_const( diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp index e04ed96bf4c..2c8166aa023 100644 --- a/core/test/matrix/batch_ell.cpp +++ b/core/test/matrix/batch_ell.cpp @@ -459,7 +459,7 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData) } -TYPED_TEST(Ell, CanBeDetectDataWithDifferentSparsity) +TYPED_TEST(Ell, ThrowsForDataWithDifferentNnz) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; @@ -481,6 +481,27 @@ TYPED_TEST(Ell, CanBeDetectDataWithDifferentSparsity) } +TYPED_TEST(Ell, ThrowsForDataWithDifferentSparsity) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using BatchEllMtx = typename TestFixture::BatchEllMtx; + auto vec_data = std::vector>{}; + vec_data.emplace_back( + gko::matrix_data({2, 3}, { + {0, 0, -1.0}, + {1, 1, 2.5}, + {2, 2, -3.0}, + })); + vec_data.emplace_back(gko::matrix_data( + {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}})); + + EXPECT_THROW( + gko::batch::detail::assert_same_sparsity_in_batched_data(vec_data), + gko::NotImplemented); +} + + TYPED_TEST(Ell, GeneratesCorrectMatrixData) { using value_type = typename TestFixture::value_type; diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 0b2bcc49166..5a1697afec4 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -93,15 +93,6 @@ class Dense final : public EnableBatchLinOp>, using absolute_type = remove_complex; using complex_type = to_complex; - /** - * Creates a Dense matrix with the configuration of another Dense - * matrix. - * - * @param other The other matrix whose configuration needs to copied. - */ - static std::unique_ptr create_with_config_of( - ptr_param other); - void convert_to(Dense>* result) const override; void move_to(Dense>* result) override; @@ -233,8 +224,8 @@ class Dense final : public EnableBatchLinOp>, * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + this->get_cumulative_offset(batch_id); diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index a6381f90f10..a02d6c81fe8 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -98,15 +98,6 @@ class Ell final using absolute_type = remove_complex; using complex_type = to_complex; - /** - * Creates a Ell matrix with the configuration of another Ell - * matrix. - * - * @param other The other matrix whose configuration needs to copied. - */ - static std::unique_ptr create_with_config_of( - ptr_param other); - void convert_to( Ell, IndexType>* result) const override; @@ -222,8 +213,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const index_type* get_const_col_idxs_for_item( - size_type batch_id) const noexcept + const index_type* get_const_col_idxs_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return col_idxs_.get_const_data(); @@ -251,8 +242,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item( - size_type batch_id) const noexcept + const value_type* get_const_values_for_item(size_type batch_id) const + noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + From e13b07b551e08c9831b07c993ec5b26064835d08 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 16 Oct 2023 19:52:38 +0000 Subject: [PATCH 400/583] Format files Co-authored-by: Pratik Nayak --- include/ginkgo/core/matrix/batch_dense.hpp | 4 ++-- include/ginkgo/core/matrix/batch_ell.hpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 5a1697afec4..47230c24e32 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -224,8 +224,8 @@ class Dense final : public EnableBatchLinOp>, * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + this->get_cumulative_offset(batch_id); diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index a02d6c81fe8..fa00a0631fd 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -213,8 +213,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const index_type* get_const_col_idxs_for_item(size_type batch_id) const - noexcept + const index_type* get_const_col_idxs_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return col_idxs_.get_const_data(); @@ -242,8 +242,8 @@ class Ell final * significantly more memory efficient than the non-constant version, * so always prefer this version. */ - const value_type* get_const_values_for_item(size_type batch_id) const - noexcept + const value_type* get_const_values_for_item( + size_type batch_id) const noexcept { GKO_ASSERT(batch_id < this->get_num_batch_items()); return values_.get_const_data() + From b2f7b473288a0f818f1c3237ce16c450672b006f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 17 Oct 2023 10:11:59 +0200 Subject: [PATCH 401/583] Review updates Co-authored-by: Marcel Koch --- core/base/batch_utilities.hpp | 3 +++ core/matrix/batch_ell.cpp | 13 ++----------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index febfd59b636..b4e380a4162 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -151,6 +151,9 @@ template void assert_same_sparsity_in_batched_data( const std::vector>& data) { + if (data.empty()) { + return; + } auto num_nnz = data.at(0).nonzeros.size(); auto base_data = data.at(0); base_data.ensure_row_major_order(); diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 88863a05dd4..b2987e741d9 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -147,10 +147,7 @@ const Ell* Ell::apply( ptr_param> b, ptr_param> x) const { - this->validate_application_parameters(b.get(), x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, x).get()); + this->apply(b, x); return this; } @@ -180,13 +177,7 @@ const Ell* Ell::apply( ptr_param> beta, ptr_param> x) const { - this->validate_application_parameters(alpha.get(), b.get(), beta.get(), - x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, beta).get(), - make_temporary_clone(exec, x).get()); + this->apply(alpha, b, beta, x); return this; } From e256261db28c62c65d9ad63cc6041e963da28702 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Sun, 12 Feb 2023 20:01:04 +0100 Subject: [PATCH 402/583] Add vec mean implementation Add vec mean tests --- .../matrix/dense_kernels.instantiate.cpp | 2 + .../unified/matrix/dense_kernels.template.cpp | 16 +++++++ core/device_hooks/common_kernels.inc.cpp | 1 + core/distributed/vector.cpp | 45 +++++++++++++++++-- core/matrix/dense.cpp | 33 ++++++++++++++ core/matrix/dense_kernels.hpp | 7 +++ include/ginkgo/core/distributed/vector.hpp | 26 +++++++++++ include/ginkgo/core/matrix/dense.hpp | 29 ++++++++++++ reference/matrix/dense_kernels.cpp | 22 +++++++++ reference/test/matrix/dense_kernels.cpp | 16 +++++++ test/mpi/vector.cpp | 21 +++++++++ 11 files changed, 215 insertions(+), 3 deletions(-) diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp index bf20c8a19b6..f34d05954c4 100644 --- a/common/unified/matrix/dense_kernels.instantiate.cpp +++ b/common/unified/matrix/dense_kernels.instantiate.cpp @@ -99,6 +99,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( // split GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); // end diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index b6ed5fb37e0..d7e1c08f38c 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -278,6 +278,22 @@ void compute_norm1(std::shared_ptr exec, } +template +void compute_mean(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense* result, array& tmp) +{ + using ValueType_nc = gko::remove_complex; + run_kernel_col_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto total_size) { + return x(i, j) / static_cast(total_size); + }, + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + tmp, x, x->get_size()[0]); +} + + template void compute_max_nnz_per_row(std::shared_ptr exec, const matrix::Dense* source, diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 462675c15db..7f7b1b473a2 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -347,6 +347,7 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 001cf75b76d..f05a2df73fd 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -30,10 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include - - #include +#include #include "core/distributed/vector_kernels.hpp" @@ -573,12 +571,52 @@ void Vector::compute_squared_norm2(ptr_param result, } +template +void Vector::compute_mean(LinOp* result) const +{ + array tmp{this->get_executor()}; + this->compute_mean(result, tmp); +} + + +void Vector::compute_mean(LinOp* result, array& tmp) const +{ + using MeanVector = local_vector_type; + const auto global_size = this->get_size()[0]; + const auto local_size = this->get_local_vector()->get_size()[0]; + const auto num_vecs = static_cast(this->get_size()[1]); + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, num_vecs)); + auto exec = this->get_executor(); + const auto comm = this->get_communicator(); + auto dense_res = make_temporary_clone(exec, as(result)); + this->get_local_vector()->compute_mean(dense_res.get()); + + // scale by its weight ie ratio of local to global size + auto weight = initialize>>( + 1, {static_cast>(local_size) / global_size}, + this->get_executor()); + dense_res->scale(weight.get()); + + exec->synchronize(); + if (mpi::requires_host_buffer(exec, comm)) { + host_reduction_buffer_.init(exec->get_master(), dense_res->get_size()); + host_reduction_buffer_->copy_from(dense_res.get()); + comm.all_reduce(exec->get_master(), + host_reduction_buffer_->get_values(), num_vecs, + MPI_SUM); + dense_res->copy_from(host_reduction_buffer_.get()); + } else { + comm.all_reduce(exec, dense_res->get_values(), num_vecs, MPI_SUM); + } +} + template ValueType& Vector::at_local(size_type row, size_type col) noexcept { return local_.at(row, col); } + template ValueType Vector::at_local(size_type row, size_type col) const noexcept @@ -586,6 +624,7 @@ ValueType Vector::at_local(size_type row, return local_.at(row, col); } + template ValueType& Vector::at_local(size_type idx) noexcept { diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 17dec93c234..babb1919040 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -80,6 +80,7 @@ GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot_dispatch); GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot_dispatch); GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2_dispatch); GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1); +GKO_REGISTER_OPERATION(compute_mean, dense::compute_mean); GKO_REGISTER_OPERATION(compute_squared_norm2, dense::compute_squared_norm2); GKO_REGISTER_OPERATION(compute_sqrt, dense::compute_sqrt); GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row); @@ -235,6 +236,14 @@ void Dense::compute_squared_norm2(ptr_param result) const } +template +void Dense::compute_mean(LinOp* result) const +{ + auto exec = this->get_executor(); + this->compute_mean_impl(make_temporary_output_clone(exec, result).get()); +} + + template void Dense::inv_scale_impl(const LinOp* alpha) { @@ -496,6 +505,20 @@ void Dense::compute_squared_norm2(ptr_param result, } +template +void Dense::compute_mean(LinOp* result, array& tmp) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + if (tmp.get_executor() != exec) { + tmp.clear(); + tmp.set_executor(exec); + } + auto dense_res = make_temporary_conversion(result); + exec->run(dense::make_compute_mean(this, dense_res.get(), tmp)); +} + + template void Dense::compute_squared_norm2_impl(LinOp* result) const { @@ -505,6 +528,16 @@ void Dense::compute_squared_norm2_impl(LinOp* result) const tmp); } +template +void Dense::compute_mean_impl(LinOp* result) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + auto dense_res = make_temporary_conversion(result); + array tmp{exec}; + exec->run(dense::make_compute_mean(this, dense_res.get(), tmp)); +} + template Dense& Dense::operator=(const Dense& other) diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 9a487fadeda..a352aa8d7c1 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -146,6 +146,11 @@ namespace kernels { matrix::Dense>* result, \ array& tmp) +#define GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL(_type) \ + void compute_mean(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + matrix::Dense<_type>* result, array& tmp) + #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec) \ void fill_in_matrix_data(std::shared_ptr exec, \ const device_matrix_data<_type, _prec>& data, \ @@ -349,6 +354,8 @@ namespace kernels { GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 61ceab8e380..e86c2ec3e61 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -404,6 +404,32 @@ class Vector */ void compute_norm1(ptr_param result, array& tmp) const; + /** + * Computes the column-wise mean of this (multi-)vector using a global + * reduction. + * + * @param result a Dense row matrix, used to store the mean + * (the number of columns in result must match the number + * of columns of this) + * @param tmp the temporary storage to use for partial sums during the + * reduction computation. It may be resized and/or reset to the + * correct executor. + */ + void compute_mean(LinOp* result) const; + + /** + * Computes the column-wise mean of this (multi-)vector using a global + * reduction. + * + * @param result a Dense row matrix, used to store the mean + * (the number of columns in result must match the number + * of columns of this) + * @param tmp the temporary storage to use for partial sums during the + * reduction computation. It may be resized and/or reset to the + * correct executor. + */ + void compute_mean(LinOp* result, array& tmp) const; + /** * Returns a single element of the multi-vector. * diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index ae738d49b93..1cba8622fce 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -917,6 +917,27 @@ class Dense */ void compute_squared_norm2(ptr_param result, array& tmp) const; + /** + * Computes the column-wise mean of this matrix. + * + * @param result a Dense row vector, used to store the norm + * (the number of columns in the vector must match the number + * of columns of this) + */ + void compute_mean(LinOp* result) const; + + /** + * Computes the column-wise mean of this matrix. + * + * @param result a Dense row vector, used to store the norm + * (the number of columns in the vector must match the + * number of columns of this) + * @param tmp the temporary storage to use for partial sums during the + * reduction computation. It may be resized and/or reset to the + * correct executor. + */ + void compute_mean(LinOp* result, array& tmp) const; + /** * Create a submatrix from the original matrix. * Warning: defining stride for this create_submatrix method might cause @@ -1215,6 +1236,14 @@ class Dense */ virtual void compute_squared_norm2_impl(LinOp* result) const; + /** + * @copydoc compute_mean(LinOp*) const + * + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. + */ + virtual void compute_mean_impl(LinOp* result) const; + /** * Resizes the matrix to the given size. * diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index ba399b0f445..df86aedd047 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -397,6 +397,28 @@ void compute_norm1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +template +void compute_mean(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense* result, array&) +{ + using ValueType_nc = gko::remove_complex; + for (size_type j = 0; j < x->get_size()[1]; ++j) { + result->at(0, j) = zero(); + } + + for (size_type i = 0; i < x->get_size()[0]; ++i) { + const ValueType_nc alpha = static_cast(i) / (i + 1); + const ValueType_nc beta = static_cast(1) / (i + 1); + for (size_type j = 0; j < x->get_size()[1]; ++j) { + result->at(0, j) = alpha * result->at(0, j) + beta * x->at(i, j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); + + template void fill_in_matrix_data(std::shared_ptr exec, const device_matrix_data& data, diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 60713c815de..763cf1b6321 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -698,6 +698,22 @@ TYPED_TEST(Dense, ComputesNorm1Mixed) } +TYPED_TEST(Dense, ComputesMean) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + using MeanVector = gko::matrix::Dense; + auto mtx(gko::initialize( + {I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}, I{-1.0, -1.0}}, + this->exec)); + auto result = MeanVector::create(this->exec, gko::dim<2>{1, 2}); + + mtx->compute_mean(result.get()); + + GKO_ASSERT_MTX_NEAR(result, l({{1.0, 1.5}}), 1e-2); +} + + TYPED_TEST(Dense, ComputeDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index a7ad735458c..414f8197f57 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -675,6 +675,27 @@ TYPED_TEST(VectorReductions, ComputeSquaredNorm2WithTmpIsSameAsDense) r::value); } +TYPED_TEST(VectorReductions, ComputesMeanIsSameAsDense) +{ + using value_type = typename TestFixture::value_type; + this->init_result(); + + this->x->compute_mean(this->res.get()); + this->dense_x->compute_mean(this->dense_res.get()); + + GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r::value); +} + +TYPED_TEST(VectorReductions, ComputesMeanWithTmpIsSameAsDense) +{ + using value_type = typename TestFixture::value_type; + this->init_result(); + + this->x->compute_mean(this->res.get(), this->tmp); + this->dense_x->compute_mean(this->dense_res.get(), this->dense_tmp); + + GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r::value); +} TYPED_TEST(VectorReductions, ComputeDotCopiesToHostOnlyIfNecessary) { From 01205e54d6e35b65f1e2eb1226d841cd32673bce Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Fri, 13 Oct 2023 20:44:03 +0200 Subject: [PATCH 403/583] use ptr_param --- core/distributed/vector.cpp | 5 +++-- include/ginkgo/core/distributed/vector.hpp | 12 +++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index f05a2df73fd..23a6774ccd2 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -572,14 +572,15 @@ void Vector::compute_squared_norm2(ptr_param result, template -void Vector::compute_mean(LinOp* result) const +void Vector::compute_mean(ptr_param result) const { array tmp{this->get_executor()}; this->compute_mean(result, tmp); } -void Vector::compute_mean(LinOp* result, array& tmp) const +void Vector::compute_mean(ptr_param result, + array& tmp) const { using MeanVector = local_vector_type; const auto global_size = this->get_size()[0]; diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index e86c2ec3e61..86a82a2f7da 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -404,6 +404,16 @@ class Vector */ void compute_norm1(ptr_param result, array& tmp) const; + /** + * Computes the column-wise mean of this (multi-)vector using a global + * reduction. + * + * @param result a Dense row matrix, used to store the mean + * (the number of columns in result must match the number + * of columns of this) + */ + void compute_mean(ptr_param result) const; + /** * Computes the column-wise mean of this (multi-)vector using a global * reduction. @@ -415,7 +425,7 @@ class Vector * reduction computation. It may be resized and/or reset to the * correct executor. */ - void compute_mean(LinOp* result) const; + void compute_mean(ptr_param result, array& tmp) const; /** * Computes the column-wise mean of this (multi-)vector using a global From e7feed393c3b265eebf2332d3b9d485fea4fd4f7 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Sat, 14 Oct 2023 08:16:02 +0200 Subject: [PATCH 404/583] fix documentation --- include/ginkgo/core/matrix/dense.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 1cba8622fce..fb9773e1247 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -920,7 +920,7 @@ class Dense /** * Computes the column-wise mean of this matrix. * - * @param result a Dense row vector, used to store the norm + * @param result a Dense row vector, used to store the mean * (the number of columns in the vector must match the number * of columns of this) */ @@ -929,7 +929,7 @@ class Dense /** * Computes the column-wise mean of this matrix. * - * @param result a Dense row vector, used to store the norm + * @param result a Dense row vector, used to store the mean * (the number of columns in the vector must match the * number of columns of this) * @param tmp the temporary storage to use for partial sums during the From 61b7c803adf3757b341df446be7ca64e24b359c3 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Sat, 14 Oct 2023 14:56:03 +0200 Subject: [PATCH 405/583] update documentation and tests --- include/ginkgo/core/matrix/dense.hpp | 4 ++-- reference/test/matrix/dense_kernels.cpp | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index fb9773e1247..912e857611c 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -918,7 +918,7 @@ class Dense void compute_squared_norm2(ptr_param result, array& tmp) const; /** - * Computes the column-wise mean of this matrix. + * Computes the column-wise arithmetic mean of this matrix. * * @param result a Dense row vector, used to store the mean * (the number of columns in the vector must match the number @@ -927,7 +927,7 @@ class Dense void compute_mean(LinOp* result) const; /** - * Computes the column-wise mean of this matrix. + * Computes the column-wise arithmetic mean of this matrix. * * @param result a Dense row vector, used to store the mean * (the number of columns in the vector must match the diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 763cf1b6321..9e31410fc49 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -702,6 +702,7 @@ TYPED_TEST(Dense, ComputesMean) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; + using T_nc = gko::remove_complex; using MeanVector = gko::matrix::Dense; auto mtx(gko::initialize( {I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}, I{-1.0, -1.0}}, @@ -710,7 +711,8 @@ TYPED_TEST(Dense, ComputesMean) mtx->compute_mean(result.get()); - GKO_ASSERT_MTX_NEAR(result, l({{1.0, 1.5}}), 1e-2); + EXPECT_EQ(result->at(0, 0), T_nc{1.0}); + EXPECT_EQ(result->at(0, 1), T_nc{1.5}); } From 52ef32d5ce5f00d2a769513c3be4d3640900dfb3 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Sat, 14 Oct 2023 15:24:05 +0200 Subject: [PATCH 406/583] use GKO_EXPECT_NEAR --- core/matrix/dense.cpp | 20 +++++++++++--------- include/ginkgo/core/matrix/dense.hpp | 4 ++-- reference/test/matrix/dense_kernels.cpp | 14 +++++--------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index babb1919040..a50ab6b260b 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -236,14 +236,6 @@ void Dense::compute_squared_norm2(ptr_param result) const } -template -void Dense::compute_mean(LinOp* result) const -{ - auto exec = this->get_executor(); - this->compute_mean_impl(make_temporary_output_clone(exec, result).get()); -} - - template void Dense::inv_scale_impl(const LinOp* alpha) { @@ -506,7 +498,16 @@ void Dense::compute_squared_norm2(ptr_param result, template -void Dense::compute_mean(LinOp* result, array& tmp) const +void Dense::compute_mean(ptr_param result) const +{ + auto exec = this->get_executor(); + this->compute_mean_impl(make_temporary_output_clone(exec, result).get()); +} + + +template +void Dense::compute_mean(ptr_param result, + array& tmp) const { GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); @@ -528,6 +529,7 @@ void Dense::compute_squared_norm2_impl(LinOp* result) const tmp); } + template void Dense::compute_mean_impl(LinOp* result) const { diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 912e857611c..9edf55d2e4c 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -924,7 +924,7 @@ class Dense * (the number of columns in the vector must match the number * of columns of this) */ - void compute_mean(LinOp* result) const; + void compute_mean(ptr_param result) const; /** * Computes the column-wise arithmetic mean of this matrix. @@ -936,7 +936,7 @@ class Dense * reduction computation. It may be resized and/or reset to the * correct executor. */ - void compute_mean(LinOp* result, array& tmp) const; + void compute_mean(ptr_param result, array& tmp) const; /** * Create a submatrix from the original matrix. diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 9e31410fc49..a2527a31d3e 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -702,17 +702,13 @@ TYPED_TEST(Dense, ComputesMean) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using T_nc = gko::remove_complex; - using MeanVector = gko::matrix::Dense; - auto mtx(gko::initialize( - {I{1.0, 0.0}, I{2.0, 3.0}, I{2.0, 4.0}, I{-1.0, -1.0}}, - this->exec)); - auto result = MeanVector::create(this->exec, gko::dim<2>{1, 2}); + auto result = Mtx::create(this->exec, gko::dim<2>{1, 3}); - mtx->compute_mean(result.get()); + this->mtx4->compute_mean(result.get()); - EXPECT_EQ(result->at(0, 0), T_nc{1.0}); - EXPECT_EQ(result->at(0, 1), T_nc{1.5}); + GKO_EXPECT_NEAR(result->at(0, 0), T{0.5}, 1e-6); + GKO_EXPECT_NEAR(result->at(0, 1), T{4.0}, 1e-6); + GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, 1e-6); } From f5e3c9eb2a84b7da0c4384554b7acbe9c38adb2e Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Sat, 14 Oct 2023 17:28:06 +0200 Subject: [PATCH 407/583] fix documentation --- include/ginkgo/core/distributed/vector.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 86a82a2f7da..1e3b9571b19 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -415,8 +415,8 @@ class Vector void compute_mean(ptr_param result) const; /** - * Computes the column-wise mean of this (multi-)vector using a global - * reduction. + * Computes the column-wise arithmetic mean of this (multi-)vector using a + * global reduction. * * @param result a Dense row matrix, used to store the mean * (the number of columns in result must match the number @@ -428,8 +428,8 @@ class Vector void compute_mean(ptr_param result, array& tmp) const; /** - * Computes the column-wise mean of this (multi-)vector using a global - * reduction. + * Computes the column-wise arithmetic mean of this (multi-)vector using a + * global reduction. * * @param result a Dense row matrix, used to store the mean * (the number of columns in result must match the number From fd9ca82fd79db77d1a03d646aca1f233935f2a7d Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Sun, 15 Oct 2023 08:10:54 +0200 Subject: [PATCH 408/583] Fixup missing template declaration, add more tests --- core/distributed/vector.cpp | 1 + reference/test/matrix/dense_kernels.cpp | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 23a6774ccd2..387e792c147 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -579,6 +579,7 @@ void Vector::compute_mean(ptr_param result) const } +template void Vector::compute_mean(ptr_param result, array& tmp) const { diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index a2527a31d3e..3a382371635 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -712,6 +712,16 @@ TYPED_TEST(Dense, ComputesMean) } +TYPED_TEST(Dense, ComputesMeanFailsOnWrongResultSize) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto result = Mtx::create(this->exec, gko::dim<2>{1, 2}); + + ASSERT_THROW(this->mtx4->compute_mean(result), gko::DimensionMismatch); +} + + TYPED_TEST(Dense, ComputeDotFailsOnWrongInputSize) { using Mtx = typename TestFixture::Mtx; From 569e1f67137e13828f7caf1a1d52b02ddf3f0124 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Sun, 15 Oct 2023 12:31:48 +0200 Subject: [PATCH 409/583] Fixup call with ptr_param --- test/mpi/vector.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 414f8197f57..43d18aad6c5 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -680,8 +680,8 @@ TYPED_TEST(VectorReductions, ComputesMeanIsSameAsDense) using value_type = typename TestFixture::value_type; this->init_result(); - this->x->compute_mean(this->res.get()); - this->dense_x->compute_mean(this->dense_res.get()); + this->x->compute_mean(this->res); + this->dense_x->compute_mean(this->dense_res); GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r::value); } @@ -691,8 +691,8 @@ TYPED_TEST(VectorReductions, ComputesMeanWithTmpIsSameAsDense) using value_type = typename TestFixture::value_type; this->init_result(); - this->x->compute_mean(this->res.get(), this->tmp); - this->dense_x->compute_mean(this->dense_res.get(), this->dense_tmp); + this->x->compute_mean(this->res, this->tmp); + this->dense_x->compute_mean(this->dense_res, this->dense_tmp); GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r::value); } From bed88788718a75c6a7cdc81abb99e2c987a604f9 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 16 Oct 2023 15:04:10 +0200 Subject: [PATCH 410/583] Add review suggestions Co-authored-by: Marcel Koch --- common/unified/matrix/dense_kernels.template.cpp | 6 +++--- core/distributed/vector.cpp | 4 ++-- core/matrix/dense.cpp | 6 ++---- include/ginkgo/core/distributed/vector.hpp | 13 ------------- reference/test/matrix/dense_kernels.cpp | 6 +++--- 5 files changed, 10 insertions(+), 25 deletions(-) diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index d7e1c08f38c..e8751a896a0 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -286,11 +286,11 @@ void compute_mean(std::shared_ptr exec, using ValueType_nc = gko::remove_complex; run_kernel_col_reduction_cached( exec, - [] GKO_KERNEL(auto i, auto j, auto x, auto total_size) { - return x(i, j) / static_cast(total_size); + [] GKO_KERNEL(auto i, auto j, auto x, auto inv_total_size) { + return x(i, j) * inv_total_size; }, GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - tmp, x, x->get_size()[0]); + tmp, x, 1. / x->get_size()[0]); } diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 387e792c147..b828a44bd6d 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -587,7 +587,7 @@ void Vector::compute_mean(ptr_param result, const auto global_size = this->get_size()[0]; const auto local_size = this->get_local_vector()->get_size()[0]; const auto num_vecs = static_cast(this->get_size()[1]); - GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, num_vecs)); + GKO_ASSERT_EQUAL_COLS(result, dim<2>(1, num_vecs)); auto exec = this->get_executor(); const auto comm = this->get_communicator(); auto dense_res = make_temporary_clone(exec, as(result)); @@ -595,7 +595,7 @@ void Vector::compute_mean(ptr_param result, // scale by its weight ie ratio of local to global size auto weight = initialize>>( - 1, {static_cast>(local_size) / global_size}, + {static_cast>(local_size) / global_size}, this->get_executor()); dense_res->scale(weight.get()); diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index a50ab6b260b..68a26c5bd87 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -509,7 +509,7 @@ template void Dense::compute_mean(ptr_param result, array& tmp) const { - GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + GKO_ASSERT_EQUAL_COLS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); if (tmp.get_executor() != exec) { tmp.clear(); @@ -533,11 +533,9 @@ void Dense::compute_squared_norm2_impl(LinOp* result) const template void Dense::compute_mean_impl(LinOp* result) const { - GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); - auto dense_res = make_temporary_conversion(result); array tmp{exec}; - exec->run(dense::make_compute_mean(this, dense_res.get(), tmp)); + this->compute_mean(make_temporary_output_clone(exec, result).get(), tmp); } diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 1e3b9571b19..87afa3a01b5 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -427,19 +427,6 @@ class Vector */ void compute_mean(ptr_param result, array& tmp) const; - /** - * Computes the column-wise arithmetic mean of this (multi-)vector using a - * global reduction. - * - * @param result a Dense row matrix, used to store the mean - * (the number of columns in result must match the number - * of columns of this) - * @param tmp the temporary storage to use for partial sums during the - * reduction computation. It may be resized and/or reset to the - * correct executor. - */ - void compute_mean(LinOp* result, array& tmp) const; - /** * Returns a single element of the multi-vector. * diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 3a382371635..532bd14ec95 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -706,9 +706,9 @@ TYPED_TEST(Dense, ComputesMean) this->mtx4->compute_mean(result.get()); - GKO_EXPECT_NEAR(result->at(0, 0), T{0.5}, 1e-6); - GKO_EXPECT_NEAR(result->at(0, 1), T{4.0}, 1e-6); - GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, 1e-6); + GKO_EXPECT_NEAR(result->at(0, 0), T{0.5}, r::value * 10); + GKO_EXPECT_NEAR(result->at(0, 1), T{4.0}, r::value * 10); + GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, r::value * 10); } From a192d9466f27d580d6c67c6f1e0f7def45ef2020 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 17 Oct 2023 06:37:09 +0200 Subject: [PATCH 411/583] Update test/mpi/vector.cpp Co-authored-by: Yu-Hsiang M. Tsai <19565938+yhmtsai@users.noreply.github.com> --- test/mpi/vector.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 43d18aad6c5..515c8e59a7b 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -675,6 +675,7 @@ TYPED_TEST(VectorReductions, ComputeSquaredNorm2WithTmpIsSameAsDense) r::value); } + TYPED_TEST(VectorReductions, ComputesMeanIsSameAsDense) { using value_type = typename TestFixture::value_type; From 927d9e7ebe85117bfe9c93a0a5069501a2ac9fcc Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 17 Oct 2023 06:37:24 +0200 Subject: [PATCH 412/583] Update test/mpi/vector.cpp Co-authored-by: Yu-Hsiang M. Tsai <19565938+yhmtsai@users.noreply.github.com> --- common/unified/matrix/dense_kernels.template.cpp | 2 +- test/mpi/vector.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index e8751a896a0..9bd5c04f861 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -290,7 +290,7 @@ void compute_mean(std::shared_ptr exec, return x(i, j) * inv_total_size; }, GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - tmp, x, 1. / x->get_size()[0]); + tmp, x, ValueType_nc{1.} / x->get_size()[0]); } diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 515c8e59a7b..2b00de19bda 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -687,6 +687,7 @@ TYPED_TEST(VectorReductions, ComputesMeanIsSameAsDense) GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r::value); } + TYPED_TEST(VectorReductions, ComputesMeanWithTmpIsSameAsDense) { using value_type = typename TestFixture::value_type; From 393ea41733b8602da2fc2570ab5ad60d6a19a37a Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 17 Oct 2023 06:39:03 +0200 Subject: [PATCH 413/583] Add review suggestions Co-authored-by: Marcel Koch --- core/distributed/vector.cpp | 2 +- core/matrix/dense.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index b828a44bd6d..b61d5c36328 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -587,7 +587,7 @@ void Vector::compute_mean(ptr_param result, const auto global_size = this->get_size()[0]; const auto local_size = this->get_local_vector()->get_size()[0]; const auto num_vecs = static_cast(this->get_size()[1]); - GKO_ASSERT_EQUAL_COLS(result, dim<2>(1, num_vecs)); + GKO_ASSERT_EQUAL_COLS(result, this); auto exec = this->get_executor(); const auto comm = this->get_communicator(); auto dense_res = make_temporary_clone(exec, as(result)); diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 68a26c5bd87..9f7dff96aab 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -509,7 +509,7 @@ template void Dense::compute_mean(ptr_param result, array& tmp) const { - GKO_ASSERT_EQUAL_COLS(result, dim<2>(1, this->get_size()[1])); + GKO_ASSERT_EQUAL_COLS(result, this); auto exec = this->get_executor(); if (tmp.get_executor() != exec) { tmp.clear(); From b3dbc679b66382b236e508153b5f058aff08f2b8 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 17 Oct 2023 08:53:50 +0200 Subject: [PATCH 414/583] Doc fixes and format --- include/ginkgo/core/matrix/dense.hpp | 3 --- test/mpi/vector.cpp | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 9edf55d2e4c..0db8f7697a5 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -1238,9 +1238,6 @@ class Dense /** * @copydoc compute_mean(LinOp*) const - * - * @deprecated This function will be removed in the future, - * we will instead always use Ginkgo's implementation. */ virtual void compute_mean_impl(LinOp* result) const; diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 2b00de19bda..ac75a461465 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -699,6 +699,7 @@ TYPED_TEST(VectorReductions, ComputesMeanWithTmpIsSameAsDense) GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r::value); } + TYPED_TEST(VectorReductions, ComputeDotCopiesToHostOnlyIfNecessary) { this->init_result(); From 452af109676e3b00cf77469308065b5a3c22abf8 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 17 Oct 2023 08:54:30 +0200 Subject: [PATCH 415/583] Use simpler implementation for reference --- reference/matrix/dense_kernels.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index df86aedd047..ff69dcf2684 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -408,11 +408,10 @@ void compute_mean(std::shared_ptr exec, } for (size_type i = 0; i < x->get_size()[0]; ++i) { - const ValueType_nc alpha = static_cast(i) / (i + 1); - const ValueType_nc beta = static_cast(1) / (i + 1); for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = alpha * result->at(0, j) + beta * x->at(i, j); + result->at(0, i) += x->at(i, j); } + result->at(0, i) /= static_cast(x->get_size()[1]); } } From c364dd81f90479e3459abb206f6754fe2a3f9cf1 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 18 Oct 2023 10:09:42 +0200 Subject: [PATCH 416/583] Fix reference compute mean impl, add test --- reference/matrix/dense_kernels.cpp | 8 ++++---- reference/test/matrix/dense_kernels.cpp | 8 ++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index ff69dcf2684..47df46b3c86 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -407,11 +407,11 @@ void compute_mean(std::shared_ptr exec, result->at(0, j) = zero(); } - for (size_type i = 0; i < x->get_size()[0]; ++i) { - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, i) += x->at(i, j); + for (size_type i = 0; i < x->get_size()[1]; ++i) { + for (size_type j = 0; j < x->get_size()[0]; ++j) { + result->at(0, i) += x->at(j, i); } - result->at(0, i) /= static_cast(x->get_size()[1]); + result->at(0, i) /= static_cast(x->get_size()[0]); } } diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 532bd14ec95..b776f426794 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include @@ -702,6 +703,13 @@ TYPED_TEST(Dense, ComputesMean) { using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::value_type; + + auto iota = Mtx::create(this->exec, gko::dim<2>{10, 1}); + std::iota(iota->get_values(), iota->get_values() + 10, 1); + auto iota_result = Mtx::create(this->exec, gko::dim<2>{1, 1}); + iota->compute_mean(iota_result.get()); + GKO_EXPECT_NEAR(iota_result->at(0, 0), T{5.5}, r::value * 10); + auto result = Mtx::create(this->exec, gko::dim<2>{1, 3}); this->mtx4->compute_mean(result.get()); From 3e1c8316e544de399960535d35751b2987339406 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 19 Oct 2023 11:11:38 +0200 Subject: [PATCH 417/583] add the const apply check --- reference/test/matrix/batch_ell_kernels.cpp | 41 +++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp index 81f189c3e02..d0e70bf5552 100644 --- a/reference/test/matrix/batch_ell_kernels.cpp +++ b/reference/test/matrix/batch_ell_kernels.cpp @@ -128,6 +128,21 @@ TYPED_TEST(Ell, AppliesToBatchMultiVector) } +TYPED_TEST(Ell, ConstAppliesToBatchMultiVector) +{ + using T = typename TestFixture::value_type; + using BMtx = typename TestFixture::BMtx; + + static_cast(this->mtx_0.get())->apply(this->b_0, this->x_0); + + this->mtx_00->apply(this->b_00.get(), this->x_00.get()); + this->mtx_01->apply(this->b_01.get(), this->x_01.get()); + auto res = gko::batch::unbatch>(this->x_0.get()); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r::value); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r::value); +} + + TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector) { using BMtx = typename TestFixture::BMtx; @@ -154,6 +169,32 @@ TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector) } +TYPED_TEST(Ell, ConstAppliesLinearCombinationToBatchMultiVector) +{ + using BMtx = typename TestFixture::BMtx; + using BMVec = typename TestFixture::BMVec; + using DenseMtx = typename TestFixture::DenseMtx; + using T = typename TestFixture::value_type; + auto alpha = gko::batch::initialize({{1.5}, {-1.0}}, this->exec); + auto beta = gko::batch::initialize({{2.5}, {-4.0}}, this->exec); + auto alpha0 = gko::initialize({1.5}, this->exec); + auto alpha1 = gko::initialize({-1.0}, this->exec); + auto beta0 = gko::initialize({2.5}, this->exec); + auto beta1 = gko::initialize({-4.0}, this->exec); + + static_cast(this->mtx_0.get()) + ->apply(alpha.get(), this->b_0.get(), beta.get(), this->x_0.get()); + + this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(), + this->x_00.get()); + this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(), + this->x_01.get()); + auto res = gko::batch::unbatch>(this->x_0.get()); + GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r::value); + GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r::value); +} + + TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols) { using BMVec = typename TestFixture::BMVec; From 4ef1159e00fcccdf954eac6268db1460291f7f12 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 19 Oct 2023 11:16:28 +0200 Subject: [PATCH 418/583] fix batch ell infinite loop --- core/matrix/batch_ell.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index b2987e741d9..19b2dcae5c3 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -134,10 +134,7 @@ Ell* Ell::apply( ptr_param> b, ptr_param> x) { - this->validate_application_parameters(b.get(), x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, x).get()); + static_cast(this)->apply(b, x); return this; } @@ -147,7 +144,10 @@ const Ell* Ell::apply( ptr_param> b, ptr_param> x) const { - this->apply(b, x); + this->validate_application_parameters(b.get(), x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, x).get()); return this; } @@ -159,13 +159,7 @@ Ell* Ell::apply( ptr_param> beta, ptr_param> x) { - this->validate_application_parameters(alpha.get(), b.get(), beta.get(), - x.get()); - auto exec = this->get_executor(); - this->apply_impl(make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, beta).get(), - make_temporary_clone(exec, x).get()); + static_cast(this)->apply(alpha, b, beta, x); return this; } @@ -177,7 +171,13 @@ const Ell* Ell::apply( ptr_param> beta, ptr_param> x) const { - this->apply(alpha, b, beta, x); + this->validate_application_parameters(alpha.get(), b.get(), beta.get(), + x.get()); + auto exec = this->get_executor(); + this->apply_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), + make_temporary_clone(exec, beta).get(), + make_temporary_clone(exec, x).get()); return this; } From 53a006a8bde8d49407a35e689d2e393947d694c4 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Fri, 13 Oct 2023 22:09:37 +0200 Subject: [PATCH 419/583] Add pregenerated local solver as factory param --- core/distributed/preconditioner/schwarz.cpp | 13 +++++++++++-- .../core/distributed/preconditioner/schwarz.hpp | 5 +++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 0d1267bc0b4..2b2c33d23e7 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -102,14 +102,23 @@ template void Schwarz::generate( std::shared_ptr system_matrix) { - if (parameters_.local_solver) { + if (parameters_.local_solver && !parameters_.generated_local_solvers) { this->local_solver_ = parameters_.local_solver->generate( as>( system_matrix) ->get_local_matrix()); + } else if (parameters_.generated_local_solvers && + !parameters_.local_solver) { + this->local_solver_ = parameters_.generated_local_solvers; + } else if (!parameters_.generated_local_ && !parameters_.local_solver) { + throw ::gko::InvalidStateError( + __FILE__, __LINE__, __func__, + "Requires either a generated solver or an solver factory"); } else { - GKO_NOT_IMPLEMENTED; + throw ::gko::InvalidStateError( + __FILE__, __LINE__, __func__, + "Provided both a generated solver and a solver factory"); } } diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index f31bd96aa2e..5bce97fb414 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -95,6 +95,11 @@ class Schwarz * Local solver factory. */ GKO_DEFERRED_FACTORY_PARAMETER(local_solver, LinOpFactory); + /** + * Generated Inner solvers. + */ + std::shared_ptr GKO_FACTORY_PARAMETER( + generated_local_solver, nullptr); }; GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); From dc36bf897fd269d1e64be85e126e52f1a7b395d8 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Fri, 13 Oct 2023 23:12:57 +0200 Subject: [PATCH 420/583] Add unit test --- test/mpi/preconditioner/schwarz.cpp | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 8d07ba44046..f3269b1d237 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -217,6 +217,36 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver) this->non_dist_x); } +TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) +{ + using value_type = typename TestFixture::value_type; + using csr = typename TestFixture::local_matrix_type; + using cg = typename TestFixture::solver_type; + using prec = typename TestFixture::dist_prec_type; + constexpr double tolerance = 1e-20; + auto iter_stop = gko::share( + gko::stop::Iteration::build().with_max_iters(200u).on(this->exec)); + auto tol_stop = gko::share( + gko::stop::ResidualNorm::build() + .with_reduction_factor( + static_cast>(tolerance)) + .on(this->exec)); + this->non_dist_solver_factory = + cg::build() + .with_preconditioner(this->local_solver_factory) + .with_criteria(iter_stop, tol_stop) + .on(this->exec); + auto local_solver = + this->non_dist_solver_factory->generate(this->non_dist_mat); + this->dist_solver_factory = + cg::build() + .with_preconditioner(prec::build() + .with_generated_local_solver(local_solver) + .on(this->exec)) + .with_criteria(iter_stop, tol_stop) + .on(this->exec); +} + TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditioner) { From 79aaaa168b88b19f90087e2b533520aa9bc22416 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 16 Oct 2023 11:02:48 +0200 Subject: [PATCH 421/583] Test if generate fails for invalid solver states --- core/distributed/preconditioner/schwarz.cpp | 9 +-- test/mpi/preconditioner/schwarz.cpp | 69 +++++++++++++-------- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 2b2c33d23e7..90adf384cce 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -102,16 +102,17 @@ template void Schwarz::generate( std::shared_ptr system_matrix) { - if (parameters_.local_solver && !parameters_.generated_local_solvers) { + if (parameters_.local_solver && !parameters_.generated_local_solver) { this->local_solver_ = parameters_.local_solver->generate( as>( system_matrix) ->get_local_matrix()); - } else if (parameters_.generated_local_solvers && + } else if (parameters_.generated_local_solver && + !parameters_.local_solver) { + this->local_solver_ = parameters_.generated_local_solver; + } else if (!parameters_.generated_local_solver && !parameters_.local_solver) { - this->local_solver_ = parameters_.generated_local_solvers; - } else if (!parameters_.generated_local_ && !parameters_.local_solver) { throw ::gko::InvalidStateError( __FILE__, __LINE__, __func__, "Requires either a generated solver or an solver factory"); diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index f3269b1d237..7a1f69a59a3 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -217,37 +217,56 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver) this->non_dist_x); } -TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) + +TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfPregenSolverAndSolverFactoryArePresent) { - using value_type = typename TestFixture::value_type; - using csr = typename TestFixture::local_matrix_type; - using cg = typename TestFixture::solver_type; using prec = typename TestFixture::dist_prec_type; - constexpr double tolerance = 1e-20; - auto iter_stop = gko::share( - gko::stop::Iteration::build().with_max_iters(200u).on(this->exec)); - auto tol_stop = gko::share( - gko::stop::ResidualNorm::build() - .with_reduction_factor( - static_cast>(tolerance)) - .on(this->exec)); - this->non_dist_solver_factory = - cg::build() - .with_preconditioner(this->local_solver_factory) - .with_criteria(iter_stop, tol_stop) - .on(this->exec); auto local_solver = - this->non_dist_solver_factory->generate(this->non_dist_mat); - this->dist_solver_factory = - cg::build() - .with_preconditioner(prec::build() - .with_generated_local_solver(local_solver) - .on(this->exec)) - .with_criteria(iter_stop, tol_stop) - .on(this->exec); + gko::share(this->non_dist_solver_factory->generate(this->non_dist_mat)); + + auto schwarz = prec::build() + .with_local_solver(this->local_solver_factory) + .with_generated_local_solver(local_solver) + .on(this->exec); + + ASSERT_THROW(schwarz->generate(this->dist_mat), gko::InvalidStateError); + + auto schwarz_no_solver = prec::build().on(this->exec); + ASSERT_THROW(schwarz_no_solver->generate(this->dist_mat), gko::InvalidStateError); } +// TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) +// { +// using value_type = typename TestFixture::value_type; +// using csr = typename TestFixture::local_matrix_type; +// using cg = typename TestFixture::solver_type; +// using prec = typename TestFixture::dist_prec_type; +// constexpr double tolerance = 1e-20; +// auto iter_stop = gko::share( +// gko::stop::Iteration::build().with_max_iters(200u).on(this->exec)); +// auto tol_stop = gko::share( +// gko::stop::ResidualNorm::build() +// .with_reduction_factor( +// static_cast>(tolerance)) +// .on(this->exec)); +// this->non_dist_solver_factory = +// cg::build() +// .with_preconditioner(this->local_solver_factory) +// .with_criteria(iter_stop, tol_stop) +// .on(this->exec); +// auto local_solver = +// this->non_dist_solver_factory->generate(this->non_dist_mat); +// this->dist_solver_factory = +// cg::build() +// .with_preconditioner(prec::build() +// .with_generated_local_solver(local_solver.get()) +// .on(this->exec)) +// .with_criteria(iter_stop, tol_stop) +// .on(this->exec); +// } + + TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditioner) { using value_type = typename TestFixture::value_type; From 1ff74f58700c364feae3544c83f338d81345d09c Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 17 Oct 2023 14:58:08 +0200 Subject: [PATCH 422/583] refactor build method a bit, add unit tests --- core/distributed/preconditioner/schwarz.cpp | 28 +++-- test/mpi/preconditioner/schwarz.cpp | 121 +++++++++++--------- 2 files changed, 81 insertions(+), 68 deletions(-) diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 90adf384cce..7dfdfd3b4a7 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -102,24 +102,28 @@ template void Schwarz::generate( std::shared_ptr system_matrix) { - if (parameters_.local_solver && !parameters_.generated_local_solver) { + if (parameters_.local_solver != nullptr && + parameters_.generated_local_solver != nullptr) { + throw ::gko::InvalidStateError( + __FILE__, __LINE__, __func__, + "Provided both a generated solver and a solver factory"); + } + + if (parameters_.local_solver == nullptr && + parameters_.generated_local_solver == nullptr) { + throw ::gko::InvalidStateError( + __FILE__, __LINE__, __func__, + "Requires either a generated solver or an solver factory"); + } + + if (parameters_.local_solver) { this->local_solver_ = parameters_.local_solver->generate( as>( system_matrix) ->get_local_matrix()); - } else if (parameters_.generated_local_solver && - !parameters_.local_solver) { - this->local_solver_ = parameters_.generated_local_solver; - } else if (!parameters_.generated_local_solver && - !parameters_.local_solver) { - throw ::gko::InvalidStateError( - __FILE__, __LINE__, __func__, - "Requires either a generated solver or an solver factory"); } else { - throw ::gko::InvalidStateError( - __FILE__, __LINE__, __func__, - "Provided both a generated solver and a solver factory"); + this->local_solver_ = parameters_.generated_local_solver; } } diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 7a1f69a59a3..42a043d2e51 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -178,65 +178,32 @@ class SchwarzPreconditioner : public CommonMpiTestFixture { TYPED_TEST_SUITE(SchwarzPreconditioner, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); - -TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver) +TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfInvalidState) { using value_type = typename TestFixture::value_type; - using csr = typename TestFixture::local_matrix_type; - using cg = typename TestFixture::solver_type; - using prec = typename TestFixture::dist_prec_type; - constexpr double tolerance = 1e-20; - auto iter_stop = gko::share( - gko::stop::Iteration::build().with_max_iters(200u).on(this->exec)); - auto tol_stop = gko::share( - gko::stop::ResidualNorm::build() - .with_reduction_factor( - static_cast>(tolerance)) - .on(this->exec)); - this->dist_solver_factory = - cg::build() - .with_preconditioner( - prec::build() - .with_local_solver(this->local_solver_factory) - .on(this->exec)) - .with_criteria(iter_stop, tol_stop) - .on(this->exec); - auto dist_solver = this->dist_solver_factory->generate(this->dist_mat); - this->non_dist_solver_factory = - cg::build() - .with_preconditioner(this->local_solver_factory) - .with_criteria(iter_stop, tol_stop) - .on(this->exec); - auto non_dist_solver = - this->non_dist_solver_factory->generate(this->non_dist_mat); - - dist_solver->apply(this->dist_b.get(), this->dist_x.get()); - non_dist_solver->apply(this->non_dist_b.get(), this->non_dist_x.get()); - - this->assert_equal_to_non_distributed_vector(this->dist_x, - this->non_dist_x); -} - - -TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfPregenSolverAndSolverFactoryArePresent) -{ + using local_index_type = typename TestFixture::local_index_type; + using local_prec_type = + gko::preconditioner::Jacobi; using prec = typename TestFixture::dist_prec_type; - auto local_solver = - gko::share(this->non_dist_solver_factory->generate(this->non_dist_mat)); + auto local_solver = gko::share(local_prec_type::build() + .with_max_block_size(1u) + .on(this->exec) + ->generate(this->non_dist_mat)); auto schwarz = prec::build() - .with_local_solver(this->local_solver_factory) - .with_generated_local_solver(local_solver) - .on(this->exec); + .with_local_solver(this->local_solver_factory) + .with_generated_local_solver(local_solver) + .on(this->exec); ASSERT_THROW(schwarz->generate(this->dist_mat), gko::InvalidStateError); auto schwarz_no_solver = prec::build().on(this->exec); - ASSERT_THROW(schwarz_no_solver->generate(this->dist_mat), gko::InvalidStateError); + ASSERT_THROW(schwarz_no_solver->generate(this->dist_mat), + gko::InvalidStateError); } -// TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) +// TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver) // { // using value_type = typename TestFixture::value_type; // using csr = typename TestFixture::local_matrix_type; @@ -250,30 +217,72 @@ TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfPregenSolverAndSolverFactoryAre // .with_reduction_factor( // static_cast>(tolerance)) // .on(this->exec)); -// this->non_dist_solver_factory = +// this->dist_solver_factory = // cg::build() -// .with_preconditioner(this->local_solver_factory) +// .with_preconditioner( +// prec::build() +// .with_local_solver(this->local_solver_factory) +// .on(this->exec)) // .with_criteria(iter_stop, tol_stop) // .on(this->exec); -// auto local_solver = -// this->non_dist_solver_factory->generate(this->non_dist_mat); -// this->dist_solver_factory = +// auto dist_solver = this->dist_solver_factory->generate(this->dist_mat); +// this->non_dist_solver_factory = // cg::build() -// .with_preconditioner(prec::build() -// .with_generated_local_solver(local_solver.get()) -// .on(this->exec)) +// .with_preconditioner(this->local_solver_factory) // .with_criteria(iter_stop, tol_stop) // .on(this->exec); +// auto non_dist_solver = +// this->non_dist_solver_factory->generate(this->non_dist_mat); +// +// dist_solver->apply(this->dist_b.get(), this->dist_x.get()); +// dist_solver->apply(this->non_dist_b.get(), this->non_dist_x.get()); +// +// this->assert_equal_to_non_distributed_vector(this->dist_x, +// this->non_dist_x); // } -TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditioner) +TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) { using value_type = typename TestFixture::value_type; + using local_index_type = typename TestFixture::local_index_type; + using local_prec_type = + gko::preconditioner::Jacobi; using csr = typename TestFixture::local_matrix_type; using cg = typename TestFixture::solver_type; using prec = typename TestFixture::dist_prec_type; + auto local_solver = gko::share(local_prec_type::build() + .with_max_block_size(1u) + .on(this->exec) + ->generate(this->non_dist_mat)); + auto precond = prec::build() + .with_local_solver(this->local_solver_factory) + .on(this->exec) + ->generate(this->dist_mat); + + auto precond_pregen = prec::build() + .with_generated_local_solver(local_solver) + .on(this->exec) + ->generate(this->dist_mat); + + auto dist_x = gko::share(this->dist_x->clone()); + auto dist_x_pregen = gko::share(this->dist_x->clone()); + + precond->apply(this->dist_b.get(), dist_x.get()); + precond->apply(this->dist_b.get(), dist_x_pregen.get()); + + GKO_ASSERT_MTX_NEAR( + dist_x->get_local_vector(), dist_x_pregen->get_local_vector(), + r::value); +} + + +TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditioner) +{ + using value_type = typename TestFixture::value_type; + using prec = typename TestFixture::dist_prec_type; + auto precond_factory = prec::build() .with_local_solver(this->local_solver_factory) .on(this->exec); From 7ff372885d31e7726c1ed9fcf95906138e22bb75 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 17 Oct 2023 16:20:14 +0200 Subject: [PATCH 423/583] add missing test --- test/mpi/preconditioner/schwarz.cpp | 74 ++++++++++++++--------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 42a043d2e51..2241be8f535 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -203,43 +203,43 @@ TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfInvalidState) } -// TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver) -// { -// using value_type = typename TestFixture::value_type; -// using csr = typename TestFixture::local_matrix_type; -// using cg = typename TestFixture::solver_type; -// using prec = typename TestFixture::dist_prec_type; -// constexpr double tolerance = 1e-20; -// auto iter_stop = gko::share( -// gko::stop::Iteration::build().with_max_iters(200u).on(this->exec)); -// auto tol_stop = gko::share( -// gko::stop::ResidualNorm::build() -// .with_reduction_factor( -// static_cast>(tolerance)) -// .on(this->exec)); -// this->dist_solver_factory = -// cg::build() -// .with_preconditioner( -// prec::build() -// .with_local_solver(this->local_solver_factory) -// .on(this->exec)) -// .with_criteria(iter_stop, tol_stop) -// .on(this->exec); -// auto dist_solver = this->dist_solver_factory->generate(this->dist_mat); -// this->non_dist_solver_factory = -// cg::build() -// .with_preconditioner(this->local_solver_factory) -// .with_criteria(iter_stop, tol_stop) -// .on(this->exec); -// auto non_dist_solver = -// this->non_dist_solver_factory->generate(this->non_dist_mat); -// -// dist_solver->apply(this->dist_b.get(), this->dist_x.get()); -// dist_solver->apply(this->non_dist_b.get(), this->non_dist_x.get()); -// -// this->assert_equal_to_non_distributed_vector(this->dist_x, -// this->non_dist_x); -// } +TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver) +{ + using value_type = typename TestFixture::value_type; + using csr = typename TestFixture::local_matrix_type; + using cg = typename TestFixture::solver_type; + using prec = typename TestFixture::dist_prec_type; + constexpr double tolerance = 1e-20; + auto iter_stop = gko::share( + gko::stop::Iteration::build().with_max_iters(200u).on(this->exec)); + auto tol_stop = gko::share( + gko::stop::ResidualNorm::build() + .with_reduction_factor( + static_cast>(tolerance)) + .on(this->exec)); + this->dist_solver_factory = + cg::build() + .with_preconditioner( + prec::build() + .with_local_solver(this->local_solver_factory) + .on(this->exec)) + .with_criteria(iter_stop, tol_stop) + .on(this->exec); + auto dist_solver = this->dist_solver_factory->generate(this->dist_mat); + this->non_dist_solver_factory = + cg::build() + .with_preconditioner(this->local_solver_factory) + .with_criteria(iter_stop, tol_stop) + .on(this->exec); + auto non_dist_solver = + this->non_dist_solver_factory->generate(this->non_dist_mat); + + dist_solver->apply(this->dist_b.get(), this->dist_x.get()); + non_dist_solver->apply(this->non_dist_b.get(), this->non_dist_x.get()); + + this->assert_equal_to_non_distributed_vector(this->dist_x, + this->non_dist_x); +} TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) From fc17c0b8df56dcbf21b11ad60320592187e5115d Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 18 Oct 2023 10:21:46 +0200 Subject: [PATCH 424/583] Implement review comments Co-authored-by: Pratik Nayak --- core/distributed/preconditioner/schwarz.cpp | 12 ++++-------- .../core/distributed/preconditioner/schwarz.hpp | 1 + test/mpi/preconditioner/schwarz.cpp | 13 ++++++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 7dfdfd3b4a7..dd3f86a1cd9 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -102,17 +102,13 @@ template void Schwarz::generate( std::shared_ptr system_matrix) { - if (parameters_.local_solver != nullptr && - parameters_.generated_local_solver != nullptr) { - throw ::gko::InvalidStateError( - __FILE__, __LINE__, __func__, + if (parameters_.local_solver && parameters_.generated_local_solver) { + GKO_INVALID_STATE( "Provided both a generated solver and a solver factory"); } - if (parameters_.local_solver == nullptr && - parameters_.generated_local_solver == nullptr) { - throw ::gko::InvalidStateError( - __FILE__, __LINE__, __func__, + if (!parameters_.local_solver && !parameters_.generated_local_solver) { + GKO_INVALID_STATE( "Requires either a generated solver or an solver factory"); } diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index 5bce97fb414..1b34faff7c4 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -95,6 +95,7 @@ class Schwarz * Local solver factory. */ GKO_DEFERRED_FACTORY_PARAMETER(local_solver, LinOpFactory); + /** * Generated Inner solvers. */ diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 2241be8f535..506a8d1320f 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -196,7 +196,12 @@ TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfInvalidState) .on(this->exec); ASSERT_THROW(schwarz->generate(this->dist_mat), gko::InvalidStateError); +} + +TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfNoSolverProvided) +{ + using prec = typename TestFixture::dist_prec_type; auto schwarz_no_solver = prec::build().on(this->exec); ASSERT_THROW(schwarz_no_solver->generate(this->dist_mat), gko::InvalidStateError); @@ -260,21 +265,19 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) .with_local_solver(this->local_solver_factory) .on(this->exec) ->generate(this->dist_mat); - auto precond_pregen = prec::build() .with_generated_local_solver(local_solver) .on(this->exec) ->generate(this->dist_mat); - auto dist_x = gko::share(this->dist_x->clone()); auto dist_x_pregen = gko::share(this->dist_x->clone()); precond->apply(this->dist_b.get(), dist_x.get()); precond->apply(this->dist_b.get(), dist_x_pregen.get()); - GKO_ASSERT_MTX_NEAR( - dist_x->get_local_vector(), dist_x_pregen->get_local_vector(), - r::value); + GKO_ASSERT_MTX_NEAR(dist_x->get_local_vector(), + dist_x_pregen->get_local_vector(), + r::value); } From b345caa5df30b10d97044229a7dc9afff872e8e9 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 18 Oct 2023 14:52:41 +0200 Subject: [PATCH 425/583] Add review suggestions Co-authored-by: Yuhsiang Tsai --- core/distributed/preconditioner/schwarz.cpp | 26 ++++++++++++++----- .../distributed/preconditioner/schwarz.hpp | 10 +++++-- test/mpi/preconditioner/schwarz.cpp | 5 ++-- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index dd3f86a1cd9..45536c9df87 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -98,6 +98,20 @@ void Schwarz::apply_impl( } +template +void Schwarz::set_solver( + std::shared_ptr new_solver) +{ + auto exec = this->get_executor(); + if (new_solver) { + if (new_solver->get_executor() != exec) { + new_solver = gko::clone(exec, new_solver); + } + } + this->local_solver_ = new_solver; +} + + template void Schwarz::generate( std::shared_ptr system_matrix) @@ -113,13 +127,13 @@ void Schwarz::generate( } if (parameters_.local_solver) { - this->local_solver_ = parameters_.local_solver->generate( - as>( - system_matrix) - ->get_local_matrix()); + this->set_solver(gko::share(parameters_.local_solver->generate( + as>(system_matrix) + ->get_local_matrix()))); + } else { - this->local_solver_ = parameters_.generated_local_solver; + this->set_solver(parameters_.generated_local_solver); } } diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index 1b34faff7c4..e7cd2b1d471 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -99,7 +99,7 @@ class Schwarz /** * Generated Inner solvers. */ - std::shared_ptr GKO_FACTORY_PARAMETER( + std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( generated_local_solver, nullptr); }; GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory); @@ -136,7 +136,6 @@ class Schwarz */ void generate(std::shared_ptr system_matrix); - void apply_impl(const LinOp* b, LinOp* x) const override; template @@ -146,6 +145,13 @@ class Schwarz LinOp* x) const override; private: + /** + * Sets the solver operator used as the local solver. + * + * @param new_solver the new local solver + */ + void set_solver(std::shared_ptr new_solver); + std::shared_ptr local_solver_; }; diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 506a8d1320f..f0181cad39a 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -203,6 +203,7 @@ TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfNoSolverProvided) { using prec = typename TestFixture::dist_prec_type; auto schwarz_no_solver = prec::build().on(this->exec); + ASSERT_THROW(schwarz_no_solver->generate(this->dist_mat), gko::InvalidStateError); } @@ -260,7 +261,7 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) auto local_solver = gko::share(local_prec_type::build() .with_max_block_size(1u) .on(this->exec) - ->generate(this->non_dist_mat)); + ->generate(this->dist_mat->get_local_matrix())); auto precond = prec::build() .with_local_solver(this->local_solver_factory) .on(this->exec) @@ -273,7 +274,7 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) auto dist_x_pregen = gko::share(this->dist_x->clone()); precond->apply(this->dist_b.get(), dist_x.get()); - precond->apply(this->dist_b.get(), dist_x_pregen.get()); + precond_pregen->apply(this->dist_b.get(), dist_x_pregen.get()); GKO_ASSERT_MTX_NEAR(dist_x->get_local_vector(), dist_x_pregen->get_local_vector(), From bd5fc17f80cfa2a965c60784ab6e73c063e04dfd Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 23 Oct 2023 08:05:19 +0000 Subject: [PATCH 426/583] Format files Co-authored-by: Gregor Olenik --- test/mpi/preconditioner/schwarz.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index f0181cad39a..3c6dbf33a52 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -258,10 +258,11 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver) using cg = typename TestFixture::solver_type; using prec = typename TestFixture::dist_prec_type; - auto local_solver = gko::share(local_prec_type::build() - .with_max_block_size(1u) - .on(this->exec) - ->generate(this->dist_mat->get_local_matrix())); + auto local_solver = + gko::share(local_prec_type::build() + .with_max_block_size(1u) + .on(this->exec) + ->generate(this->dist_mat->get_local_matrix())); auto precond = prec::build() .with_local_solver(this->local_solver_factory) .on(this->exec) From a5942fac01104400524ff1aecf57401671d6515e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 18 Oct 2023 10:15:26 +0200 Subject: [PATCH 427/583] add dpcpp csr diagonal missing components - check_diagonal_entries - add_scaled_identity --- dpcpp/matrix/csr_kernels.dp.cpp | 102 +++++++++++++++++++++++++++++++- test/matrix/csr_kernels2.cpp | 6 -- 2 files changed, 99 insertions(+), 9 deletions(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 11309b67b9b..c5a8e3ef4d4 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -871,6 +871,74 @@ void extract_diagonal(size_type diag_size, size_type nnz, GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal); +template +void check_diagonal_entries(const IndexType num_min_rows_cols, + const IndexType* const __restrict__ row_ptrs, + const IndexType* const __restrict__ col_idxs, + bool* const __restrict__ has_all_diags, + sycl::nd_item<3> item_ct1) +{ + constexpr int subgroup_size = config::warp_size; + auto tile_grp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const auto row = + thread::get_subwarp_id_flat(item_ct1); + if (row < num_min_rows_cols) { + const auto tid_in_warp = tile_grp.thread_rank(); + const auto row_start = row_ptrs[row]; + const auto num_nz = row_ptrs[row + 1] - row_start; + bool row_has_diag_local{false}; + for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) { + if (col_idxs[iz + row_start] == row) { + row_has_diag_local = true; + break; + } + } + auto row_has_diag = static_cast(tile_grp.any(row_has_diag_local)); + if (!row_has_diag) { + if (tile_grp.thread_rank() == 0) { + *has_all_diags = false; + } + return; + } + } +} + +GKO_ENABLE_DEFAULT_HOST(check_diagonal_entries, check_diagonal_entries); + + +template +void add_scaled_identity(const ValueType* const __restrict__ alpha, + const ValueType* const __restrict__ beta, + const IndexType num_rows, + const IndexType* const __restrict__ row_ptrs, + const IndexType* const __restrict__ col_idxs, + ValueType* const __restrict__ values, + sycl::nd_item<3> item_ct1) +{ + constexpr int subgroup_size = config::warp_size; + auto tile_grp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const auto row = + thread::get_subwarp_id_flat(item_ct1); + const auto num_warps = + thread::get_subwarp_num_flat(item_ct1); + if (row < num_rows) { + const auto tid_in_warp = tile_grp.thread_rank(); + const auto row_start = row_ptrs[row]; + const auto num_nz = row_ptrs[row + 1] - row_start; + for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) { + values[iz + row_start] *= beta[0]; + if (col_idxs[iz + row_start] == row) { + values[iz + row_start] += alpha[0]; + } + } + } +} + +GKO_ENABLE_DEFAULT_HOST(add_scaled_identity, add_scaled_identity); + + } // namespace kernel @@ -2364,8 +2432,24 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); template void check_diagonal_entries_exist( std::shared_ptr exec, - const matrix::Csr* const mtx, - bool& has_all_diags) GKO_NOT_IMPLEMENTED; + const matrix::Csr* const mtx, bool& has_all_diags) +{ + const size_type num_subgroup = mtx->get_size()[0]; + if (num_subgroup > 0) { + const size_type num_blocks = + num_subgroup / (default_block_size / config::warp_size); + array has_diags(exec, {true}); + kernel::check_diagonal_entries( + num_blocks, default_block_size, 0, exec->get_queue(), + static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])), + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + has_diags.get_data()); + has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); + } else { + has_all_diags = true; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); @@ -2376,7 +2460,19 @@ void add_scaled_identity(std::shared_ptr exec, const matrix::Dense* const alpha, const matrix::Dense* const beta, matrix::Csr* const mtx) - GKO_NOT_IMPLEMENTED; +{ + const auto nrows = mtx->get_size()[0]; + if (nrows == 0) { + return; + } + const auto nthreads = nrows * config::warp_size; + const auto nblocks = ceildiv(nthreads, default_block_size); + kernel::add_scaled_identity( + nblocks, default_block_size, 0, exec->get_queue(), + alpha->get_const_values(), beta->get_const_values(), + static_cast(nrows), mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), mtx->get_values()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 4d3ffa61323..412f9a41158 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -1311,9 +1311,6 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef) } -#ifndef GKO_COMPILING_DPCPP - - TEST_F(Csr, CanDetectMissingDiagonalEntry) { using T = double; @@ -1359,6 +1356,3 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) GKO_ASSERT_MTX_NEAR(mtx, dmtx, r::value); } - - -#endif // GKO_COMPILING_DPCPP From f78461435be15346d54c534cdf4c92bbc3b35469 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 23 Oct 2023 22:50:16 +0200 Subject: [PATCH 428/583] refine the kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Pratik Nayak Co-authored-by: Thomas Grützmacher --- common/cuda_hip/matrix/csr_common.hpp.inc | 1 - common/cuda_hip/matrix/csr_kernels.hpp.inc | 12 ++++++++---- dpcpp/matrix/csr_kernels.dp.cpp | 14 ++++++++------ omp/matrix/csr_kernels.cpp | 11 ++++++++--- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc index 0fce02aecfa..35718464c42 100644 --- a/common/cuda_hip/matrix/csr_common.hpp.inc +++ b/common/cuda_hip/matrix/csr_common.hpp.inc @@ -102,7 +102,6 @@ __global__ __launch_bounds__(default_block_size) void check_diagonal_entries( if (tile_grp.thread_rank() == 0) { *has_all_diags = false; } - return; } } } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 3f02337747e..4bc601c5067 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -826,15 +826,19 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity( auto tile_grp = group::tiled_partition(group::this_thread_block()); const auto warpid = thread::get_subwarp_id_flat(); - const auto num_warps = thread::get_subwarp_num_flat(); if (warpid < num_rows) { const auto tid_in_warp = tile_grp.thread_rank(); const IndexType row_start = row_ptrs[warpid]; const IndexType num_nz = row_ptrs[warpid + 1] - row_start; + const auto beta_val = beta[0]; + const auto alpha_val = alpha[0]; for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) { - values[iz + row_start] *= beta[0]; - if (col_idxs[iz + row_start] == warpid) { - values[iz + row_start] += alpha[0]; + if (beta_val != one()) { + values[iz + row_start] *= beta_val; + } + if (col_idxs[iz + row_start] == warpid && + alpha_val != zero()) { + values[iz + row_start] += alpha_val; } } } diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index c5a8e3ef4d4..915e2027a26 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -899,7 +899,6 @@ void check_diagonal_entries(const IndexType num_min_rows_cols, if (tile_grp.thread_rank() == 0) { *has_all_diags = false; } - return; } } } @@ -921,16 +920,19 @@ void add_scaled_identity(const ValueType* const __restrict__ alpha, group::this_thread_block(item_ct1)); const auto row = thread::get_subwarp_id_flat(item_ct1); - const auto num_warps = - thread::get_subwarp_num_flat(item_ct1); if (row < num_rows) { const auto tid_in_warp = tile_grp.thread_rank(); const auto row_start = row_ptrs[row]; const auto num_nz = row_ptrs[row + 1] - row_start; + const auto beta_val = beta[0]; + const auto alpha_val = alpha[0]; for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) { - values[iz + row_start] *= beta[0]; - if (col_idxs[iz + row_start] == row) { - values[iz + row_start] += alpha[0]; + if (beta_val != one()) { + values[iz + row_start] *= beta_val; + } + if (col_idxs[iz + row_start] == row && + alpha_val != zero()) { + values[iz + row_start] += alpha_val; } } } diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 7d4a5a7ebd1..1757b4b8a25 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1134,12 +1134,17 @@ void add_scaled_identity(std::shared_ptr exec, const auto nrows = static_cast(mtx->get_size()[0]); const auto row_ptrs = mtx->get_const_row_ptrs(); const auto vals = mtx->get_values(); + const auto beta_val = beta->get_const_values()[0]; + const auto alpha_val = alpha->get_const_values()[0]; #pragma omp parallel for for (IndexType row = 0; row < nrows; row++) { for (IndexType iz = row_ptrs[row]; iz < row_ptrs[row + 1]; iz++) { - vals[iz] *= beta->get_const_values()[0]; - if (row == mtx->get_const_col_idxs()[iz]) { - vals[iz] += alpha->get_const_values()[0]; + if (beta_val != one()) { + vals[iz] *= beta_val; + } + if (row == mtx->get_const_col_idxs()[iz] && + alpha_val != zero()) { + vals[iz] += alpha_val; } } } From 6444d965fcf1e9b3ea3262cbede49281f314d1d6 Mon Sep 17 00:00:00 2001 From: "Jayesh Badwaik (FZ Juelich)" Date: Wed, 30 Aug 2023 12:46:38 +0200 Subject: [PATCH 429/583] Fix memory_order invocations to be inline with C++20 changes --- omp/reorder/rcm_kernels.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/omp/reorder/rcm_kernels.cpp b/omp/reorder/rcm_kernels.cpp index c0042224b3c..4de58456cc1 100644 --- a/omp/reorder/rcm_kernels.cpp +++ b/omp/reorder/rcm_kernels.cpp @@ -235,8 +235,8 @@ struct UbfsLinearQueue { #define GKO_CMPXCHG_IMPL(ptr, ptr_expected, replace_with) \ return __atomic_compare_exchange_n( \ ptr, ptr_expected, replace_with, true, \ - std::memory_order::memory_order_acq_rel, \ - std::memory_order::memory_order_acquire); + static_cast(std::memory_order_acq_rel), \ + static_cast(std::memory_order_acquire)); #endif /** From cec594be8dfb14ea9b740c8290d31477196bcca8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 24 Oct 2023 21:46:40 +0200 Subject: [PATCH 430/583] failed: missing diag in tail and ensure all diag --- core/test/utils/matrix_utils_test.cpp | 41 ++++++++++++++++----------- core/utils/matrix_utils.hpp | 7 +++-- test/matrix/csr_kernels2.cpp | 6 ++-- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 31a6072270e..5c1653f22dc 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -355,29 +355,38 @@ TEST(MatrixUtils, ModifyToEnsureAllDiagonalEntries) using T = float; using Csr = gko::matrix::Csr; auto exec = gko::ReferenceExecutor::create(); + auto check_all_diag = [](const Csr* csr) { + const auto rowptrs = csr->get_const_row_ptrs(); + const auto colidxs = csr->get_const_col_idxs(); + const auto ndiag = + static_cast(std::min(csr->get_size()[0], csr->get_size()[1])); + bool all_diags = true; + for (int i = 0; i < ndiag; i++) { + bool has_diag = false; + for (int j = rowptrs[i]; j < rowptrs[i + 1]; j++) { + if (colidxs[j] == i) { + has_diag = true; + break; + } + } + if (!has_diag) { + all_diags = false; + break; + } + } + return all_diags; + }; auto b = gko::initialize( {I{2.0, 0.0, 1.1, 0.0}, I{1.0, 2.4, 0.0, -1.0}, I{0.0, -4.0, 2.2, -2.0}, I{0.0, -3.0, 1.5, 1.0}}, exec); + // ensure it misses some diag + bool prev_check = check_all_diag(b.get()); gko::utils::ensure_all_diagonal_entries(b.get()); - const auto rowptrs = b->get_const_row_ptrs(); - const auto colidxs = b->get_const_col_idxs(); - bool all_diags = true; - for (int i = 0; i < 3; i++) { - bool has_diag = false; - for (int j = rowptrs[i]; j < rowptrs[i + 1]; j++) { - if (colidxs[j] == i) { - has_diag = true; - } - } - if (!has_diag) { - all_diags = false; - break; - } - } - ASSERT_TRUE(all_diags); + ASSERT_FALSE(prev_check); + ASSERT_TRUE(check_all_diag(b.get())); } diff --git a/core/utils/matrix_utils.hpp b/core/utils/matrix_utils.hpp index fed92ad73ef..65b610d1a1d 100644 --- a/core/utils/matrix_utils.hpp +++ b/core/utils/matrix_utils.hpp @@ -301,9 +301,10 @@ void ensure_all_diagonal_entries(MtxType* mtx) using index_type = typename MtxType::index_type; matrix_data mdata; mtx->write(mdata); - const auto nrows = static_cast(mtx->get_size()[0]); - mdata.nonzeros.reserve(mtx->get_num_stored_elements() + nrows); - for (index_type i = 0; i < nrows; i++) { + const auto ndiag = static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])); + mdata.nonzeros.reserve(mtx->get_num_stored_elements() + ndiag); + for (index_type i = 0; i < ndiag; i++) { mdata.nonzeros.push_back({i, i, zero()}); } mdata.sum_duplicates(); diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 412f9a41158..84b1335c675 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -1315,10 +1315,12 @@ TEST_F(Csr, CanDetectMissingDiagonalEntry) { using T = double; using Csr = Mtx; - auto ref_mtx = gen_mtx(103, 98, 10); + auto ref_mtx = gen_mtx(103, 104, 10); const auto rowptrs = ref_mtx->get_row_ptrs(); const auto colidxs = ref_mtx->get_col_idxs(); - const int testrow = 15; + gko::utils::ensure_all_diagonal_entries(ref_mtx.get()); + // Choose the last row to ensure that kernel assign enough work + const int testrow = 102; gko::utils::remove_diagonal_entry_from_row(ref_mtx.get(), testrow); auto mtx = gko::clone(exec, ref_mtx); bool has_diags = true; From 04449bb90a6f4b33cf8f6913b9edadf516dd290b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 24 Oct 2023 22:06:06 +0200 Subject: [PATCH 431/583] enough work for check and initial non-full-diag ex --- core/test/utils/matrix_utils_test.cpp | 7 +++---- cuda/matrix/csr_kernels.template.cu | 13 ++++++------- dpcpp/matrix/csr_kernels.dp.cpp | 13 ++++++------- hip/matrix/csr_kernels.template.hip.cpp | 13 ++++++------- 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 5c1653f22dc..cc5ed70966d 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -376,10 +376,9 @@ TEST(MatrixUtils, ModifyToEnsureAllDiagonalEntries) } return all_diags; }; - auto b = gko::initialize( - {I{2.0, 0.0, 1.1, 0.0}, I{1.0, 2.4, 0.0, -1.0}, - I{0.0, -4.0, 2.2, -2.0}, I{0.0, -3.0, 1.5, 1.0}}, - exec); + auto b = gko::initialize({I{2.0, 0.0, 1.1}, I{1.0, 0.0, 0.0}, + I{0.0, -4.0, 2.2}, I{0.0, -3.0, 1.5}}, + exec); // ensure it misses some diag bool prev_check = check_all_diag(b.get()); diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 1b4b20a1e75..803cb530262 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -1322,16 +1322,15 @@ void check_diagonal_entries_exist( std::shared_ptr exec, const matrix::Csr* const mtx, bool& has_all_diags) { - const size_type num_warps = mtx->get_size()[0]; - if (num_warps > 0) { - const size_type num_blocks = - num_warps / (default_block_size / config::warp_size); + const auto num_diag = static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])); + if (num_diag > 0) { + const IndexType num_blocks = + ceildiv(num_diag, default_block_size / config::warp_size); array has_diags(exec, {true}); kernel::check_diagonal_entries<<get_stream()>>>( - static_cast( - std::min(mtx->get_size()[0], mtx->get_size()[1])), - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), has_diags.get_data()); has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); } else { diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 915e2027a26..46e8894fdac 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2436,15 +2436,14 @@ void check_diagonal_entries_exist( std::shared_ptr exec, const matrix::Csr* const mtx, bool& has_all_diags) { - const size_type num_subgroup = mtx->get_size()[0]; - if (num_subgroup > 0) { - const size_type num_blocks = - num_subgroup / (default_block_size / config::warp_size); + const auto num_diag = static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])); + if (num_diag > 0) { + const IndexType num_blocks = + ceildiv(num_diag, default_block_size / config::warp_size); array has_diags(exec, {true}); kernel::check_diagonal_entries( - num_blocks, default_block_size, 0, exec->get_queue(), - static_cast( - std::min(mtx->get_size()[0], mtx->get_size()[1])), + num_blocks, default_block_size, 0, exec->get_queue(), num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), has_diags.get_data()); has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index e6a4fb64041..5e4de7b9699 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -1119,16 +1119,15 @@ void check_diagonal_entries_exist( std::shared_ptr exec, const matrix::Csr* const mtx, bool& has_all_diags) { - const size_type num_warps = mtx->get_size()[0]; - if (num_warps > 0) { - const size_type num_blocks = - num_warps / (default_block_size / config::warp_size); + const auto num_diag = static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])); + if (num_diag > 0) { + const IndexType num_blocks = + ceildiv(num_diag, default_block_size / config::warp_size); array has_diags(exec, {true}); kernel::check_diagonal_entries<<get_stream()>>>( - static_cast( - std::min(mtx->get_size()[0], mtx->get_size()[1])), - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), has_diags.get_data()); has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); } else { From ad4d2bbe6598e31f6f44e168504ad2580f0f2d10 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 3 Sep 2023 08:12:10 +0200 Subject: [PATCH 432/583] improve-doc --- include/ginkgo/core/base/lin_op.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index 407fafda0d1..531163e6c94 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -520,6 +520,9 @@ class Permutable { * In the resulting LinOp, the entry at location `(i,j)` contains the input * value `(perm[i],perm[j])`. * + * From the linear algebra perspective, with \f$P_{ij} = \delta_{i + * \pi(i)}\f$, this represents the operation \f$P A P^{-1}\f$. + * * @param permutation_indices the array of indices containing the * permutation order. * @@ -538,6 +541,9 @@ class Permutable { * In the resulting LinOp, the entry at location `(perm[i],perm[j])` * contains the input value `(i,j)`. * + * From the linear algebra perspective, with \f$P_{ij} = \delta_{i + * \pi(i)}\f$, this represents the operation \f$P^{-1} A P\f$. + * * @param permutation_indices the array of indices containing the * permutation order. * @@ -555,6 +561,9 @@ class Permutable { * object. * In the resulting LinOp, the row `i` contains the input row `perm[i]`. * + * From the linear algebra perspective, with \f$P_{ij} = \delta_{i + * \pi(i)}\f$, this represents the operation \f$P A\f$. + * * @param permutation_indices the array of indices containing the * permutation order. * @@ -569,6 +578,9 @@ class Permutable { * In the resulting LinOp, the column `i` contains the input column * `perm[i]`. * + * From the linear algebra perspective, with \f$P_{ij} = \delta_{i + * \pi(i)}\f$, this represents the operation \f$A P^{-1}\f$. + * * @param permutation_indices the array of indices containing the * permutation order `perm`. * @@ -582,6 +594,9 @@ class Permutable { * object. * In the resulting LinOp, the row `perm[i]` contains the input row `i`. * + * From the linear algebra perspective, with \f$P_{ij} = \delta_{i + * \pi(i)}\f$, this represents the operation \f$P^{-1} A\f$. + * * @param permutation_indices the array of indices containing the * permutation order `perm`. * @@ -596,6 +611,9 @@ class Permutable { * In the resulting LinOp, the column `perm[i]` contains the input column * `i`. * + * From the linear algebra perspective, with \f$P_{ij} = \delta_{i + * \pi(i)}\f$, this represents the operation \f$A P\f$. + * * @param permutation_indices the array of indices containing the * permutation order `perm`. * From 03022434789ecdde574ef1e455899301729d13a2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 3 Sep 2023 08:13:39 +0200 Subject: [PATCH 433/583] doc-and-new-interface --- include/ginkgo/core/base/lin_op.hpp | 28 ++-- include/ginkgo/core/matrix/csr.hpp | 84 ++++++++++++ include/ginkgo/core/matrix/dense.hpp | 191 +++++++++++++++++++++++++++ 3 files changed, 289 insertions(+), 14 deletions(-) diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index 531163e6c94..e40b0500bde 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -520,8 +520,8 @@ class Permutable { * In the resulting LinOp, the entry at location `(i,j)` contains the input * value `(perm[i],perm[j])`. * - * From the linear algebra perspective, with \f$P_{ij} = \delta_{i - * \pi(i)}\f$, this represents the operation \f$P A P^{-1}\f$. + * From the linear algebra perspective, with $P_{ij} = \delta_{i + * \pi(i)}$, this represents the operation $P A P^T$. * * @param permutation_indices the array of indices containing the * permutation order. @@ -533,7 +533,7 @@ class Permutable { { return as(this->row_permute(permutation_indices)) ->column_permute(permutation_indices); - }; + } /** * Returns a LinOp representing the symmetric inverse row and column @@ -541,8 +541,8 @@ class Permutable { * In the resulting LinOp, the entry at location `(perm[i],perm[j])` * contains the input value `(i,j)`. * - * From the linear algebra perspective, with \f$P_{ij} = \delta_{i - * \pi(i)}\f$, this represents the operation \f$P^{-1} A P\f$. + * From the linear algebra perspective, with $P_{ij} = \delta_{i + * \pi(i)}$, this represents the operation $P^{-1} A P^{-T}$. * * @param permutation_indices the array of indices containing the * permutation order. @@ -554,15 +554,15 @@ class Permutable { { return as(this->inverse_row_permute(permutation_indices)) ->inverse_column_permute(permutation_indices); - }; + } /** * Returns a LinOp representing the row permutation of the Permutable * object. * In the resulting LinOp, the row `i` contains the input row `perm[i]`. * - * From the linear algebra perspective, with \f$P_{ij} = \delta_{i - * \pi(i)}\f$, this represents the operation \f$P A\f$. + * From the linear algebra perspective, with $P_{ij} = \delta_{i + * \pi(i)}$, this represents the operation $P A$. * * @param permutation_indices the array of indices containing the * permutation order. @@ -578,8 +578,8 @@ class Permutable { * In the resulting LinOp, the column `i` contains the input column * `perm[i]`. * - * From the linear algebra perspective, with \f$P_{ij} = \delta_{i - * \pi(i)}\f$, this represents the operation \f$A P^{-1}\f$. + * From the linear algebra perspective, with $P_{ij} = \delta_{i + * \pi(i)}$, this represents the operation $A P^T$. * * @param permutation_indices the array of indices containing the * permutation order `perm`. @@ -594,8 +594,8 @@ class Permutable { * object. * In the resulting LinOp, the row `perm[i]` contains the input row `i`. * - * From the linear algebra perspective, with \f$P_{ij} = \delta_{i - * \pi(i)}\f$, this represents the operation \f$P^{-1} A\f$. + * From the linear algebra perspective, with $P_{ij} = \delta_{i + * \pi(i)}$, this represents the operation $P^{-1} A$. * * @param permutation_indices the array of indices containing the * permutation order `perm`. @@ -611,8 +611,8 @@ class Permutable { * In the resulting LinOp, the column `perm[i]` contains the input column * `i`. * - * From the linear algebra perspective, with \f$P_{ij} = \delta_{i - * \pi(i)}\f$, this represents the operation \f$A P\f$. + * From the linear algebra perspective, with $P_{ij} = \delta_{i + * \pi(i)}$, this represents the operation $A P^{-T}$. * * @param permutation_indices the array of indices containing the * permutation order `perm`. diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 611e5d33c64..834208c4322 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -59,6 +59,12 @@ class Ell; template class Hybrid; +template +class Permutation; + +template +class ScaledPermutation; + template class Sellp; @@ -763,6 +769,84 @@ class Csr : public EnableLinOp>, std::unique_ptr conj_transpose() const override; + /** + * Creates a permuted copy $A'$ of this matrix $A$ with the given + * permutation $P$. By default, this computes a symmetric permutation + * (permute_mode::symmetric). For the effect of the different permutation + * modes, see the following table. + * + * mode | entry mapping | matrix representation + * ------------------|----------------------------|---------------------- + * none | $A'(i, j) = A(i, j)$ | $A' = A$ + * rows | $A'(i, j) = A(p[i], j)$ | $A' = P A$ + * columns | $A'(i, j) = A(i, p[j])$ | $A' = A P^T$ + * inverse_rows | $A'(p[i], j) = A(i, j)$ | $A' = P^{-1} A$ + * inverse_columns | $A'(p[i], j) = A(i, j)$ | $A' = A P^{-T}$ + * symmetric | $A'(i, j) = A(p[i], p[j])$ | $A' = P A P^T$ + * inverse_symmetric | $A'(p[i], p[j]) = A(i, j)$ | $A' = P^{-1} A P^{-T}$ + * + * @param permutation The input permutation. + * @param mode The permutation mode. If permute_mode::inverse is set, we + * use the inverse permutation $P^{-1}$ instead of $P$. + * If permute_mode::rows is set, the rows will be permuted. + * If permute_mode::columns is set, the columns will be + * permuted. + * @return The permuted matrix. + */ + std::unique_ptr permute( + ptr_param> permutation, + permute_mode mode = permute_mode::symmetric) const; + + /** + * Creates a non-symmetrically permuted copy $A'$ of this matrix $A$ with + * the given row and column permutations $P$ and $Q$. The operation will + * compute $A'(i, j) = A(p[i], q[j])$, or $A' = P A Q^T$ if `invert` is + * `false`, and $A'(p[i], q[j]) = A(i,j)$, or $A' = P^{-1} A Q^{-T}$ if + * `invert` is `true`. + * + * @param row_permutation The permutation $P$ to apply to the rows + * @param column_permutation The permutation $Q$ to apply to the columns + * @param invert If set to `false`, uses the input permutations, otherwise + * uses their inverses $P^{-1}, Q^{-1}$ + * @return The permuted matrix. + */ + std::unique_ptr permute( + ptr_param> row_permutation, + ptr_param> column_permutation, + bool invert = false) const; + + /** + * Creates a scaled and permuted copy of this matrix. + * For an explanation of the permutation modes, see + * @ref permute(ptr_param>, permute_mode) + * + * @param permutation The scaled permutation. + * @param mode The permutation mode. + * @return The permuted matrix. + */ + std::unique_ptr scale_permute( + ptr_param> permutation, + permute_mode = permute_mode::symmetric) const; + + /** + * Creates a scaled and permuted copy of this matrix. + * For an explanation of the parameters, see + * @ref permute(ptr_param>, ptr_param>, permute_mode) + * + * @param row_permutation The scaled row permutation. + * @param column_permutation The scaled column permutation. + * @param invert If set to `false`, uses the input permutations, otherwise + * uses their inverses $P^{-1}, Q^{-1}$ + * @return The permuted matrix. + */ + std::unique_ptr scale_permute( + ptr_param> + row_permutation, + ptr_param> + column_permutation, + bool invert = false) const; + std::unique_ptr permute( const array* permutation_indices) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 0db8f7697a5..9c4799951f2 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -81,6 +81,12 @@ class Fbcsr; template class Hybrid; +template +class Permutation; + +template +class ScaledPermutation; + template class Sellp; @@ -401,6 +407,191 @@ class Dense */ void fill(const ValueType value); + /** + * Creates a permuted copy $A'$ of this matrix $A$ with the given + * permutation $P$. By default, this computes a symmetric permutation + * (permute_mode::symmetric). For the effect of the different permutation + * modes, see the following table. + * + * mode | entry mapping | matrix representation + * ------------------|----------------------------|---------------------- + * none | $A'(i, j) = A(i, j)$ | $A' = A$ + * rows | $A'(i, j) = A(p[i], j)$ | $A' = P A$ + * columns | $A'(i, j) = A(i, p[j])$ | $A' = A P^T$ + * inverse_rows | $A'(p[i], j) = A(i, j)$ | $A' = P^{-1} A$ + * inverse_columns | $A'(p[i], j) = A(i, j)$ | $A' = A P^{-T}$ + * symmetric | $A'(i, j) = A(p[i], p[j])$ | $A' = P A P^T$ + * inverse_symmetric | $A'(p[i], p[j]) = A(i, j)$ | $A' = P^{-1} A P^{-T}$ + * + * @param permutation The input permutation. + * @param mode The permutation mode. If permute_mode::inverse is set, we + * use the inverse permutation $P^{-1}$ instead of $P$. + * If permute_mode::rows is set, the rows will be permuted. + * If permute_mode::columns is set, the columns will be + * permuted. + * @return The permuted matrix. + */ + std::unique_ptr permute( + ptr_param> permutation, + permute_mode mode = permute_mode::symmetric) const; + + /** + * @copydoc permute(ptr_param>, permute_mode) + */ + std::unique_ptr permute( + ptr_param> permutation, + permute_mode mode = permute_mode::symmetric) const; + + /** + * Overload of permute(ptr_param>, permute_mode) + * that writes the permuted copy into an existing Dense matrix. + * @param output the output matrix. + */ + void permute(ptr_param> permutation, + ptr_param output, permute_mode mode) const; + + /** + * @copydoc permute(ptr_param>, ptr_param, + * permute_mode) + */ + void permute(ptr_param> permutation, + ptr_param output, permute_mode mode) const; + + /** + * Creates a non-symmetrically permuted copy $A'$ of this matrix $A$ with + * the given row and column permutations $P$ and $Q$. The operation will + * compute $A'(i, j) = A(p[i], q[j])$, or $A' = P A Q^T$ if `invert` is + * `false`, and $A'(p[i], q[j]) = A(i,j)$, or $A' = P^{-1} A Q^{-T}$ if + * `invert` is `true`. + * + * @param row_permutation The permutation $P$ to apply to the rows + * @param column_permutation The permutation $Q$ to apply to the columns + * @param invert If set to `false`, uses the input permutations, otherwise + * uses their inverses $P^{-1}, Q^{-1}$ + * @return The permuted matrix. + */ + std::unique_ptr permute( + ptr_param> row_permutation, + ptr_param> column_permutation, + bool invert = false) const; + + /** + * @copydoc permute(ptr_param>, ptr_param>, permute_mode) + */ + std::unique_ptr permute( + ptr_param> row_permutation, + ptr_param> column_permutation, + bool invert = false) const; + + /** + * Overload of permute(ptr_param>, ptr_param>, permute_mode) that writes the permuted copy into an + * existing Dense matrix. + * @param output the output matrix. + */ + void permute(ptr_param> row_permutation, + ptr_param> column_permutation, + ptr_param output, bool invert) const; + + /** + * @copydoc permute(ptr_param>, ptr_param>, ptr_param, permute_mode) + */ + void permute(ptr_param> row_permutation, + ptr_param> column_permutation, + ptr_param output, bool invert) const; + + /** + * Creates a scaled and permuted copy of this matrix. + * For an explanation of the permutation modes, see + * @ref permute(ptr_param>, permute_mode) + * + * @param permutation The scaled permutation. + * @param mode The permutation mode. + * @return The permuted matrix. + */ + std::unique_ptr scale_permute( + ptr_param> permutation, + permute_mode mode = permute_mode::symmetric) const; + + /** + * @copydoc scale_permute(ptr_param>, permute_mode) + */ + std::unique_ptr scale_permute( + ptr_param> permutation, + permute_mode mode = permute_mode::symmetric) const; + + /** + * Overload of scale_permute(ptr_param>, permute_mode) that writes the permuted copy into an + * existing Dense matrix. + * @param output the output matrix. + */ + void scale_permute( + ptr_param> permutation, + ptr_param output, permute_mode mode) const; + + /** + * @copydoc scale_permute(ptr_param>, ptr_param, permute_mode) + */ + std::unique_ptr scale_permute( + ptr_param> permutation, + ptr_param output, permute_mode mode) const; + + /** + * Creates a scaled and permuted copy of this matrix. + * For an explanation of the parameters, see + * @ref permute(ptr_param>, ptr_param>, permute_mode) + * + * @param row_permutation The scaled row permutation. + * @param column_permutation The scaled column permutation. + * @param invert If set to `false`, uses the input permutations, otherwise + * uses their inverses $P^{-1}, Q^{-1}$ + * @return The permuted matrix. + */ + std::unique_ptr scale_permute( + ptr_param> row_permutation, + ptr_param> + column_permutation, + bool invert = false) const; + + /** + * @copydoc scale_permute(ptr_param>, ptr_param>, bool) + */ + std::unique_ptr scale_permute( + ptr_param> row_permutation, + ptr_param> + column_permutation, + bool invert = false) const; + + /** + * Overload of scale_permute(ptr_param>, ptr_param>, bool) + * that writes the permuted copy into an existing Dense matrix. + * @param output the output matrix. + */ + std::unique_ptr scale_permute( + ptr_param> row_permutation, + ptr_param> + column_permutation, + ptr_param output, bool invert) const; + + /** + * @copydoc scale_permute(ptr_param>, ptr_param>, + * ptr_param, bool) + */ + std::unique_ptr scale_permute( + ptr_param> row_permutation, + ptr_param> + column_permutation, + ptr_param output, bool invert) const; + std::unique_ptr permute( const array* permutation_indices) const override; From f669e50df1814a9249543d63f719c6f48b136377 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 6 Oct 2023 20:54:13 +0200 Subject: [PATCH 434/583] add new interface and kernels --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 631 +++++++++++++++++ common/unified/CMakeLists.txt | 2 + common/unified/matrix/csr_kernels.cpp | 56 +- .../matrix/dense_kernels.instantiate.cpp | 24 +- .../unified/matrix/dense_kernels.template.cpp | 225 +++++- common/unified/matrix/permutation_kernels.cpp | 67 ++ .../matrix/scaled_permutation_kernels.cpp | 72 ++ core/CMakeLists.txt | 3 +- core/device_hooks/common_kernels.inc.cpp | 46 +- core/matrix/csr.cpp | 317 ++++++--- core/matrix/csr_kernels.hpp | 216 +++--- core/matrix/dense.cpp | 439 +++++++++--- core/matrix/dense_kernels.hpp | 336 +++++---- core/matrix/permutation.cpp | 38 ++ core/matrix/permutation_kernels.hpp | 82 +++ core/matrix/scaled_permutation.cpp | 142 ++++ core/matrix/scaled_permutation_kernels.hpp | 68 ++ core/test/matrix/permutation.cpp | 99 +-- cuda/matrix/csr_kernels.instantiate.cu | 12 +- cuda/matrix/csr_kernels.template.cu | 365 +--------- dpcpp/matrix/csr_kernels.dp.cpp | 85 ++- .../distributed-solver/distributed-solver.cpp | 5 +- hip/matrix/csr_kernels.instantiate.hip.cpp | 12 +- hip/matrix/csr_kernels.template.hip.cpp | 336 --------- include/ginkgo/core/base/exception.hpp | 20 + include/ginkgo/core/matrix/csr.hpp | 4 +- include/ginkgo/core/matrix/dense.hpp | 45 +- include/ginkgo/core/matrix/permutation.hpp | 94 ++- .../ginkgo/core/matrix/scaled_permutation.hpp | 177 +++++ include/ginkgo/ginkgo.hpp | 1 + omp/matrix/csr_kernels.cpp | 171 ++++- omp/test/reorder/rcm_kernels.cpp | 7 +- reference/CMakeLists.txt | 2 + reference/matrix/csr_kernels.cpp | 219 +++++- reference/matrix/dense_kernels.cpp | 230 ++++++- reference/matrix/permutation_kernels.cpp | 58 ++ .../matrix/scaled_permutation_kernels.cpp | 64 ++ reference/test/matrix/CMakeLists.txt | 1 + reference/test/matrix/csr_kernels.cpp | 461 ++++++++++++- reference/test/matrix/dense_kernels.cpp | 641 ++++++++++++++++++ reference/test/matrix/permutation.cpp | 415 +----------- reference/test/matrix/scaled_permutation.cpp | 116 ++++ reference/test/reorder/rcm_kernels.cpp | 2 +- test/matrix/CMakeLists.txt | 2 + test/matrix/csr_kernels2.cpp | 227 ++++++- test/matrix/dense_kernels.cpp | 216 ++++++ test/matrix/permutation_kernels.cpp | 73 ++ test/matrix/scaled_permutation_kernels.cpp | 77 +++ 48 files changed, 5305 insertions(+), 1696 deletions(-) create mode 100644 common/unified/matrix/permutation_kernels.cpp create mode 100644 common/unified/matrix/scaled_permutation_kernels.cpp create mode 100644 core/matrix/permutation_kernels.hpp create mode 100644 core/matrix/scaled_permutation.cpp create mode 100644 core/matrix/scaled_permutation_kernels.hpp create mode 100644 include/ginkgo/core/matrix/scaled_permutation.hpp create mode 100644 reference/matrix/permutation_kernels.cpp create mode 100644 reference/matrix/scaled_permutation_kernels.cpp create mode 100644 reference/test/matrix/scaled_permutation.cpp create mode 100644 test/matrix/permutation_kernels.cpp create mode 100644 test/matrix/scaled_permutation_kernels.cpp diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 4bc601c5067..3a762ad5ad1 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -764,6 +764,147 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute( } +template +__global__ __launch_bounds__(default_block_size) void inv_nonsymm_permute( + size_type num_rows, const IndexType* __restrict__ row_permutation, + const IndexType* __restrict__ col_permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + auto lane = threadIdx.x % subwarp_size; + auto in_row = tid; + auto out_row = row_permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + out_cols[out_begin + i] = col_permutation[in_cols[in_begin + i]]; + out_vals[out_begin + i] = in_vals[in_begin + i]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void row_scale_permute( + size_type num_rows, const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + auto lane = threadIdx.x % subwarp_size; + auto in_row = permutation[tid]; + auto out_row = tid; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + out_cols[out_begin + i] = in_cols[in_begin + i]; + out_vals[out_begin + i] = in_vals[in_begin + i] * scale[out_row]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void inv_row_scale_permute( + size_type num_rows, const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + auto lane = threadIdx.x % subwarp_size; + auto in_row = tid; + auto out_row = permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + out_cols[out_begin + i] = in_cols[in_begin + i]; + out_vals[out_begin + i] = in_vals[in_begin + i] / scale[in_row]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void inv_symm_scale_permute( + size_type num_rows, const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + auto lane = threadIdx.x % subwarp_size; + auto in_row = tid; + auto out_row = permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + const auto in_col = in_cols[in_begin + i]; + out_cols[out_begin + i] = permutation[in_col]; + out_vals[out_begin + i] = + in_vals[in_begin + i] / (scale[in_row] * scale[in_col]); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void inv_nonsymm_scale_permute( + size_type num_rows, const ValueType* __restrict__ row_scale, + const IndexType* __restrict__ row_permutation, + const ValueType* __restrict__ col_scale, + const IndexType* __restrict__ col_permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + auto lane = threadIdx.x % subwarp_size; + auto in_row = tid; + auto out_row = row_permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + const auto in_col = in_cols[in_begin + i]; + out_cols[out_begin + i] = col_permutation[in_col]; + out_vals[out_begin + i] = + in_vals[in_begin + i] / (row_scale[in_row] * col_scale[in_col]); + } +} + + template __global__ __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( @@ -1120,6 +1261,408 @@ void build_lookup(std::shared_ptr exec, } +namespace { + + +template +void spgeam(syn::value_list, + std::shared_ptr exec, const ValueType* alpha, + const IndexType* a_row_ptrs, const IndexType* a_col_idxs, + const ValueType* a_vals, const ValueType* beta, + const IndexType* b_row_ptrs, const IndexType* b_col_idxs, + const ValueType* b_vals, matrix::Csr* c) +{ + auto m = static_cast(c->get_size()[0]); + auto c_row_ptrs = c->get_row_ptrs(); + // count nnz for alpha * A + beta * B + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(m, subwarps_per_block); + if (num_blocks > 0) { + kernel::spgeam_nnz + <<get_stream()>>>( + a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + } + + // build row pointers + components::prefix_sum_nonnegative(exec, c_row_ptrs, m + 1); + + // accumulate non-zeros for alpha * A + beta * B + matrix::CsrBuilder c_builder{c}; + auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); + c_builder.get_col_idx_array().resize_and_reset(c_nnz); + c_builder.get_value_array().resize_and_reset(c_nnz); + auto c_col_idxs = c->get_col_idxs(); + auto c_vals = c->get_values(); + if (num_blocks > 0) { + kernel::spgeam + <<get_stream()>>>( + as_device_type(alpha), a_row_ptrs, a_col_idxs, + as_device_type(a_vals), as_device_type(beta), b_row_ptrs, + b_col_idxs, as_device_type(b_vals), m, c_row_ptrs, c_col_idxs, + as_device_type(c_vals)); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); + + +} // namespace + + +template +void spgeam(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Csr* a, + const matrix::Dense* beta, + const matrix::Csr* b, + matrix::Csr* c) +{ + auto total_nnz = + a->get_num_stored_elements() + b->get_num_stored_elements(); + auto nnz_per_row = total_nnz / a->get_size()[0]; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), a->get_const_row_ptrs(), + a->get_const_col_idxs(), a->get_const_values(), + beta->get_const_values(), b->get_const_row_ptrs(), + b->get_const_col_idxs(), b->get_const_values(), c); +} + + +template +void fill_in_dense(std::shared_ptr exec, + const matrix::Csr* source, + matrix::Dense* result) +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto stride = result->get_stride(); + const auto row_ptrs = source->get_const_row_ptrs(); + const auto col_idxs = source->get_const_col_idxs(); + const auto vals = source->get_const_values(); + + auto grid_dim = ceildiv(num_rows, default_block_size); + if (grid_dim > 0) { + kernel::fill_in_dense<<get_stream()>>>( + num_rows, as_device_type(row_ptrs), as_device_type(col_idxs), + as_device_type(vals), stride, as_device_type(result->get_values())); + } +} + + +template +void inv_symm_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_symm_permute + <<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_perm, const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, row_perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_nonsymm_permute + <<get_stream()>>>( + num_rows, row_perm, col_perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::row_permute + <<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void inv_row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_row_permute + <<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_symm_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(scale), perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, row_perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_nonsymm_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(row_scale), row_perm, + as_device_type(col_scale), col_perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::row_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(scale), perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_row_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(scale), perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void calculate_nonzeros_per_row_in_span( + std::shared_ptr exec, + const matrix::Csr* source, const span& row_span, + const span& col_span, array* row_nnz) +{ + const auto num_rows = source->get_size()[0]; + auto row_ptrs = source->get_const_row_ptrs(); + auto col_idxs = source->get_const_col_idxs(); + auto grid_dim = ceildiv(row_span.length(), default_block_size); + if (grid_dim > 0) { + kernel::calculate_nnz_per_row_in_span<<get_stream()>>>( + row_span, col_span, as_device_type(row_ptrs), + as_device_type(col_idxs), as_device_type(row_nnz->get_data())); + } +} + + +template +void compute_submatrix(std::shared_ptr exec, + const matrix::Csr* source, + gko::span row_span, gko::span col_span, + matrix::Csr* result) +{ + auto row_offset = row_span.begin; + auto col_offset = col_span.begin; + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + auto row_ptrs = source->get_const_row_ptrs(); + auto grid_dim = ceildiv(num_rows, default_block_size); + if (grid_dim > 0) { + kernel::compute_submatrix_idxs_and_vals<<get_stream()>>>( + num_rows, num_cols, row_offset, col_offset, + as_device_type(source->get_const_row_ptrs()), + as_device_type(source->get_const_col_idxs()), + as_device_type(source->get_const_values()), + as_device_type(result->get_const_row_ptrs()), + as_device_type(result->get_col_idxs()), + as_device_type(result->get_values())); + } +} + + +template +void calculate_nonzeros_per_row_in_index_set( + std::shared_ptr exec, + const matrix::Csr* source, + const gko::index_set& row_index_set, + const gko::index_set& col_index_set, + IndexType* row_nnz) GKO_NOT_IMPLEMENTED; + + +template +void compute_submatrix_from_index_set( + std::shared_ptr exec, + const matrix::Csr* source, + const gko::index_set& row_index_set, + const gko::index_set& col_index_set, + matrix::Csr* result) GKO_NOT_IMPLEMENTED; + + template void fallback_transpose(std::shared_ptr exec, const matrix::Csr* input, @@ -1169,3 +1712,91 @@ void fallback_sort(std::shared_ptr exec, thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, col_val_it); } + + +template +void is_sorted_by_column_index( + std::shared_ptr exec, + const matrix::Csr* to_check, bool* is_sorted) +{ + *is_sorted = true; + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); + auto gpu_array = array{exec, cpu_array}; + auto block_size = default_block_size; + auto num_rows = static_cast(to_check->get_size()[0]); + auto num_blocks = ceildiv(num_rows, block_size); + if (num_blocks > 0) { + kernel:: + check_unsorted<<get_stream()>>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_rows, gpu_array.get_data()); + } + cpu_array = gpu_array; +} + + +template +void extract_diagonal(std::shared_ptr exec, + const matrix::Csr* orig, + matrix::Diagonal* diag) +{ + const auto nnz = orig->get_num_stored_elements(); + const auto diag_size = diag->get_size()[0]; + const auto num_blocks = + ceildiv(config::warp_size * diag_size, default_block_size); + + const auto orig_values = orig->get_const_values(); + const auto orig_row_ptrs = orig->get_const_row_ptrs(); + const auto orig_col_idxs = orig->get_const_col_idxs(); + auto diag_values = diag->get_values(); + if (num_blocks > 0) { + kernel::extract_diagonal<<get_stream()>>>( + diag_size, nnz, as_device_type(orig_values), + as_device_type(orig_row_ptrs), as_device_type(orig_col_idxs), + as_device_type(diag_values)); + } +} + + +template +void check_diagonal_entries_exist( + std::shared_ptr exec, + const matrix::Csr* const mtx, bool& has_all_diags) +{ + const auto num_diag = static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])); + if (num_diag > 0) { + const IndexType num_blocks = + ceildiv(num_diag, default_block_size / config::warp_size); + array has_diags(exec, {true}); + kernel::check_diagonal_entries<<get_stream()>>>( + num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + has_diags.get_data()); + has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); + } else { + has_all_diags = true; + } +} + + +template +void add_scaled_identity(std::shared_ptr exec, + const matrix::Dense* const alpha, + const matrix::Dense* const beta, + matrix::Csr* const mtx) +{ + const auto nrows = mtx->get_size()[0]; + if (nrows == 0) { + return; + } + const auto nthreads = nrows * config::warp_size; + const auto nblocks = ceildiv(nthreads, default_block_size); + kernel::add_scaled_identity<<get_stream()>>>( + as_device_type(alpha->get_const_values()), + as_device_type(beta->get_const_values()), static_cast(nrows), + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_device_type(mtx->get_values())); +} diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt index 67fc839d6a7..7ac6b3df40c 100644 --- a/common/unified/CMakeLists.txt +++ b/common/unified/CMakeLists.txt @@ -12,6 +12,8 @@ set(UNIFIED_SOURCES matrix/csr_kernels.cpp matrix/ell_kernels.cpp matrix/hybrid_kernels.cpp + matrix/permutation_kernels.cpp + matrix/scaled_permutation_kernels.cpp matrix/sellp_kernels.cpp matrix/sparsity_csr_kernels.cpp matrix/diagonal_kernels.cpp diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index 1704fdd1f9c..4746f88ddfe 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -54,53 +54,71 @@ namespace GKO_DEVICE_NAMESPACE { namespace csr { -template -void invert_permutation(std::shared_ptr exec, - size_type size, const IndexType* permutation_indices, - IndexType* inv_permutation) +template +void inv_col_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* col_permuted) { + auto num_rows = orig->get_size()[0]; + auto nnz = orig->get_num_stored_elements(); + auto size = std::max(num_rows, nnz); run_kernel( exec, - [] GKO_KERNEL(auto tid, auto permutation, auto inv_permutation) { - inv_permutation[permutation[tid]] = tid; + [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, + auto permutation, auto in_row_ptrs, auto in_col_idxs, + auto in_vals, auto out_row_ptrs, auto out_col_idxs, + auto out_vals) { + if (tid < num_nonzeros) { + out_col_idxs[tid] = permutation[in_col_idxs[tid]]; + out_vals[tid] = in_vals[tid]; + } + if (tid <= num_rows) { + out_row_ptrs[tid] = in_row_ptrs[tid]; + } }, - size, permutation_indices, inv_permutation); + size, num_rows, nnz, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), orig->get_const_values(), + col_permuted->get_row_ptrs(), col_permuted->get_col_idxs(), + col_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); template -void inverse_column_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* column_permuted) +void inv_col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* col_permuted) { auto num_rows = orig->get_size()[0]; auto nnz = orig->get_num_stored_elements(); auto size = std::max(num_rows, nnz); run_kernel( exec, - [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, + [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, auto scale, auto permutation, auto in_row_ptrs, auto in_col_idxs, auto in_vals, auto out_row_ptrs, auto out_col_idxs, auto out_vals) { if (tid < num_nonzeros) { - out_col_idxs[tid] = permutation[in_col_idxs[tid]]; - out_vals[tid] = in_vals[tid]; + const auto in_col = in_col_idxs[tid]; + out_col_idxs[tid] = permutation[in_col]; + out_vals[tid] = in_vals[tid] / scale[in_col]; } if (tid <= num_rows) { out_row_ptrs[tid] = in_row_ptrs[tid]; } }, - size, num_rows, nnz, perm, orig->get_const_row_ptrs(), + size, num_rows, nnz, scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), orig->get_const_values(), - column_permuted->get_row_ptrs(), column_permuted->get_col_idxs(), - column_permuted->get_values()); + col_permuted->get_row_ptrs(), col_permuted->get_col_idxs(), + col_permuted->get_values()); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); template diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp index f34d05954c4..73e06385f54 100644 --- a/common/unified/matrix/dense_kernels.instantiate.cpp +++ b/common/unified/matrix/dense_kernels.instantiate.cpp @@ -59,16 +59,36 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); + GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); + GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index 9bd5c04f861..f3723ae8aad 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -383,7 +383,7 @@ void compute_sqrt(std::shared_ptr exec, template void symm_permute(std::shared_ptr exec, - const array* permutation_indices, + const IndexType* permutation_indices, const matrix::Dense* orig, matrix::Dense* permuted) { @@ -392,13 +392,13 @@ void symm_permute(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(row, col) = orig(perm[row], perm[col]); }, - orig->get_size(), orig, *permutation_indices, permuted); + orig->get_size(), orig, permutation_indices, permuted); } template void inv_symm_permute(std::shared_ptr exec, - const array* permutation_indices, + const IndexType* permutation_indices, const matrix::Dense* orig, matrix::Dense* permuted) { @@ -407,14 +407,49 @@ void inv_symm_permute(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(perm[row], perm[col]) = orig(row, col); }, - orig->get_size(), orig, *permutation_indices, permuted); + orig->get_size(), orig, permutation_indices, permuted); +} + + +template +void nonsymm_permute(std::shared_ptr exec, + const IndexType* row_permutation_indices, + const IndexType* column_permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto row_perm, + auto col_perm, auto permuted) { + permuted(row, col) = orig(row_perm[row], col_perm[col]); + }, + orig->get_size(), orig, row_permutation_indices, + column_permutation_indices, permuted); +} + + +template +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_permutation_indices, + const IndexType* column_permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto row_perm, + auto col_perm, auto permuted) { + permuted(row_perm[row], col_perm[col]) = orig(row, col); + }, + orig->get_size(), orig, row_permutation_indices, + column_permutation_indices, permuted); } template void row_gather(std::shared_ptr exec, - const array* row_idxs, - const matrix::Dense* orig, + const IndexType* row_idxs, const matrix::Dense* orig, matrix::Dense* row_collection) { run_kernel( @@ -422,15 +457,14 @@ void row_gather(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { gathered(row, col) = orig(rows[row], col); }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, - row_collection); + row_collection->get_size(), orig, row_idxs, row_collection); } template void advanced_row_gather(std::shared_ptr exec, const matrix::Dense* alpha, - const array* row_idxs, + const IndexType* row_idxs, const matrix::Dense* orig, const matrix::Dense* beta, matrix::Dense* row_collection) @@ -445,54 +479,191 @@ void advanced_row_gather(std::shared_ptr exec, static_cast(beta[0]) * static_cast(gathered(row, col)); }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, - alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), - row_collection); + row_collection->get_size(), alpha->get_const_values(), orig, row_idxs, + beta->get_const_values(), row_collection); } template -void column_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) +void col_permute(std::shared_ptr exec, + const IndexType* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* col_permuted) { run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(row, col) = orig(row, perm[col]); }, - orig->get_size(), orig, *permutation_indices, column_permuted); + orig->get_size(), orig, permutation_indices, col_permuted); } template -void inverse_row_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* row_permuted) +void inv_row_permute(std::shared_ptr exec, + const IndexType* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* row_permuted) { run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(perm[row], col) = orig(row, col); }, - orig->get_size(), orig, *permutation_indices, row_permuted); + orig->get_size(), orig, permutation_indices, row_permuted); } template -void inverse_column_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) +void inv_col_permute(std::shared_ptr exec, + const IndexType* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* col_permuted) { run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(row, perm[col]) = orig(row, col); }, - orig->get_size(), orig, *permutation_indices, column_permuted); + orig->get_size(), orig, permutation_indices, col_permuted); +} + + +template +void symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + permuted(i, j) = scale[i] * scale[j] * orig(perm[i], perm[j]); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + permuted(perm[i], perm[j]) = orig(i, j) / (scale[i] * scale[j]); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm, + auto col_scale, auto col_perm, auto orig, auto permuted) { + permuted(i, j) = + row_scale[i] * col_scale[j] * orig(row_perm[i], col_perm[j]); + }, + orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig, + permuted); +} + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm, + auto col_scale, auto col_perm, auto orig, auto permuted) { + permuted(row_perm[i], row_perm[j]) = + orig(i, j) / (row_scale[i] * col_scale[j]); + }, + orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig, + permuted); +} + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + permuted(i, j) = scale[i] * orig(perm[i], j); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + permuted(perm[i], j) = orig(i, j) / scale[i]; + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + permuted(i, j) = scale[j] * orig(i, perm[j]); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void inv_col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + permuted(i, perm[j]) = orig(i, j) / scale[j]; + }, + orig->get_size(), scale, perm, orig, permuted); } diff --git a/common/unified/matrix/permutation_kernels.cpp b/common/unified/matrix/permutation_kernels.cpp new file mode 100644 index 00000000000..58b82c1602e --- /dev/null +++ b/common/unified/matrix/permutation_kernels.cpp @@ -0,0 +1,67 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/permutation_kernels.hpp" + + +#include + + +#include "common/unified/base/kernel_launch.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace permutation { + + +template +void invert(std::shared_ptr exec, + const IndexType* permutation_indices, size_type size, + IndexType* inv_permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto permutation, auto inv_permutation) { + inv_permutation[permutation[i]] = i; + }, + size, permutation_indices, inv_permutation); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL); + + +} // namespace permutation +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp new file mode 100644 index 00000000000..7bebe4c4778 --- /dev/null +++ b/common/unified/matrix/scaled_permutation_kernels.cpp @@ -0,0 +1,72 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/scaled_permutation_kernels.hpp" + + +#include + + +#include "common/unified/base/kernel_launch.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace scaled_permutation { + + +template +void invert(std::shared_ptr exec, + const IndexType* input_permutation, const ValueType* input_scale, + size_type size, IndexType* output_permutation, + ValueType* output_scale) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto input_permutation, auto input_scale, + auto output_permutation, auto output_scale) { + output_permutation[input_permutation[i]] = i; + output_scale[input_permutation[i]] = + one(input_scale[i]) / input_scale[i]; + }, + size, input_permutation, input_scale, output_permutation, output_scale); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); + + +} // namespace scaled_permutation +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index ae8035bcbf9..014a94c0369 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -51,9 +51,10 @@ target_sources(ginkgo matrix/hybrid.cpp matrix/identity.cpp matrix/permutation.cpp + matrix/row_gatherer.cpp + matrix/scaled_permutation.cpp matrix/sellp.cpp matrix/sparsity_csr.cpp - matrix/row_gatherer.cpp multigrid/pgm.cpp multigrid/fixed_coarsening.cpp preconditioner/isai.cpp diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 7f7b1b473a2..3f5d097abac 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -67,6 +67,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/fbcsr_kernels.hpp" #include "core/matrix/fft_kernels.hpp" #include "core/matrix/hybrid_kernels.hpp" +#include "core/matrix/permutation_kernels.hpp" +#include "core/matrix/scaled_permutation_kernels.hpp" #include "core/matrix/sellp_kernels.hpp" #include "core/matrix/sparsity_csr_kernels.hpp" #include "core/multigrid/pgm_kernels.hpp" @@ -372,9 +374,20 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2( GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); @@ -591,11 +604,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); -GKO_STUB_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); @@ -708,6 +726,24 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); } // namespace hybrid +namespace permutation { + + +GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL); + + +} // namespace permutation + + +namespace scaled_permutation { + + +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); + + +} // namespace scaled_permutation + + namespace sellp { diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 9a4697c1195..e669f4d4718 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -93,9 +94,15 @@ GKO_REGISTER_OPERATION(transpose, csr::transpose); GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose); GKO_REGISTER_OPERATION(inv_symm_permute, csr::inv_symm_permute); GKO_REGISTER_OPERATION(row_permute, csr::row_permute); -GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute); -GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute); -GKO_REGISTER_OPERATION(invert_permutation, csr::invert_permutation); +GKO_REGISTER_OPERATION(inv_row_permute, csr::inv_row_permute); +GKO_REGISTER_OPERATION(inv_col_permute, csr::inv_col_permute); +GKO_REGISTER_OPERATION(inv_nonsymm_permute, csr::inv_nonsymm_permute); +GKO_REGISTER_OPERATION(inv_symm_scale_permute, csr::inv_symm_scale_permute); +GKO_REGISTER_OPERATION(row_scale_permute, csr::row_scale_permute); +GKO_REGISTER_OPERATION(inv_row_scale_permute, csr::inv_row_scale_permute); +GKO_REGISTER_OPERATION(inv_col_scale_permute, csr::inv_col_scale_permute); +GKO_REGISTER_OPERATION(inv_nonsymm_scale_permute, + csr::inv_nonsymm_scale_permute); GKO_REGISTER_OPERATION(convert_ptrs_to_sizes, components::convert_ptrs_to_sizes); GKO_REGISTER_OPERATION(sort_by_column_index, csr::sort_by_column_index); @@ -520,26 +527,226 @@ std::unique_ptr Csr::conj_transpose() const } +template +std::unique_ptr> Csr::permute( + ptr_param> permutation, + permute_mode mode) const +{ + const auto exec = this->get_executor(); + const auto size = this->get_size(); + const auto nnz = this->get_num_stored_elements(); + if ((mode & permute_mode::symmetric) == permute_mode::none) { + return this->clone(); + } + if ((mode & permute_mode::symmetric) == permute_mode::symmetric) { + GKO_ASSERT_IS_SQUARE_MATRIX(this); + } + if ((mode & permute_mode::rows) == permute_mode::rows) { + GKO_ASSERT_EQ(size[0], permutation->get_size()[0]); + } + if ((mode & permute_mode::columns) == permute_mode::columns) { + GKO_ASSERT_EQ(size[1], permutation->get_size()[0]); + } + auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy()); + auto local_permutation = make_temporary_clone(exec, permutation); + std::unique_ptr> inv_permutation; + const auto perm_idxs = local_permutation->get_const_permutation(); + const IndexType* inv_perm_idxs{}; + // to permute columns, we need to know the inverse permutation + bool needs_inverse = + (mode & permute_mode::inverse_columns) == permute_mode::columns; + if (needs_inverse) { + inv_permutation = local_permutation->invert(); + inv_perm_idxs = inv_permutation->get_const_permutation(); + } + switch (mode) { + case permute_mode::rows: + exec->run(csr::make_row_permute(perm_idxs, this, result.get())); + break; + case permute_mode::columns: + exec->run(csr::make_inv_col_permute(inv_perm_idxs, this, result.get())); + break; + case permute_mode::inverse_rows: + exec->run(csr::make_inv_row_permute(perm_idxs, this, result.get())); + break; + case permute_mode::inverse_columns: + exec->run(csr::make_inv_col_permute(perm_idxs, this, result.get())); + break; + case permute_mode::symmetric: + exec->run( + csr::make_inv_symm_permute(inv_perm_idxs, this, result.get())); + break; + case permute_mode::inverse_symmetric: + exec->run(csr::make_inv_symm_permute(perm_idxs, this, result.get())); + break; + default: + GKO_ASSERT(false); + } + result->make_srow(); + if ((mode & permute_mode::columns) == permute_mode::columns) { + result->sort_by_column_index(); + } + return result; +} + + +template +std::unique_ptr> Csr::permute( + ptr_param> row_permutation, + ptr_param> col_permutation, bool invert) const +{ + const auto exec = this->get_executor(); + const auto size = this->get_size(); + const auto nnz = this->get_num_stored_elements(); + GKO_ASSERT_EQ(size[0], row_permutation->get_size()[0]); + GKO_ASSERT_EQ(size[1], col_permutation->get_size()[0]); + auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy()); + auto local_row_permutation = make_temporary_clone(exec, row_permutation); + auto local_col_permutation = make_temporary_clone(exec, col_permutation); + if (invert) { + exec->run(csr::make_inv_nonsymm_permute( + local_row_permutation->get_const_permutation(), + local_col_permutation->get_const_permutation(), this, + result.get())); + } else { + const auto inv_row_perm = local_row_permutation->invert(); + const auto inv_col_perm = local_col_permutation->invert(); + exec->run(csr::make_inv_nonsymm_permute( + inv_row_perm->get_const_permutation(), + inv_col_perm->get_const_permutation(), this, result.get())); + } + result->make_srow(); + result->sort_by_column_index(); + return result; +} + + +template +std::unique_ptr> +Csr::scale_permute( + ptr_param> permutation, + permute_mode mode) const +{ + const auto exec = this->get_executor(); + const auto size = this->get_size(); + const auto nnz = this->get_num_stored_elements(); + if ((mode & permute_mode::symmetric) == permute_mode::none) { + return this->clone(); + } + if ((mode & permute_mode::symmetric) == permute_mode::symmetric) { + GKO_ASSERT_IS_SQUARE_MATRIX(this); + } + if ((mode & permute_mode::rows) == permute_mode::rows) { + GKO_ASSERT_EQ(size[0], permutation->get_size()[0]); + } + if ((mode & permute_mode::columns) == permute_mode::columns) { + GKO_ASSERT_EQ(size[1], permutation->get_size()[0]); + } + auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy()); + auto local_permutation = make_temporary_clone(exec, permutation); + std::unique_ptr> + inv_permutation; + const auto perm_idxs = local_permutation->get_const_permutation(); + const auto scale_factors = local_permutation->get_const_scale(); + const ValueType* inv_scale_factors{}; + const IndexType* inv_perm_idxs{}; + // to permute columns, we need to know the inverse permutation + bool needs_inverse = + (mode & permute_mode::inverse_columns) == permute_mode::columns; + if (needs_inverse) { + inv_permutation = local_permutation->invert(); + inv_scale_factors = inv_permutation->get_const_scale(); + inv_perm_idxs = inv_permutation->get_const_permutation(); + } + switch (mode) { + case permute_mode::rows: + exec->run(csr::make_row_scale_permute(scale_factors, perm_idxs, this, + result.get())); + break; + case permute_mode::columns: + exec->run(csr::make_inv_col_scale_permute( + inv_scale_factors, inv_perm_idxs, this, result.get())); + break; + case permute_mode::inverse_rows: + exec->run(csr::make_inv_row_scale_permute(scale_factors, perm_idxs, + this, result.get())); + break; + case permute_mode::inverse_columns: + exec->run(csr::make_inv_col_scale_permute(scale_factors, perm_idxs, + this, result.get())); + break; + case permute_mode::symmetric: + exec->run(csr::make_inv_symm_scale_permute( + inv_scale_factors, inv_perm_idxs, this, result.get())); + break; + case permute_mode::inverse_symmetric: + exec->run(csr::make_inv_symm_scale_permute(scale_factors, perm_idxs, + this, result.get())); + break; + default: + GKO_ASSERT(false); + } + result->make_srow(); + if ((mode & permute_mode::columns) == permute_mode::columns) { + result->sort_by_column_index(); + } + return result; +} + + +template +std::unique_ptr> +Csr::scale_permute( + ptr_param> row_permutation, + ptr_param> col_permutation, + bool invert) const +{ + const auto exec = this->get_executor(); + const auto size = this->get_size(); + const auto nnz = this->get_num_stored_elements(); + GKO_ASSERT_EQ(size[0], row_permutation->get_size()[0]); + GKO_ASSERT_EQ(size[1], col_permutation->get_size()[0]); + auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy()); + auto local_row_permutation = make_temporary_clone(exec, row_permutation); + auto local_col_permutation = make_temporary_clone(exec, col_permutation); + if (invert) { + exec->run(csr::make_inv_nonsymm_scale_permute( + local_row_permutation->get_const_scale(), + local_row_permutation->get_const_permutation(), + local_col_permutation->get_const_scale(), + local_col_permutation->get_const_permutation(), this, + result.get())); + } else { + const auto inv_row_perm = local_row_permutation->invert(); + const auto inv_col_perm = local_col_permutation->invert(); + exec->run(csr::make_inv_nonsymm_scale_permute( + inv_row_perm->get_const_scale(), + inv_row_perm->get_const_permutation(), + inv_col_perm->get_const_scale(), + inv_col_perm->get_const_permutation(), this, result.get())); + } + result->make_srow(); + result->sort_by_column_index(); + return result; +} + + +template +std::unique_ptr> create_permutation_view( + const array& indices) +{ + return Permutation::create_const(indices.get_executor(), + indices.get_num_elems(), + indices.as_const_view()); +} + + template std::unique_ptr Csr::permute( const array* permutation_indices) const { - GKO_ASSERT_IS_SQUARE_MATRIX(this); - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); - auto exec = this->get_executor(); - auto permute_cpy = - Csr::create(exec, this->get_size(), this->get_num_stored_elements(), - this->get_strategy()); - array inv_permutation(exec, this->get_size()[1]); - - exec->run(csr::make_invert_permutation( - this->get_size()[1], - make_temporary_clone(exec, permutation_indices)->get_const_data(), - inv_permutation.get_data())); - exec->run(csr::make_inv_symm_permute(inv_permutation.get_const_data(), this, - permute_cpy.get())); - permute_cpy->make_srow(); - return std::move(permute_cpy); + return permute(create_permutation_view(*permutation_indices), + permute_mode::symmetric); } @@ -547,18 +754,8 @@ template std::unique_ptr Csr::inverse_permute( const array* permutation_indices) const { - GKO_ASSERT_IS_SQUARE_MATRIX(this); - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); - auto exec = this->get_executor(); - auto permute_cpy = - Csr::create(exec, this->get_size(), this->get_num_stored_elements(), - this->get_strategy()); - - exec->run(csr::make_inv_symm_permute( - make_temporary_clone(exec, permutation_indices)->get_const_data(), this, - permute_cpy.get())); - permute_cpy->make_srow(); - return std::move(permute_cpy); + return permute(create_permutation_view(*permutation_indices), + permute_mode::inverse_symmetric); } @@ -566,17 +763,8 @@ template std::unique_ptr Csr::row_permute( const array* permutation_indices) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); - auto exec = this->get_executor(); - auto permute_cpy = - Csr::create(exec, this->get_size(), this->get_num_stored_elements(), - this->get_strategy()); - - exec->run(csr::make_row_permute( - make_temporary_clone(exec, permutation_indices)->get_const_data(), this, - permute_cpy.get())); - permute_cpy->make_srow(); - return std::move(permute_cpy); + return permute(create_permutation_view(*permutation_indices), + permute_mode::rows); } @@ -584,22 +772,8 @@ template std::unique_ptr Csr::column_permute( const array* permutation_indices) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]); - auto exec = this->get_executor(); - auto permute_cpy = - Csr::create(exec, this->get_size(), this->get_num_stored_elements(), - this->get_strategy()); - array inv_permutation(exec, this->get_size()[1]); - - exec->run(csr::make_invert_permutation( - this->get_size()[1], - make_temporary_clone(exec, permutation_indices)->get_const_data(), - inv_permutation.get_data())); - exec->run(csr::make_inverse_column_permute(inv_permutation.get_const_data(), - this, permute_cpy.get())); - permute_cpy->make_srow(); - permute_cpy->sort_by_column_index(); - return std::move(permute_cpy); + return permute(create_permutation_view(*permutation_indices), + permute_mode::columns); } @@ -607,17 +781,8 @@ template std::unique_ptr Csr::inverse_row_permute( const array* permutation_indices) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); - auto exec = this->get_executor(); - auto inverse_permute_cpy = - Csr::create(exec, this->get_size(), this->get_num_stored_elements(), - this->get_strategy()); - - exec->run(csr::make_inverse_row_permute( - make_temporary_clone(exec, permutation_indices)->get_const_data(), this, - inverse_permute_cpy.get())); - inverse_permute_cpy->make_srow(); - return std::move(inverse_permute_cpy); + return permute(create_permutation_view(*permutation_indices), + permute_mode::inverse_rows); } @@ -625,18 +790,8 @@ template std::unique_ptr Csr::inverse_column_permute( const array* permutation_indices) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]); - auto exec = this->get_executor(); - auto inverse_permute_cpy = - Csr::create(exec, this->get_size(), this->get_num_stored_elements(), - this->get_strategy()); - - exec->run(csr::make_inverse_column_permute( - make_temporary_clone(exec, permutation_indices)->get_const_data(), this, - inverse_permute_cpy.get())); - inverse_permute_cpy->make_srow(); - inverse_permute_cpy->sort_by_column_index(); - return std::move(inverse_permute_cpy); + return permute(create_permutation_view(*permutation_indices), + permute_mode::inverse_columns); } diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 42a92ca1b84..26d80f93b8b 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -146,23 +146,61 @@ namespace kernels { const matrix::Csr* orig, \ matrix::Csr* row_permuted) -#define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType) \ - void inverse_row_permute(std::shared_ptr exec, \ - const IndexType* permutation_indices, \ +#define GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType) \ + void inv_row_permute(std::shared_ptr exec, \ + const IndexType* permutation_indices, \ + const matrix::Csr* orig, \ + matrix::Csr* row_permuted) + +#define GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL(ValueType, IndexType) \ + void inv_col_permute(std::shared_ptr exec, \ + const IndexType* permutation_indices, \ + const matrix::Csr* orig, \ + matrix::Csr* col_permuted) + +#define GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType) \ + void inv_nonsymm_permute(std::shared_ptr exec, \ + const IndexType* row_permutation_indices, \ + const IndexType* column_permutation_indices, \ const matrix::Csr* orig, \ - matrix::Csr* row_permuted) - -#define GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \ - void inverse_column_permute( \ - std::shared_ptr exec, \ - const IndexType* permutation_indices, \ - const matrix::Csr* orig, \ - matrix::Csr* column_permuted) - -#define GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType) \ - void invert_permutation( \ - std::shared_ptr exec, size_type size, \ - const IndexType* permutation_indices, IndexType* inv_permutation) + matrix::Csr* permuted) + +#define GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType) \ + void inv_symm_scale_permute(std::shared_ptr exec, \ + const ValueType* scale, \ + const IndexType* permutation_indices, \ + const matrix::Csr* orig, \ + matrix::Csr* permuted) + +#define GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType) \ + void row_scale_permute(std::shared_ptr exec, \ + const ValueType* scale, \ + const IndexType* permutation_indices, \ + const matrix::Csr* orig, \ + matrix::Csr* row_permuted) + +#define GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType) \ + void inv_row_scale_permute( \ + std::shared_ptr exec, const ValueType* scale, \ + const IndexType* permutation_indices, \ + const matrix::Csr* orig, \ + matrix::Csr* row_permuted) + +#define GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType) \ + void inv_col_scale_permute( \ + std::shared_ptr exec, const ValueType* scale, \ + const IndexType* permutation_indices, \ + const matrix::Csr* orig, \ + matrix::Csr* col_permuted) + +#define GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType) \ + void inv_nonsymm_scale_permute( \ + std::shared_ptr exec, \ + const ValueType* row_scale, const IndexType* row_permutation_indices, \ + const ValueType* column_scale, \ + const IndexType* column_permutation_indices, \ + const matrix::Csr* orig, \ + matrix::Csr* col_permuted) #define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType) \ void calculate_nonzeros_per_row_in_span( \ @@ -251,74 +289,84 @@ namespace kernels { IndexType sample_size, IndexType* result) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \ - OutputValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(MatrixValueType, InputValueType, \ - OutputValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType); \ - template \ - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL(ValueType, \ - IndexType); \ - template \ - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL(ValueType, \ - IndexType); \ - template \ - GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_EXTRACT_DIAGONAL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType); \ - template \ - GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \ + OutputValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(MatrixValueType, InputValueType, \ + OutputValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL(ValueType, \ + IndexType); \ + template \ + GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL(ValueType, \ + IndexType); \ + template \ + GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_EXTRACT_DIAGONAL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType); \ + template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType); \ + template \ GKO_DECLARE_CSR_BENCHMARK_LOOKUP_KERNEL(IndexType) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 9f7dff96aab..b263357dc9b 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -50,6 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -58,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/matrix/hybrid_kernels.hpp" +#include "ginkgo/core/base/temporary_clone.hpp" namespace gko { @@ -96,11 +99,22 @@ GKO_REGISTER_OPERATION(transpose, dense::transpose); GKO_REGISTER_OPERATION(conj_transpose, dense::conj_transpose); GKO_REGISTER_OPERATION(symm_permute, dense::symm_permute); GKO_REGISTER_OPERATION(inv_symm_permute, dense::inv_symm_permute); +GKO_REGISTER_OPERATION(nonsymm_permute, dense::nonsymm_permute); +GKO_REGISTER_OPERATION(inv_nonsymm_permute, dense::inv_nonsymm_permute); GKO_REGISTER_OPERATION(row_gather, dense::row_gather); GKO_REGISTER_OPERATION(advanced_row_gather, dense::advanced_row_gather); -GKO_REGISTER_OPERATION(column_permute, dense::column_permute); -GKO_REGISTER_OPERATION(inverse_row_permute, dense::inverse_row_permute); -GKO_REGISTER_OPERATION(inverse_column_permute, dense::inverse_column_permute); +GKO_REGISTER_OPERATION(col_permute, dense::col_permute); +GKO_REGISTER_OPERATION(inverse_row_permute, dense::inv_row_permute); +GKO_REGISTER_OPERATION(inverse_col_permute, dense::inv_col_permute); +GKO_REGISTER_OPERATION(symm_scale_permute, dense::symm_scale_permute); +GKO_REGISTER_OPERATION(inv_symm_scale_permute, dense::inv_symm_scale_permute); +GKO_REGISTER_OPERATION(nonsymm_scale_permute, dense::nonsymm_scale_permute); +GKO_REGISTER_OPERATION(inv_nonsymm_scale_permute, + dense::inv_nonsymm_scale_permute); +GKO_REGISTER_OPERATION(row_scale_permute, dense::row_scale_permute); +GKO_REGISTER_OPERATION(col_scale_permute, dense::col_scale_permute); +GKO_REGISTER_OPERATION(inv_row_scale_permute, dense::inv_row_scale_permute); +GKO_REGISTER_OPERATION(inv_col_scale_permute, dense::inv_col_scale_permute); GKO_REGISTER_OPERATION(fill_in_matrix_data, dense::fill_in_matrix_data); GKO_REGISTER_OPERATION(convert_to_coo, dense::convert_to_coo); GKO_REGISTER_OPERATION(convert_to_csr, dense::convert_to_csr); @@ -1113,48 +1127,174 @@ void Dense::conj_transpose(ptr_param> output) const template template -void Dense::permute_impl(const array* permutation_indices, - Dense* output) const +void Dense::permute_impl(const Permutation* permutation, + permute_mode mode, Dense* output) const { - GKO_ASSERT_IS_SQUARE_MATRIX(this); + const auto exec = this->get_executor(); + const auto size = this->get_size(); GKO_ASSERT_EQUAL_DIMENSIONS(this, output); - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); - auto exec = this->get_executor(); - - exec->run(dense::make_symm_permute( - make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_output_clone(exec, output).get())); + if ((mode & permute_mode::symmetric) == permute_mode::none) { + output->copy_from(this); + return; + } + if ((mode & permute_mode::symmetric) == permute_mode::symmetric) { + GKO_ASSERT_IS_SQUARE_MATRIX(this); + } + if ((mode & permute_mode::rows) == permute_mode::rows) { + GKO_ASSERT_EQ(size[0], permutation->get_size()[0]); + } + if ((mode & permute_mode::columns) == permute_mode::columns) { + GKO_ASSERT_EQ(size[1], permutation->get_size()[0]); + } + auto local_output = make_temporary_output_clone(exec, output); + auto local_perm = make_temporary_clone(exec, permutation); + switch (mode) { + case permute_mode::rows: + exec->run(dense::make_row_gather(local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::columns: + exec->run(dense::make_col_permute(local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::symmetric: + exec->run(dense::make_symm_permute(local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::inverse_rows: + exec->run(dense::make_inverse_row_permute( + local_perm->get_const_permutation(), this, local_output.get())); + break; + case permute_mode::inverse_columns: + exec->run(dense::make_inverse_col_permute( + local_perm->get_const_permutation(), this, local_output.get())); + break; + case permute_mode::inverse_symmetric: + exec->run(dense::make_inv_symm_permute( + local_perm->get_const_permutation(), this, local_output.get())); + break; + default: + GKO_ASSERT(false); // cannot happen + } } template template -void Dense::inverse_permute_impl( - const array* permutation_indices, Dense* output) const +void Dense::permute_impl( + const Permutation* row_permutation, + const Permutation* col_permutation, bool invert, + Dense* output) const { - GKO_ASSERT_IS_SQUARE_MATRIX(this); - GKO_ASSERT_EQUAL_DIMENSIONS(this, output); - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); auto exec = this->get_executor(); - - exec->run(dense::make_inv_symm_permute( - make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_output_clone(exec, output).get())); + auto size = this->get_size(); + GKO_ASSERT_EQUAL_DIMENSIONS(this, output); + GKO_ASSERT_EQ(size[0], row_permutation->get_size()[0]); + GKO_ASSERT_EQ(size[1], col_permutation->get_size()[0]); + auto local_output = make_temporary_output_clone(exec, output); + auto local_row_perm = make_temporary_clone(exec, row_permutation); + auto local_col_perm = make_temporary_clone(exec, col_permutation); + if (invert) { + exec->run(dense::make_inv_nonsymm_permute( + local_row_perm->get_const_permutation(), + local_col_perm->get_const_permutation(), this, local_output.get())); + } else { + exec->run(dense::make_nonsymm_permute( + local_row_perm->get_const_permutation(), + local_col_perm->get_const_permutation(), this, local_output.get())); + } } template template -void Dense::row_permute_impl( - const array* permutation_indices, Dense* output) const +void Dense::scale_permute_impl( + const ScaledPermutation* permutation, + permute_mode mode, Dense* output) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); + const auto exec = this->get_executor(); + const auto size = this->get_size(); GKO_ASSERT_EQUAL_DIMENSIONS(this, output); - auto exec = this->get_executor(); + if ((mode & permute_mode::symmetric) == permute_mode::none) { + output->copy_from(this); + return; + } + if ((mode & permute_mode::symmetric) == permute_mode::symmetric) { + GKO_ASSERT_IS_SQUARE_MATRIX(this); + } + if ((mode & permute_mode::rows) == permute_mode::rows) { + GKO_ASSERT_EQ(size[0], permutation->get_size()[0]); + } + if ((mode & permute_mode::columns) == permute_mode::columns) { + GKO_ASSERT_EQ(size[1], permutation->get_size()[0]); + } + auto local_output = make_temporary_output_clone(exec, output); + auto local_perm = make_temporary_clone(exec, permutation); + switch (mode) { + case permute_mode::rows: + exec->run(dense::make_row_scale_permute( + local_perm->get_const_scale(), local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::columns: + exec->run(dense::make_col_scale_permute( + local_perm->get_const_scale(), local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::symmetric: + exec->run(dense::make_symm_scale_permute( + local_perm->get_const_scale(), local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::inverse_rows: + exec->run(dense::make_inv_row_scale_permute( + local_perm->get_const_scale(), local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::inverse_columns: + exec->run(dense::make_inv_col_scale_permute( + local_perm->get_const_scale(), local_perm->get_const_permutation(), + this, local_output.get())); + break; + case permute_mode::inverse_symmetric: + exec->run(dense::make_inv_symm_scale_permute( + local_perm->get_const_scale(), local_perm->get_const_permutation(), + this, local_output.get())); + break; + default: + GKO_ASSERT(false); // cannot happen + } +} - exec->run(dense::make_row_gather( - make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_output_clone(exec, output).get())); + +template +template +void Dense::scale_permute_impl( + const ScaledPermutation* row_permutation, + const ScaledPermutation* col_permutation, bool invert, + Dense* output) const +{ + auto exec = this->get_executor(); + auto size = this->get_size(); + GKO_ASSERT_EQUAL_DIMENSIONS(this, output); + GKO_ASSERT_EQ(size[0], row_permutation->get_size()[0]); + GKO_ASSERT_EQ(size[1], col_permutation->get_size()[0]); + auto local_output = make_temporary_output_clone(exec, output); + auto local_row_perm = make_temporary_clone(exec, row_permutation); + auto local_col_perm = make_temporary_clone(exec, col_permutation); + if (invert) { + exec->run(dense::make_inv_nonsymm_scale_permute( + local_row_perm->get_const_scale(), + local_row_perm->get_const_permutation(), + local_col_perm->get_const_scale(), + local_col_perm->get_const_permutation(), this, local_output.get())); + } else { + exec->run(dense::make_nonsymm_scale_permute( + local_row_perm->get_const_scale(), + local_row_perm->get_const_permutation(), + local_col_perm->get_const_scale(), + local_col_perm->get_const_permutation(), this, local_output.get())); + } } @@ -1168,7 +1308,7 @@ void Dense::row_gather_impl(const array* row_idxs, GKO_ASSERT_EQUAL_DIMENSIONS(expected_dim, row_collection); exec->run(dense::make_row_gather( - make_temporary_clone(exec, row_idxs).get(), this, + make_temporary_clone(exec, row_idxs)->get_const_data(), this, make_temporary_output_clone(exec, row_collection).get())); } @@ -1185,82 +1325,130 @@ void Dense::row_gather_impl(const Dense* alpha, exec->run(dense::make_advanced_row_gather( make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, row_idxs).get(), this, + make_temporary_clone(exec, row_idxs)->get_const_data(), this, make_temporary_clone(exec, beta).get(), make_temporary_clone(exec, row_collection).get())); } template -template -void Dense::column_permute_impl( - const array* permutation_indices, Dense* output) const +std::unique_ptr Dense::permute( + const array* permutation_indices) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]); - GKO_ASSERT_EQUAL_DIMENSIONS(this, output); - auto exec = this->get_executor(); - - exec->run(dense::make_column_permute( - make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_output_clone(exec, output).get())); + auto result = Dense::create(this->get_executor(), this->get_size()); + this->permute(permutation_indices, result); + return result; } template -template -void Dense::inverse_row_permute_impl( - const array* permutation_indices, Dense* output) const +std::unique_ptr Dense::permute( + const array* permutation_indices) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]); - GKO_ASSERT_EQUAL_DIMENSIONS(this, output); - auto exec = this->get_executor(); - - exec->run(dense::make_inverse_row_permute( - make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_output_clone(exec, output).get())); + auto result = Dense::create(this->get_executor(), this->get_size()); + this->permute(permutation_indices, result); + return result; } template -template -void Dense::inverse_column_permute_impl( - const array* permutation_indices, Dense* output) const +std::unique_ptr> Dense::permute( + ptr_param> permutation, permute_mode mode) const { - GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]); - GKO_ASSERT_EQUAL_DIMENSIONS(this, output); - auto exec = this->get_executor(); + auto result = Dense::create(this->get_executor(), this->get_size()); + this->permute(permutation, result, mode); + return result; +} + - exec->run(dense::make_inverse_column_permute( - make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_output_clone(exec, output).get())); +template +std::unique_ptr> Dense::permute( + ptr_param> permutation, permute_mode mode) const +{ + auto result = Dense::create(this->get_executor(), this->get_size()); + this->permute(permutation, result, mode); + return result; } template -std::unique_ptr Dense::permute( - const array* permutation_indices) const +std::unique_ptr> Dense::permute( + ptr_param> row_permutation, + ptr_param> col_permutation, bool invert) const { auto result = Dense::create(this->get_executor(), this->get_size()); - this->permute(permutation_indices, result); + this->permute(row_permutation, col_permutation, result, invert); return result; } template -std::unique_ptr Dense::permute( - const array* permutation_indices) const +std::unique_ptr> Dense::permute( + ptr_param> row_permutation, + ptr_param> col_permutation, bool invert) const { auto result = Dense::create(this->get_executor(), this->get_size()); - this->permute(permutation_indices, result); + this->permute(row_permutation, col_permutation, result, invert); return result; } +template +void Dense::permute(ptr_param> permutation, + ptr_param> result, + permute_mode mode) const +{ + this->permute_impl(permutation.get(), mode, result.get()); +} + + +template +void Dense::permute(ptr_param> permutation, + ptr_param> result, + permute_mode mode) const +{ + this->permute_impl(permutation.get(), mode, result.get()); +} + + +template +void Dense::permute( + ptr_param> row_permutation, + ptr_param> col_permutation, + ptr_param> result, bool invert) const +{ + this->permute_impl(row_permutation.get(), col_permutation.get(), invert, + result.get()); +} + + +template +void Dense::permute( + ptr_param> row_permutation, + ptr_param> col_permutation, + ptr_param> result, bool invert) const +{ + this->permute_impl(row_permutation.get(), col_permutation.get(), invert, + result.get()); +} + + +template +std::unique_ptr> create_permutation_view( + const array& indices) +{ + return Permutation::create_const(indices.get_executor(), + indices.get_num_elems(), + indices.as_const_view()); +} + + template void Dense::permute(const array* permutation_indices, ptr_param> output) const { - this->permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::symmetric, output.get()); } @@ -1268,7 +1456,8 @@ template void Dense::permute(const array* permutation_indices, ptr_param> output) const { - this->permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::symmetric, output.get()); } @@ -1296,7 +1485,8 @@ template void Dense::inverse_permute(const array* permutation_indices, ptr_param> output) const { - this->inverse_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::inverse_symmetric, output.get()); } @@ -1304,7 +1494,8 @@ template void Dense::inverse_permute(const array* permutation_indices, ptr_param> output) const { - this->inverse_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::inverse_symmetric, output.get()); } @@ -1332,7 +1523,8 @@ template void Dense::row_permute(const array* permutation_indices, ptr_param> output) const { - this->row_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::rows, output.get()); } @@ -1340,7 +1532,8 @@ template void Dense::row_permute(const array* permutation_indices, ptr_param> output) const { - this->row_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::rows, output.get()); } @@ -1467,7 +1660,8 @@ template void Dense::column_permute(const array* permutation_indices, ptr_param> output) const { - this->column_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::columns, output.get()); } @@ -1475,7 +1669,8 @@ template void Dense::column_permute(const array* permutation_indices, ptr_param> output) const { - this->column_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::columns, output.get()); } @@ -1504,7 +1699,8 @@ void Dense::inverse_row_permute( const array* permutation_indices, ptr_param> output) const { - this->inverse_row_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::inverse_rows, output.get()); } @@ -1513,7 +1709,8 @@ void Dense::inverse_row_permute( const array* permutation_indices, ptr_param> output) const { - this->inverse_row_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::inverse_rows, output.get()); } @@ -1542,7 +1739,8 @@ void Dense::inverse_column_permute( const array* permutation_indices, ptr_param> output) const { - this->inverse_column_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::inverse_columns, output.get()); } @@ -1551,7 +1749,94 @@ void Dense::inverse_column_permute( const array* permutation_indices, ptr_param> output) const { - this->inverse_column_permute_impl(permutation_indices, output.get()); + this->permute_impl(create_permutation_view(*permutation_indices).get(), + permute_mode::inverse_columns, output.get()); +} + + +template +std::unique_ptr> Dense::scale_permute( + ptr_param> permutation, + permute_mode mode) const +{ + auto result = Dense::create(this->get_executor(), this->get_size()); + this->scale_permute(permutation, result, mode); + return result; +} + + +template +std::unique_ptr> Dense::scale_permute( + ptr_param> permutation, + permute_mode mode) const +{ + auto result = Dense::create(this->get_executor(), this->get_size()); + this->scale_permute(permutation, result, mode); + return result; +} + + +template +void Dense::scale_permute( + ptr_param> permutation, + ptr_param output, permute_mode mode) const +{ + this->scale_permute_impl(permutation.get(), mode, output.get()); +} + + +template +void Dense::scale_permute( + ptr_param> permutation, + ptr_param output, permute_mode mode) const +{ + this->scale_permute_impl(permutation.get(), mode, output.get()); +} + + +template +std::unique_ptr> Dense::scale_permute( + ptr_param> row_permutation, + ptr_param> col_permutation, + bool invert) const +{ + auto result = Dense::create(this->get_executor(), this->get_size()); + this->scale_permute(row_permutation, col_permutation, result, invert); + return result; +} + + +template +std::unique_ptr> Dense::scale_permute( + ptr_param> row_permutation, + ptr_param> col_permutation, + bool invert) const +{ + auto result = Dense::create(this->get_executor(), this->get_size()); + this->scale_permute(row_permutation, col_permutation, result, invert); + return result; +} + + +template +void Dense::scale_permute( + ptr_param> row_permutation, + ptr_param> col_permutation, + ptr_param output, bool invert) const +{ + this->scale_permute_impl(row_permutation.get(), col_permutation.get(), + invert, output.get()); +} + + +template +void Dense::scale_permute( + ptr_param> row_permutation, + ptr_param> col_permutation, + ptr_param output, bool invert) const +{ + this->scale_permute_impl(row_permutation.get(), col_permutation.get(), + invert, output.get()); } diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index a352aa8d7c1..f315a393712 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -237,50 +237,112 @@ namespace kernels { const matrix::Dense<_type>* orig, \ matrix::Dense<_type>* trans) +#define GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void symm_scale_permute( \ + std::shared_ptr exec, const _vtype* scale, \ + const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void row_scale_permute( \ + std::shared_ptr exec, const _vtype* scale, \ + const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void col_scale_permute( \ + std::shared_ptr exec, const _vtype* scale, \ + const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void inv_symm_scale_permute( \ + std::shared_ptr exec, const _vtype* scale, \ + const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void inv_row_scale_permute( \ + std::shared_ptr exec, const _vtype* scale, \ + const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void inv_col_scale_permute( \ + std::shared_ptr exec, const _vtype* scale, \ + const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void nonsymm_scale_permute( \ + std::shared_ptr exec, const _vtype* row_scale, \ + const _itype* row_permutation_indices, const _vtype* column_scale, \ + const _itype* column_permutation_indices, \ + const matrix::Dense<_vtype>* orig, matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype) \ + void inv_nonsymm_scale_permute( \ + std::shared_ptr exec, const _vtype* row_scale, \ + const _itype* row_permutation_indices, const _vtype* column_scale, \ + const _itype* column_permutation_indices, \ + const matrix::Dense<_vtype>* orig, matrix::Dense<_vtype>* permuted) + #define GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL(_vtype, _itype) \ void symm_permute(std::shared_ptr exec, \ - const array<_itype>* permutation_indices, \ + const _itype* permutation_indices, \ const matrix::Dense<_vtype>* orig, \ matrix::Dense<_vtype>* permuted) #define GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL(_vtype, _itype) \ void inv_symm_permute(std::shared_ptr exec, \ - const array<_itype>* permutation_indices, \ + const _itype* permutation_indices, \ const matrix::Dense<_vtype>* orig, \ matrix::Dense<_vtype>* permuted) +#define GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL(_vtype, _itype) \ + void nonsymm_permute(std::shared_ptr exec, \ + const _itype* row_permutation_indices, \ + const _itype* column_permutation_indices, \ + const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + +#define GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL(_vtype, _itype) \ + void inv_nonsymm_permute(std::shared_ptr exec, \ + const _itype* row_permutation_indices, \ + const _itype* column_permutation_indices, \ + const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* permuted) + #define GKO_DECLARE_DENSE_ROW_GATHER_KERNEL(_vtype, _otype, _itype) \ void row_gather(std::shared_ptr exec, \ - const array<_itype>* gather_indices, \ + const _itype* gather_indices, \ const matrix::Dense<_vtype>* orig, \ matrix::Dense<_otype>* row_collection) - -#define GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(_vtype, _otype, _itype) \ - void advanced_row_gather(std::shared_ptr exec, \ - const matrix::Dense<_vtype>* alpha, \ - const array<_itype>* gather_indices, \ - const matrix::Dense<_vtype>* orig, \ - const matrix::Dense<_vtype>* beta, \ - matrix::Dense<_otype>* row_collection) - -#define GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL(_vtype, _itype) \ - void column_permute(std::shared_ptr exec, \ - const array<_itype>* permutation_indices, \ - const matrix::Dense<_vtype>* orig, \ - matrix::Dense<_vtype>* column_permuted) - -#define GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(_vtype, _itype) \ - void inverse_row_permute(std::shared_ptr exec, \ - const array<_itype>* permutation_indices, \ - const matrix::Dense<_vtype>* orig, \ - matrix::Dense<_vtype>* row_permuted) - -#define GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL(_vtype, _itype) \ - void inverse_column_permute(std::shared_ptr exec, \ - const array<_itype>* permutation_indices, \ - const matrix::Dense<_vtype>* orig, \ - matrix::Dense<_vtype>* column_permuted) +#define GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(_vtype, _otype, _itype) \ + void advanced_row_gather( \ + std::shared_ptr exec, \ + const matrix::Dense<_vtype>* alpha, const _itype* gather_indices, \ + const matrix::Dense<_vtype>* orig, const matrix::Dense<_vtype>* beta, \ + matrix::Dense<_otype>* row_collection) + +#define GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL(_vtype, _itype) \ + void col_permute(std::shared_ptr exec, \ + const _itype* permutation_indices, \ + const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* col_permuted) + +#define GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(_vtype, _itype) \ + void inv_row_permute(std::shared_ptr exec, \ + const _itype* permutation_indices, \ + const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* row_permuted) + +#define GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL(_vtype, _itype) \ + void inv_col_permute(std::shared_ptr exec, \ + const _itype* permutation_indices, \ + const matrix::Dense<_vtype>* orig, \ + matrix::Dense<_vtype>* col_permuted) #define GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL(_vtype) \ void extract_diagonal(std::shared_ptr exec, \ @@ -319,104 +381,124 @@ namespace kernels { matrix::Dense<_vtype>* mtx) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_APPLY_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COPY_KERNEL(InValueType, OutValueType); \ - template \ - GKO_DECLARE_DENSE_FILL_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_SCALE_KERNEL(ValueType, ScalarType); \ - template \ - GKO_DECLARE_DENSE_INV_SCALE_KERNEL(ValueType, ScalarType); \ - template \ - GKO_DECLARE_DENSE_ADD_SCALED_KERNEL(ValueType, ScalarType); \ - template \ - GKO_DECLARE_DENSE_SUB_SCALED_KERNEL(ValueType, ScalarType); \ - template \ - GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL(ValueType, \ - IndexType); \ - template \ - GKO_DECLARE_DENSE_TRANSPOSE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_ROW_GATHER_KERNEL(ValueType, OutputType, IndexType); \ - template \ - GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(ValueType, OutputType, \ - IndexType); \ - template \ - GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL(ValueType); \ - template \ - GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL(ValueType); \ - template \ - GKO_DECLARE_MAKE_COMPLEX_KERNEL(ValueType); \ - template \ - GKO_DECLARE_GET_REAL_KERNEL(ValueType); \ - template \ - GKO_DECLARE_GET_IMAG_KERNEL(ValueType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_APPLY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COPY_KERNEL(InValueType, OutValueType); \ + template \ + GKO_DECLARE_DENSE_FILL_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_SCALE_KERNEL(ValueType, ScalarType); \ + template \ + GKO_DECLARE_DENSE_INV_SCALE_KERNEL(ValueType, ScalarType); \ + template \ + GKO_DECLARE_DENSE_ADD_SCALED_KERNEL(ValueType, ScalarType); \ + template \ + GKO_DECLARE_DENSE_SUB_SCALED_KERNEL(ValueType, ScalarType); \ + template \ + GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL(ValueType, \ + IndexType); \ + template \ + GKO_DECLARE_DENSE_TRANSPOSE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_ROW_GATHER_KERNEL(ValueType, OutputType, IndexType); \ + template \ + GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(ValueType, OutputType, \ + IndexType); \ + template \ + GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL(ValueType); \ + template \ + GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_MAKE_COMPLEX_KERNEL(ValueType); \ + template \ + GKO_DECLARE_GET_REAL_KERNEL(ValueType); \ + template \ + GKO_DECLARE_GET_IMAG_KERNEL(ValueType); \ + template \ GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL(ValueType, ScalarType) diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index a641834f12c..cc58ced53d2 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -31,10 +31,48 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include +#include "core/matrix/permutation_kernels.hpp" +#include "ginkgo/core/base/executor.hpp" namespace gko { namespace matrix { +namespace permutation { + + +GKO_REGISTER_OPERATION(invert, permutation::invert); + + +} + + +template +std::unique_ptr> Permutation::invert() const +{ + const auto exec = this->get_executor(); + const auto size = this->get_size()[0]; + array inv_permutation{exec, size}; + exec->run(permutation::make_invert(this->get_const_permutation(), size, + inv_permutation.get_data())); + return Permutation::create(exec, dim<2>{size, size}, + std::move(inv_permutation)); +} + + +template +void Permutation::write( + gko::matrix_data& data) const +{ + const auto host_this = + make_temporary_clone(this->get_executor()->get_master(), this); + data.size = this->get_size(); + data.nonzeros.clear(); + data.nonzeros.reserve(data.size[0]); + for (IndexType row = 0; row < this->get_size()[0]; row++) { + data.nonzeros.emplace_back(row, host_this->get_const_permutation()[row], + 1.0); + } +} #define GKO_DECLARE_PERMUTATION_MATRIX(_type) class Permutation<_type> diff --git a/core/matrix/permutation_kernels.hpp b/core/matrix/permutation_kernels.hpp new file mode 100644 index 00000000000..a77e0c2f618 --- /dev/null +++ b/core/matrix/permutation_kernels.hpp @@ -0,0 +1,82 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_PERMUTATION_KERNELS_HPP_ +#define GKO_CORE_MATRIX_PERMUTATION_KERNELS_HPP_ + + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "core/base/kernel_declaration.hpp" +#include "core/matrix/csr_lookup.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_PERMUTATION_INVERT_KERNEL(IndexType) \ + void invert(std::shared_ptr exec, \ + const IndexType* permutation_indices, size_type size, \ + IndexType* inv_permutation) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_PERMUTATION_INVERT_KERNEL(IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(permutation, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_MATRIX_PERMUTATION_KERNELS_HPP_ diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp new file mode 100644 index 00000000000..d1ce00b521a --- /dev/null +++ b/core/matrix/scaled_permutation.cpp @@ -0,0 +1,142 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include "core/matrix/scaled_permutation_kernels.hpp" +#include "ginkgo/core/base/executor.hpp" +#include "ginkgo/core/base/precision_dispatch.hpp" + + +namespace gko { +namespace matrix { +namespace scaled_permutation { +namespace { + + +GKO_REGISTER_OPERATION(invert, scaled_permutation::invert); + + +} // namespace +} // namespace scaled_permutation + + +template +ScaledPermutation::ScaledPermutation( + std::shared_ptr exec, size_type size) + : ScaledPermutation{exec, array{exec, size}, + array{exec, size}} +{} + + +template +ScaledPermutation::ScaledPermutation( + std::shared_ptr exec, array scaling_factors, + array permutation_indices) + : EnableLinOp(exec, + dim<2>{scaling_factors.get_num_elems(), + scaling_factors.get_num_elems()}), + scale_{exec, std::move(scaling_factors)}, + permutation_{exec, std::move(permutation_indices)} +{ + GKO_ASSERT_EQ(scale_.get_num_elems(), permutation_.get_num_elems()); +} + + +template +std::unique_ptr> +ScaledPermutation::invert() const +{ + const auto exec = this->get_executor(); + const auto size = this->get_size()[0]; + array inv_permutation{exec, size}; + array inv_scale{exec, size}; + exec->run(scaled_permutation::make_invert( + this->get_const_permutation(), this->get_const_scale(), size, + inv_permutation.get_data(), inv_scale.get_data())); + return ScaledPermutation::create(exec, std::move(inv_scale), + std::move(inv_permutation)); +} + + +template +void ScaledPermutation::apply_impl(const LinOp* b, + LinOp* x) const +{ + precision_dispatch_real_complex( + [this](auto dense_b, auto dense_x) { + dense_b->scale_permute(this, dense_x, permute_mode::rows); + }, + b, x); +} + + +template +void ScaledPermutation::apply_impl(const LinOp* alpha, + const LinOp* b, + const LinOp* beta, + LinOp* x) const +{ + precision_dispatch_real_complex( + [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { + auto x_clone = dense_x->clone(); + dense_b->scale_permute(this, x_clone, permute_mode::rows); + dense_x->scale(dense_beta); + dense_x->add_scaled(dense_alpha, x_clone); + }, + alpha, b, beta, x); +} + + +template +void ScaledPermutation::write( + gko::matrix_data& data) const +{ + const auto host_this = + make_temporary_clone(this->get_executor()->get_master(), this); + data.size = this->get_size(); + data.nonzeros.clear(); + data.nonzeros.reserve(data.size[0]); + for (IndexType row = 0; row < this->get_size()[0]; row++) { + data.nonzeros.emplace_back(row, host_this->get_const_permutation()[row], + host_this->get_const_scale()[row]); + } +} + + +#define GKO_DECLARE_SCALED_PERMUTATION_MATRIX(ValueType, IndexType) \ + class ScaledPermutation +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SCALED_PERMUTATION_MATRIX); + + +} // namespace matrix +} // namespace gko diff --git a/core/matrix/scaled_permutation_kernels.hpp b/core/matrix/scaled_permutation_kernels.hpp new file mode 100644 index 00000000000..905321ea885 --- /dev/null +++ b/core/matrix/scaled_permutation_kernels.hpp @@ -0,0 +1,68 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_ +#define GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_ + +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL(ValueType, IndexType) \ + void invert(std::shared_ptr exec, \ + const IndexType* input_permutation, \ + const ValueType* input_scale, size_type size, \ + IndexType* output_permutation, ValueType* output_scale) + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL(ValueType, IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(scaled_permutation, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_ diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp index 09ef5e4701a..166ff0cbcdb 100644 --- a/core/test/matrix/permutation.cpp +++ b/core/test/matrix/permutation.cpp @@ -51,39 +51,40 @@ namespace { template class Permutation : public ::testing::Test { protected: - using v_type = + using value_type = typename std::tuple_element<0, decltype(ValueIndexType())>::type; - using i_type = + using index_type = typename std::tuple_element<1, decltype(ValueIndexType())>::type; - using Vec = gko::matrix::Dense; - using Csr = gko::matrix::Csr; + using Vec = gko::matrix::Dense; + using Csr = gko::matrix::Csr; Permutation() : exec(gko::ReferenceExecutor::create()), - mtx(gko::matrix::Permutation::create( - exec, gko::dim<2>{4, 3}, gko::array{exec, {1, 0, 2, 3}})) + mtx(gko::matrix::Permutation::create( + exec, gko::dim<2>{4, 3}, + gko::array{exec, {1, 0, 2, 3}})) {} static void assert_equal_to_original_mtx( - gko::ptr_param> m) + gko::ptr_param> m) { auto perm = m->get_permutation(); ASSERT_EQ(m->get_size(), gko::dim<2>(4, 3)); - ASSERT_EQ(m->get_permutation_size(), 4); + ASSERT_EQ(m->get_size()[0], 4); ASSERT_EQ(perm[0], 1); ASSERT_EQ(perm[1], 0); ASSERT_EQ(perm[2], 2); ASSERT_EQ(perm[3], 3); } - static void assert_empty(gko::matrix::Permutation* m) + static void assert_empty(gko::matrix::Permutation* m) { ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0)); - ASSERT_EQ(m->get_permutation_size(), 0); + ASSERT_EQ(m->get_size()[0], 0); } std::shared_ptr exec; - std::unique_ptr> mtx; + std::unique_ptr> mtx; }; TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes, @@ -92,8 +93,8 @@ TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes, TYPED_TEST(Permutation, CanBeEmpty) { - using i_type = typename TestFixture::i_type; - auto empty = gko::matrix::Permutation::create(this->exec); + using index_type = typename TestFixture::index_type; + auto empty = gko::matrix::Permutation::create(this->exec); this->assert_empty(empty.get()); } @@ -101,8 +102,8 @@ TYPED_TEST(Permutation, CanBeEmpty) TYPED_TEST(Permutation, ReturnsNullValuesArrayWhenEmpty) { - using i_type = typename TestFixture::i_type; - auto empty = gko::matrix::Permutation::create(this->exec); + using index_type = typename TestFixture::index_type; + auto empty = gko::matrix::Permutation::create(this->exec); ASSERT_EQ(empty->get_const_permutation(), nullptr); } @@ -110,19 +111,19 @@ TYPED_TEST(Permutation, ReturnsNullValuesArrayWhenEmpty) TYPED_TEST(Permutation, CanBeConstructedWithSize) { - using i_type = typename TestFixture::i_type; - auto m = - gko::matrix::Permutation::create(this->exec, gko::dim<2>{2, 3}); + using index_type = typename TestFixture::index_type; + auto m = gko::matrix::Permutation::create(this->exec, + gko::dim<2>{2, 3}); ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_permutation_size(), 2); + ASSERT_EQ(m->get_size()[0], 2); } TYPED_TEST(Permutation, FactorySetsCorrectPermuteMask) { - using i_type = typename TestFixture::i_type; - auto m = gko::matrix::Permutation::create(this->exec); + using index_type = typename TestFixture::index_type; + auto m = gko::matrix::Permutation::create(this->exec); auto mask = m->get_permute_mask(); ASSERT_EQ(mask, gko::matrix::row_permute); @@ -131,10 +132,10 @@ TYPED_TEST(Permutation, FactorySetsCorrectPermuteMask) TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData) { - using i_type = typename TestFixture::i_type; - i_type data[] = {1, 0, 2}; + using index_type = typename TestFixture::index_type; + index_type data[] = {1, 0, 2}; - auto m = gko::matrix::Permutation::create( + auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 5}, gko::make_array_view(this->exec, 3, data)); @@ -144,12 +145,12 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData) TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingConstData) { - using i_type = typename TestFixture::i_type; - using i_type = typename TestFixture::i_type; - const i_type data[] = {1, 0, 2}; + using index_type = typename TestFixture::index_type; + using index_type = typename TestFixture::index_type; + const index_type data[] = {1, 0, 2}; - auto m = gko::matrix::Permutation::create_const( - this->exec, 3, gko::array::const_view(this->exec, 3, data)); + auto m = gko::matrix::Permutation::create_const( + this->exec, 3, gko::array::const_view(this->exec, 3, data)); ASSERT_EQ(m->get_const_permutation(), data); } @@ -157,20 +158,20 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingConstData) TYPED_TEST(Permutation, CanBeConstructedWithSizeAndMask) { - using i_type = typename TestFixture::i_type; - auto m = gko::matrix::Permutation::create( + using index_type = typename TestFixture::index_type; + auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute); ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_permutation_size(), 2); + ASSERT_EQ(m->get_size()[0], 2); ASSERT_EQ(m->get_permute_mask(), gko::matrix::column_permute); } TYPED_TEST(Permutation, CanExplicitlyOverrideSetPermuteMask) { - using i_type = typename TestFixture::i_type; - auto m = gko::matrix::Permutation::create( + using index_type = typename TestFixture::index_type; + auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute); auto mask = m->get_permute_mask(); @@ -186,10 +187,10 @@ TYPED_TEST(Permutation, CanExplicitlyOverrideSetPermuteMask) TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions) { - using i_type = typename TestFixture::i_type; - i_type data[] = {0, 2, 1}; + using index_type = typename TestFixture::index_type; + index_type data[] = {0, 2, 1}; - ASSERT_THROW(gko::matrix::Permutation::create( + ASSERT_THROW(gko::matrix::Permutation::create( this->exec, gko::dim<2>{4, 2}, gko::make_array_view(this->exec, 3, data)), gko::ValueMismatch); @@ -198,10 +199,10 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions) TYPED_TEST(Permutation, SettingMaskDoesNotModifyData) { - using i_type = typename TestFixture::i_type; - i_type data[] = {1, 0, 2}; + using index_type = typename TestFixture::index_type; + index_type data[] = {1, 0, 2}; - auto m = gko::matrix::Permutation::create( + auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 5}, gko::make_array_view(this->exec, 3, data)); @@ -220,10 +221,10 @@ TYPED_TEST(Permutation, SettingMaskDoesNotModifyData) TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions) { - using i_type = typename TestFixture::i_type; - i_type data[] = {0, 2, 1}; + using index_type = typename TestFixture::index_type; + index_type data[] = {0, 2, 1}; - ASSERT_THROW(gko::matrix::Permutation::create( + ASSERT_THROW(gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 4}, gko::make_array_view(this->exec, 3, data), gko::matrix::column_permute), @@ -239,8 +240,8 @@ TYPED_TEST(Permutation, KnowsItsSizeAndValues) TYPED_TEST(Permutation, CanBeCopied) { - using i_type = typename TestFixture::i_type; - auto mtx_copy = gko::matrix::Permutation::create(this->exec); + using index_type = typename TestFixture::index_type; + auto mtx_copy = gko::matrix::Permutation::create(this->exec); mtx_copy->copy_from(this->mtx); @@ -252,8 +253,8 @@ TYPED_TEST(Permutation, CanBeCopied) TYPED_TEST(Permutation, CanBeMoved) { - using i_type = typename TestFixture::i_type; - auto mtx_copy = gko::matrix::Permutation::create(this->exec); + using index_type = typename TestFixture::index_type; + auto mtx_copy = gko::matrix::Permutation::create(this->exec); mtx_copy->move_from(this->mtx); @@ -263,8 +264,8 @@ TYPED_TEST(Permutation, CanBeMoved) TYPED_TEST(Permutation, CopyingPreservesMask) { - using i_type = typename TestFixture::i_type; - auto mtx_copy = gko::matrix::Permutation::create(this->exec); + using index_type = typename TestFixture::index_type; + auto mtx_copy = gko::matrix::Permutation::create(this->exec); mtx_copy->copy_from(this->mtx); diff --git a/cuda/matrix/csr_kernels.instantiate.cu b/cuda/matrix/csr_kernels.instantiate.cu index 75747bf074b..335d42d2ff9 100644 --- a/cuda/matrix/csr_kernels.instantiate.cu +++ b/cuda/matrix/csr_kernels.instantiate.cu @@ -69,12 +69,22 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 803cb530262..d5b577a6068 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -124,7 +124,7 @@ namespace { template void merge_path_spmv(syn::value_list, - std::shared_ptr exec, + std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -204,7 +204,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv); template -int compute_items_per_thread(std::shared_ptr exec) +int compute_items_per_thread(std::shared_ptr exec) { const int version = (exec->get_major_version() << 4) + exec->get_minor_version(); @@ -245,7 +245,7 @@ int compute_items_per_thread(std::shared_ptr exec) template void classical_spmv(syn::value_list, - std::shared_ptr exec, + std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -298,7 +298,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); template -void load_balance_spmv(std::shared_ptr exec, +void load_balance_spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -349,7 +349,7 @@ void load_balance_spmv(std::shared_ptr exec, template -bool try_general_sparselib_spmv(std::shared_ptr exec, +bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType* alpha, const matrix::Csr* a, const matrix::Dense* b, @@ -441,7 +441,7 @@ template ::value || !std::is_same::value>> -bool try_sparselib_spmv(std::shared_ptr exec, +bool try_sparselib_spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -453,7 +453,7 @@ bool try_sparselib_spmv(std::shared_ptr exec, } template -bool try_sparselib_spmv(std::shared_ptr exec, +bool try_sparselib_spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -479,7 +479,7 @@ bool try_sparselib_spmv(std::shared_ptr exec, template -void spmv(std::shared_ptr exec, +void spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c) @@ -536,7 +536,7 @@ void spmv(std::shared_ptr exec, template -void advanced_spmv(std::shared_ptr exec, +void advanced_spmv(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::Csr* a, const matrix::Dense* b, @@ -597,7 +597,7 @@ void advanced_spmv(std::shared_ptr exec, template -void spgemm(std::shared_ptr exec, +void spgemm(std::shared_ptr exec, const matrix::Csr* a, const matrix::Csr* b, matrix::Csr* c) @@ -719,56 +719,8 @@ void spgemm(std::shared_ptr exec, } -namespace { - - -template -void spgeam(syn::value_list, - std::shared_ptr exec, const ValueType* alpha, - const IndexType* a_row_ptrs, const IndexType* a_col_idxs, - const ValueType* a_vals, const ValueType* beta, - const IndexType* b_row_ptrs, const IndexType* b_col_idxs, - const ValueType* b_vals, matrix::Csr* c) -{ - auto m = static_cast(c->get_size()[0]); - auto c_row_ptrs = c->get_row_ptrs(); - // count nnz for alpha * A + beta * B - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(m, subwarps_per_block); - if (num_blocks > 0) { - kernel::spgeam_nnz - <<get_stream()>>>( - a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); - } - - // build row pointers - components::prefix_sum_nonnegative(exec, c_row_ptrs, m + 1); - - // accumulate non-zeros for alpha * A + beta * B - matrix::CsrBuilder c_builder{c}; - auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); - c_builder.get_col_idx_array().resize_and_reset(c_nnz); - c_builder.get_value_array().resize_and_reset(c_nnz); - auto c_col_idxs = c->get_col_idxs(); - auto c_vals = c->get_values(); - if (num_blocks > 0) { - kernel::spgeam - <<get_stream()>>>( - as_device_type(alpha), a_row_ptrs, a_col_idxs, - as_device_type(a_vals), as_device_type(beta), b_row_ptrs, - b_col_idxs, as_device_type(b_vals), m, c_row_ptrs, c_col_idxs, - as_device_type(c_vals)); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); - - -} // namespace - - template -void advanced_spgemm(std::shared_ptr exec, +void advanced_spgemm(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::Csr* a, const matrix::Csr* b, @@ -914,54 +866,7 @@ void advanced_spgemm(std::shared_ptr exec, template -void spgeam(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Csr* a, - const matrix::Dense* beta, - const matrix::Csr* b, - matrix::Csr* c) -{ - auto total_nnz = - a->get_num_stored_elements() + b->get_num_stored_elements(); - auto nnz_per_row = total_nnz / a->get_size()[0]; - select_spgeam( - spgeam_kernels(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= nnz_per_row || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, - alpha->get_const_values(), a->get_const_row_ptrs(), - a->get_const_col_idxs(), a->get_const_values(), - beta->get_const_values(), b->get_const_row_ptrs(), - b->get_const_col_idxs(), b->get_const_values(), c); -} - - -template -void fill_in_dense(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto stride = result->get_stride(); - const auto row_ptrs = source->get_const_row_ptrs(); - const auto col_idxs = source->get_const_col_idxs(); - const auto vals = source->get_const_values(); - - auto grid_dim = ceildiv(num_rows, default_block_size); - if (grid_dim > 0) { - kernel::fill_in_dense<<get_stream()>>>( - num_rows, as_device_type(row_ptrs), as_device_type(col_idxs), - as_device_type(vals), stride, as_device_type(result->get_values())); - } -} - - -template -void transpose(std::shared_ptr exec, +void transpose(std::shared_ptr exec, const matrix::Csr* orig, matrix::Csr* trans) { @@ -1010,7 +915,7 @@ void transpose(std::shared_ptr exec, template -void conj_transpose(std::shared_ptr exec, +void conj_transpose(std::shared_ptr exec, const matrix::Csr* orig, matrix::Csr* trans) { @@ -1067,160 +972,7 @@ void conj_transpose(std::shared_ptr exec, template -void inv_symm_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* permuted) -{ - auto num_rows = orig->get_size()[0]; - auto count_num_blocks = ceildiv(num_rows, default_block_size); - if (count_num_blocks > 0) { - kernel::inv_row_ptr_permute<<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - permuted->get_row_ptrs()); - } - components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), - num_rows + 1); - auto copy_num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - if (copy_num_blocks > 0) { - kernel::inv_symm_permute - <<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), - as_device_type(orig->get_const_values()), - permuted->get_row_ptrs(), permuted->get_col_idxs(), - as_device_type(permuted->get_values())); - } -} - - -template -void row_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* row_permuted) -{ - auto num_rows = orig->get_size()[0]; - auto count_num_blocks = ceildiv(num_rows, default_block_size); - if (count_num_blocks > 0) { - kernel::row_ptr_permute<<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); - } - components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), - num_rows + 1); - auto copy_num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - if (copy_num_blocks > 0) { - kernel::row_permute - <<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), - as_device_type(orig->get_const_values()), - row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), - as_device_type(row_permuted->get_values())); - } -} - - -template -void inverse_row_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* row_permuted) -{ - auto num_rows = orig->get_size()[0]; - auto count_num_blocks = ceildiv(num_rows, default_block_size); - if (count_num_blocks > 0) { - kernel::inv_row_ptr_permute<<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); - } - components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), - num_rows + 1); - auto copy_num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - if (copy_num_blocks > 0) { - kernel::inv_row_permute - <<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), - as_device_type(orig->get_const_values()), - row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), - as_device_type(row_permuted->get_values())); - } -} - - -template -void calculate_nonzeros_per_row_in_span( - std::shared_ptr exec, - const matrix::Csr* source, const span& row_span, - const span& col_span, array* row_nnz) -{ - const auto num_rows = source->get_size()[0]; - auto row_ptrs = source->get_const_row_ptrs(); - auto col_idxs = source->get_const_col_idxs(); - auto grid_dim = ceildiv(row_span.length(), default_block_size); - if (grid_dim > 0) { - kernel::calculate_nnz_per_row_in_span<<get_stream()>>>( - row_span, col_span, as_device_type(row_ptrs), - as_device_type(col_idxs), as_device_type(row_nnz->get_data())); - } -} - - -template -void compute_submatrix(std::shared_ptr exec, - const matrix::Csr* source, - gko::span row_span, gko::span col_span, - matrix::Csr* result) -{ - auto row_offset = row_span.begin; - auto col_offset = col_span.begin; - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - auto row_ptrs = source->get_const_row_ptrs(); - auto grid_dim = ceildiv(num_rows, default_block_size); - if (grid_dim > 0) { - kernel::compute_submatrix_idxs_and_vals<<get_stream()>>>( - num_rows, num_cols, row_offset, col_offset, - as_device_type(source->get_const_row_ptrs()), - as_device_type(source->get_const_col_idxs()), - as_device_type(source->get_const_values()), - as_device_type(result->get_const_row_ptrs()), - as_device_type(result->get_col_idxs()), - as_device_type(result->get_values())); - } -} - - -template -void calculate_nonzeros_per_row_in_index_set( - std::shared_ptr exec, - const matrix::Csr* source, - const gko::index_set& row_index_set, - const gko::index_set& col_index_set, - IndexType* row_nnz) GKO_NOT_IMPLEMENTED; - - -template -void compute_submatrix_from_index_set( - std::shared_ptr exec, - const matrix::Csr* source, - const gko::index_set& row_index_set, - const gko::index_set& col_index_set, - matrix::Csr* result) GKO_NOT_IMPLEMENTED; - - -template -void sort_by_column_index(std::shared_ptr exec, +void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) { if (cusparse::is_supported::value) { @@ -1271,95 +1023,6 @@ void sort_by_column_index(std::shared_ptr exec, } -template -void is_sorted_by_column_index( - std::shared_ptr exec, - const matrix::Csr* to_check, bool* is_sorted) -{ - *is_sorted = true; - auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); - auto gpu_array = array{exec, cpu_array}; - auto block_size = default_block_size; - auto num_rows = static_cast(to_check->get_size()[0]); - auto num_blocks = ceildiv(num_rows, block_size); - if (num_blocks > 0) { - kernel:: - check_unsorted<<get_stream()>>>( - to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), - num_rows, gpu_array.get_data()); - } - cpu_array = gpu_array; -} - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Csr* orig, - matrix::Diagonal* diag) -{ - const auto nnz = orig->get_num_stored_elements(); - const auto diag_size = diag->get_size()[0]; - const auto num_blocks = - ceildiv(config::warp_size * diag_size, default_block_size); - - const auto orig_values = orig->get_const_values(); - const auto orig_row_ptrs = orig->get_const_row_ptrs(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - - if (num_blocks > 0) { - kernel::extract_diagonal<<get_stream()>>>( - diag_size, nnz, as_device_type(orig_values), - as_device_type(orig_row_ptrs), as_device_type(orig_col_idxs), - as_device_type(diag_values)); - } -} - - -template -void check_diagonal_entries_exist( - std::shared_ptr exec, - const matrix::Csr* const mtx, bool& has_all_diags) -{ - const auto num_diag = static_cast( - std::min(mtx->get_size()[0], mtx->get_size()[1])); - if (num_diag > 0) { - const IndexType num_blocks = - ceildiv(num_diag, default_block_size / config::warp_size); - array has_diags(exec, {true}); - kernel::check_diagonal_entries<<get_stream()>>>( - num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - has_diags.get_data()); - has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); - } else { - has_all_diags = true; - } -} - - -template -void add_scaled_identity(std::shared_ptr exec, - const matrix::Dense* const alpha, - const matrix::Dense* const beta, - matrix::Csr* const mtx) -{ - const auto nrows = mtx->get_size()[0]; - if (nrows == 0) { - return; - } - const auto nthreads = nrows * config::warp_size; - const auto nblocks = ceildiv(nthreads, default_block_size); - kernel::add_scaled_identity<<get_stream()>>>( - as_device_type(alpha->get_const_values()), - as_device_type(beta->get_const_values()), static_cast(nrows), - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - as_device_type(mtx->get_values())); -} - - } // namespace csr } // namespace cuda } // namespace kernels diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 46e8894fdac..f05692c1929 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1102,6 +1102,35 @@ void inv_symm_permute_kernel(size_type num_rows, } } + +template +void inv_nonsymm_permute_kernel(size_type num_rows, + const IndexType* __restrict__ row_permutation, + const IndexType* __restrict__ col_permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, + ValueType* __restrict__ out_vals, + sycl::nd_item<3> item_ct1) +{ + auto tid = thread::get_subwarp_id_flat(item_ct1); + if (tid >= num_rows) { + return; + } + auto lane = item_ct1.get_local_id(2) % subgroup_size; + auto in_row = tid; + auto out_row = row_permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subgroup_size) { + out_cols[out_begin + i] = col_permutation[in_cols[in_begin + i]]; + out_vals[out_begin + i] = in_vals[in_begin + i]; + } +} + template void inv_symm_permute_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, @@ -1122,6 +1151,25 @@ void inv_symm_permute_kernel(dim3 grid, dim3 block, }); } +template +void inv_nonsymm_permute_kernel( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, const IndexType* row_permutation, + const IndexType* col_permutation, const IndexType* in_row_ptrs, + const IndexType* in_cols, const ValueType* in_vals, + const IndexType* out_row_ptrs, IndexType* out_cols, ValueType* out_vals) +{ + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + inv_nonsymm_permute_kernel( + num_rows, row_permutation, col_permutation, + in_row_ptrs, in_cols, in_vals, out_row_ptrs, + out_cols, out_vals, item_ct1); + }); + }); +} + namespace host_kernel { @@ -2266,6 +2314,33 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); +template +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_perm, const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + inv_row_ptr_permute_kernel( + count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + row_perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + inv_symm_permute_kernel( + copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + row_perm, col_perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), orig->get_const_values(), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + permuted->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); + + template void row_permute(std::shared_ptr exec, const IndexType* perm, @@ -2293,10 +2368,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void inverse_row_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* row_permuted) +void inv_row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) { auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); @@ -2315,7 +2390,7 @@ void inverse_row_permute(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); template diff --git a/examples/distributed-solver/distributed-solver.cpp b/examples/distributed-solver/distributed-solver.cpp index 1b758d186a4..7eafba783ee 100644 --- a/examples/distributed-solver/distributed-solver.cpp +++ b/examples/distributed-solver/distributed-solver.cpp @@ -119,15 +119,14 @@ int main(int argc, char* argv[]) int device_id = gko::experimental::mpi::map_rank_to_device_id( comm, gko::CudaExecutor::get_num_devices()); return gko::CudaExecutor::create( - device_id, gko::ReferenceExecutor::create(), false, - gko::allocation_mode::device); + device_id, gko::ReferenceExecutor::create()); }}, {"hip", [](MPI_Comm comm) { int device_id = gko::experimental::mpi::map_rank_to_device_id( comm, gko::HipExecutor::get_num_devices()); return gko::HipExecutor::create( - device_id, gko::ReferenceExecutor::create(), true); + device_id, gko::ReferenceExecutor::create()); }}, {"dpcpp", [](MPI_Comm comm) { int device_id = 0; diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp index 9a6c29206de..156b170311f 100644 --- a/hip/matrix/csr_kernels.instantiate.hip.cpp +++ b/hip/matrix/csr_kernels.instantiate.hip.cpp @@ -117,12 +117,22 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index 5e4de7b9699..52101385c92 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -629,54 +629,6 @@ void spgemm(std::shared_ptr exec, } -namespace { - - -template -void spgeam(syn::value_list, - std::shared_ptr exec, const ValueType* alpha, - const IndexType* a_row_ptrs, const IndexType* a_col_idxs, - const ValueType* a_vals, const ValueType* beta, - const IndexType* b_row_ptrs, const IndexType* b_col_idxs, - const ValueType* b_vals, matrix::Csr* c) -{ - auto m = static_cast(c->get_size()[0]); - auto c_row_ptrs = c->get_row_ptrs(); - // count nnz for alpha * A + beta * B - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(m, subwarps_per_block); - if (num_blocks > 0) { - kernel::spgeam_nnz - <<get_stream()>>>( - a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); - } - - // build row pointers - components::prefix_sum_nonnegative(exec, c_row_ptrs, m + 1); - - // accumulate non-zeros for alpha * A + beta * B - matrix::CsrBuilder c_builder{c}; - auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); - c_builder.get_col_idx_array().resize_and_reset(c_nnz); - c_builder.get_value_array().resize_and_reset(c_nnz); - auto c_col_idxs = c->get_col_idxs(); - auto c_vals = c->get_values(); - if (num_blocks > 0) { - kernel::spgeam - <<get_stream()>>>( - as_device_type(alpha), a_row_ptrs, a_col_idxs, - as_device_type(a_vals), as_device_type(beta), b_row_ptrs, - b_col_idxs, as_device_type(b_vals), m, c_row_ptrs, c_col_idxs, - as_device_type(c_vals)); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); - - -} // namespace - - template void advanced_spgemm(std::shared_ptr exec, const matrix::Dense* alpha, @@ -768,53 +720,6 @@ void advanced_spgemm(std::shared_ptr exec, } -template -void spgeam(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Csr* a, - const matrix::Dense* beta, - const matrix::Csr* b, - matrix::Csr* c) -{ - auto total_nnz = - a->get_num_stored_elements() + b->get_num_stored_elements(); - auto nnz_per_row = total_nnz / a->get_size()[0]; - select_spgeam( - spgeam_kernels(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= nnz_per_row || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, - alpha->get_const_values(), a->get_const_row_ptrs(), - a->get_const_col_idxs(), a->get_const_values(), - beta->get_const_values(), b->get_const_row_ptrs(), - b->get_const_col_idxs(), b->get_const_values(), c); -} - - -template -void fill_in_dense(std::shared_ptr exec, - const matrix::Csr* source, - matrix::Dense* result) -{ - const auto num_rows = result->get_size()[0]; - const auto num_cols = result->get_size()[1]; - const auto stride = result->get_stride(); - const auto row_ptrs = source->get_const_row_ptrs(); - const auto col_idxs = source->get_const_col_idxs(); - const auto vals = source->get_const_values(); - - auto grid_dim = ceildiv(num_rows, default_block_size); - if (grid_dim > 0) { - kernel::fill_in_dense<<get_stream()>>>( - num_rows, as_device_type(row_ptrs), as_device_type(col_idxs), - as_device_type(vals), stride, as_device_type(result->get_values())); - } -} - - template void transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -871,159 +776,6 @@ void conj_transpose(std::shared_ptr exec, } -template -void inv_symm_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* permuted) -{ - auto num_rows = orig->get_size()[0]; - auto count_num_blocks = ceildiv(num_rows, default_block_size); - if (count_num_blocks > 0) { - kernel::inv_row_ptr_permute<<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - permuted->get_row_ptrs()); - } - components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), - num_rows + 1); - auto copy_num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - if (copy_num_blocks > 0) { - kernel::inv_symm_permute - <<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), - as_device_type(orig->get_const_values()), - permuted->get_row_ptrs(), permuted->get_col_idxs(), - as_device_type(permuted->get_values())); - } -} - - -template -void row_permute(std::shared_ptr exec, const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* row_permuted) -{ - auto num_rows = orig->get_size()[0]; - auto count_num_blocks = ceildiv(num_rows, default_block_size); - if (count_num_blocks > 0) { - kernel::row_ptr_permute<<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); - } - components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), - num_rows + 1); - auto copy_num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - if (copy_num_blocks > 0) { - kernel::row_permute - <<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), - as_device_type(orig->get_const_values()), - row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), - as_device_type(row_permuted->get_values())); - } -} - - -template -void inverse_row_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* row_permuted) -{ - auto num_rows = orig->get_size()[0]; - auto count_num_blocks = ceildiv(num_rows, default_block_size); - if (count_num_blocks > 0) { - kernel::inv_row_ptr_permute<<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - row_permuted->get_row_ptrs()); - } - components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), - num_rows + 1); - auto copy_num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - if (copy_num_blocks > 0) { - kernel::inv_row_permute - <<get_stream()>>>( - num_rows, perm, orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), - as_device_type(orig->get_const_values()), - row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), - as_device_type(row_permuted->get_values())); - } -} - - -template -void calculate_nonzeros_per_row_in_span( - std::shared_ptr exec, - const matrix::Csr* source, const span& row_span, - const span& col_span, array* row_nnz) -{ - const auto num_rows = source->get_size()[0]; - auto row_ptrs = source->get_const_row_ptrs(); - auto col_idxs = source->get_const_col_idxs(); - auto grid_dim = ceildiv(row_span.length(), default_block_size); - - if (grid_dim > 0) { - kernel::calculate_nnz_per_row_in_span<<get_stream()>>>( - row_span, col_span, as_device_type(row_ptrs), - as_device_type(col_idxs), as_device_type(row_nnz->get_data())); - } -} - - -template -void compute_submatrix(std::shared_ptr exec, - const matrix::Csr* source, - gko::span row_span, gko::span col_span, - matrix::Csr* result) -{ - auto row_offset = row_span.begin; - auto col_offset = col_span.begin; - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - auto row_ptrs = source->get_const_row_ptrs(); - auto grid_dim = ceildiv(num_rows, default_block_size); - if (grid_dim > 0) { - kernel::compute_submatrix_idxs_and_vals<<get_stream()>>>( - num_rows, num_cols, row_offset, col_offset, - as_device_type(source->get_const_row_ptrs()), - as_device_type(source->get_const_col_idxs()), - as_device_type(source->get_const_values()), - as_device_type(result->get_const_row_ptrs()), - as_device_type(result->get_col_idxs()), - as_device_type(result->get_values())); - } -} - - -template -void calculate_nonzeros_per_row_in_index_set( - std::shared_ptr exec, - const matrix::Csr* source, - const gko::index_set& row_index_set, - const gko::index_set& col_index_set, - IndexType* row_nnz) GKO_NOT_IMPLEMENTED; - - -template -void compute_submatrix_from_index_set( - std::shared_ptr exec, - const matrix::Csr* source, - const gko::index_set& row_index_set, - const gko::index_set& col_index_set, - matrix::Csr* result) GKO_NOT_IMPLEMENTED; - - template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) @@ -1069,94 +821,6 @@ void sort_by_column_index(std::shared_ptr exec, } -template -void is_sorted_by_column_index( - std::shared_ptr exec, - const matrix::Csr* to_check, bool* is_sorted) -{ - *is_sorted = true; - auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); - auto gpu_array = array{exec, cpu_array}; - auto block_size = default_block_size; - auto num_rows = static_cast(to_check->get_size()[0]); - auto num_blocks = ceildiv(num_rows, block_size); - if (num_blocks > 0) { - kernel:: - check_unsorted<<get_stream()>>>( - to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), - num_rows, gpu_array.get_data()); - } - cpu_array = gpu_array; -} - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Csr* orig, - matrix::Diagonal* diag) -{ - const auto nnz = orig->get_num_stored_elements(); - const auto diag_size = diag->get_size()[0]; - const auto num_blocks = - ceildiv(config::warp_size * diag_size, default_block_size); - - const auto orig_values = orig->get_const_values(); - const auto orig_row_ptrs = orig->get_const_row_ptrs(); - const auto orig_col_idxs = orig->get_const_col_idxs(); - auto diag_values = diag->get_values(); - if (num_blocks > 0) { - kernel::extract_diagonal<<get_stream()>>>( - diag_size, nnz, as_device_type(orig_values), - as_device_type(orig_row_ptrs), as_device_type(orig_col_idxs), - as_device_type(diag_values)); - } -} - - -template -void check_diagonal_entries_exist( - std::shared_ptr exec, - const matrix::Csr* const mtx, bool& has_all_diags) -{ - const auto num_diag = static_cast( - std::min(mtx->get_size()[0], mtx->get_size()[1])); - if (num_diag > 0) { - const IndexType num_blocks = - ceildiv(num_diag, default_block_size / config::warp_size); - array has_diags(exec, {true}); - kernel::check_diagonal_entries<<get_stream()>>>( - num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - has_diags.get_data()); - has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); - } else { - has_all_diags = true; - } -} - - -template -void add_scaled_identity(std::shared_ptr exec, - const matrix::Dense* const alpha, - const matrix::Dense* const beta, - matrix::Csr* const mtx) -{ - const auto nrows = mtx->get_size()[0]; - if (nrows == 0) { - return; - } - const auto nthreads = nrows * config::warp_size; - const auto nblocks = ceildiv(nthreads, default_block_size); - kernel::add_scaled_identity<<get_stream()>>>( - as_device_type(alpha->get_const_values()), - as_device_type(beta->get_const_values()), static_cast(nrows), - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - as_device_type(mtx->get_values())); -} - - } // namespace csr } // namespace hip } // namespace kernels diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index 8b270ed7a98..1a52b93c0bd 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -683,6 +683,7 @@ class UnsupportedMatrixProperty : public Error { }; +/** Exception thrown if an object is in an invalid state. */ class InvalidStateError : public Error { public: /** @@ -701,6 +702,25 @@ class InvalidStateError : public Error { }; +/** Exception thrown if an invalid valid was passed to a function. */ +class InvalidValueError : public Error { +public: + /** + * Initializes an invalid value error. + * + * @param file The name of the offending source file + * @param line The source code line number where the error occurred + * @param func The function name where the error occurred + * @param clarification A message describing the invalid value + */ + InvalidValueError(const std::string& file, int line, + const std::string& func, const std::string& clarification) + : Error(file, line, + func + ": Invalid value encountered : " + clarification) + {} +}; + + } // namespace gko diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 834208c4322..b73459c1175 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace gko { @@ -59,9 +60,6 @@ class Ell; template class Hybrid; -template -class Permutation; - template class ScaledPermutation; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 9c4799951f2..539480934d1 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace gko { @@ -81,9 +82,6 @@ class Fbcsr; template class Hybrid; -template -class Permutation; - template class ScaledPermutation; @@ -492,7 +490,7 @@ class Dense */ void permute(ptr_param> row_permutation, ptr_param> column_permutation, - ptr_param output, bool invert) const; + ptr_param output, bool invert = false) const; /** * @copydoc permute(ptr_param>, ptr_param> row_permutation, ptr_param> column_permutation, - ptr_param output, bool invert) const; + ptr_param output, bool invert = false) const; /** * Creates a scaled and permuted copy of this matrix. @@ -537,7 +535,7 @@ class Dense * @copydoc scale_permute(ptr_param>, ptr_param, permute_mode) */ - std::unique_ptr scale_permute( + void scale_permute( ptr_param> permutation, ptr_param output, permute_mode mode) const; @@ -575,22 +573,22 @@ class Dense * that writes the permuted copy into an existing Dense matrix. * @param output the output matrix. */ - std::unique_ptr scale_permute( + void scale_permute( ptr_param> row_permutation, ptr_param> column_permutation, - ptr_param output, bool invert) const; + ptr_param output, bool invert = false) const; /** * @copydoc scale_permute(ptr_param>, ptr_param>, * ptr_param, bool) */ - std::unique_ptr scale_permute( + void scale_permute( ptr_param> row_permutation, ptr_param> column_permutation, - ptr_param output, bool invert) const; + ptr_param output, bool invert = false) const; std::unique_ptr permute( const array* permutation_indices) const override; @@ -1469,19 +1467,24 @@ class Dense } template - void permute_impl(const array* permutation, Dense* output) const; + void permute_impl(const Permutation* permutation, + permute_mode mode, Dense* output) const; template - void inverse_permute_impl(const array* permutation, - Dense* output) const; + void permute_impl(const Permutation* row_permutation, + const Permutation* col_permutation, + bool invert, Dense* output) const; template - void row_permute_impl(const array* permutation, - Dense* output) const; + void scale_permute_impl( + const ScaledPermutation* permutation, + permute_mode mode, Dense* output) const; template - void inverse_row_permute_impl(const array* permutation, - Dense* output) const; + void scale_permute_impl( + const ScaledPermutation* row_permutation, + const ScaledPermutation* column_permutation, + bool invert, Dense* output) const; template void row_gather_impl(const array* row_idxs, @@ -1493,14 +1496,6 @@ class Dense const Dense* beta, Dense* row_collection) const; - template - void column_permute_impl(const array* permutation, - Dense* output) const; - - template - void inverse_column_permute_impl(const array* permutation, - Dense* output) const; - private: array values_; size_type stride_; diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index 163160a2af6..b577481345b 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -52,6 +52,78 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace matrix { + +/** Specifies how a permutation will be applied to a matrix. */ +enum class permute_mode { + /** Neither rows nor columns will be permuted. */ + none = 0b0, + /** The rows will be permuted. */ + rows = 0b1, + /** The columns will be permuted. */ + columns = 0b10, + /** + * The rows and columns will be permuted. This is equivalent to + * `permute_mode::rows | permute_mode::columns`. + */ + symmetric = 0b11, + /** The permutation will be inverted before being applied. */ + inverse = 0b100, + /** + * The rows will be permuted using the inverse permutation. This is + * equivalent to `permute_mode::rows | permute_mode::inverse`. + */ + inverse_rows = 0b101, + /** + * The columns will be permuted using the inverse permutation. This is + * equivalent to `permute_mode::columns | permute_mode::inverse`. + */ + inverse_columns = 0b110, + /** + * The rows and columns will be permuted using the inverse permutation. This + * is equivalent to `permute_mode::symmetric | permute_mode::inverse`. + */ + inverse_symmetric = 0b111 +}; + + +/** Combines two permutation modes. */ +inline permute_mode operator|(permute_mode a, permute_mode b) +{ + return static_cast(static_cast(a) | static_cast(b)); +} + + +/** Computes the intersection of two permutation modes. */ +inline permute_mode operator&(permute_mode a, permute_mode b) +{ + return static_cast(static_cast(a) & static_cast(b)); +} + + +inline std::ostream& operator<<(std::ostream& stream, permute_mode mode) +{ + switch (mode) { + case permute_mode::none: + return stream << "none"; + case permute_mode::rows: + return stream << "rows"; + case permute_mode::columns: + return stream << "columns"; + case permute_mode::symmetric: + return stream << "symmetric"; + case permute_mode::inverse: + return stream << "inverse"; + case permute_mode::inverse_rows: + return stream << "inverse_rows"; + case permute_mode::inverse_columns: + return stream << "inverse_columns"; + case permute_mode::inverse_symmetric: + return stream << "inverse_symmetric"; + } + return stream; +} + + /** @internal std::bitset allows to store any number of bits */ using mask_type = gko::uint64; @@ -77,11 +149,14 @@ static constexpr mask_type inverse_permute = mask_type{1 << 3}; */ template class Permutation : public EnableLinOp>, - public EnableCreateMethod> { + public EnableCreateMethod>, + public WritableToMatrixData { friend class EnableCreateMethod; friend class EnablePolymorphicObject; public: + // value_type is only available to enable the usage of gko::write + using value_type = default_precision; using index_type = IndexType; /** @@ -110,7 +185,8 @@ class Permutation : public EnableLinOp>, * @return the number of elements explicitly stored in the permutation * array. */ - size_type get_permutation_size() const noexcept + [[deprecated("use get_size()[0] instead")]] size_type get_permutation_size() + const noexcept { return permutation_.get_num_elems(); } @@ -132,6 +208,16 @@ class Permutation : public EnableLinOp>, enabled_permute_ = permute_mask; } + /** + * Returns the inverse permutation. + * + * @return a newly created Permutation object storing the inverse + * permutation of this Permutation. + */ + std::unique_ptr invert() const; + + void write(gko::matrix_data& data) const override; + /** * Creates a constant (immutable) Permutation matrix from a constant array. * @@ -214,7 +300,7 @@ class Permutation : public EnableLinOp>, } } - void apply_impl(const LinOp* in, LinOp* out) const + void apply_impl(const LinOp* in, LinOp* out) const override { auto perm = as>(in); std::unique_ptr tmp{}; @@ -248,7 +334,7 @@ class Permutation : public EnableLinOp>, void apply_impl(const LinOp*, const LinOp* in, const LinOp*, - LinOp* out) const + LinOp* out) const override { // Ignores alpha and beta and just performs a normal permutation as an // advanced apply does not really make sense here. diff --git a/include/ginkgo/core/matrix/scaled_permutation.hpp b/include/ginkgo/core/matrix/scaled_permutation.hpp new file mode 100644 index 00000000000..0a5a2d781e7 --- /dev/null +++ b/include/ginkgo/core/matrix/scaled_permutation.hpp @@ -0,0 +1,177 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_MATRIX_SCALED_PERMUTATION_HPP_ +#define GKO_PUBLIC_CORE_MATRIX_SCALED_PERMUTATION_HPP_ + + +#include + + +#include +#include +#include +#include +#include + + +namespace gko { +namespace matrix { + + +/** + * ScaledPermutation is a matrix combining a permutation with scaling factors. + * It is a combination of Diagonal and Permutation, and can be read as + * $SP = S \cdot P$, i.e. the scaling gets applied after the permutation. + * + * @tparam IndexType index type of permutation indices + * @tparam ValueType value type of the scaling factors + * + * @ingroup permutation + * @ingroup mat_formats + * @ingroup LinOp + */ +template +class ScaledPermutation + : public EnableLinOp>, + public EnableCreateMethod>, + public WritableToMatrixData { + friend class EnableCreateMethod; + friend class EnablePolymorphicObject; + +public: + using value_type = ValueType; + using index_type = IndexType; + + /** + * Returns a pointer to the scaling factors. + * + * @return the pointer to the scaling factors. + */ + value_type* get_scale() noexcept { return scale_.get_data(); } + + /** + * @copydoc get_scale() + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const value_type* get_const_scale() const noexcept + { + return scale_.get_const_data(); + } + + /** + * Returns a pointer to the permutation indices. + * + * @return the pointer to the permutation indices. + */ + index_type* get_permutation() noexcept { return permutation_.get_data(); } + + /** + * @copydoc get_permutation() + * + * @note This is the constant version of the function, which can be + * significantly more memory efficient than the non-constant version, + * so always prefer this version. + */ + const index_type* get_const_permutation() const noexcept + { + return permutation_.get_const_data(); + } + + /** + * Returns the inverse scaled permutation. + * + * @return a newly created ScaledPermutation object storing the inverse + * permutation and scaling factors of this ScalingPermutation. + */ + std::unique_ptr invert() const; + + void write(gko::matrix_data& data) const override; + + /** + * Creates a constant (immutable) ScaledPermutation matrix from constant + * arrays. + * + * @param exec the executor to create the object on + * @param perm_idxs the permutation index array of the matrix + * @param scale the scaling factor array + * @returns A smart pointer to the constant matrix wrapping the input arrays + * (if it resides on the same executor as the matrix) or a copy of + * the arrays on the correct executor. + */ + static std::unique_ptr create_const( + std::shared_ptr exec, + gko::detail::const_array_view&& scale, + gko::detail::const_array_view&& perm_idxs); + +protected: + /** + * Creates an uninitialized ScaledPermutation matrix. + * + * @param exec Executor associated to the matrix + * @param size dimensions of the (square) scaled permutation matrix + */ + ScaledPermutation(std::shared_ptr exec, size_type size = 0); + + /** + * Creates a ScaledPermutation matrix from already allocated (and + * initialized) arrays. + * + * @param exec Executor associated to the matrix + * @param permutation_indices array of permutation indices + * @param scaling_factors array of scaling factors + */ + ScaledPermutation(std::shared_ptr exec, + array scaling_factors, + array permutation_indices); + + void apply_impl(const LinOp* in, LinOp* out) const override; + + + void apply_impl(const LinOp*, const LinOp* in, const LinOp*, + LinOp* out) const override; + + +private: + array scale_; + array permutation_; +}; + + +} // namespace matrix +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_MATRIX_SCALED_PERMUTATION_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index ad90e264189..baa5f5fd795 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -121,6 +121,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 1757b4b8a25..29459a264c4 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -909,6 +909,20 @@ void inv_symm_permute(std::shared_ptr exec, const IndexType* perm, const matrix::Csr* orig, matrix::Csr* permuted) +{ + inv_nonsymm_permute(exec, perm, perm, orig, permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); + + +template +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_perm, + const IndexType* column_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) { auto in_row_ptrs = orig->get_const_row_ptrs(); auto in_col_idxs = orig->get_const_col_idxs(); @@ -921,26 +935,26 @@ void inv_symm_permute(std::shared_ptr exec, #pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { auto src_row = row; - auto dst_row = perm[row]; + auto dst_row = row_perm[row]; p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row]; } components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1); #pragma omp parallel for for (size_type row = 0; row < num_rows; ++row) { auto src_row = row; - auto dst_row = perm[row]; + auto dst_row = row_perm[row]; auto src_begin = in_row_ptrs[src_row]; auto dst_begin = p_row_ptrs[dst_row]; auto row_size = in_row_ptrs[src_row + 1] - src_begin; for (IndexType i = 0; i < row_size; ++i) { - p_col_idxs[dst_begin + i] = perm[in_col_idxs[src_begin + i]]; + p_col_idxs[dst_begin + i] = column_perm[in_col_idxs[src_begin + i]]; p_vals[dst_begin + i] = in_vals[src_begin + i]; } } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); template @@ -982,10 +996,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void inverse_row_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* row_permuted) +void inv_row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) { auto orig_row_ptrs = orig->get_const_row_ptrs(); auto orig_col_idxs = orig->get_const_col_idxs(); @@ -1017,7 +1031,146 @@ void inverse_row_permute(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); + + +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto in_row_ptrs = orig->get_const_row_ptrs(); + auto in_col_idxs = orig->get_const_col_idxs(); + auto in_vals = orig->get_const_values(); + auto p_row_ptrs = permuted->get_row_ptrs(); + auto p_col_idxs = permuted->get_col_idxs(); + auto p_vals = permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = row_perm[row]; + p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row]; + } + components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1); +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = row_perm[row]; + auto src_begin = in_row_ptrs[src_row]; + auto dst_begin = p_row_ptrs[dst_row]; + auto row_size = in_row_ptrs[src_row + 1] - src_begin; + for (IndexType i = 0; i < row_size; ++i) { + const auto in_col = in_col_idxs[src_begin + i]; + p_col_idxs[dst_begin + i] = col_perm[in_col]; + p_vals[dst_begin + i] = in_vals[src_begin + i] / + (row_scale[src_row] * col_scale[in_col]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto orig_row_ptrs = orig->get_const_row_ptrs(); + auto orig_col_idxs = orig->get_const_col_idxs(); + auto orig_vals = orig->get_const_values(); + auto rp_row_ptrs = row_permuted->get_row_ptrs(); + auto rp_col_idxs = row_permuted->get_col_idxs(); + auto rp_vals = row_permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = perm[row]; + auto dst_row = row; + rp_row_ptrs[dst_row] = + orig_row_ptrs[src_row + 1] - orig_row_ptrs[src_row]; + } + components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1); +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = perm[row]; + auto dst_row = row; + auto src_begin = orig_row_ptrs[src_row]; + auto dst_begin = rp_row_ptrs[dst_row]; + auto row_size = orig_row_ptrs[src_row + 1] - src_begin; + std::copy_n(orig_col_idxs + src_begin, row_size, + rp_col_idxs + dst_begin); + for (IndexType i = 0; i < row_size; i++) { + rp_vals[i + dst_begin] = orig_vals[i + src_begin] * scale[dst_row]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto orig_row_ptrs = orig->get_const_row_ptrs(); + auto orig_col_idxs = orig->get_const_col_idxs(); + auto orig_vals = orig->get_const_values(); + auto rp_row_ptrs = row_permuted->get_row_ptrs(); + auto rp_col_idxs = row_permuted->get_col_idxs(); + auto rp_vals = row_permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = perm[row]; + rp_row_ptrs[dst_row] = + orig_row_ptrs[src_row + 1] - orig_row_ptrs[src_row]; + } + components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1); +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = perm[row]; + auto src_begin = orig_row_ptrs[src_row]; + auto dst_begin = rp_row_ptrs[dst_row]; + auto row_size = orig_row_ptrs[src_row + 1] - src_begin; + std::copy_n(orig_col_idxs + src_begin, row_size, + rp_col_idxs + dst_begin); + for (IndexType i = 0; i < row_size; i++) { + rp_vals[i + dst_begin] = orig_vals[i + src_begin] / scale[src_row]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); template diff --git a/omp/test/reorder/rcm_kernels.cpp b/omp/test/reorder/rcm_kernels.cpp index d2996ffb319..48698ac1b49 100644 --- a/omp/test/reorder/rcm_kernels.cpp +++ b/omp/test/reorder/rcm_kernels.cpp @@ -118,8 +118,7 @@ class Rcm : public ::testing::Test { return false; } - const auto n = gko::as(reorder->get_permutation()) - ->get_permutation_size(); + const auto n = reorder->get_permutation()->get_size()[0]; auto degrees = std::vector(n); for (gko::size_type i = 0; i < n; ++i) { degrees[i] = @@ -198,8 +197,8 @@ class Rcm : public ::testing::Test { static bool is_rcm_ordered(std::shared_ptr mtx, std::shared_ptr reorder) { - const auto n = gko::as(reorder->get_permutation()) - ->get_permutation_size(); + const auto n = + gko::as(reorder->get_permutation())->get_size()[0]; const auto row_ptrs = mtx->get_const_row_ptrs(); const auto col_idxs = mtx->get_const_col_idxs(); auto degrees = std::vector(n); diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 21dfc0dfb5a..44ee564c16f 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -35,6 +35,8 @@ target_sources(ginkgo_reference matrix/fbcsr_kernels.cpp matrix/fft_kernels.cpp matrix/hybrid_kernels.cpp + matrix/permutation_kernels.cpp + matrix/scaled_permutation_kernels.cpp matrix/sellp_kernels.cpp matrix/sparsity_csr_kernels.cpp multigrid/pgm_kernels.cpp diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 3a05a09cd45..d87e72bc5ab 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -834,24 +834,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); -template -void invert_permutation(std::shared_ptr exec, - size_type size, const IndexType* permutation_indices, - IndexType* inv_permutation) +template +void inv_symm_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) { - for (IndexType i = 0; i < static_cast(size); ++i) { - inv_permutation[permutation_indices[i]] = i; - } + inv_nonsymm_permute(exec, perm, perm, orig, permuted); } -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); template -void inv_symm_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* permuted) +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_perm, + const IndexType* column_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) { auto in_row_ptrs = orig->get_const_row_ptrs(); auto in_col_idxs = orig->get_const_col_idxs(); @@ -863,25 +864,25 @@ void inv_symm_permute(std::shared_ptr exec, for (size_type row = 0; row < num_rows; ++row) { auto src_row = row; - auto dst_row = perm[row]; + auto dst_row = row_perm[row]; p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row]; } components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1); for (size_type row = 0; row < num_rows; ++row) { auto src_row = row; - auto dst_row = perm[row]; + auto dst_row = row_perm[row]; auto src_begin = in_row_ptrs[src_row]; auto dst_begin = p_row_ptrs[dst_row]; auto row_size = in_row_ptrs[src_row + 1] - src_begin; for (IndexType i = 0; i < row_size; ++i) { - p_col_idxs[dst_begin + i] = perm[in_col_idxs[src_begin + i]]; + p_col_idxs[dst_begin + i] = column_perm[in_col_idxs[src_begin + i]]; p_vals[dst_begin + i] = in_vals[src_begin + i]; } } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); template @@ -920,10 +921,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void inverse_row_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* row_permuted) +void inv_row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) { auto in_row_ptrs = orig->get_const_row_ptrs(); auto in_col_idxs = orig->get_const_col_idxs(); @@ -951,21 +952,21 @@ void inverse_row_permute(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); template -void inverse_column_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* column_permuted) +void inv_col_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* col_permuted) { auto in_row_ptrs = orig->get_const_row_ptrs(); auto in_col_idxs = orig->get_const_col_idxs(); auto in_vals = orig->get_const_values(); - auto cp_row_ptrs = column_permuted->get_row_ptrs(); - auto cp_col_idxs = column_permuted->get_col_idxs(); - auto cp_vals = column_permuted->get_values(); + auto cp_row_ptrs = col_permuted->get_row_ptrs(); + auto cp_col_idxs = col_permuted->get_col_idxs(); + auto cp_vals = col_permuted->get_values(); auto num_rows = orig->get_size()[0]; for (size_type row = 0; row < num_rows; ++row) { @@ -981,7 +982,167 @@ void inverse_column_permute(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); + + +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto in_row_ptrs = orig->get_const_row_ptrs(); + auto in_col_idxs = orig->get_const_col_idxs(); + auto in_vals = orig->get_const_values(); + auto p_row_ptrs = permuted->get_row_ptrs(); + auto p_col_idxs = permuted->get_col_idxs(); + auto p_vals = permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = row_perm[row]; + p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row]; + } + components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1); + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = row_perm[row]; + auto src_begin = in_row_ptrs[src_row]; + auto dst_begin = p_row_ptrs[dst_row]; + auto row_size = in_row_ptrs[src_row + 1] - src_begin; + for (IndexType i = 0; i < row_size; ++i) { + const auto in_col = in_col_idxs[src_begin + i]; + p_col_idxs[dst_begin + i] = col_perm[in_col]; + p_vals[dst_begin + i] = in_vals[src_begin + i] / + (row_scale[src_row] * col_scale[in_col]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto in_row_ptrs = orig->get_const_row_ptrs(); + auto in_col_idxs = orig->get_const_col_idxs(); + auto in_vals = orig->get_const_values(); + auto rp_row_ptrs = row_permuted->get_row_ptrs(); + auto rp_col_idxs = row_permuted->get_col_idxs(); + auto rp_vals = row_permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = perm[row]; + auto dst_row = row; + rp_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row]; + } + components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1); + for (size_type row = 0; row < num_rows; ++row) { + const auto src_row = perm[row]; + const auto dst_row = row; + const auto src_begin = in_row_ptrs[src_row]; + const auto dst_begin = rp_row_ptrs[dst_row]; + const auto row_size = in_row_ptrs[src_row + 1] - src_begin; + std::copy_n(in_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin); + for (IndexType i = 0; i < row_size; i++) { + rp_vals[i + dst_begin] = in_vals[i + src_begin] * scale[dst_row]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto in_row_ptrs = orig->get_const_row_ptrs(); + auto in_col_idxs = orig->get_const_col_idxs(); + auto in_vals = orig->get_const_values(); + auto rp_row_ptrs = row_permuted->get_row_ptrs(); + auto rp_col_idxs = row_permuted->get_col_idxs(); + auto rp_vals = row_permuted->get_values(); + size_type num_rows = orig->get_size()[0]; + + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = perm[row]; + rp_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row]; + } + components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1); + for (size_type row = 0; row < num_rows; ++row) { + auto src_row = row; + auto dst_row = perm[row]; + auto src_begin = in_row_ptrs[src_row]; + auto dst_begin = rp_row_ptrs[dst_row]; + auto row_size = in_row_ptrs[src_row + 1] - src_begin; + std::copy_n(in_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin); + for (IndexType i = 0; i < row_size; i++) { + rp_vals[i + dst_begin] = in_vals[i + src_begin] / scale[src_row]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); + + +template +void inv_col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* col_permuted) +{ + auto in_row_ptrs = orig->get_const_row_ptrs(); + auto in_col_idxs = orig->get_const_col_idxs(); + auto in_vals = orig->get_const_values(); + auto cp_row_ptrs = col_permuted->get_row_ptrs(); + auto cp_col_idxs = col_permuted->get_col_idxs(); + auto cp_vals = col_permuted->get_values(); + auto num_rows = orig->get_size()[0]; + + for (size_type row = 0; row < num_rows; ++row) { + auto row_begin = in_row_ptrs[row]; + auto row_end = in_row_ptrs[row + 1]; + cp_row_ptrs[row] = in_row_ptrs[row]; + for (auto k = row_begin; k < row_end; ++k) { + const auto in_col = in_col_idxs[k]; + cp_col_idxs[k] = perm[in_col]; + cp_vals[k] = in_vals[k] / scale[in_col]; + } + } + cp_row_ptrs[num_rows] = in_row_ptrs[num_rows]; +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); template diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 47df46b3c86..3b28336db11 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -862,11 +862,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); template void symm_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, + const IndexType* perm, const matrix::Dense* orig, matrix::Dense* permuted) { - auto perm = permutation_indices->get_const_data(); auto size = orig->get_size()[0]; for (size_type i = 0; i < size; ++i) { for (size_type j = 0; j < size; ++j) { @@ -881,11 +879,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void inv_symm_permute(std::shared_ptr exec, - const array* permutation_indices, + const IndexType* perm, const matrix::Dense* orig, matrix::Dense* permuted) { - auto perm = permutation_indices->get_const_data(); auto size = orig->get_size()[0]; for (size_type i = 0; i < size; ++i) { for (size_type j = 0; j < size; ++j) { @@ -898,14 +895,46 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); +template +void nonsymm_permute(std::shared_ptr exec, + const IndexType* row_perm, const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + permuted->at(i, j) = orig->at(row_perm[i], col_perm[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); + + +template +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_perm, const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + permuted->at(row_perm[i], col_perm[j]) = orig->at(i, j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); + + template void row_gather(std::shared_ptr exec, - const array* row_idxs, - const matrix::Dense* orig, + const IndexType* rows, const matrix::Dense* orig, matrix::Dense* row_collection) { - auto rows = row_idxs->get_const_data(); - for (size_type i = 0; i < row_idxs->get_num_elems(); ++i) { + for (size_type i = 0; i < row_collection->get_size()[0]; ++i) { for (size_type j = 0; j < orig->get_size()[1]; ++j) { row_collection->at(i, j) = orig->at(rows[i], j); } @@ -919,16 +948,15 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( template void advanced_row_gather(std::shared_ptr exec, const matrix::Dense* alpha, - const array* row_idxs, + const IndexType* rows, const matrix::Dense* orig, const matrix::Dense* beta, matrix::Dense* row_collection) { using type = highest_precision; - auto rows = row_idxs->get_const_data(); auto scalar_alpha = alpha->at(0, 0); auto scalar_beta = beta->at(0, 0); - for (size_type i = 0; i < row_idxs->get_num_elems(); ++i) { + for (size_type i = 0; i < row_collection->get_size()[0]; ++i) { for (size_type j = 0; j < orig->get_size()[1]; ++j) { row_collection->at(i, j) = static_cast(scalar_alpha * orig->at(rows[i], j)) + @@ -943,30 +971,27 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( template -void column_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) +void col_permute(std::shared_ptr exec, + const IndexType* perm, const matrix::Dense* orig, + matrix::Dense* col_permuted) { - auto perm = permutation_indices->get_const_data(); for (size_type j = 0; j < orig->get_size()[1]; ++j) { for (size_type i = 0; i < orig->get_size()[0]; ++i) { - column_permuted->at(i, j) = orig->at(i, perm[j]); + col_permuted->at(i, j) = orig->at(i, perm[j]); } } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); + GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); template -void inverse_row_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* row_permuted) +void inv_row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* row_permuted) { - auto perm = permutation_indices->get_const_data(); for (size_type i = 0; i < orig->get_size()[0]; ++i) { for (size_type j = 0; j < orig->get_size()[1]; ++j) { row_permuted->at(perm[i], j) = orig->at(i, j); @@ -979,21 +1004,166 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void inverse_column_permute(std::shared_ptr exec, - const array* permutation_indices, +void inv_col_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* col_permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + col_permuted->at(i, perm[j]) = orig->at(i, j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); + + +template +void symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + permuted->at(i, j) = + scale[i] * scale[j] * orig->at(perm[i], perm[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); + + +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, const matrix::Dense* orig, - matrix::Dense* column_permuted) + matrix::Dense* permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + permuted->at(perm[i], perm[j]) = + orig->at(i, j) / (scale[i] * scale[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); + + +template +void nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + permuted->at(i, j) = row_scale[i] * col_scale[j] * + orig->at(row_perm[i], col_perm[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + permuted->at(row_perm[i], col_perm[j]) = + orig->at(i, j) / (row_scale[i] * col_scale[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL); + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + permuted->at(i, j) = scale[i] * orig->at(perm[i], j); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + permuted->at(perm[i], j) = orig->at(i, j) / scale[i]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); + + +template +void col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + permuted->at(i, j) = scale[j] * orig->at(i, perm[j]); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); + + +template +void inv_col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) { - auto perm = permutation_indices->get_const_data(); for (size_type j = 0; j < orig->get_size()[1]; ++j) { for (size_type i = 0; i < orig->get_size()[0]; ++i) { - column_permuted->at(i, perm[j]) = orig->at(i, j); + permuted->at(i, perm[j]) = orig->at(i, j) / scale[j]; } } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); + GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); template diff --git a/reference/matrix/permutation_kernels.cpp b/reference/matrix/permutation_kernels.cpp new file mode 100644 index 00000000000..cc7a81a1044 --- /dev/null +++ b/reference/matrix/permutation_kernels.cpp @@ -0,0 +1,58 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/permutation_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace permutation { + + +template +void invert(std::shared_ptr exec, + const IndexType* permutation, size_type size, + IndexType* output_permutation) +{ + for (size_type i = 0; i < size; i++) { + output_permutation[permutation[i]] = i; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL); + + +} // namespace permutation +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp new file mode 100644 index 00000000000..54a68fbdf0a --- /dev/null +++ b/reference/matrix/scaled_permutation_kernels.cpp @@ -0,0 +1,64 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/scaled_permutation_kernels.hpp" + + +#include + + +namespace gko { +namespace kernels { +namespace reference { +namespace scaled_permutation { + + +template +void invert(std::shared_ptr exec, + const IndexType* input_permutation, const ValueType* input_scale, + size_type size, IndexType* output_permutation, + ValueType* output_scale) +{ + for (size_type i = 0; i < size; i++) { + output_permutation[input_permutation[i]] = i; + output_scale[input_permutation[i]] = one() / input_scale[i]; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); + + +} // namespace scaled_permutation +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt index 05498cbadc4..6f3348da432 100644 --- a/reference/test/matrix/CMakeLists.txt +++ b/reference/test/matrix/CMakeLists.txt @@ -10,6 +10,7 @@ ginkgo_create_test(fft_kernels) ginkgo_create_test(hybrid_kernels) ginkgo_create_test(identity) ginkgo_create_test(permutation) +ginkgo_create_test(scaled_permutation) ginkgo_create_test(sellp_kernels) ginkgo_create_test(sparsity_csr) ginkgo_create_test(sparsity_csr_kernels) diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index d0265e462f2..f388922f05d 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -49,6 +49,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -77,6 +79,8 @@ class Csr : public ::testing::Test { using Hybrid = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; using MixedVec = gko::matrix::Dense>; + using Perm = gko::matrix::Permutation; + using ScaledPerm = gko::matrix::ScaledPermutation; Csr() : exec(gko::ReferenceExecutor::create()), @@ -88,7 +92,21 @@ class Csr : public ::testing::Test { std::make_shared())), mtx3_unsorted( Mtx::create(exec, gko::dim<2>(3, 3), 7, - std::make_shared())) + std::make_shared())), + perm3(Perm::create(exec, 3, gko::array{exec, {1, 2, 0}})), + perm3_rev(perm3->invert()), + perm2(Perm::create(exec, 2, gko::array{exec, {1, 0}})), + perm0(Perm::create(exec)), + scale_perm3(ScaledPerm::create( + exec, gko::array{this->exec, {2.0, 3.0, 5.0}}, + gko::array{exec, {1, 2, 0}})), + scale_perm3_rev(ScaledPerm::create( + exec, gko::array{this->exec, {7.0, 11.0, 13.0}}, + gko::array{exec, {1, 2, 0}})), + scale_perm2(ScaledPerm::create( + exec, gko::array{this->exec, {17.0, 19.0}}, + gko::array{exec, {1, 0}})), + scale_perm0(ScaledPerm::create(exec)) { this->create_mtx(mtx.get()); this->create_mtx2(mtx2.get()); @@ -350,6 +368,14 @@ class Csr : public ::testing::Test { std::unique_ptr mtx2; std::unique_ptr mtx3_sorted; std::unique_ptr mtx3_unsorted; + std::unique_ptr perm3; + std::unique_ptr perm3_rev; + std::unique_ptr perm2; + std::unique_ptr perm0; + std::unique_ptr scale_perm3; + std::unique_ptr scale_perm3_rev; + std::unique_ptr scale_perm2; + std::unique_ptr scale_perm0; index_type invalid_index = gko::invalid_index(); }; @@ -1285,6 +1311,439 @@ TYPED_TEST(Csr, NonSquareMtxIsTransposable) } +template +std::unique_ptr> csr_from_permutation( + gko::matrix::Permutation* perm, bool invert) +{ + gko::matrix_data double_data; + if (invert) { + perm->invert()->write(double_data); + } else { + perm->write(double_data); + } + gko::matrix_data data; + data.size = double_data.size; + for (auto entry : double_data.nonzeros) { + data.nonzeros.emplace_back(entry.row, entry.column, 1.0); + } + auto mtx = + gko::matrix::Csr::create(perm->get_executor()); + mtx->read(data); + return mtx; +} + + +template +std::unique_ptr> csr_from_permutation( + gko::matrix::ScaledPermutation* perm, bool invert) +{ + gko::matrix_data data; + if (invert) { + perm->invert()->write(data); + } else { + perm->write(data); + } + auto mtx = + gko::matrix::Csr::create(perm->get_executor()); + mtx->read(data); + return mtx; +} + + +template +std::unique_ptr> ref_permute( + gko::matrix::Csr* input, Permutation* permutation, + gko::matrix::permute_mode mode) +{ + using gko::matrix::permute_mode; + using Csr = gko::matrix::Csr; + auto result = input->clone(); + auto permutation_csr = csr_from_permutation( + permutation, (mode & permute_mode::inverse) == permute_mode::inverse); + if ((mode & permute_mode::rows) == permute_mode::rows) { + // compute P * A + permutation_csr->apply(input, result); + } + if ((mode & permute_mode::columns) == permute_mode::columns) { + // compute A * P^T = (P * A^T)^T + auto tmp = result->transpose(); + auto tmp2 = gko::as(tmp->clone()); + permutation_csr->apply(tmp, tmp2); + result = gko::as(tmp2->transpose()); + } + return result; +} + + +template +std::unique_ptr> ref_permute( + gko::matrix::Csr* input, Permutation* row_permutation, + Permutation* col_permutation, bool invert) +{ + using gko::matrix::permute_mode; + using Csr = gko::matrix::Csr; + auto result = input->clone(); + auto row_permutation_csr = + csr_from_permutation(row_permutation, invert); + auto col_permutation_csr = + csr_from_permutation(col_permutation, invert); + row_permutation_csr->apply(input, result); + auto tmp = result->transpose(); + auto tmp2 = gko::as(tmp->clone()); + col_permutation_csr->apply(tmp, tmp2); + return gko::as(tmp2->transpose()); +} + + +TYPED_TEST(Csr, Permute) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = this->mtx3_sorted->permute(this->perm3, mode); + auto ref_permuted = + ref_permute(this->mtx3_sorted.get(), this->perm3.get(), mode); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); + } +} + + +TYPED_TEST(Csr, PermuteRoundtrip) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {permute_mode::rows, permute_mode::columns, permute_mode::symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = + this->mtx3_sorted->permute(this->perm3, mode) + ->permute(this->perm3, mode | permute_mode::inverse); + + GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); + } +} + + +TYPED_TEST(Csr, PermuteRectangular) +{ + using gko::matrix::permute_mode; + + auto rpermuted = this->mtx2->permute(this->perm2, permute_mode::rows); + auto irpermuted = + this->mtx2->permute(this->perm2, permute_mode::inverse_rows); + auto cpermuted = this->mtx2->permute(this->perm3, permute_mode::columns); + auto icpermuted = + this->mtx2->permute(this->perm3, permute_mode::inverse_columns); + auto ref_rpermuted = + ref_permute(this->mtx2.get(), this->perm2.get(), permute_mode::rows); + auto ref_irpermuted = ref_permute(this->mtx2.get(), this->perm2.get(), + permute_mode::inverse_rows); + auto ref_cpermuted = + ref_permute(this->mtx2.get(), this->perm3.get(), permute_mode::columns); + auto ref_icpermuted = ref_permute(this->mtx2.get(), this->perm3.get(), + permute_mode::inverse_columns); + + GKO_ASSERT_MTX_NEAR(rpermuted, ref_rpermuted, 0.0); + GKO_ASSERT_MTX_NEAR(irpermuted, ref_irpermuted, 0.0); + GKO_ASSERT_MTX_NEAR(cpermuted, ref_cpermuted, 0.0); + GKO_ASSERT_MTX_NEAR(icpermuted, ref_icpermuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(rpermuted, ref_rpermuted); + GKO_ASSERT_MTX_EQ_SPARSITY(irpermuted, ref_irpermuted); + GKO_ASSERT_MTX_EQ_SPARSITY(cpermuted, ref_cpermuted); + GKO_ASSERT_MTX_EQ_SPARSITY(icpermuted, ref_icpermuted); + ASSERT_TRUE(rpermuted->is_sorted_by_column_index()); + ASSERT_TRUE(irpermuted->is_sorted_by_column_index()); + ASSERT_TRUE(cpermuted->is_sorted_by_column_index()); + ASSERT_TRUE(icpermuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, PermuteFailsWithIncorrectPermutationSize) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + ASSERT_THROW(this->mtx3_sorted->permute(this->perm0, mode), + gko::ValueMismatch); + } +} + + +TYPED_TEST(Csr, NonsymmPermute) +{ + auto permuted = this->mtx3_sorted->permute(this->perm3, this->perm3_rev); + auto ref_permuted = ref_permute(this->mtx3_sorted.get(), this->perm3.get(), + this->perm3_rev.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmPermuteInverse) +{ + auto permuted = + this->mtx3_sorted->permute(this->perm3, this->perm3_rev, true); + auto ref_permuted = ref_permute(this->mtx3_sorted.get(), this->perm3.get(), + this->perm3_rev.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmPermuteRectangular) +{ + auto permuted = this->mtx2->permute(this->perm2, this->perm3); + auto ref_permuted = ref_permute(this->mtx2.get(), this->perm2.get(), + this->perm3.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmPermuteInverseRectangular) +{ + auto permuted = this->mtx2->permute(this->perm2, this->perm3, true); + auto ref_permuted = ref_permute(this->mtx2.get(), this->perm2.get(), + this->perm3.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmPermuteRoundtrip) +{ + auto permuted = this->mtx3_sorted->permute(this->perm3, this->perm3_rev) + ->permute(this->perm3, this->perm3_rev, true); + + GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmPermuteFailsWithIncorrectPermutationSize) +{ + ASSERT_THROW(this->mtx3_sorted->permute(this->perm0, this->perm3_rev), + gko::ValueMismatch); + ASSERT_THROW(this->mtx3_sorted->permute(this->perm3_rev, this->perm0), + gko::ValueMismatch); + ASSERT_THROW(this->mtx3_sorted->permute(this->perm0, this->perm0), + gko::ValueMismatch); +} + + +TYPED_TEST(Csr, ScaledPermute) +{ + using gko::matrix::permute_mode; + using value_type = typename TestFixture::value_type; + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = + this->mtx3_sorted->scale_permute(this->scale_perm3, mode); + auto ref_permuted = + ref_permute(this->mtx3_sorted.get(), this->scale_perm3.get(), mode); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); + } +} + + +TYPED_TEST(Csr, ScaledPermuteRoundtrip) +{ + using gko::matrix::permute_mode; + using value_type = typename TestFixture::value_type; + + for (auto mode : + {permute_mode::rows, permute_mode::columns, permute_mode::symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = + this->mtx3_sorted->scale_permute(this->scale_perm3, mode) + ->scale_permute(this->scale_perm3, + mode | permute_mode::inverse); + + GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); + } +} + + +TYPED_TEST(Csr, ScaledPermuteRectangular) +{ + using gko::matrix::permute_mode; + using value_type = typename TestFixture::value_type; + + auto rpermuted = + this->mtx2->scale_permute(this->scale_perm2, permute_mode::rows); + auto irpermuted = this->mtx2->scale_permute(this->scale_perm2, + permute_mode::inverse_rows); + auto cpermuted = + this->mtx2->scale_permute(this->scale_perm3, permute_mode::columns); + auto icpermuted = this->mtx2->scale_permute(this->scale_perm3, + permute_mode::inverse_columns); + auto ref_rpermuted = ref_permute(this->mtx2.get(), this->scale_perm2.get(), + permute_mode::rows); + auto ref_irpermuted = ref_permute(this->mtx2.get(), this->scale_perm2.get(), + permute_mode::inverse_rows); + auto ref_cpermuted = ref_permute(this->mtx2.get(), this->scale_perm3.get(), + permute_mode::columns); + auto ref_icpermuted = ref_permute(this->mtx2.get(), this->scale_perm3.get(), + permute_mode::inverse_columns); + + GKO_ASSERT_MTX_NEAR(rpermuted, ref_rpermuted, r::value); + GKO_ASSERT_MTX_NEAR(irpermuted, ref_irpermuted, r::value); + GKO_ASSERT_MTX_NEAR(cpermuted, ref_cpermuted, r::value); + GKO_ASSERT_MTX_NEAR(icpermuted, ref_icpermuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(rpermuted, ref_rpermuted); + GKO_ASSERT_MTX_EQ_SPARSITY(irpermuted, ref_irpermuted); + GKO_ASSERT_MTX_EQ_SPARSITY(cpermuted, ref_cpermuted); + GKO_ASSERT_MTX_EQ_SPARSITY(icpermuted, ref_icpermuted); + ASSERT_TRUE(rpermuted->is_sorted_by_column_index()); + ASSERT_TRUE(irpermuted->is_sorted_by_column_index()); + ASSERT_TRUE(cpermuted->is_sorted_by_column_index()); + ASSERT_TRUE(icpermuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, ScaledPermuteFailsWithIncorrectPermutationSize) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + ASSERT_THROW(this->mtx3_sorted->scale_permute(this->scale_perm0, mode), + gko::ValueMismatch); + } +} + + +TYPED_TEST(Csr, NonsymmScaledPermute) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = this->mtx3_sorted->scale_permute(this->scale_perm3, + this->scale_perm3_rev); + auto ref_permuted = + ref_permute(this->mtx3_sorted.get(), this->scale_perm3.get(), + this->scale_perm3_rev.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmScaledPermuteInverse) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = this->mtx3_sorted->scale_permute( + this->scale_perm3, this->scale_perm3_rev, true); + auto ref_permuted = + ref_permute(this->mtx3_sorted.get(), this->scale_perm3.get(), + this->scale_perm3_rev.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmScaledPermuteRectangular) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = + this->mtx2->scale_permute(this->scale_perm2, this->scale_perm3); + auto ref_permuted = ref_permute(this->mtx2.get(), this->scale_perm2.get(), + this->scale_perm3.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmScaledPermuteInverseRectangular) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = + this->mtx2->scale_permute(this->scale_perm2, this->scale_perm3, true); + auto ref_permuted = ref_permute(this->mtx2.get(), this->scale_perm2.get(), + this->scale_perm3.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmScaledPermuteRoundtrip) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = + this->mtx3_sorted + ->scale_permute(this->scale_perm3, this->scale_perm3_rev) + ->scale_permute(this->scale_perm3, this->scale_perm3_rev, true); + + GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted); + ASSERT_TRUE(permuted->is_sorted_by_column_index()); +} + + +TYPED_TEST(Csr, NonsymmScaledPermuteFailsWithIncorrectPermutationSize) +{ + ASSERT_THROW(this->mtx3_sorted->scale_permute(this->scale_perm0, + this->scale_perm3_rev), + gko::ValueMismatch); + ASSERT_THROW(this->mtx3_sorted->scale_permute(this->scale_perm3_rev, + this->scale_perm0), + gko::ValueMismatch); + ASSERT_THROW( + this->mtx3_sorted->scale_permute(this->scale_perm0, this->scale_perm0), + gko::ValueMismatch); +} + + TYPED_TEST(Csr, SquareMatrixIsPermutable) { using Csr = typename TestFixture::Mtx; diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index b776f426794..a95359a0ac8 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -51,6 +51,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -1348,6 +1350,40 @@ class DenseWithIndexType typename std::tuple_element<0, decltype(ValueIndexType())>::type; using index_type = typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Permutation = gko::matrix::Permutation; + using ScaledPermutation = + gko::matrix::ScaledPermutation; + + + DenseWithIndexType() + { + perm2 = Permutation::create(this->exec, + gko::array{this->exec, {1, 0}}); + perm3 = Permutation::create( + this->exec, gko::array{this->exec, {1, 2, 0}}); + perm3_rev = Permutation::create( + this->exec, gko::array{this->exec, {2, 0, 1}}); + perm0 = Permutation::create(this->exec, 0); + scale_perm2 = ScaledPermutation::create( + this->exec, gko::array{this->exec, {17.0, 19.0}}, + gko::array{this->exec, {1, 0}}); + scale_perm3 = ScaledPermutation::create( + this->exec, gko::array{this->exec, {2.0, 3.0, 5.0}}, + gko::array{this->exec, {1, 2, 0}}); + scale_perm3_rev = ScaledPermutation::create( + this->exec, gko::array{this->exec, {7.0, 11.0, 13.0}}, + gko::array{this->exec, {2, 0, 1}}); + scale_perm0 = ScaledPermutation::create(this->exec, 0); + } + + std::unique_ptr perm2; + std::unique_ptr perm3; + std::unique_ptr perm3_rev; + std::unique_ptr perm0; + std::unique_ptr scale_perm2; + std::unique_ptr scale_perm3; + std::unique_ptr scale_perm3_rev; + std::unique_ptr scale_perm0; }; TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes, @@ -2230,6 +2266,286 @@ TYPED_TEST(DenseWithIndexType, MovesEmptyToSellp) } +template +std::unique_ptr> ref_permute( + gko::matrix::Dense* input, + gko::matrix::Permutation* permutation, + gko::matrix::permute_mode mode) +{ + using gko::matrix::permute_mode; + auto result = input->clone(); + auto permutation_dense = + gko::matrix::Dense::create(input->get_executor()); + gko::matrix_data permutation_data; + if ((mode & permute_mode::inverse) == permute_mode::inverse) { + permutation->invert()->write(permutation_data); + } else { + permutation->write(permutation_data); + } + permutation_dense->read(permutation_data); + if ((mode & permute_mode::rows) == permute_mode::rows) { + // compute P * A + permutation_dense->apply(input, result); + } + if ((mode & permute_mode::columns) == permute_mode::columns) { + // compute A * P^T = (P * A^T)^T + auto tmp = result->transpose(); + auto tmp2 = gko::as>(tmp->clone()); + permutation_dense->apply(tmp, tmp2); + tmp2->transpose(result); + } + return result; +} + + +template +std::unique_ptr> ref_permute( + gko::matrix::Dense* input, + gko::matrix::Permutation* row_permutation, + gko::matrix::Permutation* col_permutation, bool invert) +{ + using gko::matrix::permute_mode; + auto result = input->clone(); + auto row_permutation_dense = + gko::matrix::Dense::create(input->get_executor()); + auto col_permutation_dense = + gko::matrix::Dense::create(input->get_executor()); + gko::matrix_data row_permutation_data; + gko::matrix_data col_permutation_data; + if (invert) { + row_permutation->invert()->write(row_permutation_data); + col_permutation->invert()->write(col_permutation_data); + } else { + row_permutation->write(row_permutation_data); + col_permutation->write(col_permutation_data); + } + row_permutation_dense->read(row_permutation_data); + col_permutation_dense->read(col_permutation_data); + row_permutation_dense->apply(input, result); + auto tmp = result->transpose(); + auto tmp2 = gko::as>(tmp->clone()); + col_permutation_dense->apply(tmp, tmp2); + tmp2->transpose(result); + return result; +} + + +TYPED_TEST(DenseWithIndexType, Permute) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = this->mtx5->permute(this->perm3, mode); + auto ref_permuted = + ref_permute(this->mtx5.get(), this->perm3.get(), mode); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + } +} + + +TYPED_TEST(DenseWithIndexType, PermuteRoundtrip) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {permute_mode::rows, permute_mode::columns, permute_mode::symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = + this->mtx5->permute(this->perm3, mode) + ->permute(this->perm3, mode | permute_mode::inverse); + + GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, 0.0); + } +} + + +TYPED_TEST(DenseWithIndexType, PermuteStridedIntoDense) +{ + using gko::matrix::permute_mode; + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 1); + mtx->copy_from(this->mtx5); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse, + permute_mode::inverse_rows, permute_mode::inverse_columns, + permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto permuted = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 2); + + this->mtx5->permute(this->perm3, permuted, mode); + auto ref_permuted = + ref_permute(this->mtx5.get(), this->perm3.get(), mode); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); + } +} + + +TYPED_TEST(DenseWithIndexType, PermuteRectangular) +{ + using gko::matrix::permute_mode; + + auto rpermuted = this->mtx1->permute(this->perm2, permute_mode::rows); + auto irpermuted = + this->mtx1->permute(this->perm2, permute_mode::inverse_rows); + auto cpermuted = this->mtx1->permute(this->perm3, permute_mode::columns); + auto icpermuted = + this->mtx1->permute(this->perm3, permute_mode::inverse_columns); + auto ref_rpermuted = + ref_permute(this->mtx1.get(), this->perm2.get(), permute_mode::rows); + auto ref_irpermuted = ref_permute(this->mtx1.get(), this->perm2.get(), + permute_mode::inverse_rows); + auto ref_cpermuted = + ref_permute(this->mtx1.get(), this->perm3.get(), permute_mode::columns); + auto ref_icpermuted = ref_permute(this->mtx1.get(), this->perm3.get(), + permute_mode::inverse_columns); + + GKO_ASSERT_MTX_NEAR(rpermuted, ref_rpermuted, 0.0); + GKO_ASSERT_MTX_NEAR(irpermuted, ref_irpermuted, 0.0); + GKO_ASSERT_MTX_NEAR(cpermuted, ref_cpermuted, 0.0); + GKO_ASSERT_MTX_NEAR(icpermuted, ref_icpermuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, PermuteFailsWithIncorrectPermutationSize) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + ASSERT_THROW(this->mtx5->permute(this->perm0, mode), + gko::ValueMismatch); + } +} + + +TYPED_TEST(DenseWithIndexType, PermuteFailsWithIncorrectOutputSize) +{ + using gko::matrix::permute_mode; + using Mtx = typename TestFixture::Mtx; + auto output = Mtx::create(this->exec); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + ASSERT_THROW(this->mtx5->permute(this->perm3, output, mode), + gko::DimensionMismatch); + } +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermute) +{ + auto permuted = this->mtx5->permute(this->perm3, this->perm3_rev); + auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(), + this->perm3_rev.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermuteInverse) +{ + auto permuted = this->mtx5->permute(this->perm3, this->perm3_rev, true); + auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(), + this->perm3_rev.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermuteRectangular) +{ + auto permuted = this->mtx1->permute(this->perm2, this->perm3); + auto ref_permuted = ref_permute(this->mtx1.get(), this->perm2.get(), + this->perm3.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermuteInverseRectangular) +{ + auto permuted = this->mtx1->permute(this->perm2, this->perm3, true); + auto ref_permuted = ref_permute(this->mtx1.get(), this->perm2.get(), + this->perm3.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermuteRoundtrip) +{ + auto permuted = this->mtx5->permute(this->perm3, this->perm3_rev) + ->permute(this->perm3, this->perm3_rev, true); + + GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermuteStridedIntoDense) +{ + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 1); + auto permuted = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 2); + mtx->copy_from(this->mtx5); + + mtx->permute(this->perm3, this->perm3_rev, permuted); + auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(), + this->perm3_rev.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermuteInverseStridedIntoDense) +{ + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 1); + auto permuted = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 2); + mtx->copy_from(this->mtx5); + + mtx->permute(this->perm3, this->perm3_rev, permuted, true); + auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(), + this->perm3_rev.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmPermuteFailsWithIncorrectPermutationSize) +{ + ASSERT_THROW(this->mtx5->permute(this->perm0, this->perm3_rev), + gko::ValueMismatch); + ASSERT_THROW(this->mtx5->permute(this->perm3_rev, this->perm0), + gko::ValueMismatch); + ASSERT_THROW(this->mtx5->permute(this->perm0, this->perm0), + gko::ValueMismatch); +} + + TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRows) { using Mtx = typename TestFixture::Mtx; @@ -2907,6 +3223,331 @@ TYPED_TEST(DenseWithIndexType, } +template +std::unique_ptr> ref_scaled_permute( + gko::matrix::Dense* input, + gko::matrix::ScaledPermutation* permutation, + gko::matrix::permute_mode mode) +{ + using gko::matrix::permute_mode; + auto result = input->clone(); + auto permutation_dense = + gko::matrix::Dense::create(input->get_executor()); + gko::matrix_data permutation_data; + if ((mode & permute_mode::inverse) == permute_mode::inverse) { + permutation->invert()->write(permutation_data); + } else { + permutation->write(permutation_data); + } + permutation_dense->read(permutation_data); + if ((mode & permute_mode::rows) == permute_mode::rows) { + // compute P * A + permutation_dense->apply(input, result); + } + if ((mode & permute_mode::columns) == permute_mode::columns) { + // compute A * P^T = (P * A^T)^T + auto tmp = result->transpose(); + auto tmp2 = gko::as>(tmp->clone()); + permutation_dense->apply(tmp, tmp2); + tmp2->transpose(result); + } + return result; +} + + +template +std::unique_ptr> ref_scaled_permute( + gko::matrix::Dense* input, + gko::matrix::ScaledPermutation* row_permutation, + gko::matrix::ScaledPermutation* col_permutation, + bool invert) +{ + using gko::matrix::permute_mode; + auto result = input->clone(); + auto row_permutation_dense = + gko::matrix::Dense::create(input->get_executor()); + auto col_permutation_dense = + gko::matrix::Dense::create(input->get_executor()); + gko::matrix_data row_permutation_data; + gko::matrix_data col_permutation_data; + if (invert) { + row_permutation->invert()->write(row_permutation_data); + col_permutation->invert()->write(col_permutation_data); + } else { + row_permutation->write(row_permutation_data); + col_permutation->write(col_permutation_data); + } + row_permutation_dense->read(row_permutation_data); + col_permutation_dense->read(col_permutation_data); + row_permutation_dense->apply(input, result); + auto tmp = result->transpose(); + auto tmp2 = gko::as>(tmp->clone()); + col_permutation_dense->apply(tmp, tmp2); + tmp2->transpose(result); + return result; +} + + +TYPED_TEST(DenseWithIndexType, ScaledPermute) +{ + using gko::matrix::permute_mode; + using value_type = typename TestFixture::value_type; + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = this->mtx5->scale_permute(this->scale_perm3, mode); + auto ref_permuted = + ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), mode); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); + } +} + + +TYPED_TEST(DenseWithIndexType, ScaledPermuteRoundtrip) +{ + using gko::matrix::permute_mode; + using value_type = typename TestFixture::value_type; + + for (auto mode : + {permute_mode::rows, permute_mode::columns, permute_mode::symmetric}) { + SCOPED_TRACE(mode); + + auto permuted = this->mtx5->scale_permute(this->scale_perm3, mode) + ->scale_permute(this->scale_perm3, + mode | permute_mode::inverse); + + GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, r::value); + } +} + + +TYPED_TEST(DenseWithIndexType, ScaledPermuteStridedIntoDense) +{ + using gko::matrix::permute_mode; + using value_type = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + auto mtx = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 1); + mtx->copy_from(this->mtx5); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse, + permute_mode::inverse_rows, permute_mode::inverse_columns, + permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto permuted = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 2); + + this->mtx5->scale_permute(this->scale_perm3, permuted, mode); + auto ref_permuted = + ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), mode); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); + } +} + + +TYPED_TEST(DenseWithIndexType, ScaledPermuteRectangular) +{ + using gko::matrix::permute_mode; + using value_type = typename TestFixture::value_type; + + auto rpermuted = + this->mtx1->scale_permute(this->scale_perm2, permute_mode::rows); + auto irpermuted = this->mtx1->scale_permute(this->scale_perm2, + permute_mode::inverse_rows); + auto cpermuted = + this->mtx1->scale_permute(this->scale_perm3, permute_mode::columns); + auto icpermuted = this->mtx1->scale_permute(this->scale_perm3, + permute_mode::inverse_columns); + auto ref_rpermuted = ref_scaled_permute( + this->mtx1.get(), this->scale_perm2.get(), permute_mode::rows); + auto ref_irpermuted = ref_scaled_permute( + this->mtx1.get(), this->scale_perm2.get(), permute_mode::inverse_rows); + auto ref_cpermuted = ref_scaled_permute( + this->mtx1.get(), this->scale_perm3.get(), permute_mode::columns); + auto ref_icpermuted = + ref_scaled_permute(this->mtx1.get(), this->scale_perm3.get(), + permute_mode::inverse_columns); + + GKO_ASSERT_MTX_NEAR(rpermuted, ref_rpermuted, r::value); + GKO_ASSERT_MTX_NEAR(irpermuted, ref_irpermuted, r::value); + GKO_ASSERT_MTX_NEAR(cpermuted, ref_cpermuted, r::value); + GKO_ASSERT_MTX_NEAR(icpermuted, ref_icpermuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, ScaledPermuteFailsWithIncorrectPermutationSize) +{ + using gko::matrix::permute_mode; + + for (auto mode : + {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + ASSERT_THROW(this->mtx5->scale_permute(this->scale_perm0, mode), + gko::ValueMismatch); + } +} + + +TYPED_TEST(DenseWithIndexType, ScaledPermuteFailsWithIncorrectOutputSize) +{ + using gko::matrix::permute_mode; + using Mtx = typename TestFixture::Mtx; + auto output = Mtx::create(this->exec); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + + ASSERT_THROW(this->mtx5->scale_permute(this->scale_perm3, output, mode), + gko::DimensionMismatch); + } +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermute) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = + this->mtx5->scale_permute(this->scale_perm3, this->scale_perm3_rev); + auto ref_permuted = + ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), + this->scale_perm3_rev.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteInverse) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = this->mtx5->scale_permute(this->scale_perm3, + this->scale_perm3_rev, true); + auto ref_permuted = + ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), + this->scale_perm3_rev.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteRectangular) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = + this->mtx1->scale_permute(this->scale_perm2, this->scale_perm3); + auto ref_permuted = + ref_scaled_permute(this->mtx1.get(), this->scale_perm2.get(), + this->scale_perm3.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteInverseRectangular) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = + this->mtx1->scale_permute(this->scale_perm2, this->scale_perm3, true); + auto ref_permuted = + ref_scaled_permute(this->mtx1.get(), this->scale_perm2.get(), + this->scale_perm3.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteRoundtrip) +{ + using value_type = typename TestFixture::value_type; + + auto permuted = + this->mtx5->scale_permute(this->scale_perm3, this->scale_perm3_rev) + ->scale_permute(this->scale_perm3, this->scale_perm3_rev, true); + + GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteStridedIntoDense) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto mtx = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 1); + auto permuted = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 2); + mtx->copy_from(this->mtx5); + + mtx->scale_permute(this->scale_perm3, this->scale_perm3_rev, permuted); + auto ref_permuted = + ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), + this->scale_perm3_rev.get(), false); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteInverseStridedIntoDense) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + auto mtx = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 1); + auto permuted = Mtx::create(this->exec, this->mtx5->get_size(), + this->mtx5->get_size()[1] + 2); + mtx->copy_from(this->mtx5); + + mtx->scale_permute(this->scale_perm3, this->scale_perm3_rev, permuted, + true); + auto ref_permuted = + ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), + this->scale_perm3_rev.get(), true); + + GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r::value); +} + + +TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteFailsWithIncorrectOutputSize) +{ + ASSERT_THROW( + this->mtx5->scale_permute(this->scale_perm3, this->scale_perm3, + TestFixture::Mtx::create(this->exec)), + gko::DimensionMismatch); +} + + +TYPED_TEST(DenseWithIndexType, + NonsymmScaledPermuteFailsWithIncorrectPermutationSize) +{ + ASSERT_THROW( + this->mtx5->scale_permute(this->scale_perm0, this->scale_perm3_rev), + gko::ValueMismatch); + ASSERT_THROW( + this->mtx5->scale_permute(this->scale_perm3_rev, this->scale_perm0), + gko::ValueMismatch); + ASSERT_THROW( + this->mtx5->scale_permute(this->scale_perm0, this->scale_perm0), + gko::ValueMismatch); +} + + template class DenseComplex : public ::testing::Test { protected: diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index 2bd2e3d9741..65e092dfcd5 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -37,8 +37,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include #include @@ -51,12 +49,11 @@ namespace { template class Permutation : public ::testing::Test { protected: - using v_type = + using value_type = typename std::tuple_element<0, decltype(ValueIndexType())>::type; - using i_type = + using index_type = typename std::tuple_element<1, decltype(ValueIndexType())>::type; - using Vec = gko::matrix::Dense; - using Csr = gko::matrix::Csr; + using Vec = gko::matrix::Dense; Permutation() : exec(gko::ReferenceExecutor::create()) {} @@ -67,413 +64,53 @@ TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes, PairTypenameNameGenerator); -TYPED_TEST(Permutation, AppliesRowPermutationToDense) +TYPED_TEST(Permutation, Invert) { - using i_type = typename TestFixture::i_type; - using T = typename TestFixture::v_type; - using Vec = typename TestFixture::Vec; - // clang-format off - auto x = gko::initialize( - {I{2.0, 3.0}, - I{4.0, 2.5}}, this->exec); - // clang-format on - auto y = Vec::create(this->exec, gko::dim<2>{2}); - i_type rdata[] = {1, 0}; + using index_type = typename TestFixture::index_type; + auto perm = gko::matrix::Permutation::create( + this->exec, 3, gko::array{this->exec, {1, 2, 0}}); - auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata)); + auto inv = perm->invert(); - perm->apply(x, y); - // clang-format off - GKO_ASSERT_MTX_NEAR(y, - l({{4.0, 2.5}, - {2.0, 3.0}}), - 0.0); - // clang-format on + EXPECT_EQ(inv->get_const_permutation()[0], 2); + EXPECT_EQ(inv->get_const_permutation()[1], 0); + EXPECT_EQ(inv->get_const_permutation()[2], 1); } -TYPED_TEST(Permutation, AppliesColPermutationToDense) +TYPED_TEST(Permutation, Write) { - using i_type = typename TestFixture::i_type; - using T = typename TestFixture::v_type; - using Vec = typename TestFixture::Vec; - // clang-format off - auto x = gko::initialize( - {I{2.0, 3.0}, - I{4.0, 2.5}}, this->exec); - // clang-format on - auto y = Vec::create(this->exec, gko::dim<2>{2}); - i_type rdata[] = {1, 0}; + using index_type = typename TestFixture::index_type; + auto perm = gko::matrix::Permutation::create( + this->exec, 3, gko::array{this->exec, {1, 2, 0}}); - auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata), - gko::matrix::column_permute); - - perm->apply(x, y); - // clang-format off - GKO_ASSERT_MTX_NEAR(y, - l({{3.0, 2.0}, - {2.5, 4.0}}), - 0.0); - // clang-format on + GKO_ASSERT_MTX_NEAR( + perm, l({{0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {1.0, 0.0, 0.0}}), + 0.0); } -TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense) +TYPED_TEST(Permutation, AppliesRowPermutationToDense) { - using i_type = typename TestFixture::i_type; - using T = typename TestFixture::v_type; + using index_type = typename TestFixture::index_type; + using T = typename TestFixture::value_type; using Vec = typename TestFixture::Vec; // clang-format off auto x = gko::initialize( {I{2.0, 3.0}, I{4.0, 2.5}}, this->exec); // clang-format on - auto y1 = Vec::create(this->exec, gko::dim<2>{2}); - auto y2 = Vec::create(this->exec, gko::dim<2>{2}); - i_type cdata[] = {1, 0}; - i_type rdata[] = {1, 0}; + auto y = Vec::create(this->exec, gko::dim<2>{2}); + index_type rdata[] = {1, 0}; - auto rperm = gko::matrix::Permutation::create( + auto perm = gko::matrix::Permutation::create( this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata)); - auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, cdata), - gko::matrix::column_permute); - - rperm->apply(x, y1); - cperm->apply(y1, y2); - // clang-format off - GKO_ASSERT_MTX_NEAR(y2, - l({{2.5, 4.0}, - {3.0, 2.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray) -{ - using i_type = typename TestFixture::i_type; - using T = typename TestFixture::v_type; - using Vec = typename TestFixture::Vec; - // clang-format off - auto x = gko::initialize( - {I{2.0, 3.0}, - I{4.0, 2.5}}, this->exec); - // clang-format on - auto y1 = Vec::create(this->exec, gko::dim<2>{2}); - i_type data[] = {1, 0}; - - auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, data), - gko::matrix::row_permute | gko::matrix::column_permute); - - perm->apply(x, y1); - // clang-format off - GKO_ASSERT_MTX_NEAR(y1, - l({{2.5, 4.0}, - {3.0, 2.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense) -{ - using i_type = typename TestFixture::i_type; - using Vec = typename TestFixture::Vec; - // clang-format off - auto x = gko::initialize({{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y1 = Vec::create(this->exec, gko::dim<2>{3}); - auto y2 = Vec::create(this->exec, gko::dim<2>{3}); - i_type cdata[] = {1, 2, 0}; - i_type rdata[] = {1, 2, 0}; - - auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), - gko::matrix::row_permute | gko::matrix::inverse_permute); - auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), - gko::matrix::inverse_permute | gko::matrix::column_permute); - - rperm->apply(x, y1); - cperm->apply(y1, y2); - // clang-format off - GKO_ASSERT_MTX_NEAR(y2, - l({{2.5, 0.0, 4.0}, - {0.0, 2.0, 3.0}, - {0.0, 0.0, 1.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray) -{ - using i_type = typename TestFixture::i_type; - using Vec = typename TestFixture::Vec; - // clang-format off - auto x = gko::initialize({{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y1 = Vec::create(this->exec, gko::dim<2>{3}); - i_type data[] = {1, 2, 0}; - - auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, data), - gko::matrix::column_permute | gko::matrix::row_permute | - gko::matrix::inverse_permute); - - perm->apply(x, y1); - // clang-format off - GKO_ASSERT_MTX_NEAR(y1, - l({{2.5, 0.0, 4.0}, - {0.0, 2.0, 3.0}, - {0.0, 0.0, 1.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense) -{ - using i_type = typename TestFixture::i_type; - using Vec = typename TestFixture::Vec; - // clang-format off - auto x = gko::initialize({{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y = Vec::create(this->exec, gko::dim<2>{3}); - i_type rdata[] = {1, 2, 0}; - - auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), - gko::matrix::row_permute | gko::matrix::inverse_permute); - - rperm->apply(x, y); - // clang-format off - GKO_ASSERT_MTX_NEAR(y, - l({{0.0, 4.0, 2.5}, - {2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesInverseColPermutationToDense) -{ - using i_type = typename TestFixture::i_type; - using Vec = typename TestFixture::Vec; - // clang-format off - auto x = gko::initialize({{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y = Vec::create(this->exec, gko::dim<2>{3}); - i_type cdata[] = {1, 2, 0}; - - auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), - gko::matrix::inverse_permute | gko::matrix::column_permute); - - cperm->apply(x, y); - // clang-format off - GKO_ASSERT_MTX_NEAR(y, - l({{0.0, 2.0, 3.0}, - {0.0, 0.0, 1.0}, - {2.5, 0.0, 4.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesRowPermutationToCsr) -{ - using i_type = typename TestFixture::i_type; - using Csr = typename TestFixture::Csr; - // clang-format off - auto x = gko::initialize( - {{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y = Csr::create(this->exec, gko::dim<2>{3}); - i_type rdata[] = {1, 2, 0}; - - auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata)); perm->apply(x, y); // clang-format off GKO_ASSERT_MTX_NEAR(y, - l({{0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}, - {2.0, 3.0, 0.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesColPermutationToCsr) -{ - using i_type = typename TestFixture::i_type; - using Csr = typename TestFixture::Csr; - // clang-format off - auto x = gko::initialize( - {{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y = Csr::create(this->exec, gko::dim<2>{3}); - i_type cdata[] = {1, 2, 0}; - - auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), - gko::matrix::column_permute); - - perm->apply(x, y); - // clang-format off - GKO_ASSERT_MTX_NEAR(y, - l({{3.0, 0.0, 2.0}, - {1.0, 0.0, 0.0}, - {4.0, 2.5, 0.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr) -{ - using i_type = typename TestFixture::i_type; - using Csr = typename TestFixture::Csr; - // clang-format off - auto x = gko::initialize( - {{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y1 = Csr::create(this->exec, gko::dim<2>{3}); - auto y2 = Csr::create(this->exec, gko::dim<2>{3}); - i_type cdata[] = {1, 2, 0}; - i_type rdata[] = {1, 2, 0}; - - auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata)); - auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), - gko::matrix::column_permute); - - rperm->apply(x, y1); - cperm->apply(y1, y2); - // clang-format off - GKO_ASSERT_MTX_NEAR(y2, - l({{1.0, 0.0, 0.0}, - {4.0, 2.5, 0.0}, - {3.0, 0.0, 2.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr) -{ - using i_type = typename TestFixture::i_type; - using Csr = typename TestFixture::Csr; - // clang-format off - auto x = gko::initialize({{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y = Csr::create(this->exec, gko::dim<2>{3}); - i_type rdata[] = {1, 2, 0}; - - auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), - gko::matrix::row_permute | gko::matrix::inverse_permute); - - rperm->apply(x, y); - // clang-format off - GKO_ASSERT_MTX_NEAR(y, - l({{0.0, 4.0, 2.5}, - {2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr) -{ - using i_type = typename TestFixture::i_type; - using Csr = typename TestFixture::Csr; - // clang-format off - auto x = gko::initialize({{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y = Csr::create(this->exec, gko::dim<2>{3}); - i_type cdata[] = {1, 2, 0}; - - auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), - gko::matrix::inverse_permute | gko::matrix::column_permute); - - cperm->apply(x, y); - // clang-format off - GKO_ASSERT_MTX_NEAR(y, - l({{0.0, 2.0, 3.0}, - {0.0, 0.0, 1.0}, - {2.5, 0.0, 4.0}}), - 0.0); - // clang-format on -} - - -TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr) -{ - using i_type = typename TestFixture::i_type; - using Csr = typename TestFixture::Csr; - // clang-format off - auto x = gko::initialize({{2.0, 3.0, 0.0}, - {0.0, 1.0, 0.0}, - {0.0, 4.0, 2.5}}, - this->exec); - // clang-format on - auto y1 = Csr::create(this->exec, gko::dim<2>{3}); - auto y2 = Csr::create(this->exec, gko::dim<2>{3}); - i_type cdata[] = {1, 2, 0}; - i_type rdata[] = {1, 2, 0}; - - auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), - gko::matrix::row_permute | gko::matrix::inverse_permute); - auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), - gko::matrix::inverse_permute | gko::matrix::column_permute); - - rperm->apply(x, y1); - cperm->apply(y1, y2); - // clang-format off - GKO_ASSERT_MTX_NEAR(y2, - l({{2.5, 0.0, 4.0}, - {0.0, 2.0, 3.0}, - {0.0, 0.0, 1.0}}), + l({{4.0, 2.5}, + {2.0, 3.0}}), 0.0); // clang-format on } diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp new file mode 100644 index 00000000000..a15c0f09bbf --- /dev/null +++ b/reference/test/matrix/scaled_permutation.cpp @@ -0,0 +1,116 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class ScaledPermutation : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Vec = gko::matrix::Dense; + using Mtx = gko::matrix::ScaledPermutation; + + ScaledPermutation() : exec(gko::ReferenceExecutor::create()) + { + perm3 = Mtx::create(exec, + gko::array{this->exec, {1.0, 2.0, 4.0}}, + gko::array{this->exec, {1, 2, 0}}); + perm2 = + Mtx::create(exec, gko::array{this->exec, {3.0, 5.0}}, + gko::array{this->exec, {1, 0}}); + } + + std::shared_ptr exec; + std::unique_ptr perm3; + std::unique_ptr perm2; +}; + +TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypes, + PairTypenameNameGenerator); + + +TYPED_TEST(ScaledPermutation, Invert) +{ + using T = typename TestFixture::value_type; + auto inv = this->perm3->invert(); + + EXPECT_EQ(inv->get_const_permutation()[0], 2); + EXPECT_EQ(inv->get_const_permutation()[1], 0); + EXPECT_EQ(inv->get_const_permutation()[2], 1); + EXPECT_EQ(inv->get_const_scale()[0], T{0.25}); + EXPECT_EQ(inv->get_const_scale()[1], T{1.0}); + EXPECT_EQ(inv->get_const_scale()[2], T{0.5}); +} + + +TYPED_TEST(ScaledPermutation, Write) +{ + using T = typename TestFixture::value_type; + + GKO_ASSERT_MTX_NEAR( + this->perm3, l({{0.0, 1.0, 0.0}, {0.0, 0.0, 2.0}, {4.0, 0.0, 0.0}}), + 0.0); +} + + +TYPED_TEST(ScaledPermutation, AppliesToDense) +{ + using T = typename TestFixture::value_type; + using Vec = typename TestFixture::Vec; + auto x = gko::initialize({I{2.0, 3.0}, I{4.0, 2.5}}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + this->perm2->apply(x, y); + + GKO_ASSERT_MTX_NEAR(y, l({{12.0, 7.5}, {10.0, 15.0}}), 0.0); +} + + +} // namespace diff --git a/reference/test/reorder/rcm_kernels.cpp b/reference/test/reorder/rcm_kernels.cpp index 4c79af9e73a..b23e8bec097 100644 --- a/reference/test/reorder/rcm_kernels.cpp +++ b/reference/test/reorder/rcm_kernels.cpp @@ -98,7 +98,7 @@ class Rcm : public ::testing::Test { static bool is_permutation(const perm_type* input_perm) { - const auto perm_size = input_perm->get_permutation_size(); + const auto perm_size = input_perm->get_size()[0]; auto perm_sorted = std::vector(perm_size); std::copy_n(input_perm->get_const_permutation(), perm_size, perm_sorted.begin()); diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt index a03a0a0bb4e..d49373811dc 100644 --- a/test/matrix/CMakeLists.txt +++ b/test/matrix/CMakeLists.txt @@ -14,5 +14,7 @@ else() endif() ginkgo_create_common_test(hybrid_kernels) ginkgo_create_common_test(matrix) +ginkgo_create_common_test(permutation_kernels) +ginkgo_create_common_test(scaled_permutation_kernels) ginkgo_create_common_test(sellp_kernels) ginkgo_create_common_test(sparsity_csr_kernels) diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 84b1335c675..9e8355c284d 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -48,6 +48,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -55,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/assertions.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" @@ -68,6 +71,8 @@ class Csr : public CommonTestFixture { using Mtx = gko::matrix::Csr; using ComplexVec = gko::matrix::Dense>; using ComplexMtx = gko::matrix::Csr>; + using Perm = gko::matrix::Permutation; + using ScaledPerm = gko::matrix::ScaledPermutation; Csr() #ifdef GINKGO_FAST_TESTS @@ -162,8 +167,8 @@ class Csr : public CommonTestFixture { beta2 = gko::initialize({-1.0}, ref); dmtx = Mtx::create(exec, strategy); dmtx->copy_from(mtx); - square_dmtx = Mtx::create(exec, strategy); - square_dmtx->copy_from(square_mtx); + dsquare_mtx = Mtx::create(exec, strategy); + dsquare_mtx->copy_from(square_mtx); dresult = gko::clone(exec, expected); dresult2 = gko::clone(exec, expected2); dy = gko::clone(exec, y); @@ -180,8 +185,22 @@ class Csr : public CommonTestFixture { std::vector tmp2(mtx->get_size()[1], 0); std::iota(tmp2.begin(), tmp2.end(), 0); std::shuffle(tmp2.begin(), tmp2.end(), rng); + std::vector scale(mtx->get_size()[0]); + std::vector scale2(mtx->get_size()[1]); + std::uniform_real_distribution dist(1, 2); + auto gen = [&] { return dist(rng); }; + std::generate(scale.begin(), scale.end(), gen); + std::generate(scale2.begin(), scale2.end(), gen); rpermute_idxs = std::make_unique(ref, tmp.begin(), tmp.end()); cpermute_idxs = std::make_unique(ref, tmp2.begin(), tmp2.end()); + rpermutation = Perm::create(ref, tmp.size(), *rpermute_idxs); + cpermutation = Perm::create(ref, tmp2.size(), *cpermute_idxs); + srpermutation = ScaledPerm::create( + ref, gko::array(ref, scale.begin(), scale.end()), + *rpermute_idxs); + scpermutation = ScaledPerm::create( + ref, gko::array(ref, scale2.begin(), scale2.end()), + *cpermute_idxs); } template @@ -192,8 +211,8 @@ class Csr : public CommonTestFixture { complex_mtx = ComplexMtx::create(ref, strategy); complex_mtx->move_from( gen_mtx(mtx_size[0], mtx_size[1], 1)); - complex_dmtx = ComplexMtx::create(exec, strategy); - complex_dmtx->copy_from(complex_mtx); + dcomplex_mtx = ComplexMtx::create(exec, strategy); + dcomplex_mtx->copy_from(complex_mtx); } void unsort_mtx() @@ -220,8 +239,8 @@ class Csr : public CommonTestFixture { std::unique_ptr dmtx; std::unique_ptr dmtx2; - std::unique_ptr complex_dmtx; - std::unique_ptr square_dmtx; + std::unique_ptr dcomplex_mtx; + std::unique_ptr dsquare_mtx; std::unique_ptr dresult; std::unique_ptr dresult2; std::unique_ptr dy; @@ -232,6 +251,10 @@ class Csr : public CommonTestFixture { std::unique_ptr dbeta2; std::unique_ptr rpermute_idxs; std::unique_ptr cpermute_idxs; + std::unique_ptr rpermutation; + std::unique_ptr cpermutation; + std::unique_ptr srpermutation; + std::unique_ptr scpermutation; }; @@ -510,11 +533,11 @@ TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef) auto d_trans = dmtx->transpose(); mtx->apply(alpha, trans, beta, square_mtx); - dmtx->apply(dalpha, d_trans, dbeta, square_dmtx); + dmtx->apply(dalpha, d_trans, dbeta, dsquare_mtx); - GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r::value); - GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); - ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx); + ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index()); } @@ -525,11 +548,11 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef) auto d_trans = dmtx->transpose(); mtx->apply(trans, square_mtx); - dmtx->apply(d_trans, square_dmtx); + dmtx->apply(d_trans, dsquare_mtx); - GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r::value); - GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); - ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); + GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r::value); + GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx); + ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index()); } @@ -542,11 +565,11 @@ TEST_F(Csr, SimpleApplyToSparseCsrMatrixIsEquivalentToRef) dmtx2->copy_from(mtx2); mtx->apply(mtx2, square_mtx); - dmtx->apply(dmtx2, square_dmtx); + dmtx->apply(dmtx2, dsquare_mtx); - GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); - GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r::value); - ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); + GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx); + GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r::value); + ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index()); } @@ -560,11 +583,11 @@ TEST_F(Csr, SimpleApplySparseToSparseCsrMatrixIsEquivalentToRef) auto dmtx2 = gko::clone(exec, mtx2); mtx1->apply(mtx2, square_mtx); - dmtx1->apply(dmtx2, square_dmtx); + dmtx1->apply(dmtx2, dsquare_mtx); - GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); - GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r::value); - ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); + GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx); + GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r::value); + ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index()); } @@ -581,11 +604,11 @@ TEST_F(Csr, SimpleApplyToEmptyCsrMatrixIsEquivalentToRef) dmtx2->copy_from(mtx2); mtx->apply(mtx2, square_mtx); - dmtx->apply(dmtx2, square_dmtx); + dmtx->apply(dmtx2, dsquare_mtx); - GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx); - GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r::value); - ASSERT_TRUE(square_dmtx->is_sorted_by_column_index()); + GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx); + GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r::value); + ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index()); } @@ -673,7 +696,7 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef) set_up_apply_complex_data(); auto trans = gko::as(complex_mtx->conj_transpose()); - auto d_trans = gko::as(complex_dmtx->conj_transpose()); + auto d_trans = gko::as(dcomplex_mtx->conj_transpose()); GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0); ASSERT_TRUE(d_trans->is_sorted_by_column_index()); @@ -868,12 +891,152 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) } +TEST_F(Csr, IsGenericPermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto permuted = square_mtx->permute(rpermutation, mode); + auto dpermuted = dsquare_mtx->permute(rpermutation, mode); + + GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted); + ASSERT_TRUE(dpermuted->is_sorted_by_column_index()); + } +} + + +TEST_F(Csr, IsGenericPermutableRectangular) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + auto rpermuted = mtx->permute(rpermutation, permute_mode::rows); + auto drpermuted = dmtx->permute(rpermutation, permute_mode::rows); + auto irpermuted = + mtx->permute(rpermutation, permute_mode::inverse_rows); + auto dirpermuted = + dmtx->permute(rpermutation, permute_mode::inverse_rows); + auto cpermuted = mtx->permute(cpermutation, permute_mode::columns); + auto dcpermuted = dmtx->permute(cpermutation, permute_mode::columns); + auto icpermuted = + mtx->permute(cpermutation, permute_mode::inverse_columns); + auto dicpermuted = + dmtx->permute(cpermutation, permute_mode::inverse_columns); + + GKO_EXPECT_MTX_NEAR(rpermuted, drpermuted, r::value); + GKO_EXPECT_MTX_NEAR(irpermuted, dirpermuted, r::value); + GKO_EXPECT_MTX_NEAR(cpermuted, dcpermuted, r::value); + GKO_EXPECT_MTX_NEAR(icpermuted, dicpermuted, r::value); + GKO_EXPECT_MTX_EQ_SPARSITY(rpermuted, drpermuted); + GKO_EXPECT_MTX_EQ_SPARSITY(irpermuted, dirpermuted); + GKO_EXPECT_MTX_EQ_SPARSITY(cpermuted, dcpermuted); + GKO_EXPECT_MTX_EQ_SPARSITY(icpermuted, dicpermuted); + EXPECT_TRUE(rpermuted->is_sorted_by_column_index()); + EXPECT_TRUE(irpermuted->is_sorted_by_column_index()); + EXPECT_TRUE(cpermuted->is_sorted_by_column_index()); + EXPECT_TRUE(icpermuted->is_sorted_by_column_index()); +} + + +TEST_F(Csr, IsNonsymmPermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto invert : {false, true}) { + SCOPED_TRACE(invert); + auto permuted = mtx->permute(rpermutation, cpermutation, invert); + auto dpermuted = dmtx->permute(rpermutation, cpermutation, invert); + + GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0); + GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted); + ASSERT_TRUE(dpermuted->is_sorted_by_column_index()); + } +} + + +TEST_F(Csr, IsGenericScalePermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto permuted = square_mtx->scale_permute(srpermutation, mode); + auto dpermuted = dsquare_mtx->scale_permute(srpermutation, mode); + + GKO_EXPECT_MTX_NEAR(permuted, dpermuted, r::value); + GKO_EXPECT_MTX_EQ_SPARSITY(permuted, dpermuted); + EXPECT_TRUE(dpermuted->is_sorted_by_column_index()); + } +} + + +TEST_F(Csr, IsGenericScalePermutableRectangular) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + auto rpermuted = mtx->scale_permute(srpermutation, permute_mode::rows); + auto drpermuted = dmtx->scale_permute(srpermutation, permute_mode::rows); + auto irpermuted = + mtx->scale_permute(srpermutation, permute_mode::inverse_rows); + auto dirpermuted = + dmtx->scale_permute(srpermutation, permute_mode::inverse_rows); + auto cpermuted = mtx->scale_permute(scpermutation, permute_mode::columns); + auto dcpermuted = dmtx->scale_permute(scpermutation, permute_mode::columns); + auto icpermuted = + mtx->scale_permute(scpermutation, permute_mode::inverse_columns); + auto dicpermuted = + dmtx->scale_permute(scpermutation, permute_mode::inverse_columns); + + GKO_EXPECT_MTX_NEAR(rpermuted, drpermuted, r::value); + GKO_EXPECT_MTX_NEAR(irpermuted, dirpermuted, r::value); + GKO_EXPECT_MTX_NEAR(cpermuted, dcpermuted, r::value); + GKO_EXPECT_MTX_NEAR(icpermuted, dicpermuted, r::value); + GKO_EXPECT_MTX_EQ_SPARSITY(rpermuted, drpermuted); + GKO_EXPECT_MTX_EQ_SPARSITY(irpermuted, dirpermuted); + GKO_EXPECT_MTX_EQ_SPARSITY(cpermuted, dcpermuted); + GKO_EXPECT_MTX_EQ_SPARSITY(icpermuted, dicpermuted); + EXPECT_TRUE(rpermuted->is_sorted_by_column_index()); + EXPECT_TRUE(irpermuted->is_sorted_by_column_index()); + EXPECT_TRUE(cpermuted->is_sorted_by_column_index()); + EXPECT_TRUE(icpermuted->is_sorted_by_column_index()); +} + + +TEST_F(Csr, IsNonsymmScalePermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto invert : {false, true}) { + SCOPED_TRACE(invert); + auto permuted = mtx->scale_permute(srpermutation, scpermutation, invert); + auto dpermuted = dmtx->scale_permute(srpermutation, scpermutation, invert); + + GKO_EXPECT_MTX_NEAR(permuted, dpermuted, r::value); + GKO_EXPECT_MTX_EQ_SPARSITY(permuted, dpermuted); + EXPECT_TRUE(dpermuted->is_sorted_by_column_index()); + } +} + + TEST_F(Csr, IsPermutable) { set_up_apply_data(); auto permuted = gko::as(square_mtx->permute(rpermute_idxs.get())); - auto dpermuted = gko::as(square_dmtx->permute(rpermute_idxs.get())); + auto dpermuted = gko::as(dsquare_mtx->permute(rpermute_idxs.get())); GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted); GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0); @@ -887,7 +1050,7 @@ TEST_F(Csr, IsInversePermutable) auto permuted = gko::as(square_mtx->inverse_permute(rpermute_idxs.get())); auto dpermuted = - gko::as(square_dmtx->inverse_permute(rpermute_idxs.get())); + gko::as(dsquare_mtx->inverse_permute(rpermute_idxs.get())); GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted); GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0); @@ -1141,9 +1304,9 @@ TEST_F(Csr, InplaceAbsoluteComplexMatrixIsEquivalentToRef) set_up_apply_complex_data(); complex_mtx->compute_absolute_inplace(); - complex_dmtx->compute_absolute_inplace(); + dcomplex_mtx->compute_absolute_inplace(); - GKO_ASSERT_MTX_NEAR(complex_mtx, complex_dmtx, r::value); + GKO_ASSERT_MTX_NEAR(complex_mtx, dcomplex_mtx, r::value); } @@ -1152,7 +1315,7 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef) set_up_apply_complex_data(); auto abs_mtx = complex_mtx->compute_absolute(); - auto dabs_mtx = complex_dmtx->compute_absolute(); + auto dabs_mtx = dcomplex_mtx->compute_absolute(); GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, r::value); } diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index e9449ee9262..994283915c2 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -50,6 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -70,6 +72,9 @@ class Dense : public CommonTestFixture { using ComplexMtx = gko::matrix::Dense>; using Diagonal = gko::matrix::Diagonal; using MixedComplexMtx = gko::matrix::Dense>; + using Permutation = gko::matrix::Permutation; + using ScaledPermutation = + gko::matrix::ScaledPermutation; Dense() : rand_engine(15) {} @@ -145,16 +150,37 @@ class Dense : public CommonTestFixture { std::iota(tmp2.begin(), tmp2.end(), 0); std::shuffle(tmp2.begin(), tmp2.end(), rng); std::vector tmp3(x->get_size()[0] / 10); + std::vector scale_factors(tmp.size()); + std::vector scale_factors2(tmp2.size()); std::uniform_int_distribution row_dist(0, x->get_size()[0] - 1); + std::uniform_real_distribution scale_dist{1, 2}; for (auto& i : tmp3) { i = row_dist(rng); } + for (auto& s : scale_factors) { + s = scale_dist(rng); + } + for (auto& s : scale_factors2) { + s = scale_dist(rng); + } rpermute_idxs = std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); cpermute_idxs = std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); rgather_idxs = std::unique_ptr(new Arr{ref, tmp3.begin(), tmp3.end()}); + rpermutation = Permutation::create(ref, tmp.size(), *rpermute_idxs); + cpermutation = Permutation::create(ref, tmp2.size(), *cpermute_idxs); + rspermutation = ScaledPermutation::create( + ref, + gko::array{ref, scale_factors.begin(), + scale_factors.end()}, + *rpermute_idxs); + cspermutation = ScaledPermutation::create( + ref, + gko::array{ref, scale_factors2.begin(), + scale_factors2.end()}, + *cpermute_idxs); } template @@ -187,6 +213,10 @@ class Dense : public CommonTestFixture { std::unique_ptr dsquare; std::unique_ptr rpermute_idxs; std::unique_ptr cpermute_idxs; + std::unique_ptr rpermutation; + std::unique_ptr cpermutation; + std::unique_ptr rspermutation; + std::unique_ptr cspermutation; std::unique_ptr rgather_idxs; }; @@ -1278,6 +1308,192 @@ TEST_F(Dense, CanAdvancedGatherRowsIntoMixedDenseCrossExecutor) } +TEST_F(Dense, IsGenericPermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto permuted = square->permute(rpermutation, mode); + auto dpermuted = dsquare->permute(rpermutation, mode); + + GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0); + } +} + + +TEST_F(Dense, IsGenericPermutableRectangular) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + auto rpermuted = x->permute(rpermutation, permute_mode::rows); + auto drpermuted = dx->permute(rpermutation, permute_mode::rows); + auto irpermuted = x->permute(rpermutation, permute_mode::inverse_rows); + auto dirpermuted = dx->permute(rpermutation, permute_mode::inverse_rows); + auto cpermuted = x->permute(cpermutation, permute_mode::columns); + auto dcpermuted = dx->permute(cpermutation, permute_mode::columns); + auto icpermuted = x->permute(cpermutation, permute_mode::inverse_columns); + auto dicpermuted = dx->permute(cpermutation, permute_mode::inverse_columns); + + GKO_ASSERT_MTX_NEAR(rpermuted, drpermuted, 0); + GKO_ASSERT_MTX_NEAR(irpermuted, dirpermuted, 0); + GKO_ASSERT_MTX_NEAR(cpermuted, dcpermuted, 0); + GKO_ASSERT_MTX_NEAR(icpermuted, dicpermuted, 0); +} + + +TEST_F(Dense, IsGenericPermutableIntoDenseCrossExecutor) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto host_permuted = square->clone(); + + auto ref_permuted = square->permute(rpermutation, mode); + dsquare->permute(rpermutation, host_permuted, mode); + + GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, 0); + } +} + + +TEST_F(Dense, IsNonsymmPermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto invert : {false, true}) { + SCOPED_TRACE(invert); + auto permuted = x->permute(rpermutation, cpermutation, invert); + auto dpermuted = dx->permute(rpermutation, cpermutation, invert); + + GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0); + } +} + + +TEST_F(Dense, IsNonsymmPermutableIntoDenseCrossExecutor) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto invert : {false, true}) { + SCOPED_TRACE(invert); + auto host_permuted = dx->clone(); + + auto ref_permuted = x->permute(rpermutation, cpermutation, invert); + dx->permute(rpermutation, cpermutation, host_permuted, invert); + + GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, 0); + } +} + + +TEST_F(Dense, IsGenericScalePermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto permuted = square->scale_permute(rspermutation, mode); + auto dpermuted = dsquare->scale_permute(rspermutation, mode); + + GKO_ASSERT_MTX_NEAR(permuted, dpermuted, r::value); + } +} + + +TEST_F(Dense, IsGenericScalePermutableRectangular) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + auto rpermuted = x->scale_permute(rspermutation, permute_mode::rows); + auto drpermuted = dx->scale_permute(rspermutation, permute_mode::rows); + auto irpermuted = + x->scale_permute(rspermutation, permute_mode::inverse_rows); + auto dirpermuted = + dx->scale_permute(rspermutation, permute_mode::inverse_rows); + auto cpermuted = x->scale_permute(cspermutation, permute_mode::columns); + auto dcpermuted = dx->scale_permute(cspermutation, permute_mode::columns); + auto icpermuted = + x->scale_permute(cspermutation, permute_mode::inverse_columns); + auto dicpermuted = + dx->scale_permute(cspermutation, permute_mode::inverse_columns); + + GKO_ASSERT_MTX_NEAR(rpermuted, drpermuted, r::value); + GKO_ASSERT_MTX_NEAR(irpermuted, dirpermuted, r::value); + GKO_ASSERT_MTX_NEAR(cpermuted, dcpermuted, r::value); + GKO_ASSERT_MTX_NEAR(icpermuted, dicpermuted, r::value); +} + + +TEST_F(Dense, IsGenericScalePermutableIntoDenseCrossExecutor) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto mode : + {permute_mode::none, permute_mode::rows, permute_mode::columns, + permute_mode::symmetric, permute_mode::inverse_rows, + permute_mode::inverse_columns, permute_mode::inverse_symmetric}) { + SCOPED_TRACE(mode); + auto host_permuted = square->clone(); + + auto ref_permuted = square->permute(rpermutation, mode); + dsquare->permute(rpermutation, host_permuted, mode); + + GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, r::value); + } +} + + +TEST_F(Dense, IsNonsymmScalePermutable) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto invert : {false, true}) { + SCOPED_TRACE(invert); + auto permuted = x->permute(rpermutation, cpermutation, invert); + auto dpermuted = dx->permute(rpermutation, cpermutation, invert); + + GKO_ASSERT_MTX_NEAR(permuted, dpermuted, r::value); + } +} + + +TEST_F(Dense, IsNonsymmScalePermutableIntoDenseCrossExecutor) +{ + using gko::matrix::permute_mode; + set_up_apply_data(); + + for (auto invert : {false, true}) { + SCOPED_TRACE(invert); + auto host_permuted = dx->clone(); + + auto ref_permuted = x->permute(rpermutation, cpermutation, invert); + dx->permute(rpermutation, cpermutation, host_permuted, invert); + + GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, r::value); + } +} + + TEST_F(Dense, IsPermutable) { set_up_apply_data(); diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp new file mode 100644 index 00000000000..037040b8fd4 --- /dev/null +++ b/test/matrix/permutation_kernels.cpp @@ -0,0 +1,73 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + + +class Permutation : public CommonTestFixture { +protected: + using Perm = gko::matrix::Permutation; + + Permutation() : rand_engine(42) + { + std::vector tmp(1000, 0); + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rand_engine); + permutation = Perm::create(ref, tmp.size(), gko::array(ref, tmp.begin(), tmp.end())); + dpermutation = permutation->clone(exec); + } + + std::default_random_engine rand_engine; + + std::unique_ptr permutation; + std::unique_ptr dpermutation; +}; + + +TEST_F(Permutation, InvertIsEquivalentToRef) +{ + auto inv = permutation->invert(); + auto dinv = dpermutation->invert(); + + GKO_ASSERT_MTX_EQ_SPARSITY(inv, dinv); +} diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp new file mode 100644 index 00000000000..d85b9735abc --- /dev/null +++ b/test/matrix/scaled_permutation_kernels.cpp @@ -0,0 +1,77 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + + +class ScaledPermutation : public CommonTestFixture { +protected: + using ScaledPerm = gko::matrix::ScaledPermutation; + + ScaledPermutation() : rand_engine(42) + { + std::vector tmp(1000, 0); + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rand_engine); + std::vector scale(tmp.size()); + std::uniform_real_distribution dist(1, 2); + auto gen = [&] { return dist(rand_engine); }; + std::generate(scale.begin(), scale.end(), gen); + permutation = ScaledPerm::create(ref, gko::array(ref, scale.begin(), scale.end()), gko::array(ref, tmp.begin(), tmp.end())); + dpermutation = permutation->clone(exec); + } + + std::default_random_engine rand_engine; + + std::unique_ptr permutation; + std::unique_ptr dpermutation; +}; + + +TEST_F(ScaledPermutation, InvertIsEquivalentToRef) +{ + auto inv = permutation->invert(); + auto dinv = dpermutation->invert(); + + GKO_ASSERT_MTX_NEAR(inv, dinv, r::value); +} From 17da78cb47c51b617252944a90177cbbe22401c7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 9 Oct 2023 16:17:52 +0200 Subject: [PATCH 435/583] improve permutation interface consistency - remove permute_mask - deprecate permute_mask and dim<2> parameters --- core/matrix/csr.cpp | 1 - core/matrix/dense.cpp | 1 - core/matrix/permutation.cpp | 157 ++++++++++++++++++++- core/matrix/scaled_permutation.cpp | 5 +- core/reorder/amd.cpp | 3 +- core/test/matrix/permutation.cpp | 130 +---------------- include/ginkgo/core/matrix/permutation.hpp | 154 ++++++-------------- include/ginkgo/core/reorder/rcm.hpp | 16 +-- reference/test/matrix/csr_kernels.cpp | 4 +- reference/test/matrix/permutation.cpp | 4 +- 10 files changed, 221 insertions(+), 254 deletions(-) diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index e669f4d4718..b99becadccc 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -736,7 +736,6 @@ std::unique_ptr> create_permutation_view( const array& indices) { return Permutation::create_const(indices.get_executor(), - indices.get_num_elems(), indices.as_const_view()); } diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index b263357dc9b..05b5672117b 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -1438,7 +1438,6 @@ std::unique_ptr> create_permutation_view( const array& indices) { return Permutation::create_const(indices.get_executor(), - indices.get_num_elems(), indices.as_const_view()); } diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index cc58ced53d2..779bdd964bb 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -31,8 +31,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include +#include "core/base/dispatch_helper.hpp" #include "core/matrix/permutation_kernels.hpp" +#include "ginkgo/core/base/exception_helpers.hpp" #include "ginkgo/core/base/executor.hpp" +#include "ginkgo/core/base/precision_dispatch.hpp" +#include "ginkgo/core/base/utils_helper.hpp" namespace gko { @@ -46,6 +50,113 @@ GKO_REGISTER_OPERATION(invert, permutation::invert); } +template +std::unique_ptr> +Permutation::create_const( + std::shared_ptr exec, size_type size, + gko::detail::const_array_view&& perm_idxs, + mask_type enabled_permute) +{ + GKO_ASSERT_EQ(enabled_permute, row_permute); + GKO_ASSERT_EQ(size, perm_idxs.get_num_elems()); + return create_const(std::move(exec), std::move(perm_idxs)); +} + + +template +std::unique_ptr> +Permutation::create_const( + std::shared_ptr exec, + gko::detail::const_array_view&& perm_idxs) +{ + // cast const-ness away, but return a const object afterwards, + // so we can ensure that no modifications take place. + return std::unique_ptr>( + new Permutation{ + exec, gko::detail::array_const_cast(std::move(perm_idxs))}); +} + + +template +Permutation::Permutation(std::shared_ptr exec, + size_type size) + : EnableLinOp(exec, size), permutation_{exec, size} +{} + + +template +Permutation::Permutation(std::shared_ptr exec, + array permutation_indices) + : EnableLinOp(exec, permutation_indices.get_num_elems()), + permutation_{exec, std::move(permutation_indices)} +{} + + +template +Permutation::Permutation(std::shared_ptr exec, + const dim<2>& size) + : Permutation{exec, size[0]} +{ + GKO_ASSERT_IS_SQUARE_MATRIX(size); +} + + +template +Permutation::Permutation(std::shared_ptr exec, + const dim<2>& size, + const mask_type& enabled_permute) + : Permutation{exec, size[0]} +{ + GKO_ASSERT_EQ(enabled_permute, row_permute); + GKO_ASSERT_IS_SQUARE_MATRIX(size); +} + + +template +Permutation::Permutation(std::shared_ptr exec, + const dim<2>& size, + array permutation_indices) + : Permutation{std::move(exec), std::move(permutation_indices)} +{ + GKO_ASSERT_EQ(size[0], permutation_.get_num_elems()); + GKO_ASSERT_IS_SQUARE_MATRIX(size); +} + + +template +Permutation::Permutation(std::shared_ptr exec, + const dim<2>& size, + array permutation_indices, + const mask_type& enabled_permute) + : Permutation{std::move(exec), std::move(permutation_indices)} +{ + GKO_ASSERT_EQ(enabled_permute, row_permute); + GKO_ASSERT_EQ(size[0], permutation_.get_num_elems()); + GKO_ASSERT_IS_SQUARE_MATRIX(size); +} + + +template +size_type Permutation::get_permutation_size() const noexcept +{ + return permutation_.get_num_elems(); +} + + +template +mask_type Permutation::get_permute_mask() const +{ + return row_permute; +} + + +template +void Permutation::set_permute_mask(mask_type permute_mask) +{ + GKO_ASSERT_EQ(permute_mask, row_permute); +} + + template std::unique_ptr> Permutation::invert() const { @@ -54,8 +165,7 @@ std::unique_ptr> Permutation::invert() const array inv_permutation{exec, size}; exec->run(permutation::make_invert(this->get_const_permutation(), size, inv_permutation.get_data())); - return Permutation::create(exec, dim<2>{size, size}, - std::move(inv_permutation)); + return Permutation::create(exec, std::move(inv_permutation)); } @@ -70,11 +180,52 @@ void Permutation::write( data.nonzeros.reserve(data.size[0]); for (IndexType row = 0; row < this->get_size()[0]; row++) { data.nonzeros.emplace_back(row, host_this->get_const_permutation()[row], - 1.0); + one()); + } +} + + +template +void dispatch_dense(const LinOp* op, Functor fn) +{ + using matrix::Dense; + using std::complex; + if (dynamic_cast>*>(op)) { + run*, const Dense*>(op, fn); + } else if (dynamic_cast>>*>(op)) { + run>*, const Dense>*>(op, + fn); + } else { + GKO_NOT_SUPPORTED(*op); } } +template +void Permutation::apply_impl(const LinOp* in, LinOp* out) const +{ + dispatch_dense(in, [&](auto dense_in) { + auto dense_out = make_temporary_conversion< + typename gko::detail::pointee::value_type>(out); + dense_in->permute(this, dense_out.get(), permute_mode::rows); + }); +} + + +template +void Permutation::apply_impl(const LinOp* alpha, const LinOp* in, + const LinOp* beta, LinOp* out) const +{ + dispatch_dense(in, [&](auto dense_in) { + auto dense_out = make_temporary_conversion< + typename gko::detail::pointee::value_type>(out); + auto tmp = dense_in->permute(this, permute_mode::rows); + dense_out->scale(beta); + dense_out->add_scaled(alpha, tmp); + }); +} + + #define GKO_DECLARE_PERMUTATION_MATRIX(_type) class Permutation<_type> GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_MATRIX); diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp index d1ce00b521a..435b928a6b2 100644 --- a/core/matrix/scaled_permutation.cpp +++ b/core/matrix/scaled_permutation.cpp @@ -107,10 +107,9 @@ void ScaledPermutation::apply_impl(const LinOp* alpha, { precision_dispatch_real_complex( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { - auto x_clone = dense_x->clone(); - dense_b->scale_permute(this, x_clone, permute_mode::rows); + auto tmp = dense_b->scale_permute(this, permute_mode::rows); dense_x->scale(dense_beta); - dense_x->add_scaled(dense_alpha, x_clone); + dense_x->add_scaled(dense_alpha, tmp); }, alpha, b, beta, x); } diff --git a/core/reorder/amd.cpp b/core/reorder/amd.cpp index fa955801c2b..a305a95293d 100644 --- a/core/reorder/amd.cpp +++ b/core/reorder/amd.cpp @@ -212,8 +212,7 @@ std::unique_ptr Amd::generate_impl( head, elen, degree, w)); // permutation gets copied to device via gko::array constructor - return permutation_type::create(exec, dim<2>{num_rows, num_rows}, - std::move(permutation)); + return permutation_type::create(exec, std::move(permutation)); } diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp index 166ff0cbcdb..4879d1a8402 100644 --- a/core/test/matrix/permutation.cpp +++ b/core/test/matrix/permutation.cpp @@ -60,8 +60,7 @@ class Permutation : public ::testing::Test { Permutation() : exec(gko::ReferenceExecutor::create()), mtx(gko::matrix::Permutation::create( - exec, gko::dim<2>{4, 3}, - gko::array{exec, {1, 0, 2, 3}})) + exec, gko::array{exec, {1, 0, 2, 3}})) {} @@ -69,8 +68,7 @@ class Permutation : public ::testing::Test { gko::ptr_param> m) { auto perm = m->get_permutation(); - ASSERT_EQ(m->get_size(), gko::dim<2>(4, 3)); - ASSERT_EQ(m->get_size()[0], 4); + ASSERT_EQ(m->get_size(), gko::dim<2>(4, 4)); ASSERT_EQ(perm[0], 1); ASSERT_EQ(perm[1], 0); ASSERT_EQ(perm[2], 2); @@ -80,7 +78,6 @@ class Permutation : public ::testing::Test { static void assert_empty(gko::matrix::Permutation* m) { ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0)); - ASSERT_EQ(m->get_size()[0], 0); } std::shared_ptr exec; @@ -112,21 +109,9 @@ TYPED_TEST(Permutation, ReturnsNullValuesArrayWhenEmpty) TYPED_TEST(Permutation, CanBeConstructedWithSize) { using index_type = typename TestFixture::index_type; - auto m = gko::matrix::Permutation::create(this->exec, - gko::dim<2>{2, 3}); + auto m = gko::matrix::Permutation::create(this->exec, 2); - ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size()[0], 2); -} - - -TYPED_TEST(Permutation, FactorySetsCorrectPermuteMask) -{ - using index_type = typename TestFixture::index_type; - auto m = gko::matrix::Permutation::create(this->exec); - auto mask = m->get_permute_mask(); - - ASSERT_EQ(mask, gko::matrix::row_permute); + ASSERT_EQ(m->get_size(), gko::dim<2>(2, 2)); } @@ -136,8 +121,7 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData) index_type data[] = {1, 0, 2}; auto m = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3, 5}, - gko::make_array_view(this->exec, 3, data)); + this->exec, gko::make_array_view(this->exec, 3, data)); ASSERT_EQ(m->get_const_permutation(), data); } @@ -150,88 +134,12 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingConstData) const index_type data[] = {1, 0, 2}; auto m = gko::matrix::Permutation::create_const( - this->exec, 3, gko::array::const_view(this->exec, 3, data)); + this->exec, gko::array::const_view(this->exec, 3, data)); ASSERT_EQ(m->get_const_permutation(), data); } -TYPED_TEST(Permutation, CanBeConstructedWithSizeAndMask) -{ - using index_type = typename TestFixture::index_type; - auto m = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute); - - ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_size()[0], 2); - ASSERT_EQ(m->get_permute_mask(), gko::matrix::column_permute); -} - - -TYPED_TEST(Permutation, CanExplicitlyOverrideSetPermuteMask) -{ - using index_type = typename TestFixture::index_type; - auto m = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute); - - auto mask = m->get_permute_mask(); - ASSERT_EQ(mask, gko::matrix::column_permute); - - m->set_permute_mask(gko::matrix::row_permute | - gko::matrix::inverse_permute); - - auto s_mask = m->get_permute_mask(); - ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute); -} - - -TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions) -{ - using index_type = typename TestFixture::index_type; - index_type data[] = {0, 2, 1}; - - ASSERT_THROW(gko::matrix::Permutation::create( - this->exec, gko::dim<2>{4, 2}, - gko::make_array_view(this->exec, 3, data)), - gko::ValueMismatch); -} - - -TYPED_TEST(Permutation, SettingMaskDoesNotModifyData) -{ - using index_type = typename TestFixture::index_type; - index_type data[] = {1, 0, 2}; - - auto m = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3, 5}, - gko::make_array_view(this->exec, 3, data)); - - auto mask = m->get_permute_mask(); - ASSERT_EQ(m->get_const_permutation(), data); - ASSERT_EQ(mask, gko::matrix::row_permute); - - m->set_permute_mask(gko::matrix::row_permute | - gko::matrix::inverse_permute); - - auto s_mask = m->get_permute_mask(); - ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute); - ASSERT_EQ(m->get_const_permutation(), data); -} - - -TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions) -{ - using index_type = typename TestFixture::index_type; - index_type data[] = {0, 2, 1}; - - ASSERT_THROW(gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3, 4}, - gko::make_array_view(this->exec, 3, data), - gko::matrix::column_permute), - gko::ValueMismatch); -} - - TYPED_TEST(Permutation, KnowsItsSizeAndValues) { this->assert_equal_to_original_mtx(this->mtx); @@ -262,32 +170,6 @@ TYPED_TEST(Permutation, CanBeMoved) } -TYPED_TEST(Permutation, CopyingPreservesMask) -{ - using index_type = typename TestFixture::index_type; - auto mtx_copy = gko::matrix::Permutation::create(this->exec); - - mtx_copy->copy_from(this->mtx); - - auto o_mask = this->mtx->get_permute_mask(); - auto n_mask = mtx_copy->get_permute_mask(); - ASSERT_EQ(o_mask, gko::matrix::row_permute); - ASSERT_EQ(o_mask, n_mask); - - this->mtx->set_permute_mask(gko::matrix::column_permute); - - o_mask = this->mtx->get_permute_mask(); - n_mask = mtx_copy->get_permute_mask(); - ASSERT_EQ(o_mask, gko::matrix::column_permute); - ASSERT_NE(o_mask, n_mask); - - mtx_copy->copy_from(this->mtx); - - n_mask = mtx_copy->get_permute_mask(); - ASSERT_EQ(o_mask, n_mask); -} - - TYPED_TEST(Permutation, CanBeCloned) { auto mtx_clone = this->mtx->clone(); diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index b577481345b..abfffb11248 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -186,27 +186,13 @@ class Permutation : public EnableLinOp>, * array. */ [[deprecated("use get_size()[0] instead")]] size_type get_permutation_size() - const noexcept - { - return permutation_.get_num_elems(); - } + const noexcept; - /** - * Get the permute masks - * - * @return permute_mask the permute masks - */ - mask_type get_permute_mask() const { return enabled_permute_; } + [[deprecated("permute mask is no longer supported")]] mask_type + get_permute_mask() const; - /** - * Set the permute masks - * - * @param permute_mask the permute masks - */ - void set_permute_mask(mask_type permute_mask) - { - enabled_permute_ = permute_mask; - } + [[deprecated("permute mask is no longer supported")]] void set_permute_mask( + mask_type permute_mask); /** * Returns the inverse permutation. @@ -218,6 +204,23 @@ class Permutation : public EnableLinOp>, void write(gko::matrix_data& data) const override; + /** + * Creates a constant (immutable) Permutation matrix from a constant array. + * + * @param exec the executor to create the matrix on + * @param size the size of the square matrix + * @param perm_idxs the permutation index array of the matrix + * @param enabled_permute the mask describing the type of permutation + * @returns A smart pointer to the constant matrix wrapping the input array + * (if it resides on the same executor as the matrix) or a copy of + * the array on the correct executor. + */ + [[deprecated( + "use create_const without size and permute mask")]] static std:: + unique_ptr + create_const(std::shared_ptr exec, size_type size, + gko::detail::const_array_view&& perm_idxs, + mask_type enabled_permute = row_permute); /** * Creates a constant (immutable) Permutation matrix from a constant array. * @@ -230,16 +233,8 @@ class Permutation : public EnableLinOp>, * the array on the correct executor. */ static std::unique_ptr create_const( - std::shared_ptr exec, size_type size, - gko::detail::const_array_view&& perm_idxs, - mask_type enabled_permute = row_permute) - { - // cast const-ness away, but return a const object afterwards, - // so we can ensure that no modifications take place. - return std::unique_ptr(new Permutation{ - exec, size, gko::detail::array_const_cast(std::move(perm_idxs)), - enabled_permute}); - } + std::shared_ptr exec, + gko::detail::const_array_view&& perm_idxs); protected: /** @@ -247,32 +242,12 @@ class Permutation : public EnableLinOp>, * * @param exec Executor associated to the LinOp */ - Permutation(std::shared_ptr exec) - : Permutation(std::move(exec), dim<2>{}) - {} - - /** - * Creates uninitialized Permutation arrays of the specified size. - * - * @param exec Executor associated to the matrix - * @param size size of the permutable matrix - * @param enabled_permute mask for the type of permutation to apply. - */ - Permutation(std::shared_ptr exec, const dim<2>& size, - const mask_type& enabled_permute = row_permute) - : EnableLinOp(exec, size), - permutation_(exec, size[0]), - row_size_(size[0]), - col_size_(size[1]), - enabled_permute_(enabled_permute) - {} + Permutation(std::shared_ptr exec, size_type = 0); /** * Creates a Permutation matrix from an already allocated (and initialized) * row and column permutation arrays. * - * @tparam IndicesArray type of array of indices - * * @param exec Executor associated to the matrix * @param size size of the permutation array. * @param permutation_indices array of permutation array @@ -282,71 +257,34 @@ class Permutation : public EnableLinOp>, * IndexType, or is on the wrong executor, an internal copy will be created, * and the original array data will not be used in the matrix. */ - template - Permutation(std::shared_ptr exec, const dim<2>& size, - IndicesArray&& permutation_indices, - const mask_type& enabled_permute = row_permute) - : EnableLinOp(exec, size), - permutation_{exec, std::forward(permutation_indices)}, - row_size_(size[0]), - col_size_(size[1]), - enabled_permute_(enabled_permute) - { - if (enabled_permute_ & row_permute) { - GKO_ASSERT_EQ(size[0], permutation_.get_num_elems()); - } - if (enabled_permute_ & column_permute) { - GKO_ASSERT_EQ(size[1], permutation_.get_num_elems()); - } - } + Permutation(std::shared_ptr exec, + array permutation_indices); - void apply_impl(const LinOp* in, LinOp* out) const override - { - auto perm = as>(in); - std::unique_ptr tmp{}; - if (enabled_permute_ & inverse_permute) { - if (enabled_permute_ & row_permute) { - tmp = perm->inverse_row_permute(&permutation_); - } - if (enabled_permute_ & column_permute) { - if (enabled_permute_ & row_permute) { - tmp = as>(tmp.get()) - ->inverse_column_permute(&permutation_); - } else { - tmp = perm->inverse_column_permute(&permutation_); - } - } - } else { - if (enabled_permute_ & row_permute) { - tmp = perm->row_permute(&permutation_); - } - if (enabled_permute_ & column_permute) { - if (enabled_permute_ & row_permute) { - tmp = as>(tmp.get())->column_permute( - &permutation_); - } else { - tmp = perm->column_permute(&permutation_); - } - } - } - out->move_from(tmp); - } + [[deprecated( + "dim<2> is no longer supported as a dimension parameter, use size_type " + "instead")]] Permutation(std::shared_ptr exec, + const dim<2>& size); + [[deprecated("permute mask is no longer supported")]] Permutation( + std::shared_ptr exec, const dim<2>& size, + const mask_type& enabled_permute); - void apply_impl(const LinOp*, const LinOp* in, const LinOp*, - LinOp* out) const override - { - // Ignores alpha and beta and just performs a normal permutation as an - // advanced apply does not really make sense here. - this->apply_impl(in, out); - } + [[deprecated("use the overload without dimensions")]] Permutation( + std::shared_ptr exec, const dim<2>& size, + array permutation_indices); + + [[deprecated("permute mask is no longer supported")]] Permutation( + std::shared_ptr exec, const dim<2>& size, + array permutation_indices, + const mask_type& enabled_permute); + void apply_impl(const LinOp* in, LinOp* out) const override; + + void apply_impl(const LinOp*, const LinOp* in, const LinOp*, + LinOp* out) const override; private: array permutation_; - size_type row_size_; - size_type col_size_; - mask_type enabled_permute_; }; diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp index 72ba6827f2b..ab0807194c5 100644 --- a/include/ginkgo/core/reorder/rcm.hpp +++ b/include/ginkgo/core/reorder/rcm.hpp @@ -177,6 +177,7 @@ class Rcm : public EnablePolymorphicObject, // The adjacency matrix has to be square. GKO_ASSERT_IS_SQUARE_MATRIX(args.system_matrix); + const auto num_rows = args.system_matrix->get_size()[0]; // This is needed because it does not make sense to call the copy and // convert if the existing matrix is empty. if (args.system_matrix->get_size()) { @@ -187,13 +188,12 @@ class Rcm : public EnablePolymorphicObject, adjacency_matrix = tmp->to_adjacency_matrix(); } - auto const dim = adjacency_matrix->get_size(); - permutation_ = PermutationMatrix::create(cpu_exec, dim); + permutation_ = PermutationMatrix::create(cpu_exec, num_rows); // To make it explicit. inv_permutation_ = nullptr; if (parameters_.construct_inverse_permutation) { - inv_permutation_ = PermutationMatrix::create(cpu_exec, dim); + inv_permutation_ = PermutationMatrix::create(cpu_exec, num_rows); } this->generate(cpu_exec, std::move(adjacency_matrix)); @@ -201,19 +201,19 @@ class Rcm : public EnablePolymorphicObject, // Copy back results to gpu if necessary. if (is_gpu_executor) { const auto gpu_exec = this->get_executor(); - auto gpu_perm = share(PermutationMatrix::create(gpu_exec, dim)); + auto gpu_perm = + share(PermutationMatrix::create(gpu_exec, num_rows)); gpu_perm->copy_from(permutation_); permutation_ = gpu_perm; if (inv_permutation_) { auto gpu_inv_perm = - share(PermutationMatrix::create(gpu_exec, dim)); + share(PermutationMatrix::create(gpu_exec, num_rows)); gpu_inv_perm->copy_from(inv_permutation_); inv_permutation_ = gpu_inv_perm; } } - auto permutation_array = - make_array_view(this->get_executor(), permutation_->get_size()[0], - permutation_->get_permutation()); + auto permutation_array = make_array_view( + this->get_executor(), num_rows, permutation_->get_permutation()); this->set_permutation_array(permutation_array); } diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index f388922f05d..d7b43ce9495 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -93,9 +93,9 @@ class Csr : public ::testing::Test { mtx3_unsorted( Mtx::create(exec, gko::dim<2>(3, 3), 7, std::make_shared())), - perm3(Perm::create(exec, 3, gko::array{exec, {1, 2, 0}})), + perm3(Perm::create(exec, gko::array{exec, {1, 2, 0}})), perm3_rev(perm3->invert()), - perm2(Perm::create(exec, 2, gko::array{exec, {1, 0}})), + perm2(Perm::create(exec, gko::array{exec, {1, 0}})), perm0(Perm::create(exec)), scale_perm3(ScaledPerm::create( exec, gko::array{this->exec, {2.0, 3.0, 5.0}}, diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index 65e092dfcd5..1301276a424 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -68,7 +68,7 @@ TYPED_TEST(Permutation, Invert) { using index_type = typename TestFixture::index_type; auto perm = gko::matrix::Permutation::create( - this->exec, 3, gko::array{this->exec, {1, 2, 0}}); + this->exec, gko::array{this->exec, {1, 2, 0}}); auto inv = perm->invert(); @@ -104,7 +104,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToDense) index_type rdata[] = {1, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata)); + this->exec, gko::make_array_view(this->exec, 2, rdata)); perm->apply(x, y); // clang-format off From c70dbf723a971c4f73d32c44924206739c648e4d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 9 Oct 2023 17:26:49 +0200 Subject: [PATCH 436/583] swap order of scaling and permutation --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 16 ++-- common/unified/matrix/csr_kernels.cpp | 6 +- .../unified/matrix/dense_kernels.template.cpp | 30 +++++--- .../matrix/scaled_permutation_kernels.cpp | 6 +- core/matrix/scaled_permutation.cpp | 4 +- .../ginkgo/core/matrix/scaled_permutation.hpp | 2 +- omp/matrix/csr_kernels.cpp | 10 +-- reference/matrix/csr_kernels.cpp | 16 ++-- reference/matrix/dense_kernels.cpp | 75 +++++++++++-------- .../matrix/scaled_permutation_kernels.cpp | 5 +- reference/test/matrix/scaled_permutation.cpp | 10 +-- 11 files changed, 101 insertions(+), 79 deletions(-) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 3a762ad5ad1..757cd13e8d6 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -813,7 +813,7 @@ __global__ __launch_bounds__(default_block_size) void row_scale_permute( auto out_begin = out_row_ptrs[out_row]; for (IndexType i = lane; i < in_size; i += subwarp_size) { out_cols[out_begin + i] = in_cols[in_begin + i]; - out_vals[out_begin + i] = in_vals[in_begin + i] * scale[out_row]; + out_vals[out_begin + i] = in_vals[in_begin + i] * scale[in_row]; } } @@ -840,7 +840,7 @@ __global__ __launch_bounds__(default_block_size) void inv_row_scale_permute( auto out_begin = out_row_ptrs[out_row]; for (IndexType i = lane; i < in_size; i += subwarp_size) { out_cols[out_begin + i] = in_cols[in_begin + i]; - out_vals[out_begin + i] = in_vals[in_begin + i] / scale[in_row]; + out_vals[out_begin + i] = in_vals[in_begin + i] / scale[out_row]; } } @@ -866,10 +866,10 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_scale_permute( auto in_size = in_row_ptrs[in_row + 1] - in_begin; auto out_begin = out_row_ptrs[out_row]; for (IndexType i = lane; i < in_size; i += subwarp_size) { - const auto in_col = in_cols[in_begin + i]; - out_cols[out_begin + i] = permutation[in_col]; + const auto out_col = permutation[in_cols[in_begin + i]]; + out_cols[out_begin + i] = out_col; out_vals[out_begin + i] = - in_vals[in_begin + i] / (scale[in_row] * scale[in_col]); + in_vals[in_begin + i] / (scale[out_row] * scale[out_col]); } } @@ -897,10 +897,10 @@ __global__ __launch_bounds__(default_block_size) void inv_nonsymm_scale_permute( auto in_size = in_row_ptrs[in_row + 1] - in_begin; auto out_begin = out_row_ptrs[out_row]; for (IndexType i = lane; i < in_size; i += subwarp_size) { - const auto in_col = in_cols[in_begin + i]; - out_cols[out_begin + i] = col_permutation[in_col]; + const auto out_col = col_permutation[in_cols[in_begin + i]]; + out_cols[out_begin + i] = out_col; out_vals[out_begin + i] = - in_vals[in_begin + i] / (row_scale[in_row] * col_scale[in_col]); + in_vals[in_begin + i] / (row_scale[out_row] * col_scale[out_col]); } } diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index 4746f88ddfe..10c8d8cd08e 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -103,9 +103,9 @@ void inv_col_scale_permute(std::shared_ptr exec, auto in_vals, auto out_row_ptrs, auto out_col_idxs, auto out_vals) { if (tid < num_nonzeros) { - const auto in_col = in_col_idxs[tid]; - out_col_idxs[tid] = permutation[in_col]; - out_vals[tid] = in_vals[tid] / scale[in_col]; + const auto out_col = permutation[in_col_idxs[tid]]; + out_col_idxs[tid] = out_col; + out_vals[tid] = in_vals[tid] / scale[out_col]; } if (tid <= num_rows) { out_row_ptrs[tid] = in_row_ptrs[tid]; diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index f3723ae8aad..c04f9c14d4c 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -539,7 +539,9 @@ void symm_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, auto permuted) { - permuted(i, j) = scale[i] * scale[j] * orig(perm[i], perm[j]); + const auto row = perm[i]; + const auto col = perm[j]; + permuted(i, j) = scale[row] * scale[col] * orig(row, col); }, orig->get_size(), scale, perm, orig, permuted); } @@ -555,7 +557,9 @@ void inv_symm_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, auto permuted) { - permuted(perm[i], perm[j]) = orig(i, j) / (scale[i] * scale[j]); + const auto row = perm[i]; + const auto col = perm[j]; + permuted(row, col) = orig(i, j) / (scale[row] * scale[col]); }, orig->get_size(), scale, perm, orig, permuted); } @@ -574,8 +578,9 @@ void nonsymm_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm, auto col_scale, auto col_perm, auto orig, auto permuted) { - permuted(i, j) = - row_scale[i] * col_scale[j] * orig(row_perm[i], col_perm[j]); + const auto row = row_perm[i]; + const auto col = col_perm[j]; + permuted(i, j) = row_scale[row] * col_scale[col] * orig(row, col); }, orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig, permuted); @@ -595,8 +600,9 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm, auto col_scale, auto col_perm, auto orig, auto permuted) { - permuted(row_perm[i], row_perm[j]) = - orig(i, j) / (row_scale[i] * col_scale[j]); + const auto row = row_perm[i]; + const auto col = col_perm[j]; + permuted(row, col) = orig(i, j) / (row_scale[i] * col_scale[j]); }, orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig, permuted); @@ -613,7 +619,8 @@ void row_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, auto permuted) { - permuted(i, j) = scale[i] * orig(perm[i], j); + const auto row = perm[i]; + permuted(i, j) = scale[row] * orig(row, j); }, orig->get_size(), scale, perm, orig, permuted); } @@ -629,7 +636,8 @@ void inv_row_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, auto permuted) { - permuted(perm[i], j) = orig(i, j) / scale[i]; + const auto row = perm[i]; + permuted(row, j) = orig(i, j) / scale[row]; }, orig->get_size(), scale, perm, orig, permuted); } @@ -645,7 +653,8 @@ void col_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, auto permuted) { - permuted(i, j) = scale[j] * orig(i, perm[j]); + const auto col = perm[j]; + permuted(i, j) = scale[col] * orig(i, col); }, orig->get_size(), scale, perm, orig, permuted); } @@ -661,7 +670,8 @@ void inv_col_scale_permute(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, auto permuted) { - permuted(i, perm[j]) = orig(i, j) / scale[j]; + const auto col = perm[j]; + permuted(i, col) = orig(i, j) / scale[col]; }, orig->get_size(), scale, perm, orig, permuted); } diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp index 7bebe4c4778..27a70e6c8ab 100644 --- a/common/unified/matrix/scaled_permutation_kernels.cpp +++ b/common/unified/matrix/scaled_permutation_kernels.cpp @@ -55,9 +55,9 @@ void invert(std::shared_ptr exec, exec, [] GKO_KERNEL(auto i, auto input_permutation, auto input_scale, auto output_permutation, auto output_scale) { - output_permutation[input_permutation[i]] = i; - output_scale[input_permutation[i]] = - one(input_scale[i]) / input_scale[i]; + const auto ip = input_permutation[i]; + output_permutation[ip] = i; + output_scale[i] = one(input_scale[ip]) / input_scale[ip]; }, size, input_permutation, input_scale, output_permutation, output_scale); } diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp index 435b928a6b2..b6545ee68b4 100644 --- a/core/matrix/scaled_permutation.cpp +++ b/core/matrix/scaled_permutation.cpp @@ -125,8 +125,8 @@ void ScaledPermutation::write( data.nonzeros.clear(); data.nonzeros.reserve(data.size[0]); for (IndexType row = 0; row < this->get_size()[0]; row++) { - data.nonzeros.emplace_back(row, host_this->get_const_permutation()[row], - host_this->get_const_scale()[row]); + auto col = host_this->get_const_permutation()[row]; + data.nonzeros.emplace_back(row, col, host_this->get_const_scale()[col]); } } diff --git a/include/ginkgo/core/matrix/scaled_permutation.hpp b/include/ginkgo/core/matrix/scaled_permutation.hpp index 0a5a2d781e7..46d17ecbb75 100644 --- a/include/ginkgo/core/matrix/scaled_permutation.hpp +++ b/include/ginkgo/core/matrix/scaled_permutation.hpp @@ -51,7 +51,7 @@ namespace matrix { /** * ScaledPermutation is a matrix combining a permutation with scaling factors. * It is a combination of Diagonal and Permutation, and can be read as - * $SP = S \cdot P$, i.e. the scaling gets applied after the permutation. + * $SP = P \pdot S$, i.e. the scaling gets applied before the permutation. * * @tparam IndexType index type of permutation indices * @tparam ValueType value type of the scaling factors diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 29459a264c4..ca876d29199 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1079,10 +1079,10 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, auto dst_begin = p_row_ptrs[dst_row]; auto row_size = in_row_ptrs[src_row + 1] - src_begin; for (IndexType i = 0; i < row_size; ++i) { - const auto in_col = in_col_idxs[src_begin + i]; - p_col_idxs[dst_begin + i] = col_perm[in_col]; + const auto out_col = col_perm[in_col_idxs[src_begin + i]]; + p_col_idxs[dst_begin + i] = out_col; p_vals[dst_begin + i] = in_vals[src_begin + i] / - (row_scale[src_row] * col_scale[in_col]); + (row_scale[dst_row] * col_scale[out_col]); } } } @@ -1123,7 +1123,7 @@ void row_scale_permute(std::shared_ptr exec, std::copy_n(orig_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin); for (IndexType i = 0; i < row_size; i++) { - rp_vals[i + dst_begin] = orig_vals[i + src_begin] * scale[dst_row]; + rp_vals[i + dst_begin] = orig_vals[i + src_begin] * scale[src_row]; } } } @@ -1164,7 +1164,7 @@ void inv_row_scale_permute(std::shared_ptr exec, std::copy_n(orig_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin); for (IndexType i = 0; i < row_size; i++) { - rp_vals[i + dst_begin] = orig_vals[i + src_begin] / scale[src_row]; + rp_vals[i + dst_begin] = orig_vals[i + src_begin] / scale[dst_row]; } } } diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index d87e72bc5ab..c45ad22177c 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1028,10 +1028,10 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, auto dst_begin = p_row_ptrs[dst_row]; auto row_size = in_row_ptrs[src_row + 1] - src_begin; for (IndexType i = 0; i < row_size; ++i) { - const auto in_col = in_col_idxs[src_begin + i]; - p_col_idxs[dst_begin + i] = col_perm[in_col]; + const auto out_col = col_perm[in_col_idxs[src_begin + i]]; + p_col_idxs[dst_begin + i] = out_col; p_vals[dst_begin + i] = in_vals[src_begin + i] / - (row_scale[src_row] * col_scale[in_col]); + (row_scale[dst_row] * col_scale[out_col]); } } } @@ -1068,7 +1068,7 @@ void row_scale_permute(std::shared_ptr exec, const auto row_size = in_row_ptrs[src_row + 1] - src_begin; std::copy_n(in_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin); for (IndexType i = 0; i < row_size; i++) { - rp_vals[i + dst_begin] = in_vals[i + src_begin] * scale[dst_row]; + rp_vals[i + dst_begin] = in_vals[i + src_begin] * scale[src_row]; } } } @@ -1105,7 +1105,7 @@ void inv_row_scale_permute(std::shared_ptr exec, auto row_size = in_row_ptrs[src_row + 1] - src_begin; std::copy_n(in_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin); for (IndexType i = 0; i < row_size; i++) { - rp_vals[i + dst_begin] = in_vals[i + src_begin] / scale[src_row]; + rp_vals[i + dst_begin] = in_vals[i + src_begin] / scale[dst_row]; } } } @@ -1133,9 +1133,9 @@ void inv_col_scale_permute(std::shared_ptr exec, auto row_end = in_row_ptrs[row + 1]; cp_row_ptrs[row] = in_row_ptrs[row]; for (auto k = row_begin; k < row_end; ++k) { - const auto in_col = in_col_idxs[k]; - cp_col_idxs[k] = perm[in_col]; - cp_vals[k] = in_vals[k] / scale[in_col]; + const auto out_col = perm[in_col_idxs[k]]; + cp_col_idxs[k] = out_col; + cp_vals[k] = in_vals[k] / scale[out_col]; } } cp_row_ptrs[num_rows] = in_row_ptrs[num_rows]; diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 3b28336db11..8b35dcbe6af 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -975,8 +975,8 @@ void col_permute(std::shared_ptr exec, const IndexType* perm, const matrix::Dense* orig, matrix::Dense* col_permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { col_permuted->at(i, j) = orig->at(i, perm[j]); } } @@ -1009,8 +1009,8 @@ void inv_col_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* col_permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { col_permuted->at(i, perm[j]) = orig->at(i, j); } } @@ -1026,10 +1026,11 @@ void symm_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(i, j) = - scale[i] * scale[j] * orig->at(perm[i], perm[j]); + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto row = perm[i]; + const auto col = perm[j]; + permuted->at(i, j) = scale[row] * scale[col] * orig->at(row, col); } } } @@ -1044,10 +1045,11 @@ void inv_symm_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(perm[i], perm[j]) = - orig->at(i, j) / (scale[i] * scale[j]); + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto row = perm[i]; + const auto col = perm[j]; + permuted->at(row, col) = orig->at(i, j) / (scale[row] * scale[col]); } } } @@ -1065,10 +1067,12 @@ void nonsymm_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(i, j) = row_scale[i] * col_scale[j] * - orig->at(row_perm[i], col_perm[j]); + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto row = row_perm[i]; + const auto col = col_perm[j]; + permuted->at(i, j) = + row_scale[row] * col_scale[col] * orig->at(row, col); } } } @@ -1086,10 +1090,13 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(row_perm[i], col_perm[j]) = - orig->at(i, j) / (row_scale[i] * col_scale[j]); + // TODO this was broken in common, why did the test not pick it up? + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto row = row_perm[i]; + const auto col = col_perm[j]; + permuted->at(row, col) = + orig->at(i, j) / (row_scale[row] * col_scale[col]); } } } @@ -1104,9 +1111,10 @@ void row_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(i, j) = scale[i] * orig->at(perm[i], j); + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto row = perm[i]; + permuted->at(i, j) = scale[row] * orig->at(row, j); } } } @@ -1121,9 +1129,10 @@ void inv_row_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(perm[i], j) = orig->at(i, j) / scale[i]; + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto row = perm[i]; + permuted->at(row, j) = orig->at(i, j) / scale[row]; } } } @@ -1138,9 +1147,10 @@ void col_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(i, j) = scale[j] * orig->at(i, perm[j]); + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto col = perm[j]; + permuted->at(i, j) = scale[col] * orig->at(i, col); } } } @@ -1155,9 +1165,10 @@ void inv_col_scale_permute(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* permuted) { - for (size_type j = 0; j < orig->get_size()[1]; ++j) { - for (size_type i = 0; i < orig->get_size()[0]; ++i) { - permuted->at(i, perm[j]) = orig->at(i, j) / scale[j]; + for (size_type i = 0; i < orig->get_size()[0]; ++i) { + for (size_type j = 0; j < orig->get_size()[1]; ++j) { + const auto col = perm[j]; + permuted->at(i, col) = orig->at(i, j) / scale[col]; } } } diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp index 54a68fbdf0a..f0b83128c66 100644 --- a/reference/matrix/scaled_permutation_kernels.cpp +++ b/reference/matrix/scaled_permutation_kernels.cpp @@ -49,8 +49,9 @@ void invert(std::shared_ptr exec, ValueType* output_scale) { for (size_type i = 0; i < size; i++) { - output_permutation[input_permutation[i]] = i; - output_scale[input_permutation[i]] = one() / input_scale[i]; + const auto ip = input_permutation[i]; + output_permutation[ip] = i; + output_scale[i] = one() / input_scale[ip]; } } diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp index a15c0f09bbf..d2968692761 100644 --- a/reference/test/matrix/scaled_permutation.cpp +++ b/reference/test/matrix/scaled_permutation.cpp @@ -84,9 +84,9 @@ TYPED_TEST(ScaledPermutation, Invert) EXPECT_EQ(inv->get_const_permutation()[0], 2); EXPECT_EQ(inv->get_const_permutation()[1], 0); EXPECT_EQ(inv->get_const_permutation()[2], 1); - EXPECT_EQ(inv->get_const_scale()[0], T{0.25}); - EXPECT_EQ(inv->get_const_scale()[1], T{1.0}); - EXPECT_EQ(inv->get_const_scale()[2], T{0.5}); + EXPECT_EQ(inv->get_const_scale()[0], T{0.5}); + EXPECT_EQ(inv->get_const_scale()[1], T{0.25}); + EXPECT_EQ(inv->get_const_scale()[2], T{1.0}); } @@ -95,7 +95,7 @@ TYPED_TEST(ScaledPermutation, Write) using T = typename TestFixture::value_type; GKO_ASSERT_MTX_NEAR( - this->perm3, l({{0.0, 1.0, 0.0}, {0.0, 0.0, 2.0}, {4.0, 0.0, 0.0}}), + this->perm3, l({{0.0, 2.0, 0.0}, {0.0, 0.0, 4.0}, {1.0, 0.0, 0.0}}), 0.0); } @@ -109,7 +109,7 @@ TYPED_TEST(ScaledPermutation, AppliesToDense) this->perm2->apply(x, y); - GKO_ASSERT_MTX_NEAR(y, l({{12.0, 7.5}, {10.0, 15.0}}), 0.0); + GKO_ASSERT_MTX_NEAR(y, l({{20.0, 12.5}, {6.0, 9.0}}), 0.0); } From 270ba1734a8c07a140aee28c918fe79d7e369a10 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 12 Oct 2023 15:06:49 +0200 Subject: [PATCH 437/583] add SYCL kernels --- dpcpp/matrix/csr_kernels.dp.cpp | 305 +++++++++++++++++++++++++++++++- 1 file changed, 304 insertions(+), 1 deletion(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index f05692c1929..ab57b3c072e 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1170,6 +1170,201 @@ void inv_nonsymm_permute_kernel( }); } + +template +void row_scale_permute_kernel(size_type num_rows, + const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, + ValueType* __restrict__ out_vals, + sycl::nd_item<3> item_ct1) +{ + auto tid = thread::get_subwarp_id_flat(item_ct1); + if (tid >= num_rows) { + return; + } + auto lane = item_ct1.get_local_id(2) % subgroup_size; + auto in_row = permutation[tid]; + auto out_row = tid; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subgroup_size) { + out_cols[out_begin + i] = in_cols[in_begin + i]; + out_vals[out_begin + i] = in_vals[in_begin + i] * scale[in_row]; + } +} + +template +void row_scale_permute_kernel( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, const ValueType* scale, const IndexType* permutation, + const IndexType* in_row_ptrs, const IndexType* in_cols, + const ValueType* in_vals, const IndexType* out_row_ptrs, + IndexType* out_cols, ValueType* out_vals) +{ + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + row_scale_permute_kernel( + num_rows, scale, permutation, in_row_ptrs, in_cols, in_vals, + out_row_ptrs, out_cols, out_vals, item_ct1); + }); + }); +} + + +template +void inv_row_scale_permute_kernel(size_type num_rows, + const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, + ValueType* __restrict__ out_vals, + sycl::nd_item<3> item_ct1) +{ + auto tid = thread::get_subwarp_id_flat(item_ct1); + if (tid >= num_rows) { + return; + } + auto lane = item_ct1.get_local_id(2) % subgroup_size; + auto in_row = tid; + auto out_row = permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subgroup_size) { + out_cols[out_begin + i] = in_cols[in_begin + i]; + out_vals[out_begin + i] = in_vals[in_begin + i] / scale[out_row]; + } +} + +template +void inv_row_scale_permute_kernel( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, const ValueType* scale, const IndexType* permutation, + const IndexType* in_row_ptrs, const IndexType* in_cols, + const ValueType* in_vals, const IndexType* out_row_ptrs, + IndexType* out_cols, ValueType* out_vals) +{ + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + inv_row_scale_permute_kernel( + num_rows, scale, permutation, in_row_ptrs, in_cols, in_vals, + out_row_ptrs, out_cols, out_vals, item_ct1); + }); + }); +} + + +template +void inv_symm_scale_permute_kernel(size_type num_rows, + const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, + ValueType* __restrict__ out_vals, + sycl::nd_item<3> item_ct1) +{ + auto tid = thread::get_subwarp_id_flat(item_ct1); + if (tid >= num_rows) { + return; + } + auto lane = item_ct1.get_local_id(2) % subgroup_size; + auto in_row = tid; + auto out_row = permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subgroup_size) { + const auto out_col = permutation[in_cols[in_begin + i]]; + out_cols[out_begin + i] = out_col; + out_vals[out_begin + i] = + in_vals[in_begin + i] / (scale[out_row] * scale[out_col]); + } +} + + +template +void inv_nonsymm_scale_permute_kernel( + size_type num_rows, const ValueType* __restrict__ row_scale, + const IndexType* __restrict__ row_permutation, + const ValueType* __restrict__ col_scale, + const IndexType* __restrict__ col_permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals, + sycl::nd_item<3> item_ct1) +{ + auto tid = thread::get_subwarp_id_flat(item_ct1); + if (tid >= num_rows) { + return; + } + auto lane = item_ct1.get_local_id(2) % subgroup_size; + auto in_row = tid; + auto out_row = row_permutation[tid]; + auto in_begin = in_row_ptrs[in_row]; + auto in_size = in_row_ptrs[in_row + 1] - in_begin; + auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subgroup_size) { + const auto out_col = col_permutation[in_cols[in_begin + i]]; + out_cols[out_begin + i] = out_col; + out_vals[out_begin + i] = + in_vals[in_begin + i] / (row_scale[out_row] * col_scale[out_col]); + } +} + +template +void inv_symm_scale_permute_kernel( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, const ValueType* scale, const IndexType* permutation, + const IndexType* in_row_ptrs, const IndexType* in_cols, + const ValueType* in_vals, const IndexType* out_row_ptrs, + IndexType* out_cols, ValueType* out_vals) +{ + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + inv_symm_scale_permute_kernel( + num_rows, scale, permutation, in_row_ptrs, in_cols, in_vals, + out_row_ptrs, out_cols, out_vals, item_ct1); + }); + }); +} + +template +void inv_nonsymm_scale_permute_kernel( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, const ValueType* row_scale, + const IndexType* row_permutation, const ValueType* col_scale, + const IndexType* col_permutation, const IndexType* in_row_ptrs, + const IndexType* in_cols, const ValueType* in_vals, + const IndexType* out_row_ptrs, IndexType* out_cols, ValueType* out_vals) +{ + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + inv_nonsymm_scale_permute_kernel( + num_rows, row_scale, row_permutation, col_scale, + col_permutation, in_row_ptrs, in_cols, in_vals, + out_row_ptrs, out_cols, out_vals, item_ct1); + }); + }); +} + namespace host_kernel { @@ -2329,7 +2524,7 @@ void inv_nonsymm_permute(std::shared_ptr exec, num_rows + 1); auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - inv_symm_permute_kernel( + inv_nonsymm_permute_kernel( copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, row_perm, col_perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), orig->get_const_values(), @@ -2393,6 +2588,114 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + inv_row_ptr_permute_kernel( + count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + inv_symm_scale_permute_kernel( + copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + orig->get_const_values(), permuted->get_row_ptrs(), + permuted->get_col_idxs(), permuted->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + inv_row_ptr_permute_kernel( + count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + row_perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + inv_nonsymm_scale_permute_kernel( + copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + row_scale, row_perm, col_scale, col_perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), orig->get_const_values(), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + permuted->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + row_ptr_permute_kernel( + count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + row_scale_permute_kernel( + copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + orig->get_const_values(), row_permuted->get_row_ptrs(), + row_permuted->get_col_idxs(), row_permuted->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + inv_row_ptr_permute_kernel( + count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + inv_row_scale_permute_kernel( + copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows, + scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + orig->get_const_values(), row_permuted->get_row_ptrs(), + row_permuted->get_col_idxs(), row_permuted->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); + + template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) From 03a7288907123fee7edb6ef159ea7d670ad5889e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 12 Oct 2023 22:54:01 +0200 Subject: [PATCH 438/583] add permutation combination functions --- common/unified/matrix/permutation_kernels.cpp | 18 +++++ .../matrix/scaled_permutation_kernels.cpp | 37 +++++++-- core/device_hooks/common_kernels.inc.cpp | 2 + core/matrix/permutation.cpp | 38 +++++++-- core/matrix/permutation_kernels.hpp | 16 +++- core/matrix/scaled_permutation.cpp | 78 +++++++++++++++++-- core/matrix/scaled_permutation_kernels.hpp | 25 ++++-- include/ginkgo/core/matrix/permutation.hpp | 12 +++ .../ginkgo/core/matrix/scaled_permutation.hpp | 62 ++++++++++----- reference/matrix/permutation_kernels.cpp | 14 ++++ .../matrix/scaled_permutation_kernels.cpp | 26 ++++++- reference/test/matrix/permutation.cpp | 48 +++++++++++- reference/test/matrix/scaled_permutation.cpp | 63 +++++++++++++++ test/matrix/permutation_kernels.cpp | 17 +++- test/matrix/scaled_permutation_kernels.cpp | 21 ++++- 15 files changed, 422 insertions(+), 55 deletions(-) diff --git a/common/unified/matrix/permutation_kernels.cpp b/common/unified/matrix/permutation_kernels.cpp index 58b82c1602e..d94620aca75 100644 --- a/common/unified/matrix/permutation_kernels.cpp +++ b/common/unified/matrix/permutation_kernels.cpp @@ -61,6 +61,24 @@ void invert(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL); +template +void combine(std::shared_ptr exec, + const IndexType* first_permutation, + const IndexType* second_permutation, size_type size, + IndexType* output_permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto first_permutation, auto second_permutation, + auto output_permutation) { + output_permutation[i] = second_permutation[first_permutation[i]]; + }, + size, first_permutation, second_permutation, output_permutation); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMBINE_KERNEL); + + } // namespace permutation } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp index 27a70e6c8ab..46219d45d66 100644 --- a/common/unified/matrix/scaled_permutation_kernels.cpp +++ b/common/unified/matrix/scaled_permutation_kernels.cpp @@ -47,25 +47,50 @@ namespace scaled_permutation { template void invert(std::shared_ptr exec, - const IndexType* input_permutation, const ValueType* input_scale, - size_type size, IndexType* output_permutation, - ValueType* output_scale) + const ValueType* input_scale, const IndexType* input_permutation, + size_type size, ValueType* output_scale, + IndexType* output_permutation) { run_kernel( exec, - [] GKO_KERNEL(auto i, auto input_permutation, auto input_scale, - auto output_permutation, auto output_scale) { + [] GKO_KERNEL(auto i, auto input_scale, auto input_permutation, + auto output_scale, auto output_permutation) { const auto ip = input_permutation[i]; output_permutation[ip] = i; output_scale[i] = one(input_scale[ip]) / input_scale[ip]; }, - size, input_permutation, input_scale, output_permutation, output_scale); + size, input_scale, input_permutation, output_scale, output_permutation); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); +template +void combine(std::shared_ptr exec, + const ValueType* first_scale, const IndexType* first_permutation, + const ValueType* second_scale, const IndexType* second_permutation, + size_type size, ValueType* output_scale, + IndexType* output_permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto first_scale, auto first_permutation, + auto second_scale, auto second_permutation, + auto output_permutation, auto output_scale) { + const auto first_permuted = first_permutation[i]; + output_permutation[i] = second_permutation[first_permuted]; + output_scale[first_permuted] = + first_scale[first_permuted] * second_scale[i]; + }, + size, first_scale, first_permutation, second_scale, second_permutation, + output_permutation, output_scale); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SCALED_PERMUTATION_COMBINE_KERNEL); + + } // namespace scaled_permutation } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 3f5d097abac..0c58f1a4c0f 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -730,6 +730,7 @@ namespace permutation { GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL); +GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMBINE_KERNEL); } // namespace permutation @@ -739,6 +740,7 @@ namespace scaled_permutation { GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_COMBINE_KERNEL); } // namespace scaled_permutation diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index 779bdd964bb..00115d0db68 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -31,12 +31,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include + + +#include +#include +#include +#include +#include + + #include "core/base/dispatch_helper.hpp" #include "core/matrix/permutation_kernels.hpp" -#include "ginkgo/core/base/exception_helpers.hpp" -#include "ginkgo/core/base/executor.hpp" -#include "ginkgo/core/base/precision_dispatch.hpp" -#include "ginkgo/core/base/utils_helper.hpp" namespace gko { @@ -45,9 +50,10 @@ namespace permutation { GKO_REGISTER_OPERATION(invert, permutation::invert); +GKO_REGISTER_OPERATION(combine, permutation::combine); -} +} // namespace permutation template @@ -162,10 +168,26 @@ std::unique_ptr> Permutation::invert() const { const auto exec = this->get_executor(); const auto size = this->get_size()[0]; - array inv_permutation{exec, size}; + auto result = Permutation::create(exec, size); exec->run(permutation::make_invert(this->get_const_permutation(), size, - inv_permutation.get_data())); - return Permutation::create(exec, std::move(inv_permutation)); + result->get_permutation())); + return result; +} + + +template +std::unique_ptr> Permutation::combine( + ptr_param> other) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(this, other); + const auto exec = this->get_executor(); + const auto size = this->get_size()[0]; + const auto local_other = make_temporary_clone(exec, other); + auto result = Permutation::create(exec, size); + exec->run(permutation::make_combine(this->get_const_permutation(), + local_other->get_const_permutation(), + size, result->get_permutation())); + return result; } diff --git a/core/matrix/permutation_kernels.hpp b/core/matrix/permutation_kernels.hpp index a77e0c2f618..d4deb4142ad 100644 --- a/core/matrix/permutation_kernels.hpp +++ b/core/matrix/permutation_kernels.hpp @@ -62,10 +62,18 @@ namespace kernels { const IndexType* permutation_indices, size_type size, \ IndexType* inv_permutation) - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_PERMUTATION_INVERT_KERNEL(IndexType) +#define GKO_DECLARE_PERMUTATION_COMBINE_KERNEL(IndexType) \ + void combine(std::shared_ptr exec, \ + const IndexType* first_permutation, \ + const IndexType* second_permutation, size_type size, \ + IndexType* combined_permutation) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_PERMUTATION_INVERT_KERNEL(IndexType); \ + template \ + GKO_DECLARE_PERMUTATION_COMBINE_KERNEL(IndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(permutation, diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp index b6545ee68b4..f3de34bd9b6 100644 --- a/core/matrix/scaled_permutation.cpp +++ b/core/matrix/scaled_permutation.cpp @@ -32,6 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "core/matrix/scaled_permutation_kernels.hpp" +#include "ginkgo/core/base/exception_helpers.hpp" #include "ginkgo/core/base/executor.hpp" #include "ginkgo/core/base/precision_dispatch.hpp" @@ -43,6 +44,7 @@ namespace { GKO_REGISTER_OPERATION(invert, scaled_permutation::invert); +GKO_REGISTER_OPERATION(combine, scaled_permutation::combine); } // namespace @@ -71,19 +73,83 @@ ScaledPermutation::ScaledPermutation( } +template +std::unique_ptr> +ScaledPermutation::create( + std::shared_ptr exec, size_type size) +{ + return std::unique_ptr( + new ScaledPermutation{exec, size}); +} + + +template +std::unique_ptr> +ScaledPermutation::create( + ptr_param> permutation) +{ + const auto exec = permutation->get_executor(); + const auto size = permutation->get_size()[0]; + array scale{exec, size}; + array perm{exec, size}; + exec->copy(size, permutation->get_const_permutation(), perm.get_data()); + scale.fill(one()); + return create(exec, std::move(scale), std::move(perm)); +} + + +template +std::unique_ptr> +ScaledPermutation::create( + std::shared_ptr exec, array scaling_factors, + array permutation_indices) +{ + return std::unique_ptr(new ScaledPermutation{ + exec, std::move(scaling_factors), std::move(permutation_indices)}); +} + + +template +std::unique_ptr> +ScaledPermutation::create_const( + std::shared_ptr exec, + gko::detail::const_array_view&& scale, + gko::detail::const_array_view&& perm_idxs) +{ + return create(exec, gko::detail::array_const_cast(std::move(scale)), + gko::detail::array_const_cast(std::move(perm_idxs))); +} + + template std::unique_ptr> ScaledPermutation::invert() const { const auto exec = this->get_executor(); const auto size = this->get_size()[0]; - array inv_permutation{exec, size}; - array inv_scale{exec, size}; + auto result = ScaledPermutation::create(exec, size); exec->run(scaled_permutation::make_invert( - this->get_const_permutation(), this->get_const_scale(), size, - inv_permutation.get_data(), inv_scale.get_data())); - return ScaledPermutation::create(exec, std::move(inv_scale), - std::move(inv_permutation)); + this->get_const_scale(), this->get_const_permutation(), size, + result->get_scale(), result->get_permutation())); + return result; +} + + +template +std::unique_ptr> +ScaledPermutation::combine( + ptr_param other) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(this, other); + const auto exec = this->get_executor(); + const auto size = this->get_size()[0]; + const auto local_other = make_temporary_clone(exec, other); + auto result = ScaledPermutation::create(exec, size); + exec->run(scaled_permutation::make_combine( + this->get_const_scale(), this->get_const_permutation(), + local_other->get_const_scale(), local_other->get_const_permutation(), + size, result->get_scale(), result->get_permutation())); + return result; } diff --git a/core/matrix/scaled_permutation_kernels.hpp b/core/matrix/scaled_permutation_kernels.hpp index 905321ea885..360e1a947f1 100644 --- a/core/matrix/scaled_permutation_kernels.hpp +++ b/core/matrix/scaled_permutation_kernels.hpp @@ -45,13 +45,24 @@ namespace kernels { #define GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL(ValueType, IndexType) \ void invert(std::shared_ptr exec, \ - const IndexType* input_permutation, \ - const ValueType* input_scale, size_type size, \ - IndexType* output_permutation, ValueType* output_scale) - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL(ValueType, IndexType) + const ValueType* input_scale, \ + const IndexType* input_permutation, size_type size, \ + ValueType* output_scale, IndexType* output_permutation) + +#define GKO_DECLARE_SCALED_PERMUTATION_COMBINE_KERNEL(ValueType, IndexType) \ + void combine(std::shared_ptr exec, \ + const ValueType* first_scale, \ + const IndexType* first_permutation, \ + const ValueType* second_scale, \ + const IndexType* second_permutation, size_type size, \ + ValueType* output_scale, IndexType* output_permutation) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_SCALED_PERMUTATION_COMBINE_KERNEL(ValueType, IndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(scaled_permutation, diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index abfffb11248..02401cf698e 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -202,6 +202,18 @@ class Permutation : public EnableLinOp>, */ std::unique_ptr invert() const; + /** + * Combines this permutation with another permutation via composition. + * The resulting permutation fulfills `result[i] = other[this[i]]` + * or `result = other * this` from the matrix perspective, which is + * equivalent to first permuting by `this` and then by `other`. + * + * @param other the other permutation + * @return the combined permutation + */ + std::unique_ptr combine( + ptr_param other) const; + void write(gko::matrix_data& data) const override; /** diff --git a/include/ginkgo/core/matrix/scaled_permutation.hpp b/include/ginkgo/core/matrix/scaled_permutation.hpp index 46d17ecbb75..5008590bcc1 100644 --- a/include/ginkgo/core/matrix/scaled_permutation.hpp +++ b/include/ginkgo/core/matrix/scaled_permutation.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace gko { @@ -63,9 +64,7 @@ namespace matrix { template class ScaledPermutation : public EnableLinOp>, - public EnableCreateMethod>, public WritableToMatrixData { - friend class EnableCreateMethod; friend class EnablePolymorphicObject; public: @@ -118,8 +117,51 @@ class ScaledPermutation */ std::unique_ptr invert() const; + /** + * Combines this scaled permutation with another scaled permutation via + * composition. This means `result = other * this` from the matrix + * perspective, which is equivalent to first scaling and permuting by `this` + * and then by `other`. + * + * @param other the other permutation + * @return the combined permutation + */ + std::unique_ptr combine( + ptr_param other) const; + void write(gko::matrix_data& data) const override; + /** + * Creates an uninitialized ScaledPermutation matrix. + * + * @param exec Executor associated to the matrix + * @param size dimensions of the (square) scaled permutation matrix + */ + static std::unique_ptr create( + std::shared_ptr exec, size_type size = 0); + + /** + * Create a ScaledPermutation from a Permutation. + * The permutation will be copied, the scaling factors are all set to 1.0. + * + * @param permutation the permutation + * @return the scaled permutation. + */ + static std::unique_ptr create( + ptr_param> permutation); + + /** + * Creates a ScaledPermutation matrix from already allocated (and + * initialized) arrays. + * + * @param exec Executor associated to the matrix + * @param permutation_indices array of permutation indices + * @param scaling_factors array of scaling factors + */ + static std::unique_ptr create( + std::shared_ptr exec, array scaling_factors, + array permutation_indices); + /** * Creates a constant (immutable) ScaledPermutation matrix from constant * arrays. @@ -137,33 +179,17 @@ class ScaledPermutation gko::detail::const_array_view&& perm_idxs); protected: - /** - * Creates an uninitialized ScaledPermutation matrix. - * - * @param exec Executor associated to the matrix - * @param size dimensions of the (square) scaled permutation matrix - */ ScaledPermutation(std::shared_ptr exec, size_type size = 0); - /** - * Creates a ScaledPermutation matrix from already allocated (and - * initialized) arrays. - * - * @param exec Executor associated to the matrix - * @param permutation_indices array of permutation indices - * @param scaling_factors array of scaling factors - */ ScaledPermutation(std::shared_ptr exec, array scaling_factors, array permutation_indices); void apply_impl(const LinOp* in, LinOp* out) const override; - void apply_impl(const LinOp*, const LinOp* in, const LinOp*, LinOp* out) const override; - private: array scale_; array permutation_; diff --git a/reference/matrix/permutation_kernels.cpp b/reference/matrix/permutation_kernels.cpp index cc7a81a1044..7d295394904 100644 --- a/reference/matrix/permutation_kernels.cpp +++ b/reference/matrix/permutation_kernels.cpp @@ -52,6 +52,20 @@ void invert(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL); +template +void combine(std::shared_ptr exec, + const IndexType* first_permutation, + const IndexType* second_permutation, size_type size, + IndexType* output_permutation) +{ + for (size_type i = 0; i < size; i++) { + output_permutation[i] = second_permutation[first_permutation[i]]; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMBINE_KERNEL); + + } // namespace permutation } // namespace reference } // namespace kernels diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp index f0b83128c66..e1d418204e8 100644 --- a/reference/matrix/scaled_permutation_kernels.cpp +++ b/reference/matrix/scaled_permutation_kernels.cpp @@ -44,9 +44,9 @@ namespace scaled_permutation { template void invert(std::shared_ptr exec, - const IndexType* input_permutation, const ValueType* input_scale, - size_type size, IndexType* output_permutation, - ValueType* output_scale) + const ValueType* input_scale, const IndexType* input_permutation, + size_type size, ValueType* output_scale, + IndexType* output_permutation) { for (size_type i = 0; i < size; i++) { const auto ip = input_permutation[i]; @@ -59,6 +59,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); +template +void combine(std::shared_ptr exec, + const ValueType* first_scale, const IndexType* first_permutation, + const ValueType* second_scale, const IndexType* second_permutation, + size_type size, ValueType* output_scale, + IndexType* output_permutation) +{ + // P_2 S_2 P_1 S_1 = P_2 P_1 S'_2 S_1 with S'_2 = P_1^-1 S_2 P_1^-T + for (size_type i = 0; i < size; i++) { + const auto first_permuted = first_permutation[i]; + output_permutation[i] = second_permutation[first_permuted]; + output_scale[first_permuted] = + first_scale[first_permuted] * second_scale[i]; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SCALED_PERMUTATION_COMBINE_KERNEL); + + } // namespace scaled_permutation } // namespace reference } // namespace kernels diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index 1301276a424..cd260ef6301 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -38,9 +38,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include "core/test/utils.hpp" +#include "ginkgo/core/base/exception.hpp" namespace { @@ -78,11 +80,55 @@ TYPED_TEST(Permutation, Invert) } +TYPED_TEST(Permutation, Combine) +{ + using index_type = typename TestFixture::index_type; + auto perm = gko::matrix::Permutation::create( + this->exec, gko::array{this->exec, {1, 2, 0}}); + auto perm2 = gko::matrix::Permutation::create( + this->exec, gko::array{this->exec, {0, 2, 1}}); + + auto combined = perm->combine(perm2); + + EXPECT_EQ(combined->get_const_permutation()[0], 2); + EXPECT_EQ(combined->get_const_permutation()[1], 1); + EXPECT_EQ(combined->get_const_permutation()[2], 0); +} + + +TYPED_TEST(Permutation, CombineWithInverse) +{ + using index_type = typename TestFixture::index_type; + const gko::size_type size = 20; + auto perm = gko::matrix::Permutation::create(this->exec, size); + std::iota(perm->get_permutation(), perm->get_permutation() + size, 0); + std::shuffle(perm->get_permutation(), perm->get_permutation() + size, + std::default_random_engine{29584}); + + auto combined = perm->combine(perm->invert()); + + for (index_type i = 0; i < size; i++) { + ASSERT_EQ(combined->get_const_permutation()[i], i); + } +} + + +TYPED_TEST(Permutation, CombineFailsWithMismatchingSize) +{ + using index_type = typename TestFixture::index_type; + auto perm = gko::matrix::Permutation::create( + this->exec, gko::array{this->exec, {1, 2, 0}}); + auto perm0 = gko::matrix::Permutation::create(this->exec); + + ASSERT_THROW(perm->combine(perm0), gko::DimensionMismatch); +} + + TYPED_TEST(Permutation, Write) { using index_type = typename TestFixture::index_type; auto perm = gko::matrix::Permutation::create( - this->exec, 3, gko::array{this->exec, {1, 2, 0}}); + this->exec, gko::array{this->exec, {1, 2, 0}}); GKO_ASSERT_MTX_NEAR( perm, l({{0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {1.0, 0.0, 0.0}}), diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp index d2968692761..9ca0bb26d8c 100644 --- a/reference/test/matrix/scaled_permutation.cpp +++ b/reference/test/matrix/scaled_permutation.cpp @@ -90,6 +90,69 @@ TYPED_TEST(ScaledPermutation, Invert) } +TYPED_TEST(ScaledPermutation, CreateFromPermutation) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Mtx = typename TestFixture::Mtx; + auto non_scaled = gko::matrix::Permutation::create( + this->exec, gko::array{this->exec, {1, 2, 0}}); + + auto scaled = Mtx::create(non_scaled); + + EXPECT_EQ(scaled->get_const_permutation()[0], 1); + EXPECT_EQ(scaled->get_const_permutation()[1], 2); + EXPECT_EQ(scaled->get_const_permutation()[2], 0); + EXPECT_EQ(scaled->get_const_scale()[0], gko::one()); + EXPECT_EQ(scaled->get_const_scale()[1], gko::one()); + EXPECT_EQ(scaled->get_const_scale()[2], gko::one()); +} + + +TYPED_TEST(ScaledPermutation, Combine) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + using Mtx = typename TestFixture::Mtx; + auto other_perm = Mtx::create( + this->exec, gko::array{this->exec, {3.0, 5.0, 7.0}}, + gko::array{this->exec, {1, 0, 2}}); + + auto combined = this->perm3->combine(other_perm); + + EXPECT_EQ(combined->get_const_permutation()[0], 0); + EXPECT_EQ(combined->get_const_permutation()[1], 2); + EXPECT_EQ(combined->get_const_permutation()[2], 1); + EXPECT_EQ(combined->get_const_scale()[0], value_type{7}); + EXPECT_EQ(combined->get_const_scale()[1], value_type{6}); + EXPECT_EQ(combined->get_const_scale()[2], value_type{20}); +} + + +TYPED_TEST(ScaledPermutation, CombineWithInverse) +{ + using T = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + const gko::size_type size = 20; + auto rng = std::default_random_engine{3754}; + auto perm = gko::matrix::Permutation::create(this->exec, size); + std::iota(perm->get_permutation(), perm->get_permutation() + size, 0); + std::shuffle(perm->get_permutation(), perm->get_permutation() + size, rng); + + auto combined = perm->combine(perm->invert()); + + for (index_type i = 0; i < size; i++) { + ASSERT_EQ(combined->get_const_permutation()[i], i); + } +} + + +TYPED_TEST(ScaledPermutation, CombineFailsWithMismatchingSize) +{ + ASSERT_THROW(this->perm3->combine(this->perm2), gko::DimensionMismatch); +} + + TYPED_TEST(ScaledPermutation, Write) { using T = typename TestFixture::value_type; diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp index 037040b8fd4..53086478547 100644 --- a/test/matrix/permutation_kernels.cpp +++ b/test/matrix/permutation_kernels.cpp @@ -52,14 +52,20 @@ class Permutation : public CommonTestFixture { { std::vector tmp(1000, 0); std::iota(tmp.begin(), tmp.end(), 0); + auto tmp2 = tmp; std::shuffle(tmp.begin(), tmp.end(), rand_engine); - permutation = Perm::create(ref, tmp.size(), gko::array(ref, tmp.begin(), tmp.end())); + std::shuffle(tmp2.begin(), tmp2.end(), rand_engine); + permutation = Perm::create( + ref, gko::array(ref, tmp.begin(), tmp.end())); + permutation2 = Perm::create( + ref, gko::array(ref, tmp2.begin(), tmp2.end())); dpermutation = permutation->clone(exec); } std::default_random_engine rand_engine; std::unique_ptr permutation; + std::unique_ptr permutation2; std::unique_ptr dpermutation; }; @@ -71,3 +77,12 @@ TEST_F(Permutation, InvertIsEquivalentToRef) GKO_ASSERT_MTX_EQ_SPARSITY(inv, dinv); } + + +TEST_F(Permutation, CombineIsEquivalentToRef) +{ + auto combined = permutation->combine(permutation2); + auto dcombined = dpermutation->combine(permutation2); + + GKO_ASSERT_MTX_EQ_SPARSITY(combined, dcombined); +} diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp index d85b9735abc..688788fb64a 100644 --- a/test/matrix/scaled_permutation_kernels.cpp +++ b/test/matrix/scaled_permutation_kernels.cpp @@ -52,18 +52,28 @@ class ScaledPermutation : public CommonTestFixture { { std::vector tmp(1000, 0); std::iota(tmp.begin(), tmp.end(), 0); + auto tmp2 = tmp; std::shuffle(tmp.begin(), tmp.end(), rand_engine); + std::shuffle(tmp2.begin(), tmp2.end(), rand_engine); std::vector scale(tmp.size()); + std::vector scale2(tmp2.size()); std::uniform_real_distribution dist(1, 2); auto gen = [&] { return dist(rand_engine); }; std::generate(scale.begin(), scale.end(), gen); - permutation = ScaledPerm::create(ref, gko::array(ref, scale.begin(), scale.end()), gko::array(ref, tmp.begin(), tmp.end())); + std::generate(scale2.begin(), scale2.end(), gen); + permutation = ScaledPerm::create( + ref, gko::array(ref, scale.begin(), scale.end()), + gko::array(ref, tmp.begin(), tmp.end())); + permutation2 = ScaledPerm::create( + ref, gko::array(ref, scale2.begin(), scale2.end()), + gko::array(ref, tmp2.begin(), tmp2.end())); dpermutation = permutation->clone(exec); } std::default_random_engine rand_engine; std::unique_ptr permutation; + std::unique_ptr permutation2; std::unique_ptr dpermutation; }; @@ -75,3 +85,12 @@ TEST_F(ScaledPermutation, InvertIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(inv, dinv, r::value); } + + +TEST_F(ScaledPermutation, CombineIsEquivalentToRef) +{ + auto combined = permutation->combine(permutation2); + auto dcombined = dpermutation->combine(permutation2); + + GKO_ASSERT_MTX_NEAR(combined, dcombined, r::value); +} From 430769a51513f574c43ddcce559e1e30f760646c Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 12 Oct 2023 20:56:31 +0000 Subject: [PATCH 439/583] Format files Co-authored-by: Tobias Ribizel --- core/matrix/dense.cpp | 2 +- core/matrix/permutation_kernels.hpp | 4 +--- core/matrix/scaled_permutation.cpp | 10 +++++++--- core/matrix/scaled_permutation_kernels.hpp | 1 + reference/test/matrix/permutation.cpp | 6 ++++-- reference/test/matrix/scaled_permutation.cpp | 4 ++-- test/matrix/csr_kernels2.cpp | 15 +++++++-------- test/matrix/permutation_kernels.cpp | 6 +++--- test/matrix/scaled_permutation_kernels.cpp | 6 +++--- 9 files changed, 29 insertions(+), 25 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 05b5672117b..0f5a9397f27 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -60,7 +61,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/matrix/hybrid_kernels.hpp" -#include "ginkgo/core/base/temporary_clone.hpp" namespace gko { diff --git a/core/matrix/permutation_kernels.hpp b/core/matrix/permutation_kernels.hpp index d4deb4142ad..b5186fdaaf9 100644 --- a/core/matrix/permutation_kernels.hpp +++ b/core/matrix/permutation_kernels.hpp @@ -34,13 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_MATRIX_PERMUTATION_KERNELS_HPP_ -#include - - #include #include #include #include +#include #include #include #include diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp index f3de34bd9b6..cb3b5f9bf6c 100644 --- a/core/matrix/scaled_permutation.cpp +++ b/core/matrix/scaled_permutation.cpp @@ -31,10 +31,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include + + +#include +#include +#include + + #include "core/matrix/scaled_permutation_kernels.hpp" -#include "ginkgo/core/base/exception_helpers.hpp" -#include "ginkgo/core/base/executor.hpp" -#include "ginkgo/core/base/precision_dispatch.hpp" namespace gko { diff --git a/core/matrix/scaled_permutation_kernels.hpp b/core/matrix/scaled_permutation_kernels.hpp index 360e1a947f1..9aa5421fd07 100644 --- a/core/matrix/scaled_permutation_kernels.hpp +++ b/core/matrix/scaled_permutation_kernels.hpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_ #define GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_ + #include diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index cd260ef6301..6d19fa7a9e3 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -33,16 +33,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include +#include #include #include -#include #include "core/test/utils.hpp" -#include "ginkgo/core/base/exception.hpp" namespace { diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp index 9ca0bb26d8c..8a5fbe9f6c5 100644 --- a/reference/test/matrix/scaled_permutation.cpp +++ b/reference/test/matrix/scaled_permutation.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include @@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include +#include #include "core/test/utils.hpp" diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 9e8355c284d..0884e203733 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -918,14 +918,11 @@ TEST_F(Csr, IsGenericPermutableRectangular) auto rpermuted = mtx->permute(rpermutation, permute_mode::rows); auto drpermuted = dmtx->permute(rpermutation, permute_mode::rows); - auto irpermuted = - mtx->permute(rpermutation, permute_mode::inverse_rows); - auto dirpermuted = - dmtx->permute(rpermutation, permute_mode::inverse_rows); + auto irpermuted = mtx->permute(rpermutation, permute_mode::inverse_rows); + auto dirpermuted = dmtx->permute(rpermutation, permute_mode::inverse_rows); auto cpermuted = mtx->permute(cpermutation, permute_mode::columns); auto dcpermuted = dmtx->permute(cpermutation, permute_mode::columns); - auto icpermuted = - mtx->permute(cpermutation, permute_mode::inverse_columns); + auto icpermuted = mtx->permute(cpermutation, permute_mode::inverse_columns); auto dicpermuted = dmtx->permute(cpermutation, permute_mode::inverse_columns); @@ -1021,8 +1018,10 @@ TEST_F(Csr, IsNonsymmScalePermutable) for (auto invert : {false, true}) { SCOPED_TRACE(invert); - auto permuted = mtx->scale_permute(srpermutation, scpermutation, invert); - auto dpermuted = dmtx->scale_permute(srpermutation, scpermutation, invert); + auto permuted = + mtx->scale_permute(srpermutation, scpermutation, invert); + auto dpermuted = + dmtx->scale_permute(srpermutation, scpermutation, invert); GKO_EXPECT_MTX_NEAR(permuted, dpermuted, r::value); GKO_EXPECT_MTX_EQ_SPARSITY(permuted, dpermuted); diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp index 53086478547..f04d7a9e58b 100644 --- a/test/matrix/permutation_kernels.cpp +++ b/test/matrix/permutation_kernels.cpp @@ -30,9 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include - - #include #include @@ -40,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp index 688788fb64a..249ffe8867b 100644 --- a/test/matrix/scaled_permutation_kernels.cpp +++ b/test/matrix/scaled_permutation_kernels.cpp @@ -30,9 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include - - #include #include @@ -40,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include "core/test/utils.hpp" #include "test/utils/executor.hpp" From bb856dfeee9c8fcecabdac5ab49d1f645a4c5e0b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 18 Oct 2023 13:11:45 +0200 Subject: [PATCH 440/583] fix warning --- core/factorization/factorization.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp index 436359a417a..5877124bf77 100644 --- a/core/factorization/factorization.cpp +++ b/core/factorization/factorization.cpp @@ -112,6 +112,7 @@ Factorization::unpack() const } case storage_type::combined_ldu: case storage_type::symm_combined_ldl: + default: GKO_NOT_IMPLEMENTED; } } From a7d4dff91fbdf1d31c834dea3a5205b82ae8ca35 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 24 Oct 2023 12:48:00 -0400 Subject: [PATCH 441/583] review updates - fix and test csr column permutation for hypersparse matrices - fix inverse nonsymm scaled permutation for dense - extract permutation dimension validation - simplify permutation dispatch - re-add templated constructors to preserve interface - move more code to source - add documentation for permutation to permute_mode instead of matrix classes - improve documentation - make scaled_permutation final - remove unnecessary InvalidValueError - improve error message on dispatch_helper run(...) - simplify rectangular permutation tests - fix dense tests not calling scaled permutation - test apply in (scaled) permutation on the device - throw DimensionMismatch instead of ValueMismatch - add more round-trip and inverted permutation tests - add dpcpp subgroup size Co-authored-by: Pratik Nayak Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Marcel Koch --- common/unified/matrix/csr_kernels.cpp | 4 +- .../unified/matrix/dense_kernels.template.cpp | 2 +- core/base/dispatch_helper.hpp | 4 +- core/matrix/csr.cpp | 28 ++-- core/matrix/dense.cpp | 33 ++-- core/matrix/permutation.cpp | 106 +++++++++---- core/matrix/permutation.hpp | 59 +++++++ dpcpp/matrix/csr_kernels.dp.cpp | 56 ++++--- include/ginkgo/core/base/exception.hpp | 19 --- include/ginkgo/core/matrix/csr.hpp | 12 +- include/ginkgo/core/matrix/dense.hpp | 12 +- include/ginkgo/core/matrix/permutation.hpp | 122 ++++++++------- .../ginkgo/core/matrix/scaled_permutation.hpp | 14 +- omp/matrix/csr_kernels.cpp | 6 +- reference/matrix/csr_kernels.cpp | 6 +- reference/matrix/dense_kernels.cpp | 1 - reference/test/matrix/csr_kernels.cpp | 145 ++++++++++-------- reference/test/matrix/dense_kernels.cpp | 28 ++-- test/matrix/csr_kernels2.cpp | 123 +++++++++------ test/matrix/dense_kernels.cpp | 70 ++++----- test/matrix/permutation_kernels.cpp | 46 +++++- test/matrix/scaled_permutation_kernels.cpp | 45 +++++- 22 files changed, 557 insertions(+), 384 deletions(-) create mode 100644 core/matrix/permutation.hpp diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index 10c8d8cd08e..d1abb043c44 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -62,7 +62,7 @@ void inv_col_permute(std::shared_ptr exec, { auto num_rows = orig->get_size()[0]; auto nnz = orig->get_num_stored_elements(); - auto size = std::max(num_rows, nnz); + auto size = std::max(num_rows + 1, nnz); run_kernel( exec, [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, @@ -95,7 +95,7 @@ void inv_col_scale_permute(std::shared_ptr exec, { auto num_rows = orig->get_size()[0]; auto nnz = orig->get_num_stored_elements(); - auto size = std::max(num_rows, nnz); + auto size = std::max(num_rows + 1, nnz); run_kernel( exec, [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, auto scale, diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index c04f9c14d4c..ed508066ba8 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -602,7 +602,7 @@ void inv_nonsymm_scale_permute(std::shared_ptr exec, auto col_scale, auto col_perm, auto orig, auto permuted) { const auto row = row_perm[i]; const auto col = col_perm[j]; - permuted(row, col) = orig(i, j) / (row_scale[i] * col_scale[j]); + permuted(row, col) = orig(i, j) / (row_scale[row] * col_scale[col]); }, orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig, permuted); diff --git a/core/base/dispatch_helper.hpp b/core/base/dispatch_helper.hpp index 2226ffc6b6d..9653c7f8622 100644 --- a/core/base/dispatch_helper.hpp +++ b/core/base/dispatch_helper.hpp @@ -97,9 +97,9 @@ void run(T obj, Func f, Args... args) */ template