Merge pull request #593 from tonyjie/jiajie/pr_gpu_layout_clean

[Feature] Enable GPU to accelerate odgi-layout with massive speedup. Build with a flag `USE_GPU`
pangenome · Oct 28, 2024 · 237fc1b · 237fc1b
2 parents 1895f49 + dd7257e
commit 237fc1b
Show file tree

Hide file tree

Showing 7 changed files with 735 additions and 26 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -21,7 +21,7 @@
 cmake_minimum_required(VERSION 3.16)
 
 # Project's name
-project(odgi)
+project(odgi LANGUAGES CXX)
 
 # Enforce c++17
 set(CMAKE_CXX_STANDARD 17)
@@ -30,13 +30,27 @@ set(CMAKE_CXX_STANDARD 17)
 option(PIC "Compile all odgi sources with -fPIC - required for shared libs" ON)
 option(ASAN "Use address sanitiser" OFF)
 option(INLINE_HANDLEGRAPH_SOURCES "Compile handlegraph sources inline" OFF)
+# Add the GPU option (default is OFF)
+option(USE_GPU "Enable GPU support if available" OFF)
 
 include(ExternalProject)
 include(FeatureSummary)
 
 find_package(PkgConfig REQUIRED)
 find_package(pybind11 CONFIG)
 find_package(OpenMP)
+# Find CUDA if GPU option is enabled
+if (USE_GPU)
+    find_package(CUDA REQUIRED)  # Adjust this if you're using modern CMake with FindCUDAToolkit.
+    if(CUDA_FOUND)
+        enable_language(CUDA)
+        message(STATUS "CUDA found. GPU support enabled.")
+    else()
+        message(FATAL_ERROR "CUDA not found! Cannot enable GPU support.")
+    endif()
+else()
+    message(STATUS "Building with CPU-only support.")
+endif()
 
 feature_summary(
   FATAL_ON_MISSING_REQUIRED_PACKAGES
@@ -621,6 +635,9 @@ add_library(odgi_objs OBJECT
   ${lodepng_SOURCES}
   ${handlegraph_sources}
 )
+if (USE_GPU)
+  target_sources(odgi_objs PRIVATE "${CMAKE_SOURCE_DIR}/src/cuda/layout.cu")
+endif (USE_GPU)
 
 set(odgi_DEPS
     # lodepng
@@ -687,6 +704,9 @@ set(odgi_INCLUDES
   "${xoshiro_INCLUDE}"
   "${atomicbitvector_INCLUDE}"
   "${mio_INCLUDE}")
+if (USE_GPU)
+  list(APPEND odgi_INCLUDES "${CUDA_INCLUDE_DIRS}")
+endif (USE_GPU)
 
 set(odgi_LIBS
   jemalloc
@@ -803,9 +823,26 @@ set(odgi_HEADERS
   ${CMAKE_SOURCE_DIR}/src/algorithms/path_length.hpp
   ${CMAKE_SOURCE_DIR}/src/algorithms/path_keep.hpp
   ${CMAKE_SOURCE_DIR}/src/algorithms/diffpriv.cpp)
+if (USE_GPU)
+  list(APPEND odgi_HEADERS "${CMAKE_SOURCE_DIR}/src/cuda/layout.h")
+endif (USE_GPU)
 
 target_include_directories(odgi_objs PUBLIC ${odgi_INCLUDES})
 
+if (USE_GPU)
+  include(FindCUDA/select_compute_arch)
+  CUDA_DETECT_INSTALLED_GPUS(INSTALLED_GPU_CCS_1)
+  string(STRIP "${INSTALLED_GPU_CCS_1}" INSTALLED_GPU_CCS_2)
+  string(REPLACE " " ";" INSTALLED_GPU_CCS_3 "${INSTALLED_GPU_CCS_2}")
+  string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_3}")
+  SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
+  message(STATUS "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}")
+  # Apply compile options. Detects different GPU compute capability. 
+  target_compile_options(odgi_objs PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -std=c++17 -Xcompiler=-fopenmp -lineinfo>)
+  # add USE_GPU macro when building with GPU
+  target_compile_definitions(odgi_objs PRIVATE USE_GPU)
+endif (USE_GPU)
+
 add_library(libodgi_static STATIC $<TARGET_OBJECTS:odgi_objs>)
 set_target_properties(libodgi_static PROPERTIES OUTPUT_NAME "odgi")
 set_target_properties(libodgi_static PROPERTIES PUBLIC_HEADER "${odgi_HEADERS}")

diff --git a/README.md b/README.md
@@ -51,6 +51,21 @@ Static builds are unlikely to be supported on OSX, and require appropriate stati
 
 For more information on optimisations, debugging and GNU Guix builds, see [INSTALL.md](./INSTALL.md) and [CMakeLists.txt](./CMakeLists.txt).
 
+### building with GPU
+
+If you have GPUs and CUDA installed, you can build with GPU to use our GPU-accelerated `odgi-layout`. This will provide significant 57.3x speedup compared to the CPU solution on NVIDIA A100 GPU, reducing execution time from hours to minutes. Check out this [paper](https://arxiv.org/abs/2409.00876) and [repo](https://github.com/tonyjie/odgi) for the detailed performance speedup number. It's going to be presented at [SC'24](https://sc24.conference-program.com/presentation/?id=pap443&sess=sess382)!
+
+Simply build with `-DUSE_GPU=ON` when cmake: 
+```
+cmake -DUSE_GPU=ON -H. -Bbuild && cmake --build build -- -j 3
+```
+
+To run `odgi layout` with GPU, simply add a `--gpu` with the other arguments like: 
+```
+odgi layout -i ${OG_FILE} -o ${LAY_FILE} --threads ${NUM_THREAD} --gpu
+```
+
+
 ### Nix build
 
 If you have `nix`, build and installation in your profile are as simple as:
@@ -120,6 +135,8 @@ work with output from `odgi stats`! For more details take a look at the document
 **Andrea Guarracino\*, Simon Heumos\*, Sven Nahnsen, Pjotr Prins, Erik Garrison**. [ODGI: understanding pangenome graphs](https://doi.org/10.1093/bioinformatics/btac308), Bioinformatics, 2022\
 **\*Shared first authorship**
 
+**Jiajie Li, Jan-Niklas Schmelzle, Yixiao Du, Simon Heumos, Andrea Guarracino, Giulia Guidi, Pjotr Prins, Erik Garrison, Zhiru Zhang**. [Rapid GPU-Based Pangenome Graph Layout](https://arxiv.org/abs/2409.00876), SC (The International Conference for High Performance Computing, Networking, Storage, and Analysis), 2024
+
 ## funding sources
 
 `odgi` has been funded through a variety of mechanisms, including a Wellcome Sanger PhD fellowship and diverse NIH and NSF grants (listed in our paper), as well as funding from the State of Tennessee.

diff --git a/src/algorithms/path_sgd_layout.cpp b/src/algorithms/path_sgd_layout.cpp
@@ -150,7 +150,7 @@ namespace odgi {
                                     } else {
                                         eta.store(etas[iteration]); // update our learning rate
                                         Delta_max.store(delta); // set our delta max to the threshold
-                                        if (iteration > first_cooling_iteration) {
+                                        if (iteration >= first_cooling_iteration) {
                                             //std::cerr << std::endl << "setting cooling!!" << std::endl;
                                             adj_theta.store(0.001);
                                             cooling.store(true);
@@ -466,6 +466,43 @@ namespace odgi {
 #endif
             return etas;
         }
+#ifdef USE_GPU
+        void path_linear_sgd_layout_gpu(const PathHandleGraph &graph,
+                                    const xp::XP &path_index,
+                                    const std::vector<path_handle_t> &path_sgd_use_paths,
+                                    const uint64_t &iter_max,
+                                    const uint64_t &iter_with_max_learning_rate,
+                                    const uint64_t &min_term_updates,
+                                    const double &delta,
+                                    const double &eps,
+                                    const double &eta_max,
+                                    const double &theta,
+                                    const uint64_t &space,
+                                    const uint64_t &space_max,
+                                    const uint64_t &space_quantization_step,
+                                    const double &cooling_start,
+                                    const uint64_t &nthreads,
+                                    const bool &progress,
+                                    const bool &snapshot,
+                                    const std::string &snapshot_prefix,
+                                    std::vector<std::atomic<double>> &X,
+                                    std::vector<std::atomic<double>> &Y) {
+            cuda::layout_config_t config;
+            config.iter_max = iter_max;
+            config.min_term_updates = min_term_updates;
+            config.eta_max = eta_max;
+            config.eps = eps;
+            config.iter_with_max_learning_rate = (int32_t)  iter_with_max_learning_rate;
+            config.first_cooling_iteration = std::floor(cooling_start * (double)iter_max);
+            config.theta = theta;
+            config.space = uint32_t(space);
+            config.space_max = uint32_t(space_max);
+            config.space_quantization_step = uint32_t(space_quantization_step);
+            config.nthreads = nthreads;
+            cuda::gpu_layout(config, dynamic_cast<const odgi::graph_t&>(graph), X, Y);
+            return;
+        }
+#endif
 /*
         void deterministic_path_linear_sgd(const PathHandleGraph &graph,
                                            const xp::XP &path_index,

diff --git a/src/algorithms/path_sgd_layout.hpp b/src/algorithms/path_sgd_layout.hpp
@@ -19,6 +19,9 @@
 #include "dirty_zipfian_int_distribution.h"
 #include "XoshiroCpp.hpp"
 #include "progress.hpp"
+#ifdef USE_GPU
+#include "cuda/layout.h"
+#endif
 
 namespace odgi {
     namespace algorithms {
@@ -53,7 +56,28 @@ namespace odgi {
                                                             const uint64_t &iter_max,
                                                             const uint64_t &iter_with_max_learning_rate,
                                                             const double &eps);
-
+#ifdef USE_GPU
+        void path_linear_sgd_layout_gpu(const PathHandleGraph &graph,
+                                    const xp::XP &path_index,
+                                    const std::vector<path_handle_t> &path_sgd_use_paths,
+                                    const uint64_t &iter_max,
+                                    const uint64_t &iter_with_max_learning_rate,
+                                    const uint64_t &min_term_updates,
+                                    const double &delta,
+                                    const double &eps,
+                                    const double &eta_max,
+                                    const double &theta,
+                                    const uint64_t &space,
+                                    const uint64_t &space_max,
+                                    const uint64_t &space_quantization_step,
+                                    const double &cooling_start,
+                                    const uint64_t &nthreads,
+                                    const bool &progress,
+                                    const bool &snapshot,
+                                    const std::string &snapshot_prefix,
+                                    std::vector<std::atomic<double>> &X,
+                                    std::vector<std::atomic<double>> &Y);
+#endif
 /// single threaded and deterministic path guided 1D linear SGD
 /*
         void deterministic_path_linear_sgd_layout(const PathHandleGraph &graph,