Skip to content

Commit

Permalink
Merge pull request #593 from tonyjie/jiajie/pr_gpu_layout_clean
Browse files Browse the repository at this point in the history
[Feature] Enable GPU to accelerate odgi-layout with massive speedup. Build with a flag `USE_GPU`
  • Loading branch information
AndreaGuarracino authored Oct 28, 2024
2 parents 1895f49 + dd7257e commit 237fc1b
Show file tree
Hide file tree
Showing 7 changed files with 735 additions and 26 deletions.
39 changes: 38 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
cmake_minimum_required(VERSION 3.16)

# Project's name
project(odgi)
project(odgi LANGUAGES CXX)

# Enforce c++17
set(CMAKE_CXX_STANDARD 17)
Expand All @@ -30,13 +30,27 @@ set(CMAKE_CXX_STANDARD 17)
option(PIC "Compile all odgi sources with -fPIC - required for shared libs" ON)
option(ASAN "Use address sanitiser" OFF)
option(INLINE_HANDLEGRAPH_SOURCES "Compile handlegraph sources inline" OFF)
# Add the GPU option (default is OFF)
option(USE_GPU "Enable GPU support if available" OFF)

include(ExternalProject)
include(FeatureSummary)

find_package(PkgConfig REQUIRED)
find_package(pybind11 CONFIG)
find_package(OpenMP)
# Find CUDA if GPU option is enabled
if (USE_GPU)
find_package(CUDA REQUIRED) # Adjust this if you're using modern CMake with FindCUDAToolkit.
if(CUDA_FOUND)
enable_language(CUDA)
message(STATUS "CUDA found. GPU support enabled.")
else()
message(FATAL_ERROR "CUDA not found! Cannot enable GPU support.")
endif()
else()
message(STATUS "Building with CPU-only support.")
endif()

feature_summary(
FATAL_ON_MISSING_REQUIRED_PACKAGES
Expand Down Expand Up @@ -621,6 +635,9 @@ add_library(odgi_objs OBJECT
${lodepng_SOURCES}
${handlegraph_sources}
)
if (USE_GPU)
target_sources(odgi_objs PRIVATE "${CMAKE_SOURCE_DIR}/src/cuda/layout.cu")
endif (USE_GPU)

set(odgi_DEPS
# lodepng
Expand Down Expand Up @@ -687,6 +704,9 @@ set(odgi_INCLUDES
"${xoshiro_INCLUDE}"
"${atomicbitvector_INCLUDE}"
"${mio_INCLUDE}")
if (USE_GPU)
list(APPEND odgi_INCLUDES "${CUDA_INCLUDE_DIRS}")
endif (USE_GPU)

set(odgi_LIBS
jemalloc
Expand Down Expand Up @@ -803,9 +823,26 @@ set(odgi_HEADERS
${CMAKE_SOURCE_DIR}/src/algorithms/path_length.hpp
${CMAKE_SOURCE_DIR}/src/algorithms/path_keep.hpp
${CMAKE_SOURCE_DIR}/src/algorithms/diffpriv.cpp)
if (USE_GPU)
list(APPEND odgi_HEADERS "${CMAKE_SOURCE_DIR}/src/cuda/layout.h")
endif (USE_GPU)

target_include_directories(odgi_objs PUBLIC ${odgi_INCLUDES})

if (USE_GPU)
include(FindCUDA/select_compute_arch)
CUDA_DETECT_INSTALLED_GPUS(INSTALLED_GPU_CCS_1)
string(STRIP "${INSTALLED_GPU_CCS_1}" INSTALLED_GPU_CCS_2)
string(REPLACE " " ";" INSTALLED_GPU_CCS_3 "${INSTALLED_GPU_CCS_2}")
string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_3}")
SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
message(STATUS "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}")
# Apply compile options. Detects different GPU compute capability.
target_compile_options(odgi_objs PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -std=c++17 -Xcompiler=-fopenmp -lineinfo>)
# add USE_GPU macro when building with GPU
target_compile_definitions(odgi_objs PRIVATE USE_GPU)
endif (USE_GPU)

add_library(libodgi_static STATIC $<TARGET_OBJECTS:odgi_objs>)
set_target_properties(libodgi_static PROPERTIES OUTPUT_NAME "odgi")
set_target_properties(libodgi_static PROPERTIES PUBLIC_HEADER "${odgi_HEADERS}")
Expand Down
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,21 @@ Static builds are unlikely to be supported on OSX, and require appropriate stati

For more information on optimisations, debugging and GNU Guix builds, see [INSTALL.md](./INSTALL.md) and [CMakeLists.txt](./CMakeLists.txt).

### building with GPU

If you have GPUs and CUDA installed, you can build with GPU to use our GPU-accelerated `odgi-layout`. This will provide significant 57.3x speedup compared to the CPU solution on NVIDIA A100 GPU, reducing execution time from hours to minutes. Check out this [paper](https://arxiv.org/abs/2409.00876) and [repo](https://github.com/tonyjie/odgi) for the detailed performance speedup number. It's going to be presented at [SC'24](https://sc24.conference-program.com/presentation/?id=pap443&sess=sess382)!

Simply build with `-DUSE_GPU=ON` when cmake:
```
cmake -DUSE_GPU=ON -H. -Bbuild && cmake --build build -- -j 3
```

To run `odgi layout` with GPU, simply add a `--gpu` with the other arguments like:
```
odgi layout -i ${OG_FILE} -o ${LAY_FILE} --threads ${NUM_THREAD} --gpu
```


### Nix build

If you have `nix`, build and installation in your profile are as simple as:
Expand Down Expand Up @@ -120,6 +135,8 @@ work with output from `odgi stats`! For more details take a look at the document
**Andrea Guarracino\*, Simon Heumos\*, Sven Nahnsen, Pjotr Prins, Erik Garrison**. [ODGI: understanding pangenome graphs](https://doi.org/10.1093/bioinformatics/btac308), Bioinformatics, 2022\
**\*Shared first authorship**

**Jiajie Li, Jan-Niklas Schmelzle, Yixiao Du, Simon Heumos, Andrea Guarracino, Giulia Guidi, Pjotr Prins, Erik Garrison, Zhiru Zhang**. [Rapid GPU-Based Pangenome Graph Layout](https://arxiv.org/abs/2409.00876), SC (The International Conference for High Performance Computing, Networking, Storage, and Analysis), 2024

## funding sources

`odgi` has been funded through a variety of mechanisms, including a Wellcome Sanger PhD fellowship and diverse NIH and NSF grants (listed in our paper), as well as funding from the State of Tennessee.
Expand Down
39 changes: 38 additions & 1 deletion src/algorithms/path_sgd_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ namespace odgi {
} else {
eta.store(etas[iteration]); // update our learning rate
Delta_max.store(delta); // set our delta max to the threshold
if (iteration > first_cooling_iteration) {
if (iteration >= first_cooling_iteration) {
//std::cerr << std::endl << "setting cooling!!" << std::endl;
adj_theta.store(0.001);
cooling.store(true);
Expand Down Expand Up @@ -466,6 +466,43 @@ namespace odgi {
#endif
return etas;
}
#ifdef USE_GPU
void path_linear_sgd_layout_gpu(const PathHandleGraph &graph,
const xp::XP &path_index,
const std::vector<path_handle_t> &path_sgd_use_paths,
const uint64_t &iter_max,
const uint64_t &iter_with_max_learning_rate,
const uint64_t &min_term_updates,
const double &delta,
const double &eps,
const double &eta_max,
const double &theta,
const uint64_t &space,
const uint64_t &space_max,
const uint64_t &space_quantization_step,
const double &cooling_start,
const uint64_t &nthreads,
const bool &progress,
const bool &snapshot,
const std::string &snapshot_prefix,
std::vector<std::atomic<double>> &X,
std::vector<std::atomic<double>> &Y) {
cuda::layout_config_t config;
config.iter_max = iter_max;
config.min_term_updates = min_term_updates;
config.eta_max = eta_max;
config.eps = eps;
config.iter_with_max_learning_rate = (int32_t) iter_with_max_learning_rate;
config.first_cooling_iteration = std::floor(cooling_start * (double)iter_max);
config.theta = theta;
config.space = uint32_t(space);
config.space_max = uint32_t(space_max);
config.space_quantization_step = uint32_t(space_quantization_step);
config.nthreads = nthreads;
cuda::gpu_layout(config, dynamic_cast<const odgi::graph_t&>(graph), X, Y);
return;
}
#endif
/*
void deterministic_path_linear_sgd(const PathHandleGraph &graph,
const xp::XP &path_index,
Expand Down
26 changes: 25 additions & 1 deletion src/algorithms/path_sgd_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include "dirty_zipfian_int_distribution.h"
#include "XoshiroCpp.hpp"
#include "progress.hpp"
#ifdef USE_GPU
#include "cuda/layout.h"
#endif

namespace odgi {
namespace algorithms {
Expand Down Expand Up @@ -53,7 +56,28 @@ namespace odgi {
const uint64_t &iter_max,
const uint64_t &iter_with_max_learning_rate,
const double &eps);

#ifdef USE_GPU
void path_linear_sgd_layout_gpu(const PathHandleGraph &graph,
const xp::XP &path_index,
const std::vector<path_handle_t> &path_sgd_use_paths,
const uint64_t &iter_max,
const uint64_t &iter_with_max_learning_rate,
const uint64_t &min_term_updates,
const double &delta,
const double &eps,
const double &eta_max,
const double &theta,
const uint64_t &space,
const uint64_t &space_max,
const uint64_t &space_quantization_step,
const double &cooling_start,
const uint64_t &nthreads,
const bool &progress,
const bool &snapshot,
const std::string &snapshot_prefix,
std::vector<std::atomic<double>> &X,
std::vector<std::atomic<double>> &Y);
#endif
/// single threaded and deterministic path guided 1D linear SGD
/*
void deterministic_path_linear_sgd_layout(const PathHandleGraph &graph,
Expand Down
Loading

0 comments on commit 237fc1b

Please sign in to comment.