Skip to content

Commit

Permalink
cscs-ci (#2149)
Browse files Browse the repository at this point in the history
# CI on CSCS machines

Leverages CSCS's CI-Ext facilites to run pipelines on some of the CSCS
machines. As of now, there are 3 CI pipelines enabled:

| Pipeline name  | Description |
| ------------- | ------------- |
| `daint-mc` | Piz Daint MC partition: dual socket Intel Xeon E5-2695 v4
(2x18 cores Broadwell) |
| `daint-gpu` | Piz Daint Multicore partition: single socket Intel Xeon
E5-2690 v3 (12 cores Haswell) + 1 NVidia P100 |
| `hohgant-cpu` | Hohgant Alps vCluster CPU partition: dual socket AMD
EPYC 7742 (2x64 cores Zen2) |

The CI gets triggered when one submits a comment stating `cscs-ci run
PIPELINE_NAME`, or if a whitelisted user is pushing a commit to a PR. At
the moment, only PRs targeting the master branch will run this CI. The
workflow behind the scenes is roughly as follows:

1. Github sends a webhook to cicd-ext-mw.cscs.ch (CI middleware)
2. CI middleware fetches your repository from github and pushes a mirror
to gitlab
3. Gitlab sees a change in the repository and starts a pipeline (i.e. it
uses the CI yaml as entrypoint)
4. The specified runner will take its input Dockerfile and execute
docker build -f $DOCKERFILE ., where the build context is the whole
(recursively) cloned repository

All pipelines run both on a single node and on two distinct nodes. Unit
tests and examples (including Python examples) are executed.

## Code Changes

### GPU fixes

- Pulled in some of the fixes from #2143 
- Fixed the AMD hipcc path by updating the compiler flags and properly
marking the cuda files in CMake
- Fixed some Python tests (default selection of GPU)
- Fixed C++ example (proper compiler flags)
- Fixed some warnings on clang

### Incidental Changes

- Redesigned the `run_cpp_examples.sh` script to accept more arguments -
this is useful for some of the pipelines where it is not possible to run
several MPI examples back-to-back in the same workflow (likely due to an
MPI bug).
- Python install prefix: convert to absolute path (was allowed to be
relative path when specified through `ccmake` which led to wrong
installation directory structure)

## Future Work

- add pipeline for Hohgant amdgpu partition: single socket AMD EPYC 7A53
(64 cores Zen3) + 8xMI200
- add pipeline for Hohgant nvgpu partition: single socket AMD EPYC 7713
(64 cores Zen3) + 4xA100

These pipelines require some work on our sides (develop and deploy
appropriate gitlab runners, add default recipes etc) and, thus, I'll be
addressing those in another PR.

## Current Limitations

The Hohgant runners are currently not picking up any pipelines, because
of maintenance work on CSCS systems. Thus, I disabled the pipeline
temporarily, and will enable it again end of August. During this
maintenance period the runners targeting Piz Daint may be queued for
longer times, as well.
  • Loading branch information
boeschf authored Aug 4, 2023
1 parent c899572 commit 93ddab3
Show file tree
Hide file tree
Showing 21 changed files with 690 additions and 64 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-spack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
with:
path: arbor
path: arbor
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
Expand Down
22 changes: 19 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,18 +136,26 @@ if(ARB_GPU STREQUAL "cuda")
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
find_package(CUDAToolkit)
set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
endif()
# We _still_ need this otherwise CUDA symbols will not be exported
# from libarbor.a leading to linker errors when link external clients.
# Unit tests are NOT external enough. Re-review this somewhere in the
# future.
find_package(CUDA ${CUDAToolkit_VERSION_MAJOR} REQUIRED)
elseif(ARB_GPU STREQUAL "cuda-clang")
include(FindCUDAToolkit)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
endif()
set(ARB_WITH_CUDA_CLANG TRUE)
enable_language(CUDA)
elseif(ARB_GPU STREQUAL "hip")
set(ARB_WITH_HIP_CLANG TRUE)
# Specify AMD architecture using a (user provided) list.
# Note: CMake native HIP architectures are introduced with version 3.21.
set(ARB_HIP_ARCHITECTURES gfx906 gfx900 CACHE STRING "AMD offload architectures (semicolon separated)")
endif()

if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
Expand Down Expand Up @@ -415,11 +423,19 @@ if(ARB_WITH_GPU)
target_compile_definitions(arbor-private-deps INTERFACE ARB_CUDA)
target_compile_definitions(arborenv-private-deps INTERFACE ARB_CUDA)
elseif(ARB_WITH_CUDA_CLANG)
set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_70 --cuda-gpu-arch=sm_80 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
# Transform cuda archtitecture list into clang cuda flags
list(TRANSFORM CMAKE_CUDA_ARCHITECTURES PREPEND "--cuda-gpu-arch=sm_" OUTPUT_VARIABLE TMP)
string(REPLACE ";" " " CUDA_ARCH_STR "${TMP}")

set(clang_options_ -DARB_CUDA -xcuda ${CUDA_ARCH_STR} --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
elseif(ARB_WITH_HIP_CLANG)
set(clang_options_ -DARB_HIP -xhip --amdgpu-target=gfx906 --amdgpu-target=gfx900)
# Transform hip archtitecture list into clang hip flags
list(TRANSFORM ARB_HIP_ARCHITECTURES PREPEND "--offload-arch=" OUTPUT_VARIABLE TMP)
string(REPLACE ";" " " HIP_ARCH_STR "${TMP}")

set(clang_options_ -DARB_HIP -xhip ${HIP_ARCH_STR})
target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
endif()
Expand Down
6 changes: 3 additions & 3 deletions arbor/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ install(TARGETS arbor-private-headers EXPORT arbor-targets)
# directory-local.

add_subdirectory(../mechanisms "${CMAKE_BINARY_DIR}/mechanisms")
set_source_files_properties(${arbor-builtin-mechanisms} PROPERTIES GENERATED TRUE)
set_source_files_properties(${arbor-builtin-mechanisms} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES GENERATED TRUE)

if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
set_source_files_properties(${arbor_sources} PROPERTIES LANGUAGE CXX)
set_source_files_properties(${arbor-builtin-mechanism} PROPERTIES LANGUAGE CXX)
set_source_files_properties(${arbor_sources} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES LANGUAGE CXX)
set_source_files_properties(${arbor-builtin-mechanisms} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES LANGUAGE CXX)
endif()

# Library target:
Expand Down
2 changes: 1 addition & 1 deletion arbor/backends/gpu/shared_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ struct ARB_ARBOR_API shared_state: shared_state_base<shared_state, array, ion_st
using cable_solver = arb::gpu::matrix_state_fine<arb_value_type, arb_index_type>;
cable_solver solver;

static constexpr std::size_t alignment = std::max(array::alignment(), iarray::alignment());
static constexpr unsigned alignment = std::max(array::alignment(), iarray::alignment());

arb_size_type n_intdom = 0; // Number of distinct integration domains.
arb_size_type n_detector = 0; // Max number of detectors on all cells.
Expand Down
10 changes: 3 additions & 7 deletions arbor/util/pimpl_src.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ namespace util {
template<typename T>
pimpl<T>::~pimpl() = default;

// ctor is empty instead of defaulted because of hipcc complaints
template<typename T>
pimpl<T>::pimpl() noexcept = default;
pimpl<T>::pimpl() noexcept {}

template<typename T>
pimpl<T>::pimpl(T* ptr) noexcept : m{ptr} {}
Expand Down Expand Up @@ -53,9 +54,4 @@ pimpl<T> make_pimpl(Args&&... args) {
// In order to avoid linker errors for the constructors and destructor, the pimpl template needs to
// be instantiated in the source file. This macro helps with this boilerplate code. Note, that it
// needs to be placed in the default namespace.
#define ARB_INSTANTIATE_PIMPL(T) \
namespace arb { \
namespace util { \
template struct pimpl<T>; \
} \
}
#define ARB_INSTANTIATE_PIMPL(T) template class ::arb::util::pimpl<T>;
27 changes: 27 additions & 0 deletions ci/cscs/daint_gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
ARG BASE_IMG
FROM $BASE_IMG

ARG NUM_PROCS

COPY . /arbor.src

RUN mkdir -p /arbor.src/build \
&& cd /arbor.src/build \
&& cmake .. \
-GNinja \
-DCMAKE_INSTALL_PREFIX=/arbor.install \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="-march=haswell" \
-DARB_ARCH=none \
-DARB_WITH_ASSERTIONS=ON \
-DARB_WITH_PROFILING=ON \
-DARB_VECTORIZE=ON \
-DARB_WITH_PYTHON=ON \
-DARB_USE_HWLOC=ON \
-DARB_WITH_MPI=ON \
-DARB_GPU=cuda\
-DCMAKE_CUDA_ARCHITECTURES=60 \
-DARB_USE_GPU_RNG=ON \
&& ninja -j${NUM_PROCS} tests examples pyarb \
&& ninja install

50 changes: 50 additions & 0 deletions ci/cscs/daint_gpu/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
FROM docker.io/finkandreas/spack:0.19.2-cuda11.7.1-ubuntu22.04 as builder

ARG NUM_PROCS

RUN spack-install-helper daint-mc \
"git" \
"meson" \
"ninja" \
"cmake" \
"valgrind" \
"python" \
"hwloc" \
"boost" \
"fmt" \
"random123" \
"py-mpi4py" \
"py-sphinx" \
"py-svgwrite" \
"nlohmann-json" \
"py-pybind11" \
"py-numpy" \
"py-flake8" \
"py-black" \
"py-pytest" \
"py-seaborn" \
"py-pandas" \
"pugixml"

# end of builder container, now we are ready to copy necessary files

# copy only relevant parts to the final container
FROM docker.io/finkandreas/spack:base-cuda11.7.1-ubuntu22.04

# it is important to keep the paths, otherwise your installation is broken
# all these paths are created with the above `spack-install-helper` invocation
COPY --from=builder /opt/spack-environment /opt/spack-environment
COPY --from=builder /opt/software /opt/software
COPY --from=builder /opt/._view /opt/._view
COPY --from=builder /etc/profile.d/z10_spack_environment.sh /etc/profile.d/z10_spack_environment.sh

# Some boilerplate to get all paths correctly - fix_spack_install is part of the base image
# and makes sure that all important things are being correctly setup
RUN fix_spack_install

# Finally install software that is needed, e.g. compilers
# It is also possible to build compilers via spack and let all dependencies be handled by spack
RUN apt-get -yqq update && apt-get -yqq upgrade \
&& apt-get -yqq install build-essential gfortran \
&& rm -rf /var/lib/apt/lists/*

67 changes: 67 additions & 0 deletions ci/cscs/daint_gpu/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

stages:
- build_base # build stage is running on Kubernetes cluster
- build # build stage is running on Kubernetes cluster
- test # test stage is running on slurm cluster

variables:
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/software/arbor_daint_gpu:$CI_COMMIT_SHORT_SHA

build-base:
extends: .container-builder-dynamic-name
stage: build_base
variables:
DOCKERFILE: ci/cscs/daint_gpu/Dockerfile.base
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/arbor_daint_gpu_base_image
WATCH_FILECHANGES: 'ci/cscs/daint_gpu/Dockerfile.base'

build-arbor:
extends: .container-builder
stage: build
variables:
DOCKERFILE: ci/cscs/daint_gpu/Dockerfile
DOCKER_BUILD_ARGS: '["BASE_IMG=$BASE_IMAGE"]'
GIT_SUBMODULE_STRATEGY: recursive

test-single-node:
extends: .container-runner-daint-gpu
stage: test
image: $PERSIST_IMAGE_NAME
script:
- cd /arbor.src
- build/bin/unit-modcc
- build/bin/unit-local
- build/bin/unit
- scripts/run_cpp_examples.sh
- python -m venv --system-site-packages /arbor.install
- source /arbor.install/bin/activate
- python -m unittest discover -v -s python
- scripts/run_python_examples.sh
- scripts/test_executables.sh
- deactivate
variables:
SLURM_JOB_PARTITION: normal
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 1
SLURM_CPUS_PER_TASK: 12
SLURM_TIMELIMIT: "00:30:00"
USE_MPI: "NO"

test-distributed:
extends: .container-runner-daint-gpu
stage: test
image: $PERSIST_IMAGE_NAME
script:
- cd /arbor.src
- build/bin/unit-mpi
- scripts/run_cpp_examples.sh -d
variables:
SLURM_JOB_PARTITION: normal
SLURM_JOB_NUM_NODES: 2
SLURM_NTASKS: 2
SLURM_CPUS_PER_TASK: 12
SLURM_TIMELIMIT: "00:30:00"
USE_MPI: "YES"

23 changes: 23 additions & 0 deletions ci/cscs/daint_mc/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
ARG BASE_IMG
FROM $BASE_IMG

ARG NUM_PROCS

COPY . /arbor.src

RUN mkdir -p /arbor.src/build \
&& cd /arbor.src/build \
&& cmake .. \
-GNinja \
-DCMAKE_INSTALL_PREFIX=/arbor.install \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="-march=broadwell" \
-DARB_ARCH=none \
-DARB_WITH_ASSERTIONS=ON \
-DARB_WITH_PROFILING=ON \
-DARB_VECTORIZE=ON \
-DARB_WITH_PYTHON=ON \
-DARB_USE_HWLOC=ON \
-DARB_WITH_MPI=ON \
&& ninja -j${NUM_PROCS} tests examples pyarb \
&& ninja install
49 changes: 49 additions & 0 deletions ci/cscs/daint_mc/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
FROM docker.io/finkandreas/spack:0.19.2-ubuntu22.04 as builder

ARG NUM_PROCS

RUN spack-install-helper daint-mc \
"git" \
"meson" \
"ninja" \
"cmake" \
"valgrind" \
"python" \
"hwloc" \
"boost" \
"fmt" \
"random123" \
"py-mpi4py" \
"py-sphinx" \
"py-svgwrite" \
"nlohmann-json" \
"py-pybind11" \
"py-numpy" \
"py-flake8" \
"py-black" \
"py-pytest" \
"py-seaborn" \
"py-pandas" \
"pugixml"

# end of builder container, now we are ready to copy necessary files

# copy only relevant parts to the final container
FROM docker.io/finkandreas/spack:base-ubuntu22.04

# it is important to keep the paths, otherwise your installation is broken
# all these paths are created with the above `spack-install-helper` invocation
COPY --from=builder /opt/spack-environment /opt/spack-environment
COPY --from=builder /opt/software /opt/software
COPY --from=builder /opt/._view /opt/._view
COPY --from=builder /etc/profile.d/z10_spack_environment.sh /etc/profile.d/z10_spack_environment.sh

# Some boilerplate to get all paths correctly - fix_spack_install is part of the base image
# and makes sure that all important things are being correctly setup
RUN fix_spack_install

# Finally install software that is needed, e.g. compilers
# It is also possible to build compilers via spack and let all dependencies be handled by spack
RUN apt-get -yqq update && apt-get -yqq upgrade \
&& apt-get -yqq install build-essential gfortran \
&& rm -rf /var/lib/apt/lists/*
66 changes: 66 additions & 0 deletions ci/cscs/daint_mc/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

stages:
- build_base # build stage is running on Kubernetes cluster
- build # build stage is running on Kubernetes cluster
- test # test stage is running on slurm cluster

variables:
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/software/arbor_daint_mc:$CI_COMMIT_SHORT_SHA

build-base:
extends: .container-builder-dynamic-name
stage: build_base
variables:
DOCKERFILE: ci/cscs/daint_mc/Dockerfile.base
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/arbor_daint_mc_base_image
WATCH_FILECHANGES: 'ci/cscs/daint_mc/Dockerfile.base'

build-arbor:
extends: .container-builder
stage: build
variables:
DOCKERFILE: ci/cscs/daint_mc/Dockerfile
DOCKER_BUILD_ARGS: '["BASE_IMG=$BASE_IMAGE"]'
GIT_SUBMODULE_STRATEGY: recursive

test-single-node:
extends: .container-runner-daint-mc
stage: test
image: $PERSIST_IMAGE_NAME
script:
- cd /arbor.src
- build/bin/unit-modcc
- build/bin/unit-local
- build/bin/unit
- scripts/run_cpp_examples.sh
- python -m venv --system-site-packages /arbor.install
- source /arbor.install/bin/activate
- python -m unittest discover -v -s python
- scripts/run_python_examples.sh
- scripts/test_executables.sh
- deactivate
variables:
SLURM_JOB_PARTITION: normal
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 1
SLURM_CPUS_PER_TASK: 36
SLURM_TIMELIMIT: "00:30:00"
USE_MPI: "NO"

test-distributed:
extends: .container-runner-daint-mc
stage: test
image: $PERSIST_IMAGE_NAME
script:
- cd /arbor.src
- build/bin/unit-mpi
- scripts/run_cpp_examples.sh -d
variables:
SLURM_JOB_PARTITION: normal
SLURM_JOB_NUM_NODES: 2
SLURM_NTASKS: 4
SLURM_CPUS_PER_TASK: 18
SLURM_TIMELIMIT: "00:30:00"
USE_MPI: "YES"
Loading

0 comments on commit 93ddab3

Please sign in to comment.