From 28b88a0f115e35170c242b251083ff653c5f000c Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 11 Oct 2024 18:55:45 -0700 Subject: [PATCH] Add LC gitlab CI for GPU build/run tests --- .gitlab-ci.yml | 98 +++++++++++++++++++++++++++ .gitlab/custom-jobs-and-variables.yml | 62 +++++++++++++++++ .gitlab/jobs/lassen.yml | 59 ++++++++++++++++ .gitlab/subscribed-pipelines.yml | 91 +++++++++++++++++++++++++ tests/gitlab/ci-tests.sh | 88 ++++++++++++++++++++++++ 5 files changed, 398 insertions(+) create mode 100644 .gitlab-ci.yml create mode 100644 .gitlab/custom-jobs-and-variables.yml create mode 100644 .gitlab/jobs/lassen.yml create mode 100644 .gitlab/subscribed-pipelines.yml create mode 100755 tests/gitlab/ci-tests.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000..7830073846 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,98 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# DESCRIPTION: +############################################################################### +# General GitLab pipelines configurations for supercomputers and Linux clusters +# at Lawrence Livermore National Laboratory (LLNL). +# This entire pipeline is LLNL-specific +# +# Important note: This file is a template provided by llnl/radiuss-shared-ci. +# Remains to set variable values, change the reference to the radiuss-shared-ci +# repo, opt-in and out optional features. The project can then extend it with +# additional stages. +# +# In addition, each project should copy over and complete: +# - .gitlab/custom-jobs-and-variables.yml +# - .gitlab/subscribed-pipelines.yml +# +# The jobs should be specified in a file local to the project, +# - .gitlab/jobs/${CI_MACHINE}.yml +# or generated (see LLNL/Umpire for an example). +############################################################################### + +# We define the following GitLab pipeline variables: +variables: +##### LC GITLAB CONFIGURATION +# Use an LLNL service user to run CI. This prevents from running pipelines as +# an actual user. + LLNL_SERVICE_USER: "" +# Use the service user workspace. Solves permission issues, stores everything +# at the same location whoever triggers a pipeline. + CUSTOM_CI_BUILDS_DIR: "/usr/workspace/BOUT-GPU/gitlab-runner" +# Tells Gitlab to recursively update the submodules when cloning the project. +# GIT_SUBMODULE_STRATEGY: recursive + +##### PROJECT VARIABLES +# We build the projects in the CI clone directory. +# Used in script/gitlab/build_and_test.sh script. +# TODO: add a clean-up mechanism. + BUILD_ROOT: ${CI_PROJECT_DIR} + +##### SHARED_CI CONFIGURATION +# Required information about GitHub repository + GITHUB_PROJECT_NAME: "boutproject" + GITHUB_PROJECT_ORG: "BOUT-dev" +# Set the build-and-test command. +# Nested variables are allowed and useful to customize the job command. We +# prevent variable expansion so that you can define them at job level. + JOB_CMD: + value: "tests/gitlab/ci-tests.sh" + expand: false +# Override the pattern describing branches that will skip the "draft PR filter +# test". Add protected branches here. See default value in +# preliminary-ignore-draft-pr.yml. +# ALWAYS_RUN_PATTERN: "" + +# We organize the build-and-test stage with sub-pipelines. Each sub-pipeline +# corresponds to a test batch on a given machine. + +# High level stages +stages: + - prerequisites + - build-and-test + +# Template for jobs triggering a build-and-test sub-pipeline: +.build-and-test: + stage: build-and-test + trigger: + include: + - local: '.gitlab/custom-jobs-and-variables.yml' + - project: 'radiuss/radiuss-shared-ci' + ref: 'v2024.07.0' + file: 'pipelines/${CI_MACHINE}.yml' + # Add your jobs + # you can use a local file + - local: '.gitlab/jobs/${CI_MACHINE}.yml' + # or a file generated in the previous steps + # - artifact: '${CI_MACHINE}-jobs.yml' + # job: 'generate-job-file' + # (See Umpire CI setup for an example). + strategy: depend + forward: + pipeline_variables: true + +include: + # Sets ID tokens for every job using `default:` + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' + # [Optional] checks preliminary to running the actual CI test + - project: 'radiuss/radiuss-shared-ci' + ref: 'v2024.07.0' + file: 'utilities/preliminary-ignore-draft-pr.yml' + # pipelines subscribed by the project + - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml new file mode 100644 index 0000000000..5af33aee04 --- /dev/null +++ b/.gitlab/custom-jobs-and-variables.yml @@ -0,0 +1,62 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# We define the following GitLab pipeline variables: +variables: +# In some pipelines we create only one allocation shared among jobs in +# order to save time and resources. This allocation has to be uniquely +# named so that we are sure to retrieve it and avoid collisions. + ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} + +# Ruby +# Arguments for top level allocation + RUBY_SHARED_ALLOC: "--mpi=none --exclusive --reservation=ci --time=20 --nodes=1" +# Arguments for job level allocation + RUBY_JOB_ALLOC: "--mpi=none --reservation=ci --nodes=1" +# Add variables that should apply to all the jobs on a machine: +# RUBY_MY_VAR: "..." + +# Poodle +# Arguments for top level allocation + POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=10 --nodes=1" +# Arguments for job level allocation + POODLE_JOB_ALLOC: "--nodes=1" +# Add variables that should apply to all the jobs on a machine: +# POODLE_MY_VAR: "..." + +# Corona +# Arguments for top level allocation +# OPTIONAL: "-o per-resource.count=2" allows to get 2 jobs running on each node. + CORONA_SHARED_ALLOC: "--exclusive --time-limit=15m --nodes=1" +# Arguments for job level allocation + CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" +# Add variables that should apply to all the jobs on a machine: +# CORONA_MY_VAR: "..." + +# Tioga +# Arguments for top level allocation +# OPTIONAL: "-o per-resource.count=2" allows to get 2 jobs running on each node. + TIOGA_SHARED_ALLOC: "--queue=pci --exclusive --time-limit=15m --nodes=1" +# Arguments for job level allocation + TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" +# Add variables that should apply to all the jobs on a machine: +# TIOGA_MY_VAR: "..." + +# Lassen uses a different job scheduler (spectrum lsf) that does not allow +# pre-allocation the same way slurm does. Arguments for job level allocation + LASSEN_JOB_ALLOC: "1 -W 30 -q pci" +# Add variables that should apply to all the jobs on a machine: +# LASSEN_MY_VAR: "..." + + +# Configuration shared by build and test jobs specific to this project. +# Not all configuration can be shared. Here projects can fine tune the +# CI behavior. +# See Umpire for an example (export junit test reports). +.custom_job: + variables: + JOB_TEMPLATE_CANNOT_BE_EMPTY: "True" diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml new file mode 100644 index 0000000000..6363dd9336 --- /dev/null +++ b/.gitlab/jobs/lassen.yml @@ -0,0 +1,59 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# We require project to define their job command using a variable (JOB_CMD). +# In customization/gitlab-ci.yml, we encourage to define this variable as +# non-expandable, so that project can use nested variables to configure the job +# command. The caveat is that the reproducer here cannot capture the +# definition of these variables in a generic fashion. By overriding the +# following section, projects can specify the variables to define in the +# reproducer to exactly reproduce the CI build. +.lassen_reproducer_vars: + script: + - echo -e "Running on Lassen\n" + +# With GitLab CI, included files cannot be empty. +# TODO: remove when you have at least on job defined. +variables: + INCLUDED_FILE_CANNOT_BE_EMPTY: "True" + +############### +# Explanations: +############### +# RADIUSS Shared CI provides a pipeline for each machine, where a template job +# is provided. Each of your jobs must extend this template to be added to the +# list of jobs running on the associated machine. +# +# The job template then expects you to define the "JOB_CMD" variable with the +# one line command used to trigger the build and test of your project. +# +# We suggest that you set your command in such a way that you can then +# customize it per job with variables. E.g.: +# "./path/to/my_ci_script ${A_VARIABLE}" + +## Adding jobs defined by the project. +## Note: placing the extends section first allows you to override part of the +## shared implementation if needed (and if you know what you are doing). +#: +# extends: .job_on_lassen +# variables: +# : "" + +.base-job: + extends: .job_on_lassen + before_script: + # Update BOUT-configs in the shared directory. + - pushd /usr/workspace/BOUT-GPU/BOUT-configs + - git pull + - popd + # Create the environment. + - source /usr/workspace/BOUT-GPU/BOUT-configs/lassen/setup-env.sh + after_script: + - rm -rf ${CI_BUILDS_DIR} ${CI_PROJECT_DIR} + +build-test-cuda-minimal: + extends: .base-job \ No newline at end of file diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml new file mode 100644 index 0000000000..265a344ba8 --- /dev/null +++ b/.gitlab/subscribed-pipelines.yml @@ -0,0 +1,91 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# The template job to test whether a machine is up. +# Expects CI_MACHINE defined to machine name. +.machine-check: + stage: prerequisites + tags: [shell, oslic] + variables: + GIT_STRATEGY: none + script: + - | + if [[ $(jq '.[env.CI_MACHINE].total_nodes_up' /usr/global/tools/lorenz/data/loginnodeStatus) == 0 ]] + then + echo -e "\e[31mNo node available on ${CI_MACHINE}\e[0m" + curl --url "https://api.github.com/repos/${GITHUB_PROJECT_ORG}/${GITHUB_PROJECT_NAME}/statuses/${CI_COMMIT_SHA}" \ + --header 'Content-Type: application/json' \ + --header "authorization: Bearer ${GITHUB_TOKEN}" \ + --data "{ \"state\": \"failure\", \"target_url\": \"${CI_PIPELINE_URL}\", \"description\": \"GitLab ${CI_MACHINE} down\", \"context\": \"ci/gitlab/${CI_MACHINE}\" }" + exit 1 + fi + +### +# Trigger a build-and-test pipeline for a machine. +# Comment the jobs for machines you don’t need. +### + +# RUBY +#ruby-up-check: +# variables: +# CI_MACHINE: "ruby" +# extends: [.machine-check] +# +#ruby-build-and-test: +# variables: +# CI_MACHINE: "ruby" +# needs: [ruby-up-check] +# extends: [.build-and-test] + +## POODLE +#poodle-up-check: +# variables: +# CI_MACHINE: "poodle" +# extends: [.machine-check] +# +#poodle-build-and-test: +# variables: +# CI_MACHINE: "poodle" +# needs: [poodle-up-check] +# extends: [.build-and-test] +# +## CORONA +#corona-up-check: +# variables: +# CI_MACHINE: "corona" +# extends: [.machine-check] +# +#corona-build-and-test: +# variables: +# CI_MACHINE: "corona" +# needs: [corona-up-check] +# extends: [.build-and-test] +# +## TIOGA +#tioga-up-check: +# variables: +# CI_MACHINE: "tioga" +# extends: [.machine-check] +# +#tioga-build-and-test: +# variables: +# CI_MACHINE: "tioga" +# needs: [tioga-up-check] +# extends: [.build-and-test] + +# LASSEN +lassen-up-check: + variables: + CI_MACHINE: "lassen" + extends: [.machine-check] + +lassen-build-and-test: + variables: + CI_MACHINE: "lassen" + needs: [lassen-up-check] + extends: [.build-and-test] + diff --git a/tests/gitlab/ci-tests.sh b/tests/gitlab/ci-tests.sh new file mode 100755 index 0000000000..a237d85be9 --- /dev/null +++ b/tests/gitlab/ci-tests.sh @@ -0,0 +1,88 @@ +#!/bin/bash +set -e + +echo "===> Building BOUT-dev CUDA minimal" +cmake -S . -B build \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -DBOUT_ENABLE_RAJA=on \ + -DBOUT_ENABLE_UMPIRE=on \ + -DBOUT_ENABLE_CUDA=on \ + -DCMAKE_CUDA_ARCHITECTURES=70 \ + -DCUDA_ARCH=compute_70,code=sm_70 \ + -DBOUT_ENABLE_WARNINGS=off \ + -DBOUT_USE_SYSTEM_FMT=on + +pushd build +make -j + +echo "===> Building and running blob2d-outerloop" +pushd examples/blob2d-outerloop +make -j +# Check the output using Sim Time and RHS evals. Must be careful splitting the +# regex string in mulitple lines and escaping characters. +if ./blob2d-outerloop | grep -Pzoq "(?s)Sim Time \| RHS evals \| Wall Time \| Calc Inv Comm I/O SOLVER\n.*\n"\ +"0\.000e\+00 2 .*"\ +"5\.000e\+01 53 .*"\ +"1\.000e\+02 17 .*"\ +"1\.500e\+02 27 .*"; then + echo "Sim Time and RHS evals match" +else + echo "Sim Time and RHS evals DO NOT match" + exit 1 +fi +popd + +echo "===> Building and running elm-pb-outerloop" +pushd examples/elm-pb-outerloop +make -j +if ./elm_pb_outerloop | grep -Pzoq "(?s)Sim Time \| RHS evals \| Wall Time \| Calc Inv Comm I/O SOLVER\n.*\n"\ +"0\.000e\+00 2 .*"\ +"1\.000e\+00 44 .*"\ +"2\.000e\+00 37 .*"\ +"3\.000e\+00 37 .*"\ +"4\.000e\+00 37 .*"\ +"5\.000e\+00 30 .*"\ +"6\.000e\+00 31 .*"\ +"7\.000e\+00 31 .*"\ +"8\.000e\+00 25 .*"\ +"9\.000e\+00 21 .*"\ +"1\.000e\+01 24 .*"\ +"1\.100e\+01 19 .*"\ +"1\.200e\+01 25 .*"\ +"1\.300e\+01 25 .*"\ +"1\.400e\+01 25 .*"\ +"1\.500e\+01 25 .*"\ +"1\.600e\+01 25 .*"\ +"1\.700e\+01 25 .*"\ +"1\.800e\+01 25 .*"\ +"1\.900e\+01 20 .*"\ +"2\.000e\+01 29 .*"\ +"2\.100e\+01 29 .*"\ +"2\.200e\+01 29 .*"\ +"2\.300e\+01 29 .*"\ +"2\.400e\+01 29 .*"\ +"2\.500e\+01 29 .*"\ +"2\.600e\+01 29 .*"\ +"2\.700e\+01 22 .*"\ +"2\.800e\+01 29 .*"\ +"2\.900e\+01 29 .*"\ +"3\.000e\+01 29 .*"\ +"3\.100e\+01 29 .*"\ +"3\.200e\+01 29 .*"\ +"3\.300e\+01 32 .*"\ +"3\.400e\+01 25 .*"\ +"3\.500e\+01 33 .*"\ +"3\.600e\+01 33 .*"\ +"3\.700e\+01 39 .*"\ +"3\.800e\+01 31 .*"\ +"3\.900e\+01 31 .*"\ +"4\.000e\+01 36 .*"; then + echo "Sim Time and RHS evals match" +else + echo "Sim Time and RHS evals DO NOT match" + exit 1 +fi +popd + +popd \ No newline at end of file