new testing infrastructure works for NVIDIABackend()

pc2 · carstenbauer · Aug 16, 2023 · Aug 17, 2023 · Aug 18, 2023 · Aug 18, 2023
commit 5c155247a2543c1a5718b3630069a6d2ec2c6b81
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -2,24 +2,23 @@ stages:
   - test
   - documentation
 variables:
-  SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci"
   JULIA_NUM_THREADS: "10"
   JULIA_EXCLUSIVE: "1"
   JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
-  MKL_DYNAMIC: "false"
-  MKL_NUM_THREADS: "1"
 default:
   tags:
     - bauerc-noctua2
 
 # Generates code coverage
-julia/1.9:
+julia/1.9/NVIDIA:
   stage: test
   rules:
     - changes:
         - "README.md"
     - when: on_success
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
     - module load $JULIA_1_9
@@ -28,6 +27,19 @@ julia/1.9:
     - julia --color=yes --project=test/coverage test/coverage/coverage.jl
   allow_failure: false
 
+julia/1.9/AMD:
+  stage: test
+  rules:
+    - changes:
+        - "README.md"
+    - when: on_success
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:15:00 -A pc2-mitarbeiter -p hacc --exclusive"
+  script:
+    - /bin/bash -l
+    - module load $JULIA_1_9
+    - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = false);'
+  allow_failure: true
 
 # Documentation
 build-and-deploy-docs:
@@ -37,6 +49,8 @@ build-and-deploy-docs:
     - pushes
     - tags
     - external_pull_requests
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
     - module load $JULIA_1_9

diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -24,7 +24,8 @@ using GPUInspector:
     MonitoringResults,
     _defaultylims,
     @unroll,
-    AMDBackend
+    AMDBackend,
+    getstdout
 
 include("utility.jl")
 # include("stresstests.jl")

diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl
@@ -2,7 +2,7 @@ function GPUInspector.ngpus(::AMDBackend)
     return length(AMDGPU.devices())
 end
 
-function GPUInspector.gpus(::AMDBackend; io::IO=stdout)
+function GPUInspector.gpus(::AMDBackend; io=getstdout())
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
     devs = AMDGPU.devices()
     if isempty(devs)
@@ -32,18 +32,18 @@ Print out detailed information about the AMD GPU with the given `deviceid`.
 
 (This method is from the AMD backend.)
 """
-function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io::IO=stdout)
+function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io=getstdout())
     0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id."))
     return gpuinfo(HIPDevice(deviceid); io)
 end
-function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io::IO=stdout)
+function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io=getstdout())
     # printing
     println(io, "Device: $dev \n")
     show(io, AMDGPU.HIP.properties(dev))
     return nothing
 end
 
-function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io::IO=stdout)
+function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io=getstdout())
     # check p2p access
     ndevs = ngpus(AMDBackend())
     if ndevs <= 1

diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
@@ -4,7 +4,7 @@ function GPUInspector.host2device_bandwidth(
     dtype=Cchar,
     DtoDfactor=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     N = Int(bytes(memsize) ÷ sizeof(dtype))
@@ -36,7 +36,7 @@ function _perform_memcpy(
     stats=false,
     DtoDfactor=false,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
     ts = zeros(nbench)

diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl
@@ -1,5 +1,5 @@
 # function theoretical_memory_bandwidth(
-#     ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
+#     ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
 # )
 #     max_mem_clock_rate =
 #         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
@@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth(
     verbose=true,
     DtoDfactor=true,
     device=AMDGPU.device(),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )::Float64
     AMDGPU.device!(device) do
@@ -40,7 +40,7 @@ function GPUInspector.memory_bandwidth_scaling(
     device=AMDGPU.device(),
     sizes=logspace(1, exp2(30), 10),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     bandwidths = zeros(length(sizes))
@@ -85,7 +85,7 @@ end
 #     dtype=Float32,
 #     cublas=true,
 #     verbose=true,
-#     io::IO=stdout,
+#     io=getstdout(),
 # )::Float64
 #     device!(device) do
 #         a = dtype(pi)
@@ -130,7 +130,7 @@ end
 #     device=CUDA.device(),
 #     sizes=[2^20 * i for i in 10:10:300],
 #     verbose=true,
-#     io::IO=stdout,
+#     io=getstdout(),
 #     kwargs...,
 # )
 #     # sizes = [2^20 * i for i in 8:128] # V100

diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
@@ -24,7 +24,8 @@ using GPUInspector:
     MonitoringResults,
     _defaultylims,
     @unroll,
-    NVIDIABackend
+    NVIDIABackend,
+    getstdout
 
 # for convenience
 const BFloat16 = CUDA.BFloat16

diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl
@@ -2,7 +2,7 @@ function GPUInspector.ngpus(::NVIDIABackend)
     length(CUDA.devices())
 end
 
-function GPUInspector.gpus(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpus(::NVIDIABackend; io=getstdout())
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
     devs = devices()
     if isempty(devs)
@@ -45,11 +45,11 @@ Heavily inspired by the CUDA sample "deviceQueryDrv.cpp".
 
 (This method is from the NVIDIA Backend.)
 """
-function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io=getstdout())
     0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id."))
     return gpuinfo(CuDevice(deviceid); io)
 end
-function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io=getstdout())
     # query
     mp = nmultiprocessors(dev)
     cores = ncudacores(dev)
@@ -216,7 +216,7 @@ function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::
     return nothing
 end
 
-function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io=getstdout())
     # check p2p access
     ndevs = ngpus(NVIDIABackend())
     if ndevs <= 1

diff --git a/ext/CUDAExt/implementations/host2device_bandwidth.jl b/ext/CUDAExt/implementations/host2device_bandwidth.jl
@@ -3,7 +3,7 @@ function GPUInspector.host2device_bandwidth(::NVIDIABackend;
     dtype=Cchar,
     DtoDfactor=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     N = Int(bytes(memsize) ÷ sizeof(dtype))
@@ -42,7 +42,7 @@ function _perform_memcpy(
     stats=false,
     DtoDfactor=false,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     NVTX.@range "host2dev: $title" begin
         sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")

diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl
@@ -1,5 +1,5 @@
 function GPUInspector.theoretical_memory_bandwidth(
-    ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
+    ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
 )
     max_mem_clock_rate =
         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
@@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth(
     verbose=true,
     DtoDfactor=true,
     device=CUDA.device(),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )::Float64
     device!(device) do
@@ -46,7 +46,7 @@ function GPUInspector.memory_bandwidth_scaling(
     device=CUDA.device(),
     sizes=logspace(1, exp2(30), 10),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     bandwidths = zeros(length(sizes))
@@ -91,7 +91,7 @@ function GPUInspector.memory_bandwidth_saxpy(
     dtype=Float32,
     cublas=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )::Float64
     device!(device) do
         a = dtype(pi)
@@ -136,7 +136,7 @@ function GPUInspector.memory_bandwidth_saxpy_scaling(
     device=CUDA.device(),
     sizes=[2^20 * i for i in 10:10:300],
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     # sizes = [2^20 * i for i in 8:128] # V100

diff --git a/ext/CUDAExt/implementations/p2p_bandwidth.jl b/ext/CUDAExt/implementations/p2p_bandwidth.jl
@@ -9,7 +9,7 @@ function GPUInspector.p2p_bandwidth(
     dtype=Float32,
     src=0,
     dst=1,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if ngpus(NVIDIABackend()) < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -66,7 +66,7 @@ function GPUInspector.p2p_bandwidth(
     return bw_max
 end
 
-function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...)
+function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io=getstdout(), verbose=false, kwargs...)
     ngpus = length(CUDA.devices())
     if ngpus < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -93,7 +93,7 @@ function GPUInspector.p2p_bandwidth_bidirectional(
     dev1=0,
     dev2=1,
     repeat=100,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if ngpus(NVIDIABackend()) < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")

diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl
@@ -16,7 +16,7 @@ function GPUInspector.theoretical_peakflops_gpu(
     tensorcores=hastensorcores(),
     dtype=tensorcores ? Float16 : Float32,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if tensorcores
         max_peakflops = _theoretical_peakflops_gpu_tensorcores(; device, dtype)
@@ -109,7 +109,7 @@ function GPUInspector.peakflops_gpu(
     tensorcores=hastensorcores(),
     verbose=true,
     dtype=tensorcores ? Float16 : Float32,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     if tensorcores

diff --git a/ext/CUDAExt/implementations/stresstest.jl b/ext/CUDAExt/implementations/stresstest.jl
@@ -12,7 +12,7 @@ function GPUInspector.stresstest(
     clearmem=false,
     monitoring=false,
     batch_duration=nothing,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     logger = ConsoleLogger(io)
@@ -69,7 +69,7 @@ function GPUInspector.stresstest(
         Δt = @elapsed _run_stresstests(ts; verbose, kwargs...)
         if clearmem
             verbose && @info("Clearing GPU memory.")
-            clear_all_gpus_memory(devices)
+            GPUInspector.clear_all_gpus_memory(; devices=devices)
         end
         verbose && @info("Took $(round(Δt; digits=2)) seconds to run the tests.")
         if monitoring

diff --git a/ext/CUDAExt/peakflops_gpu_fmas.jl b/ext/CUDAExt/peakflops_gpu_fmas.jl
@@ -48,7 +48,7 @@ function _peakflops_gpu_fmas(;
     nkernel=5,
     device::CuDevice=CUDA.device(),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         d_a = CUDA.rand(dtype, size)

diff --git a/ext/CUDAExt/peakflops_gpu_matmul.jl b/ext/CUDAExt/peakflops_gpu_matmul.jl
@@ -9,13 +9,13 @@ function peakflops_gpu_matmul_scaling(
     device=CUDA.device(),
     verbose=true,
     sizes=2 .^ (10:15),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 ) where {F}
     flops = zeros(length(sizes))
     for (i, s) in enumerate(sizes)
         flops[i] = peakflops_func(; device=device, size=s, verbose=false, kwargs...)
-        clear_gpu_memory(device)
+        GPUInspector.clear_gpu_memory(; device=device)
     end
     if verbose
         peak_val, idx = findmax(flops)
@@ -64,7 +64,7 @@ function peakflops_gpu_matmul(;
     nmatmuls=5,
     nbench=5,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         C = CUDA.zeros(dtype, size, size)
@@ -108,7 +108,7 @@ function peakflops_gpu_matmul_graphs(;
     nmatmuls=5,
     nbench=5,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         C = CUDA.zeros(dtype, size, size)

diff --git a/ext/CUDAExt/peakflops_gpu_wmmas.jl b/ext/CUDAExt/peakflops_gpu_wmmas.jl
@@ -91,7 +91,7 @@ function _peakflops_gpu_wmmas(;
     nkernel=10,
     verbose=true,
     dtype=Float16,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         if Symbol(dtype) == :Float16

diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl
@@ -16,6 +16,9 @@ using CpuId: cachesize
 using HDF5: h5open
 using Glob: glob
 
+const DEFAULT_IO = Ref{Union{IO, Nothing}}(nothing)
+getstdout() = something(DEFAULT_IO[], stdout)
+
 include("backends.jl")
 include("UnitPrefixedBytes.jl")
 include("utility.jl")