Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Initial AMDGPU support #26

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
new testing infrastructure works for NVIDIABackend()
carstenbauer committed Aug 18, 2023
commit 5c155247a2543c1a5718b3630069a6d2ec2c6b81
22 changes: 18 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -2,24 +2,23 @@ stages:
- test
- documentation
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci"
JULIA_NUM_THREADS: "10"
JULIA_EXCLUSIVE: "1"
JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
MKL_DYNAMIC: "false"
MKL_NUM_THREADS: "1"
default:
tags:
- bauerc-noctua2

# Generates code coverage
julia/1.9:
julia/1.9/NVIDIA:
stage: test
rules:
- changes:
- "README.md"
- when: on_success
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
script:
- /bin/bash -l
- module load $JULIA_1_9
@@ -28,6 +27,19 @@ julia/1.9:
- julia --color=yes --project=test/coverage test/coverage/coverage.jl
allow_failure: false

julia/1.9/AMD:
stage: test
rules:
- changes:
- "README.md"
- when: on_success
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:15:00 -A pc2-mitarbeiter -p hacc --exclusive"
script:
- /bin/bash -l
- module load $JULIA_1_9
- julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = false);'
allow_failure: true

# Documentation
build-and-deploy-docs:
@@ -37,6 +49,8 @@ build-and-deploy-docs:
- pushes
- tags
- external_pull_requests
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
script:
- /bin/bash -l
- module load $JULIA_1_9
3 changes: 2 additions & 1 deletion ext/AMDGPUExt/AMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -24,7 +24,8 @@ using GPUInspector:
MonitoringResults,
_defaultylims,
@unroll,
AMDBackend
AMDBackend,
getstdout

include("utility.jl")
# include("stresstests.jl")
8 changes: 4 additions & 4 deletions ext/AMDGPUExt/implementations/gpuinfo.jl
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@ function GPUInspector.ngpus(::AMDBackend)
return length(AMDGPU.devices())
end

function GPUInspector.gpus(::AMDBackend; io::IO=stdout)
function GPUInspector.gpus(::AMDBackend; io=getstdout())
# Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
devs = AMDGPU.devices()
if isempty(devs)
@@ -32,18 +32,18 @@ Print out detailed information about the AMD GPU with the given `deviceid`.

(This method is from the AMD backend.)
"""
function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io::IO=stdout)
function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io=getstdout())
0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id."))
return gpuinfo(HIPDevice(deviceid); io)
end
function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io::IO=stdout)
function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io=getstdout())
# printing
println(io, "Device: $dev \n")
show(io, AMDGPU.HIP.properties(dev))
return nothing
end

function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io::IO=stdout)
function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io=getstdout())
# check p2p access
ndevs = ngpus(AMDBackend())
if ndevs <= 1
4 changes: 2 additions & 2 deletions ext/AMDGPUExt/implementations/host2device_bandwidth.jl
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@ function GPUInspector.host2device_bandwidth(
dtype=Cchar,
DtoDfactor=true,
verbose=true,
io::IO=stdout,
io=getstdout(),
kwargs...,
)
N = Int(bytes(memsize) ÷ sizeof(dtype))
@@ -36,7 +36,7 @@ function _perform_memcpy(
stats=false,
DtoDfactor=false,
verbose=true,
io::IO=stdout,
io=getstdout(),
)
sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
ts = zeros(nbench)
10 changes: 5 additions & 5 deletions ext/AMDGPUExt/implementations/membw.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# function theoretical_memory_bandwidth(
# ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
# ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
# )
# max_mem_clock_rate =
# CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
@@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth(
verbose=true,
DtoDfactor=true,
device=AMDGPU.device(),
io::IO=stdout,
io=getstdout(),
kwargs...,
)::Float64
AMDGPU.device!(device) do
@@ -40,7 +40,7 @@ function GPUInspector.memory_bandwidth_scaling(
device=AMDGPU.device(),
sizes=logspace(1, exp2(30), 10),
verbose=true,
io::IO=stdout,
io=getstdout(),
kwargs...,
)
bandwidths = zeros(length(sizes))
@@ -85,7 +85,7 @@ end
# dtype=Float32,
# cublas=true,
# verbose=true,
# io::IO=stdout,
# io=getstdout(),
# )::Float64
# device!(device) do
# a = dtype(pi)
@@ -130,7 +130,7 @@ end
# device=CUDA.device(),
# sizes=[2^20 * i for i in 10:10:300],
# verbose=true,
# io::IO=stdout,
# io=getstdout(),
# kwargs...,
# )
# # sizes = [2^20 * i for i in 8:128] # V100
3 changes: 2 additions & 1 deletion ext/CUDAExt/CUDAExt.jl
Original file line number Diff line number Diff line change
@@ -24,7 +24,8 @@ using GPUInspector:
MonitoringResults,
_defaultylims,
@unroll,
NVIDIABackend
NVIDIABackend,
getstdout

# for convenience
const BFloat16 = CUDA.BFloat16
8 changes: 4 additions & 4 deletions ext/CUDAExt/implementations/gpuinfo.jl
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@ function GPUInspector.ngpus(::NVIDIABackend)
length(CUDA.devices())
end

function GPUInspector.gpus(::NVIDIABackend; io::IO=stdout)
function GPUInspector.gpus(::NVIDIABackend; io=getstdout())
# Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
devs = devices()
if isempty(devs)
@@ -45,11 +45,11 @@ Heavily inspired by the CUDA sample "deviceQueryDrv.cpp".

(This method is from the NVIDIA Backend.)
"""
function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io=getstdout())
0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id."))
return gpuinfo(CuDevice(deviceid); io)
end
function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io=getstdout())
# query
mp = nmultiprocessors(dev)
cores = ncudacores(dev)
@@ -216,7 +216,7 @@ function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::
return nothing
end

function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io=getstdout())
# check p2p access
ndevs = ngpus(NVIDIABackend())
if ndevs <= 1
4 changes: 2 additions & 2 deletions ext/CUDAExt/implementations/host2device_bandwidth.jl
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ function GPUInspector.host2device_bandwidth(::NVIDIABackend;
dtype=Cchar,
DtoDfactor=true,
verbose=true,
io::IO=stdout,
io=getstdout(),
kwargs...,
)
N = Int(bytes(memsize) ÷ sizeof(dtype))
@@ -42,7 +42,7 @@ function _perform_memcpy(
stats=false,
DtoDfactor=false,
verbose=true,
io::IO=stdout,
io=getstdout(),
)
NVTX.@range "host2dev: $title" begin
sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
10 changes: 5 additions & 5 deletions ext/CUDAExt/implementations/membw.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
function GPUInspector.theoretical_memory_bandwidth(
::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
)
max_mem_clock_rate =
CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
@@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth(
verbose=true,
DtoDfactor=true,
device=CUDA.device(),
io::IO=stdout,
io=getstdout(),
kwargs...,
)::Float64
device!(device) do
@@ -46,7 +46,7 @@ function GPUInspector.memory_bandwidth_scaling(
device=CUDA.device(),
sizes=logspace(1, exp2(30), 10),
verbose=true,
io::IO=stdout,
io=getstdout(),
kwargs...,
)
bandwidths = zeros(length(sizes))
@@ -91,7 +91,7 @@ function GPUInspector.memory_bandwidth_saxpy(
dtype=Float32,
cublas=true,
verbose=true,
io::IO=stdout,
io=getstdout(),
)::Float64
device!(device) do
a = dtype(pi)
@@ -136,7 +136,7 @@ function GPUInspector.memory_bandwidth_saxpy_scaling(
device=CUDA.device(),
sizes=[2^20 * i for i in 10:10:300],
verbose=true,
io::IO=stdout,
io=getstdout(),
kwargs...,
)
# sizes = [2^20 * i for i in 8:128] # V100
6 changes: 3 additions & 3 deletions ext/CUDAExt/implementations/p2p_bandwidth.jl
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@ function GPUInspector.p2p_bandwidth(
dtype=Float32,
src=0,
dst=1,
io::IO=stdout,
io=getstdout(),
)
if ngpus(NVIDIABackend()) < 2
error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -66,7 +66,7 @@ function GPUInspector.p2p_bandwidth(
return bw_max
end

function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...)
function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io=getstdout(), verbose=false, kwargs...)
ngpus = length(CUDA.devices())
if ngpus < 2
error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -93,7 +93,7 @@ function GPUInspector.p2p_bandwidth_bidirectional(
dev1=0,
dev2=1,
repeat=100,
io::IO=stdout,
io=getstdout(),
)
if ngpus(NVIDIABackend()) < 2
error("At least 2 GPUs are needed for the P2P benchmark.")
4 changes: 2 additions & 2 deletions ext/CUDAExt/implementations/peakflops_gpu.jl
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@ function GPUInspector.theoretical_peakflops_gpu(
tensorcores=hastensorcores(),
dtype=tensorcores ? Float16 : Float32,
verbose=true,
io::IO=stdout,
io=getstdout(),
)
if tensorcores
max_peakflops = _theoretical_peakflops_gpu_tensorcores(; device, dtype)
@@ -109,7 +109,7 @@ function GPUInspector.peakflops_gpu(
tensorcores=hastensorcores(),
verbose=true,
dtype=tensorcores ? Float16 : Float32,
io::IO=stdout,
io=getstdout(),
kwargs...,
)
if tensorcores
4 changes: 2 additions & 2 deletions ext/CUDAExt/implementations/stresstest.jl
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@ function GPUInspector.stresstest(
clearmem=false,
monitoring=false,
batch_duration=nothing,
io::IO=stdout,
io=getstdout(),
kwargs...,
)
logger = ConsoleLogger(io)
@@ -69,7 +69,7 @@ function GPUInspector.stresstest(
Δt = @elapsed _run_stresstests(ts; verbose, kwargs...)
if clearmem
verbose && @info("Clearing GPU memory.")
clear_all_gpus_memory(devices)
GPUInspector.clear_all_gpus_memory(; devices=devices)
end
verbose && @info("Took $(round(Δt; digits=2)) seconds to run the tests.")
if monitoring
2 changes: 1 addition & 1 deletion ext/CUDAExt/peakflops_gpu_fmas.jl
Original file line number Diff line number Diff line change
@@ -48,7 +48,7 @@ function _peakflops_gpu_fmas(;
nkernel=5,
device::CuDevice=CUDA.device(),
verbose=true,
io::IO=stdout,
io=getstdout(),
)
device!(device) do
d_a = CUDA.rand(dtype, size)
8 changes: 4 additions & 4 deletions ext/CUDAExt/peakflops_gpu_matmul.jl
Original file line number Diff line number Diff line change
@@ -9,13 +9,13 @@ function peakflops_gpu_matmul_scaling(
device=CUDA.device(),
verbose=true,
sizes=2 .^ (10:15),
io::IO=stdout,
io=getstdout(),
kwargs...,
) where {F}
flops = zeros(length(sizes))
for (i, s) in enumerate(sizes)
flops[i] = peakflops_func(; device=device, size=s, verbose=false, kwargs...)
clear_gpu_memory(device)
GPUInspector.clear_gpu_memory(; device=device)
end
if verbose
peak_val, idx = findmax(flops)
@@ -64,7 +64,7 @@ function peakflops_gpu_matmul(;
nmatmuls=5,
nbench=5,
verbose=true,
io::IO=stdout,
io=getstdout(),
)
device!(device) do
C = CUDA.zeros(dtype, size, size)
@@ -108,7 +108,7 @@ function peakflops_gpu_matmul_graphs(;
nmatmuls=5,
nbench=5,
verbose=true,
io::IO=stdout,
io=getstdout(),
)
device!(device) do
C = CUDA.zeros(dtype, size, size)
2 changes: 1 addition & 1 deletion ext/CUDAExt/peakflops_gpu_wmmas.jl
Original file line number Diff line number Diff line change
@@ -91,7 +91,7 @@ function _peakflops_gpu_wmmas(;
nkernel=10,
verbose=true,
dtype=Float16,
io::IO=stdout,
io=getstdout(),
)
device!(device) do
if Symbol(dtype) == :Float16
3 changes: 3 additions & 0 deletions src/GPUInspector.jl
Original file line number Diff line number Diff line change
@@ -16,6 +16,9 @@ using CpuId: cachesize
using HDF5: h5open
using Glob: glob

const DEFAULT_IO = Ref{Union{IO, Nothing}}(nothing)
getstdout() = something(DEFAULT_IO[], stdout)

include("backends.jl")
include("UnitPrefixedBytes.jl")
include("utility.jl")
Loading