diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7ae5973..5900b30 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,32 +2,45 @@ stages: - test - documentation variables: - SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci" JULIA_NUM_THREADS: "10" JULIA_EXCLUSIVE: "1" - JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0" - MKL_DYNAMIC: "false" - MKL_NUM_THREADS: "1" + JULIAHPC_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0" + JULIA_1_9: "lang/Julia/1.9.2-linux-x86_64" default: tags: - bauerc-noctua2 # Generates code coverage -julia/1.9: +julia/1.9/NVIDIA: stage: test rules: - changes: - "README.md" - when: on_success + variables: + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" script: - /bin/bash -l - - module load $JULIA_1_9 + - module load $JULIAHPC_1_9 - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = true);' - julia --color=yes --project=test/coverage -e 'import Pkg; Pkg.instantiate()' - julia --color=yes --project=test/coverage test/coverage/coverage.jl allow_failure: false +julia/1.9/AMD: + stage: test + rules: + - changes: + - "README.md" + - when: on_success + variables: + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:20:00 -A pc2-mitarbeiter -p hacc --exclusive" + script: + - /bin/bash -l + - module load $JULIA_1_9 + - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = false);' + allow_failure: true # Documentation build-and-deploy-docs: @@ -37,9 +50,11 @@ build-and-deploy-docs: - pushes - tags - external_pull_requests + variables: + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" script: - /bin/bash -l - - module load $JULIA_1_9 + - module load $JULIAHPC_1_9 - cd docs - julia --color=yes build_docs.jl allow_failure: false diff --git a/Project.toml b/Project.toml index 053af89..e478059 100644 --- a/Project.toml +++ b/Project.toml @@ -14,20 +14,22 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" [weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" [extensions] CUDAExt = "CUDA" +AMDGPUExt = "AMDGPU" CairoMakieExt = "CairoMakie" [compat] +AMDGPU = "0.5.5" CUDA = "3.8.4, 3.12, 4.4" CairoMakie = "0.7, 0.10.7" CpuId = "0.3" @@ -35,18 +37,16 @@ DocStringExtensions = "0.9" Glob = "1.3" HDF5 = "0.16" NVTX = "0.3" -Reexport = "1.2" -TestItemRunner = "0.2" ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7" UnicodePlots = "2.8, 3" julia = "1.9" [extras] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" [targets] -test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "TestItemRunner"] +test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU"] diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl new file mode 100644 index 0000000..2edd10c --- /dev/null +++ b/ext/AMDGPUExt/AMDGPUExt.jl @@ -0,0 +1,76 @@ +module AMDGPUExt + +using GPUInspector +using AMDGPU +using AMDGPU: device, device!, devices + +# stdlibs etc. +using Base: UUID +using Statistics +using Logging +using LinearAlgebra + +# pkgs +using UnicodePlots +using ThreadPinning + +# for usage in AMDGPUExt +using GPUInspector: + logspace, + ismonitoring, + _monitoring!, + _set_monitoring_task, + _get_monitoring_task, + MonitoringResults, + _defaultylims, + @unroll, + AMDBackend, + getstdout + +include("utility.jl") +# include("stresstests.jl") +# include("peakflops_gpu_fmas.jl") +# include("peakflops_gpu_wmmas.jl") +# include("peakflops_gpu_matmul.jl") +include("implementations/general.jl") +include("implementations/gpuinfo.jl") +# include("implementations/p2p_bandwidth.jl") +include("implementations/host2device_bandwidth.jl") +include("implementations/membw.jl") +# include("implementations/stresstest.jl") +# include("implementations/monitoring.jl") +# include("implementations/peakflops_gpu.jl") + +function __init__() + GPUInspector.AMDGPUJL_LOADED[] = true + GPUInspector.backend!(AMDBackend()) + GPUInspector.AMDGPUExt = Base.get_extension(GPUInspector, :AMDGPUExt) + return nothing +end + +function backendinfo(::AMDBackend) + # somewhat crude way to figure out which API functions are implemented :) + funcs = String[] + impl_dir = joinpath(@__DIR__, "implementations/") + for f in readdir(impl_dir) + lines = readlines(joinpath(impl_dir, f)) + func_lines = filter(startswith("function"), lines) + for fl in func_lines + fname = strip(split(split(fl, "function")[2], "(")[1]) + if startswith(fname, "_") || startswith(fname, "Base") + continue + end + if fname in funcs # avoid duplicates + continue + end + push!(funcs, fname) + end + end + println("Implementend API functions for AMDBackend:") + for f in funcs + println("\t", f) + end + return nothing +end + +end # module diff --git a/ext/AMDGPUExt/implementations/general.jl b/ext/AMDGPUExt/implementations/general.jl new file mode 100644 index 0000000..4735281 --- /dev/null +++ b/ext/AMDGPUExt/implementations/general.jl @@ -0,0 +1,21 @@ +function GPUInspector.functional(::AMDBackend; verbose=true) + if AMDGPU.functional() + verbose && @info("AMDGPU.jl is functional.") + working = true + else + verbose && @info("AMDGPU.jl not functional.") + working = false + end + return working +end + +function GPUInspector.clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true) + device!(device) do + gc && GC.gc() + AMDGPU.HIP.reclaim() + end + return nothing +end + +GPUInspector.device(::AMDBackend) = AMDGPU.device() +GPUInspector.devices(::AMDBackend) = AMDGPU.devices() diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl new file mode 100644 index 0000000..64c4d70 --- /dev/null +++ b/ext/AMDGPUExt/implementations/gpuinfo.jl @@ -0,0 +1,70 @@ +function GPUInspector.ngpus(::AMDBackend) + return length(AMDGPU.devices()) +end + +function GPUInspector.gpus(::AMDBackend; io=getstdout()) + # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 + devs = AMDGPU.devices() + if isempty(devs) + println(io, "No AMD devices found.") + elseif length(devs) == 1 + println(io, "1 device:") + else + println(io, length(devs), " devices:") + end + for (i, dev) in enumerate(devs) + mem_free, mem_tot = AMDGPU.device!(dev) do + AMDGPU.Runtime.Mem.info() + end + println( + io, + " $(_gpuid(dev)): ", + repr(dev), + " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)", + ) + end +end + +""" + gpuinfo(deviceid::Integer) + +Print out detailed information about the AMD GPU with the given `deviceid`. + +(This method is from the AMD backend.) +""" +function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io=getstdout()) + 0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id.")) + return gpuinfo(HIPDevice(deviceid); io) +end +function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io=getstdout()) + # printing + println(io, "Device: $dev \n") + show(io, AMDGPU.HIP.properties(dev)) + return nothing +end + +function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io=getstdout()) + # check p2p access + ndevs = ngpus(AMDBackend()) + if ndevs <= 1 + error("Only a single GPU available.") + else + devs = AMDGPU.devices() + mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs) + for i in 1:ndevs + for j in 1:ndevs + if i != j + mat_p2p_can_access[i, j] = Bool(AMDGPU.HIP.can_access_peer(devs[i], devs[j])) + else + mat_p2p_can_access[i, j] = false + end + end + end + + printstyled(io, "P2P Can Access:\n"; bold=true) + show(io, "text/plain", mat_p2p_can_access) + println(io) + println(io) + end + return nothing +end diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl new file mode 100644 index 0000000..8095f45 --- /dev/null +++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl @@ -0,0 +1,85 @@ +function GPUInspector.host2device_bandwidth( + ::AMDBackend; + memsize::UnitPrefixedBytes=GiB(0.5), + dtype=Cchar, + DtoDfactor=true, + verbose=true, + io=getstdout(), + kwargs..., +) + N = Int(bytes(memsize) ÷ sizeof(dtype)) + mem_host = rand(dtype, N) + # mem_host_pinned = Mem.pin(rand(dtype, N)) # TODO + mem_gpu = AMDGPU.rand(dtype, N) + + _perform_memcpy(mem_host, mem_gpu; title="Host <-> Device", verbose, io=io, kwargs...) + verbose && println(io) + # _perform_memcpy( + # mem_host_pinned, + # mem_gpu; + # title="Host (pinned) <-> Device", + # verbose, + # io=io, + # kwargs..., + # ) + # verbose && println() + # _perform_memcpy(mem_gpu, mem_gpu2; title="Device <-> Device (same device)", DtoDfactor, verbose, kwargs...) + return nothing +end + +function _perform_memcpy( + mem1, + mem2; + title="", + nbench=10, + times=false, + stats=false, + DtoDfactor=false, + verbose=true, + io=getstdout(), +) + sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)") + ts = zeros(nbench) + + @inbounds for i in 1:nbench + if i % 2 == 0 + ts[i] = AMDGPU.@elapsed copyto!(mem1, mem2) + else + ts[i] = AMDGPU.@elapsed copyto!(mem2, mem1) + end + end + + t_min = minimum(ts) + t_max = maximum(ts) + t_avg = mean(ts) + + actual_memsize_GiB = sizeof(mem1) * 2^(-30) + if DtoDfactor + actual_memsize_GiB *= 2 # must count both the read and the write here (taken from p2pBandwidthLatencyTest cuda sample....) + end + bws = actual_memsize_GiB ./ ts + bw_min = minimum(bws) + bw_max = maximum(bws) + bw_avg = mean(bws) + + if verbose + if times + println(io, "t_min: $t_min") + println(io, "t_max: $t_max") + println(io, "t_avg: $t_avg") + end + printstyled(io, "$(title) Bandwidth (GiB/s):\n"; bold=true) + if stats + print(io, " ├ max: ") + printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true) + println(io, " ├ min: ", round(bw_min; digits=2)) + println(io, " ├ avg: ", round(bw_avg; digits=2)) + print(io, " └ std_dev: ") + printstyled(io, round(std(bws); digits=2), "\n"; color=:yellow, bold=true) + else + print(io, " └ max: ") + printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true) + end + end + return bw_max +end diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl new file mode 100644 index 0000000..b309e05 --- /dev/null +++ b/ext/AMDGPUExt/implementations/membw.jl @@ -0,0 +1,152 @@ +# function theoretical_memory_bandwidth( +# ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout() +# ) +# max_mem_clock_rate = +# CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz +# max_mem_bus_width = +# CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) / 8.0 # in bytes +# max_bw = 2.0 * max_mem_clock_rate * max_mem_bus_width * 2^(-30) +# if verbose +# printstyled(io, "Theoretical Maximal Memory Bandwidth (GiB/s):\n"; bold=true) +# print(io, " └ max: ") +# printstyled(io, round(max_bw; digits=1), "\n"; color=:green, bold=true) +# end +# return max_bw +# end + +function GPUInspector.memory_bandwidth( + ::AMDBackend; + memsize::UnitPrefixedBytes=GiB(0.5), + dtype=Cchar, + verbose=true, + DtoDfactor=true, + device=AMDGPU.device(), + io=getstdout(), + kwargs..., +)::Float64 + AMDGPU.device!(device) do + N = Int(bytes(memsize) ÷ sizeof(dtype)) + mem_gpu = AMDGPU.rand(dtype, N) + mem_gpu2 = AMDGPU.rand(dtype, N) + + return _perform_memcpy( + mem_gpu, mem_gpu2; title="Memory", DtoDfactor, verbose, io=io, kwargs... + ) + end +end + +function GPUInspector.memory_bandwidth_scaling( + ::AMDBackend; + device=AMDGPU.device(), + sizes=logspace(1, exp2(30), 10), + verbose=true, + io=getstdout(), + kwargs..., +) + bandwidths = zeros(length(sizes)) + for (i, s) in enumerate(sizes) + bandwidths[i] = GPUInspector.memory_bandwidth( + AMDBackend(); memsize=B(s), device=device, verbose=false, kwargs... + ) + clear_gpu_memory(AMDBackend(); device=device) + end + if verbose + peak_val, idx = findmax(bandwidths) + peak_size = sizes[idx] + p = UnicodePlots.lineplot( + sizes, + bandwidths; + xlabel="data size", + ylabel="GiB/s", + title=string( + "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))" + ), + xscale=:log2, + ) + UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red) + println(io) # top margin + println(io, p) + println(io) # bottom margin + end + return (sizes=sizes, bandwidths=bandwidths) +end + +function GPUInspector.memory_bandwidth_saxpy( + ::AMDBackend; + device=AMDGPU.device(), + size=2^26, + nbench=10, + dtype=Float32, + verbose=true, + io=getstdout(), +)::Float64 + device!(device) do + a = dtype(pi) + x = AMDGPU.rand(dtype, size) + y = AMDGPU.rand(dtype, size) + z = AMDGPU.zeros(dtype, size) + + kernel = @roc launch = false _saxpy_gpu_kernel!(z, a, x, y) + occupancy = AMDGPU.launch_configuration(kernel) + t = Inf + for _ in 1:nbench + Δt = AMDGPU.@elapsed @roc( + groupsize = occupancy.groupsize, _saxpy_gpu_kernel!(z, a, x, y) + ) + t = min(t, Δt) + end + + bandwidth = 3.0 * sizeof(dtype) * size / t / (1024)^3 + if verbose + printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true) + print(io, " └ max: ") + printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true) + end + return bandwidth + end +end + +function _saxpy_gpu_kernel!(z, a, x, y) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i <= length(z) + @inbounds z[i] = a * x[i] + y[i] + end + return nothing +end + +function GPUInspector.memory_bandwidth_saxpy_scaling( + ::AMDBackend; + device=AMDGPU.device(), + sizes=[2^20 * i for i in 10:10:300], + verbose=true, + io=getstdout(), + kwargs..., +) + # sizes = [2^20 * i for i in 8:128] # V100 + bandwidths = zeros(length(sizes)) + for (i, s) in enumerate(sizes) + bandwidths[i] = GPUInspector.memory_bandwidth_saxpy( + AMDBackend(); device=device, size=s, verbose=false, kwargs... + ) + clear_gpu_memory(AMDBackend(); device=device) + end + if verbose + peak_val, idx = findmax(bandwidths) + peak_size = sizes[idx] + p = UnicodePlots.lineplot( + sizes, + bandwidths; + xlabel="vector length", + ylabel="GiB/s", + title=string( + "Peak: ", round(peak_val; digits=2), " GiB/s (vector size = $(bytes(peak_size)))" + ), + xscale=:log2, + ) + UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red) + println(io) # top margin + println(io, p) + println(io) # bottom margin + end + return (sizes=sizes, bandwidths=bandwidths) +end diff --git a/ext/AMDGPUExt/utility.jl b/ext/AMDGPUExt/utility.jl new file mode 100644 index 0000000..f9878ea --- /dev/null +++ b/ext/AMDGPUExt/utility.jl @@ -0,0 +1,5 @@ +_device2string(dev::HIPDevice) = "GPU $(_gpuid(dev)): $(_name(dev))" + +_gpuid(dev::HIPDevice) = AMDGPU.HIP.device_id(dev) + 1 + +_name(dev::HIPDevice) = AMDGPU.HIP.name(dev) diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index 9d2a770..0703604 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -12,6 +12,7 @@ using LinearAlgebra # pkgs using UnicodePlots using NVTX +using ThreadPinning # for usage in CUDAExt using GPUInspector: @@ -23,38 +24,8 @@ using GPUInspector: MonitoringResults, _defaultylims, @unroll, - NVIDIABackend - -# import stubs to implement them -import GPUInspector: backendinfo, functional -# gpuinfo -import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus -# p2p bw -import GPUInspector: - p2p_bandwidth, - p2p_bandwidth_all, - p2p_bandwidth_bidirectional, - p2p_bandwidth_bidirectional_all -# host2device bw -import GPUInspector: host2device_bandwidth -# membw -import GPUInspector: - theoretical_memory_bandwidth, - memory_bandwidth, - memory_bandwidth_scaling, - memory_bandwidth_saxpy, - memory_bandwidth_saxpy_scaling -# stresstest -import GPUInspector: stresstest -# monitoring -import GPUInspector: - monitoring_start, - monitoring_stop, - livemonitor_something, - livemonitor_powerusage, - livemonitor_temperature -# peakflops_gpu -import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu + NVIDIABackend, + getstdout # for convenience const BFloat16 = CUDA.BFloat16 diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl index b44e5fd..f147679 100644 --- a/ext/CUDAExt/implementations/general.jl +++ b/ext/CUDAExt/implementations/general.jl @@ -1,4 +1,4 @@ -function functional(::NVIDIABackend; verbose=true) +function GPUInspector.functional(::NVIDIABackend; verbose=true) if CUDA.functional() verbose && @info("CUDA/GPU available.") hascuda = true @@ -21,3 +21,14 @@ function functional(::NVIDIABackend; verbose=true) end return hascuda end + +function GPUInspector.clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true) + device!(device) do + gc && GC.gc() + CUDA.reclaim() + end + return nothing +end + +GPUInspector.device(::NVIDIABackend) = CUDA.device() +GPUInspector.devices(::NVIDIABackend) = CUDA.devices() diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl index b960b08..bff635d 100644 --- a/ext/CUDAExt/implementations/gpuinfo.jl +++ b/ext/CUDAExt/implementations/gpuinfo.jl @@ -1,6 +1,8 @@ -ngpus(::NVIDIABackend) = length(CUDA.devices()) +function GPUInspector.ngpus(::NVIDIABackend) + length(CUDA.devices()) +end -function gpus(::NVIDIABackend; io::IO=stdout) +function GPUInspector.gpus(::NVIDIABackend; io=getstdout()) # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 devs = devices() if isempty(devs) @@ -41,13 +43,13 @@ Print out detailed information about the NVIDIA GPU with the given `deviceid`. Heavily inspired by the CUDA sample "deviceQueryDrv.cpp". -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout) +function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io=getstdout()) 0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id.")) return gpuinfo(CuDevice(deviceid); io) end -function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) +function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io=getstdout()) # query mp = nmultiprocessors(dev) cores = ncudacores(dev) @@ -214,7 +216,7 @@ function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) return nothing end -function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout) +function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io=getstdout()) # check p2p access ndevs = ngpus(NVIDIABackend()) if ndevs <= 1 diff --git a/ext/CUDAExt/implementations/host2device_bandwidth.jl b/ext/CUDAExt/implementations/host2device_bandwidth.jl index 82f2f5d..5b3b31c 100644 --- a/ext/CUDAExt/implementations/host2device_bandwidth.jl +++ b/ext/CUDAExt/implementations/host2device_bandwidth.jl @@ -1,9 +1,9 @@ -function host2device_bandwidth(::NVIDIABackend; +function GPUInspector.host2device_bandwidth(::NVIDIABackend; memsize::UnitPrefixedBytes=GiB(0.5), dtype=Cchar, DtoDfactor=true, verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) N = Int(bytes(memsize) ÷ sizeof(dtype)) @@ -42,7 +42,7 @@ function _perform_memcpy( stats=false, DtoDfactor=false, verbose=true, - io::IO=stdout, + io=getstdout(), ) NVTX.@range "host2dev: $title" begin sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)") diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl index c0b68f1..a9dc2bf 100644 --- a/ext/CUDAExt/implementations/membw.jl +++ b/ext/CUDAExt/implementations/membw.jl @@ -1,5 +1,5 @@ -function theoretical_memory_bandwidth( - ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout +function GPUInspector.theoretical_memory_bandwidth( + ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout() ) max_mem_clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz @@ -14,14 +14,14 @@ function theoretical_memory_bandwidth( return max_bw end -function memory_bandwidth( +function GPUInspector.memory_bandwidth( ::NVIDIABackend; memsize::UnitPrefixedBytes=GiB(0.5), dtype=Cchar, verbose=true, DtoDfactor=true, device=CUDA.device(), - io::IO=stdout, + io=getstdout(), kwargs..., )::Float64 device!(device) do @@ -41,12 +41,12 @@ function memory_bandwidth( end end -function memory_bandwidth_scaling( +function GPUInspector.memory_bandwidth_scaling( ::NVIDIABackend; device=CUDA.device(), sizes=logspace(1, exp2(30), 10), verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) bandwidths = zeros(length(sizes)) @@ -54,7 +54,7 @@ function memory_bandwidth_scaling( bandwidths[i] = memory_bandwidth( NVIDIABackend(); memsize=B(s), device=device, verbose=false, kwargs... ) - clear_gpu_memory(device) + clear_gpu_memory(NVIDIABackend(); device=device) end if verbose peak_val, idx = findmax(bandwidths) @@ -81,9 +81,9 @@ end Extra keyword arguments: * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`. -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function memory_bandwidth_saxpy( +function GPUInspector.memory_bandwidth_saxpy( ::NVIDIABackend; device=CUDA.device(), size=2^20 * 10, @@ -91,7 +91,7 @@ function memory_bandwidth_saxpy( dtype=Float32, cublas=true, verbose=true, - io::IO=stdout, + io=getstdout(), )::Float64 device!(device) do a = dtype(pi) @@ -131,12 +131,12 @@ function _saxpy_gpu_kernel!(z, a, x, y) return nothing end -function memory_bandwidth_saxpy_scaling( +function GPUInspector.memory_bandwidth_saxpy_scaling( ::NVIDIABackend; device=CUDA.device(), sizes=[2^20 * i for i in 10:10:300], verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) # sizes = [2^20 * i for i in 8:128] # V100 @@ -145,7 +145,7 @@ function memory_bandwidth_saxpy_scaling( bandwidths[i] = memory_bandwidth_saxpy( NVIDIABackend(); device=device, size=s, verbose=false, kwargs... ) - clear_gpu_memory(device) + clear_gpu_memory(NVIDIABackend(); device=device) end if verbose peak_val, idx = findmax(bandwidths) diff --git a/ext/CUDAExt/implementations/monitoring.jl b/ext/CUDAExt/implementations/monitoring.jl index 483da9b..54c24f5 100644 --- a/ext/CUDAExt/implementations/monitoring.jl +++ b/ext/CUDAExt/implementations/monitoring.jl @@ -1,4 +1,4 @@ -function monitoring_start( +function GPUInspector.monitoring_start( ::NVIDIABackend; freq=1, devices=CUDA.devices(), thread=Threads.nthreads(), verbose=true ) if ismonitoring() @@ -54,9 +54,9 @@ Specifically, `results` is a named tuple with the following keys: * `time`: the (relative) times at which we measured * `temperature`, `power`, `compute`, `mem` -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults +function GPUInspector.monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults if ismonitoring() verbose && @info("Stopping monitoring and fetching results...") _monitoring!(false) @@ -67,7 +67,7 @@ function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults end end -function livemonitor_temperature(::NVIDIABackend, duration; kwargs...) +function GPUInspector.livemonitor_temperature(::NVIDIABackend, duration; kwargs...) return livemonitor_something( NVIDIABackend(), get_temperatures, @@ -78,7 +78,7 @@ function livemonitor_temperature(::NVIDIABackend, duration; kwargs...) ) end -function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...) +function GPUInspector.livemonitor_powerusage(::NVIDIABackend, duration; kwargs...) return livemonitor_something( NVIDIABackend(), get_power_usages, @@ -89,7 +89,7 @@ function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...) ) end -function livemonitor_something( +function GPUInspector.livemonitor_something( ::NVIDIABackend, f::F, duration; diff --git a/ext/CUDAExt/implementations/p2p_bandwidth.jl b/ext/CUDAExt/implementations/p2p_bandwidth.jl index 6b5a83e..2734ecb 100644 --- a/ext/CUDAExt/implementations/p2p_bandwidth.jl +++ b/ext/CUDAExt/implementations/p2p_bandwidth.jl @@ -1,4 +1,4 @@ -function p2p_bandwidth( +function GPUInspector.p2p_bandwidth( ::NVIDIABackend; memsize::UnitPrefixedBytes=B(40_000_000), nbench=5, @@ -9,7 +9,7 @@ function p2p_bandwidth( dtype=Float32, src=0, dst=1, - io::IO=stdout, + io=getstdout(), ) if ngpus(NVIDIABackend()) < 2 error("At least 2 GPUs are needed for the P2P benchmark.") @@ -66,7 +66,7 @@ function p2p_bandwidth( return bw_max end -function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...) +function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io=getstdout(), verbose=false, kwargs...) ngpus = length(CUDA.devices()) if ngpus < 2 error("At least 2 GPUs are needed for the P2P benchmark.") @@ -82,7 +82,7 @@ function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs ] end -function p2p_bandwidth_bidirectional( +function GPUInspector.p2p_bandwidth_bidirectional( ::NVIDIABackend; memsize::UnitPrefixedBytes=B(40_000_000), nbench=20, @@ -93,7 +93,7 @@ function p2p_bandwidth_bidirectional( dev1=0, dev2=1, repeat=100, - io::IO=stdout, + io=getstdout(), ) if ngpus(NVIDIABackend()) < 2 error("At least 2 GPUs are needed for the P2P benchmark.") @@ -142,7 +142,7 @@ function p2p_bandwidth_bidirectional( return bw_max end -function p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...) +function GPUInspector.p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...) ngpus = length(CUDA.devices()) if ngpus < 2 error("At least 2 GPUs are needed for the P2P benchmark.") diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl index 0d6bb2e..99a117c 100644 --- a/ext/CUDAExt/implementations/peakflops_gpu.jl +++ b/ext/CUDAExt/implementations/peakflops_gpu.jl @@ -8,15 +8,15 @@ Estimates the theoretical peak performance of a CUDA device in TFLOP/s. * `dtype` (default: `tensorcores ? Float16 : Float32`): element type of the matrices * `io` (default: `stdout`): set the stream where the results should be printed. -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function theoretical_peakflops_gpu( +function GPUInspector.theoretical_peakflops_gpu( ::NVIDIABackend; device=CUDA.device(), tensorcores=hastensorcores(), dtype=tensorcores ? Float16 : Float32, verbose=true, - io::IO=stdout, + io=getstdout(), ) if tensorcores max_peakflops = _theoretical_peakflops_gpu_tensorcores(; device, dtype) @@ -104,12 +104,12 @@ it takes to perform For more keyword argument options see [`peakflops_gpu_fmas`](@ref) and [`peakflops_gpu_wmmas`](@ref). """ -function peakflops_gpu( +function GPUInspector.peakflops_gpu( ::NVIDIABackend; tensorcores=hastensorcores(), verbose=true, dtype=tensorcores ? Float16 : Float32, - io::IO=stdout, + io=getstdout(), kwargs..., ) if tensorcores diff --git a/ext/CUDAExt/implementations/stresstest.jl b/ext/CUDAExt/implementations/stresstest.jl index 358d9ca..9c14994 100644 --- a/ext/CUDAExt/implementations/stresstest.jl +++ b/ext/CUDAExt/implementations/stresstest.jl @@ -1,4 +1,4 @@ -function stresstest( +function GPUInspector.stresstest( ::NVIDIABackend; devices=[CUDA.device()], mem=nothing, @@ -12,7 +12,7 @@ function stresstest( clearmem=false, monitoring=false, batch_duration=nothing, - io::IO=stdout, + io=getstdout(), kwargs..., ) logger = ConsoleLogger(io) @@ -69,7 +69,7 @@ function stresstest( Δt = @elapsed _run_stresstests(ts; verbose, kwargs...) if clearmem verbose && @info("Clearing GPU memory.") - clear_all_gpus_memory(devices) + GPUInspector.clear_all_gpus_memory(; devices=devices) end verbose && @info("Took $(round(Δt; digits=2)) seconds to run the tests.") if monitoring diff --git a/ext/CUDAExt/peakflops_gpu_fmas.jl b/ext/CUDAExt/peakflops_gpu_fmas.jl index a251341..eb3ff6f 100644 --- a/ext/CUDAExt/peakflops_gpu_fmas.jl +++ b/ext/CUDAExt/peakflops_gpu_fmas.jl @@ -48,7 +48,7 @@ function _peakflops_gpu_fmas(; nkernel=5, device::CuDevice=CUDA.device(), verbose=true, - io::IO=stdout, + io=getstdout(), ) device!(device) do d_a = CUDA.rand(dtype, size) diff --git a/ext/CUDAExt/peakflops_gpu_matmul.jl b/ext/CUDAExt/peakflops_gpu_matmul.jl index a081b69..93bf221 100644 --- a/ext/CUDAExt/peakflops_gpu_matmul.jl +++ b/ext/CUDAExt/peakflops_gpu_matmul.jl @@ -9,13 +9,13 @@ function peakflops_gpu_matmul_scaling( device=CUDA.device(), verbose=true, sizes=2 .^ (10:15), - io::IO=stdout, + io=getstdout(), kwargs..., ) where {F} flops = zeros(length(sizes)) for (i, s) in enumerate(sizes) flops[i] = peakflops_func(; device=device, size=s, verbose=false, kwargs...) - clear_gpu_memory(device) + GPUInspector.clear_gpu_memory(; device=device) end if verbose peak_val, idx = findmax(flops) @@ -64,7 +64,7 @@ function peakflops_gpu_matmul(; nmatmuls=5, nbench=5, verbose=true, - io::IO=stdout, + io=getstdout(), ) device!(device) do C = CUDA.zeros(dtype, size, size) @@ -108,7 +108,7 @@ function peakflops_gpu_matmul_graphs(; nmatmuls=5, nbench=5, verbose=true, - io::IO=stdout, + io=getstdout(), ) device!(device) do C = CUDA.zeros(dtype, size, size) diff --git a/ext/CUDAExt/peakflops_gpu_wmmas.jl b/ext/CUDAExt/peakflops_gpu_wmmas.jl index a12295b..f6c000c 100644 --- a/ext/CUDAExt/peakflops_gpu_wmmas.jl +++ b/ext/CUDAExt/peakflops_gpu_wmmas.jl @@ -91,7 +91,7 @@ function _peakflops_gpu_wmmas(; nkernel=10, verbose=true, dtype=Float16, - io::IO=stdout, + io=getstdout(), ) device!(device) do if Symbol(dtype) == :Float16 diff --git a/ext/CUDAExt/utility.jl b/ext/CUDAExt/utility.jl index 7f6a504..3c9681d 100644 --- a/ext/CUDAExt/utility.jl +++ b/ext/CUDAExt/utility.jl @@ -22,26 +22,6 @@ function alloc_mem(memsize::UnitPrefixedBytes; devs=(CUDA.device(),), dtype=Floa return mem_handles end -# TODO: Maybe make API/stub? -"Reclaim the unused memory of the currently active GPU (i.e. `device()`)." -function clear_gpu_memory(device::CuDevice=CUDA.device(); gc=true) - device!(device) do - gc && GC.gc() - CUDA.reclaim() - end - return nothing -end - -# TODO: Maybe make API/stub? -"Reclaim the unused memory of all available GPUs." -function clear_all_gpus_memory(devices=CUDA.devices(); gc=true) - gc && GC.gc() - for dev in devices - clear_gpu_memory(dev; gc=false) - end - return nothing -end - """ toggle_tensorcoremath([enable::Bool; verbose=true]) Switches the `CUDA.math_mode` between `CUDA.FAST_MATH` (`enable=true`) and `CUDA.DEFAULT_MATH` (`enable=false`). diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl index 9dbe9e9..7790536 100644 --- a/src/GPUInspector.jl +++ b/src/GPUInspector.jl @@ -9,14 +9,16 @@ using Base: UUID using Pkg: Pkg # external -using Reexport -@reexport using ThreadPinning +using ThreadPinning using DocStringExtensions using UnicodePlots using CpuId: cachesize using HDF5: h5open using Glob: glob +const DEFAULT_IO = Ref{Union{IO, Nothing}}(nothing) +getstdout() = something(DEFAULT_IO[], stdout) + include("backends.jl") include("UnitPrefixedBytes.jl") include("utility.jl") @@ -27,8 +29,8 @@ include("monitoring_io.jl") function not_implemented_yet() return error( - "Not implemented yet. You either haven't loaded a backend (like CUDA.jl) yet, or" * - " the loaded backend doesn't provide this functionality.", + "Not implemented yet. You either haven't loaded a backend yet (e.g. CUDA.jl or " * + "AMDGPU.jl), or the loaded backend doesn't provide this functionality.", ) end include("stubs/stubs_general.jl") @@ -42,7 +44,7 @@ include("stubs/stubs_peakflops_gpu.jl") # backends export Backend, NoBackend, NVIDIABackend, AMDBackend, backend, backend!, backendinfo -export CUDAExt +export CUDAExt, AMDGPUExt # monitoring io+plotting export plot_monitoring_results, load_monitoring_results, save_monitoring_results @@ -50,11 +52,14 @@ export plot_monitoring_results, load_monitoring_results, save_monitoring_results # utilities export UnitPrefixedBytes, B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, bytes, simplify, change_base, value -export logspace +export logspace, clear_all_gpus_memory # Let's currently not export the CPU tests. After all, this is GPUInspector.jl :) # export stresstest_cpu +# stubs general +export clear_gpu_memory + # stubs gpuinfo export ngpus, gpuinfo, gpuinfo_p2p_access, gpus # stubs p2p bandwidth diff --git a/src/backends.jl b/src/backends.jl index bd5d1de..ae147fc 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -56,6 +56,7 @@ function check_backend(b::Backend) end CUDAExt::Union{Nothing,Module} = nothing +AMDGPUExt::Union{Nothing,Module} = nothing """ Query information about a specific backend, e.g., what functionality the backend currently diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl index eee42ed..219d185 100644 --- a/src/stubs/stubs_general.jl +++ b/src/stubs/stubs_general.jl @@ -4,3 +4,19 @@ If not, print some hopefully useful debug information (or turn it off with `verb """ functional(; kwargs...) = functional(backend(); kwargs...) functional(::Backend; kwargs...) = not_implemented_yet() + +""" + clear_gpu_memory(; device, gc) + +Reclaim the unused memory of a GPU +""" +clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...) +clear_gpu_memory(::Backend; kwargs...) = not_implemented_yet() + +"Return the current device of the active backend." +device() = device(backend()) +device(::Backend) = not_implemented_yet() + +"Return the devices of the active backend." +devices() = devices(backend()) +devices(::Backend) = not_implemented_yet() diff --git a/src/utility.jl b/src/utility.jl index bba0d55..0b38851 100644 --- a/src/utility.jl +++ b/src/utility.jl @@ -2,6 +2,15 @@ function logspace(start, stop, length) return exp2.(range(log2(start), log2(stop); length=length)) end +"Reclaim the unused memory of all available GPUs." +function clear_all_gpus_memory(; gc=true, devices) + gc && GC.gc() + for dev in devices + clear_gpu_memory(; device=dev, gc=false) + end + return nothing +end + # L2_cachesize() = cachesize()[2] # """ diff --git a/test/backend_tests.jl b/test/backend_tests.jl deleted file mode 100644 index d772080..0000000 --- a/test/backend_tests.jl +++ /dev/null @@ -1,10 +0,0 @@ -@testitem "CUDA backend" begin - using CUDA - @test GPUInspector.is_cuda_loaded() - @test GPUInspector.is_backend_loaded(NVIDIABackend()) - @test backend() == NVIDIABackend() - @test isnothing(backend!(NoBackend())) - @test backend() == NoBackend() - @test isnothing(backend!(:cuda)) - @test backend() == NVIDIABackend() -end diff --git a/test/gpuinfo_tests.jl b/test/gpuinfo_tests.jl deleted file mode 100644 index 699a766..0000000 --- a/test/gpuinfo_tests.jl +++ /dev/null @@ -1,7 +0,0 @@ -@testitem "gpuinfo / gpus" begin - using CUDA - @test isnothing(gpus()) - @test isnothing(gpuinfo()) - @test isnothing(gpuinfo(0)) - @test isnothing(gpuinfo(device())) -end diff --git a/test/peakflops_tests.jl b/test/peakflops_tests.jl deleted file mode 100644 index 9005fce..0000000 --- a/test/peakflops_tests.jl +++ /dev/null @@ -1,23 +0,0 @@ -@testitem "peakflops_gpu (CUDA cores)" begin - using CUDA - @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64 - @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) == - Float64 - @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) == - Float64 -end - -@testitem "peakflops_gpu (Tensor cores)" begin - using CUDA - @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64 - @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) == Float64 -end - -@testitem "peakflops_gpu_matmul / scaling" begin - using CUDA - @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64 - @test typeof(CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false)) == Float64 - @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) == Float64 - @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) == - Tuple{Vector{Int64},Vector{Float64}} -end diff --git a/test/runtests.jl b/test/runtests.jl index 111ff35..fb1f3f8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,24 +1,100 @@ -using TestItemRunner using GPUInspector -using CUDA +using Test +using LinearAlgebra +using Logging -if !GPUInspector.functional() - error("Can't run testsuite since CUDA/GPU not present or not functional!") +# Environment variables: +# - "TEST_BACKEND": can be set to manually specify a backend +# - "TEST_QUIET": can be set to true/false to enable/disable non-verbose testing +# - "TESTS": a comma separated list of test suites to run (see TEST_NAMES below) + +# figure out which backend to use (if both CUDA and AMDGPU are functional we use CUDA) +if haskey(ENV, "TEST_BACKEND") + if lowercase(ENV["TEST_BACKEND"]) in ("nvidia", "cuda", "nvidiabackend") + using CUDA + TEST_BACKEND = NVIDIABackend() + elseif lowercase(ENV["TEST_BACKEND"]) in ("amd", "amdgpu", "amdbackend") + using AMDGPU + TEST_BACKEND = AMDBackend() + else + error(""" + TEST_BACKEND environment variable contains unsupported value. + """) + end +else + using CUDA + using AMDGPU + if CUDA.functional() + @info("NVIDIA GPUs detected.", CUDA.devices()) + TEST_BACKEND = NVIDIABackend() + elseif AMDGPU.functional() + @info("AMD GPUs detected.", AMDGPU.devices()) + TEST_BACKEND = AMDBackend() + else + error(""" + Aborting because neither CUDA.jl nor AMDGPU.jl are functional. + Are there any GPUs in the system? + """) + end +end +backend!(TEST_BACKEND) +@info "Running tests with the following backend: $TEST_BACKEND." + +const TEST_NAMES = [ + "bandwidth", "peakflops", "stresstest", "gpuinfo", "utility", "backend_specific", "core" +] +if haskey(ENV, "TESTS") + tests = split(ENV["TESTS"], ",") + if !all(t -> t in TEST_NAMES, tests) + error(""" + TESTS environment variable contains unknown test names. + Valid test names are: $(TEST_NAMES) + """) + end + TARGET_TESTS = tests +else + # run all tests + const TARGET_TESTS = TEST_NAMES end -if Threads.nthreads() == 1 || (Threads.nthreads() < length(CUDA.devices()) + 1) - # we should have at least one thread per gpu + one monitoring thread - @warn( - "You should run the tests with at least $(length(CUDA.devices()) + 1) Julia threads.", - Threads.nthreads(), - length(CUDA.devices()) - ) +@info "Running following tests: $TARGET_TESTS." + +if "stresstest" in TARGET_TESTS + # error if we aren't running with enough threads + if Threads.nthreads() == 1 || (Threads.nthreads() < ngpus() + 1) + # we should have at least one thread per gpu + one monitoring thread + error("You should run the tests with at least $(ngpus() + 1) Julia threads.") + end end -@run_package_tests +quiet_testing = parse(Bool, get(ENV, "TEST_QUIET", "true")) +if quiet_testing + GPUInspector.DEFAULT_IO[] = Base.BufferStream() + global_logger(Logging.NullLogger()) +end -include("backend_tests.jl") -include("utility_tests.jl") -include("stresstest_tests.jl") -include("bandwidth_tests.jl") -include("peakflops_tests.jl") -include("gpuinfo_tests.jl") +if "core" in TARGET_TESTS + include("tests_core.jl") +end +if "utility" in TARGET_TESTS + include("tests_utility.jl") +end +if "gpuinfo" in TARGET_TESTS + include("tests_gpuinfo.jl") +end +if "bandwidth" in TARGET_TESTS + include("tests_bandwidth.jl") +end +if "stresstest" in TARGET_TESTS + using CairoMakie + include("tests_stresstest.jl") +end +if "peakflops" in TARGET_TESTS + include("tests_peakflops.jl") +end +if "backend_specific" in TARGET_TESTS + if TEST_BACKEND == NVIDIABackend() + include("tests_nvidia_only.jl") + elseif TEST_BACKEND == AMDBackend() + include("tests_amd_only.jl") + end +end diff --git a/test/tests_amd_only.jl b/test/tests_amd_only.jl new file mode 100644 index 0000000..e69de29 diff --git a/test/bandwidth_tests.jl b/test/tests_bandwidth.jl similarity index 60% rename from test/bandwidth_tests.jl rename to test/tests_bandwidth.jl index a68ea19..6626617 100644 --- a/test/bandwidth_tests.jl +++ b/test/tests_bandwidth.jl @@ -1,15 +1,12 @@ -@testitem "p2p_bandwidth" begin - using LinearAlgebra - using CUDA - +@testset "p2p_bandwidth" begin @testset "unidirectional" begin # p2p_bandwidth @test typeof(p2p_bandwidth(; verbose=false)) == Float64 @test 0 ≤ p2p_bandwidth(; verbose=false) # options @test typeof(p2p_bandwidth(; memsize=MB(100), verbose=false)) == Float64 - @test typeof(p2p_bandwidth(; src=CuDevice(0), dst=CuDevice(1), verbose=false)) == - Float64 + dev_src, dev_dst = collect(GPUInspector.devices())[1:2] + @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; dtype=Float16, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; nbench=10, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; hist=true, verbose=true)) == Float64 @@ -26,7 +23,8 @@ @test typeof(p2p_bandwidth_bidirectional(; verbose=false)) == Float64 @test 0 ≤ p2p_bandwidth_bidirectional(; verbose=false) # options - @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) == Float64 + @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) == + Float64 @test typeof(p2p_bandwidth_bidirectional(; dtype=Float16, verbose=false)) == Float64 @test typeof(p2p_bandwidth_bidirectional(; nbench=10, verbose=false)) == Float64 @test typeof(p2p_bandwidth_bidirectional(; hist=true, verbose=true)) == Float64 @@ -41,20 +39,29 @@ end end -@testitem "host2device_bandwidth" begin - using CUDA +@testset "host2device_bandwidth" begin @test isnothing(host2device_bandwidth()) - @test isnothing(host2device_bandwidth(; memsize=MB(100))) - @test isnothing(host2device_bandwidth(; dtype=Float16)) + @test isnothing(host2device_bandwidth(; memsize=MB(1))) + @test isnothing(host2device_bandwidth(; dtype=Float64)) end -@testitem "memory_bandwidth" begin - using CUDA - @test typeof(memory_bandwidth()) == Float64 - @test typeof(memory_bandwidth(; memsize=MiB(10))) == Float64 - @test typeof(memory_bandwidth(; dtype=Float32)) == Float64 - - @test typeof(memory_bandwidth_saxpy()) == Float64 - @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64 - @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64 +@testset "memory_bandwidth" begin + @testset "regular" begin + @test typeof(memory_bandwidth()) == Float64 + @test typeof(memory_bandwidth(; memsize=MiB(1))) == Float64 + @test typeof(memory_bandwidth(; dtype=Float32)) == Float64 + end + @testset "regular, scaling" begin + @test typeof(memory_bandwidth_scaling()) == + NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Float64},Vector{Float64}}} + end + @testset "saxpy" begin + @test typeof(memory_bandwidth_saxpy()) == Float64 + @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64 + @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64 + end + @testset "saxpy, scaling" begin + @test typeof(memory_bandwidth_saxpy_scaling()) == + NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Int64},Vector{Float64}}} + end end diff --git a/test/tests_core.jl b/test/tests_core.jl new file mode 100644 index 0000000..1db7309 --- /dev/null +++ b/test/tests_core.jl @@ -0,0 +1,21 @@ +@testset "Backend switching" begin + if TEST_BACKEND == NVIDIABackend() + @test GPUInspector.is_cuda_loaded() + @test GPUInspector.is_backend_loaded(NVIDIABackend()) + @test backend() == NVIDIABackend() + @test isnothing(backend!(NoBackend())) + @test backend() == NoBackend() + @test isnothing(backend!(:cuda)) + @test backend() == NVIDIABackend() + @test isnothing(backend!(NVIDIABackend())) + elseif TEST_BACKEND == AMDBackend() + @test GPUInspector.is_amdgpu_loaded() + @test GPUInspector.is_backend_loaded(AMDBackend()) + @test backend() == AMDBackend() + @test isnothing(backend!(NoBackend())) + @test backend() == NoBackend() + @test isnothing(backend!(:amd)) + @test backend() == AMDBackend() + @test isnothing(backend!(AMDBackend())) + end +end diff --git a/test/tests_gpuinfo.jl b/test/tests_gpuinfo.jl new file mode 100644 index 0000000..9ea54c7 --- /dev/null +++ b/test/tests_gpuinfo.jl @@ -0,0 +1,8 @@ +@testset "gpuinfo / gpus" begin + @test isnothing(gpus()) + @test isnothing(gpuinfo()) + @test isnothing(gpuinfo(GPUInspector.device())) + if ngpus() > 1 + @test isnothing(gpuinfo_p2p_access()) + end +end diff --git a/test/tests_nvidia_only.jl b/test/tests_nvidia_only.jl new file mode 100644 index 0000000..27ffcd7 --- /dev/null +++ b/test/tests_nvidia_only.jl @@ -0,0 +1,11 @@ +@testset "toggle_tensorcoremath" begin + @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false)) + @test CUDA.math_mode() == CUDA.FAST_MATH + @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false)) + @test CUDA.math_mode() == CUDA.DEFAULT_MATH + # test toggle + @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) + @test CUDA.math_mode() == CUDA.FAST_MATH + @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) + @test CUDA.math_mode() == CUDA.DEFAULT_MATH +end diff --git a/test/tests_peakflops.jl b/test/tests_peakflops.jl new file mode 100644 index 0000000..0d835e0 --- /dev/null +++ b/test/tests_peakflops.jl @@ -0,0 +1,28 @@ +if backend() == NVIDIABackend() + @testset "peakflops_gpu (CUDA cores)" begin + @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64 + @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) == + Float64 + @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) == + Float64 + end + + @testset "peakflops_gpu (Tensor cores)" begin + @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64 + @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) == + Float64 + end + + @testset "peakflops_gpu_matmul / scaling" begin + @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64 + @test typeof( + CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false) + ) == Float64 + @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) == + Float64 + @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) == + Tuple{Vector{Int64},Vector{Float64}} + end +elseif backend() == AMDBackend() + # TODO +end diff --git a/test/stresstest_tests.jl b/test/tests_stresstest.jl similarity index 81% rename from test/stresstest_tests.jl rename to test/tests_stresstest.jl index 8b99fc0..32152b9 100644 --- a/test/stresstest_tests.jl +++ b/test/tests_stresstest.jl @@ -1,5 +1,4 @@ -@testitem "Stresstest: different kinds" begin - using CUDA +@testset "Stresstest: different kinds" begin @test isnothing(stresstest(; duration=2, verbose=false)) @test isnothing(stresstest(; enforced_duration=2, verbose=false)) @test isnothing(stresstest(; approx_duration=2, verbose=false)) @@ -8,10 +7,9 @@ @test isnothing(stresstest(; mem=0.2, verbose=false)) end -@testitem "Stresstest: keyword options" begin - using CUDA +@testset "Stresstest: keyword options" begin @test isnothing(stresstest(; duration=2, verbose=false)) - @test isnothing(stresstest(; duration=2, devices=devices(), verbose=false)) + @test isnothing(stresstest(; duration=2, devices=GPUInspector.devices(), verbose=false)) @test isnothing(stresstest(; duration=2, size=3000, verbose=false)) @test isnothing(stresstest(; duration=2, dtype=Float16, verbose=false)) @test isnothing(stresstest(; duration=2, clearmem=true, verbose=false)) @@ -19,15 +17,16 @@ end # TODO: kwargs: threads, parallel end -@testitem "Stresstest: monitoring" begin - using CUDA +@testset "Stresstest: monitoring" begin @testset "automatic (monitoring=true)" begin @test typeof( - stresstest(; devices=devices(), duration=2, verbose=false, monitoring=true) + stresstest(; + devices=GPUInspector.devices(), duration=2, verbose=false, monitoring=true + ), ) == MonitoringResults end @testset "manual" begin - devs = devices() + devs = GPUInspector.devices() @test isnothing(monitoring_start(; freq=1, devices=devs, verbose=false)) @test isnothing( stresstest(; devices=devs, duration=2, verbose=false, monitoring=false) @@ -41,8 +40,7 @@ end end end -@testitem "Stresstest: monitoring results" begin - using CUDA +@testset "Stresstest: monitoring results" begin @testset "MonitoringResults" begin r = stresstest(; duration=2, verbose=false, monitoring=true) @test typeof(r) == MonitoringResults @@ -53,12 +51,13 @@ end end @testset "save / load" begin d = Dict{Symbol,Vector{Vector{Float64}}}() - ndevs = length(CUDA.devices()) + ndevs = ngpus() d[:asd] = [rand(ndevs) for _ in 1:5] d[:qwe] = [rand(ndevs) for _ in 1:5] d[:jkl] = [rand(ndevs) for _ in 1:5] devices = Tuple{String,Base.UUID}[ - (CUDAExt._device2string(dev), uuid(dev)) for dev in collect(CUDA.devices()) + (CUDAExt._device2string(dev), uuid(dev)) for + dev in collect(GPUInspector.devices()) ] r = MonitoringResults(rand(5), devices, d) cd(mktempdir()) do @@ -74,8 +73,7 @@ end end end -@testitem "Stresstest: monitoring results (CairoMakie)" begin - using CairoMakie +@testset "Stresstest: monitoring results (CairoMakie)" begin r = load_monitoring_results(joinpath(@__DIR__, "test.h5")) @test isnothing(savefig_monitoring_results(r)) @test isnothing(savefig_monitoring_results(r, (:compute, :mem))) diff --git a/test/utility_tests.jl b/test/tests_utility.jl similarity index 78% rename from test/utility_tests.jl rename to test/tests_utility.jl index 1bf5bd3..9ec5c5f 100644 --- a/test/utility_tests.jl +++ b/test/tests_utility.jl @@ -1,4 +1,4 @@ -@testitem "UnitPrefixedBytes" begin +@testset "UnitPrefixedBytes" begin using InteractiveUtils: subtypes # general stuff @@ -64,16 +64,3 @@ end @test B(40_000_000) + MB(3) - 2 * KiB(2) ≈ MB(42.995904) end - -@testitem "toggle_tensorcoremath" begin - using CUDA - @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false)) - @test CUDA.math_mode() == CUDA.FAST_MATH - @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false)) - @test CUDA.math_mode() == CUDA.DEFAULT_MATH - # test toggle - @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) - @test CUDA.math_mode() == CUDA.FAST_MATH - @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) - @test CUDA.math_mode() == CUDA.DEFAULT_MATH -end