From e7be3edf97aeb799f4564e3252ae9e72b88694e5 Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Wed, 16 Aug 2023 15:46:07 +0200 Subject: [PATCH 1/5] initial attempt --- Project.toml | 6 +- ext/AMDGPUExt/AMDGPUExt.jl | 106 +++++++ ext/AMDGPUExt/implementations/general.jl | 18 ++ ext/AMDGPUExt/implementations/gpuinfo.jl | 261 ++++++++++++++++++ .../implementations/host2device_bandwidth.jl | 85 ++++++ ext/AMDGPUExt/implementations/membw.jl | 163 +++++++++++ ext/CUDAExt/implementations/general.jl | 8 + ext/CUDAExt/implementations/gpuinfo.jl | 4 +- ext/CUDAExt/implementations/membw.jl | 4 +- ext/CUDAExt/utility.jl | 20 -- src/GPUInspector.jl | 9 +- src/stubs/stubs_general.jl | 8 + src/utility.jl | 9 + 13 files changed, 674 insertions(+), 27 deletions(-) create mode 100644 ext/AMDGPUExt/AMDGPUExt.jl create mode 100644 ext/AMDGPUExt/implementations/general.jl create mode 100644 ext/AMDGPUExt/implementations/gpuinfo.jl create mode 100644 ext/AMDGPUExt/implementations/host2device_bandwidth.jl create mode 100644 ext/AMDGPUExt/implementations/membw.jl diff --git a/Project.toml b/Project.toml index 053af89..dfbf455 100644 --- a/Project.toml +++ b/Project.toml @@ -21,13 +21,16 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" [weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" [extensions] CUDAExt = "CUDA" +AMDGPUExt = "AMDGPU" CairoMakieExt = "CairoMakie" [compat] +AMDGPU = "0.5" CUDA = "3.8.4, 3.12, 4.4" CairoMakie = "0.7, 0.10.7" CpuId = "0.3" @@ -43,10 +46,11 @@ julia = "1.9" [extras] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" [targets] -test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "TestItemRunner"] +test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU", "TestItemRunner"] diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl new file mode 100644 index 0000000..d141faf --- /dev/null +++ b/ext/AMDGPUExt/AMDGPUExt.jl @@ -0,0 +1,106 @@ +module AMDGPUExt + +using GPUInspector +using AMDGPU +using AMDGPU: device, device!, devices + +# stdlibs etc. +using Base: UUID +using Statistics +using Logging +using LinearAlgebra + +# pkgs +using UnicodePlots + +# for usage in AMDGPUExt +using GPUInspector: + logspace, + ismonitoring, + _monitoring!, + _set_monitoring_task, + _get_monitoring_task, + MonitoringResults, + _defaultylims, + @unroll, + AMDBackend + +# import stubs to implement them +import GPUInspector: backendinfo, functional, clear_gpu_memory +# gpuinfo +import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus +# p2p bw +import GPUInspector: + p2p_bandwidth, + p2p_bandwidth_all, + p2p_bandwidth_bidirectional, + p2p_bandwidth_bidirectional_all +# host2device bw +import GPUInspector: host2device_bandwidth +# membw +import GPUInspector: + theoretical_memory_bandwidth, + memory_bandwidth, + memory_bandwidth_scaling, + memory_bandwidth_saxpy, + memory_bandwidth_saxpy_scaling +# stresstest +import GPUInspector: stresstest +# monitoring +import GPUInspector: + monitoring_start, + monitoring_stop, + livemonitor_something, + livemonitor_powerusage, + livemonitor_temperature +# peakflops_gpu +import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu + +# include("cuda_wrappers.jl") +# include("utility.jl") +# include("stresstests.jl") +# include("peakflops_gpu_fmas.jl") +# include("peakflops_gpu_wmmas.jl") +# include("peakflops_gpu_matmul.jl") +include("implementations/general.jl") +include("implementations/gpuinfo.jl") +# include("implementations/p2p_bandwidth.jl") +include("implementations/host2device_bandwidth.jl") +include("implementations/membw.jl") +# include("implementations/stresstest.jl") +# include("implementations/monitoring.jl") +# include("implementations/peakflops_gpu.jl") + +function __init__() + GPUInspector.AMDGPUJL_LOADED[] = true + GPUInspector.backend!(AMDBackend()) + GPUInspector.AMDGPUExt = Base.get_extension(GPUInspector, :AMDGPUExt) + return nothing +end + +function backendinfo(::AMDBackend) + # somewhat crude way to figure out which API functions are implemented :) + funcs = String[] + impl_dir = joinpath(@__DIR__, "implementations/") + for f in readdir(impl_dir) + lines = readlines(joinpath(impl_dir, f)) + func_lines = filter(startswith("function"), lines) + for fl in func_lines + fname = strip(split(split(fl, "function")[2], "(")[1]) + if startswith(fname, "_") || startswith(fname, "Base") + continue + end + if fname in funcs # avoid duplicates + continue + end + push!(funcs, fname) + end + end + println("Implementend API functions for AMDBackend:") + for f in funcs + println("\t", f) + end + return nothing +end + +end # module diff --git a/ext/AMDGPUExt/implementations/general.jl b/ext/AMDGPUExt/implementations/general.jl new file mode 100644 index 0000000..7ca7b14 --- /dev/null +++ b/ext/AMDGPUExt/implementations/general.jl @@ -0,0 +1,18 @@ +function functional(::AMDBackend; verbose=true) + if AMDGPU.functional() + verbose && @info("AMDGPU.jl is functional.") + working = true + else + verbose && @info("AMDGPU.jl not functional.") + working = false + end + return working +end + +function clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true) + device!(device) do + gc && GC.gc() + AMDGPU.HIP.reclaim() + end + return nothing +end diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl new file mode 100644 index 0000000..92db2e2 --- /dev/null +++ b/ext/AMDGPUExt/implementations/gpuinfo.jl @@ -0,0 +1,261 @@ +function ngpus(::AMDBackend) + length(AMDGPU.devices()) +end + +function gpus(::AMDBackend; io::IO=stdout) + # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 + devs = AMDGPU.devices() + if isempty(devs) + println(io, "No AMD devices found.") + elseif length(devs) == 1 + println(io, "1 device:") + else + println(io, length(devs), " devices:") + end + for (i, dev) in enumerate(devs) + mem_free, mem_tot = AMDGPU.device!(dev) do + AMDGPU.Runtime.Mem.info() + end + println( + io, + " $(i-1): ", repr(dev), " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)", + ) + end +end + +# """ +# gpuinfo(deviceid::Integer) + +# Print out detailed information about the NVIDIA GPU with the given `deviceid`. + +# Heavily inspired by the CUDA sample "deviceQueryDrv.cpp". + +# (This method is from the CUDA backend.) +# """ +# function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout) +# 0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id.")) +# return gpuinfo(CuDevice(deviceid); io) +# end +# function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) +# # query +# mp = nmultiprocessors(dev) +# cores = ncudacores(dev) +# max_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE) ÷ 1000 +# mem_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) ÷ 1000 +# mem_bus_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) +# l2cachesize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE) +# maxTex1D = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH) +# maxTex2D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH) +# maxTex2D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT) +# maxTex3D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH) +# maxTex3D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT) +# maxTex3D_depth = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH) +# maxTex1DLayered_width = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH +# ) +# maxTex1DLayered_layers = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS +# ) +# maxTex2DLayered_width = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH +# ) +# maxTex2DLayered_height = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT +# ) +# maxTex2DLayered_layers = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS +# ) +# total_constant_mem = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY) +# shared_mem_per_block = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK +# ) +# regs_per_block = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK) +# warpsize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE) +# max_threads_per_mp = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR +# ) +# max_threads_per_block = CUDA.attribute( +# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK +# ) +# blockdim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X) +# blockdim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y) +# blockdim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z) +# griddim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X) +# griddim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y) +# griddim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z) +# texture_align = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT) +# max_mem_pitch = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_PITCH) +# async_engine_count = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT) +# gpu_overlap = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP)) +# kernel_exec_timeout_enabled = Bool( +# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT) +# ) +# integrated = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_INTEGRATED)) +# can_map_host_mem = Bool( +# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY) +# ) +# concurrent_kernels = Bool( +# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS) +# ) +# surface_alignment = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT) > 0 +# ecc_enabled = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ECC_ENABLED)) +# unified_addressing = Bool( +# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) +# ) +# managed_memory = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) +# compute_preemption = Bool( +# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED) +# ) +# cooperative_launch = Bool( +# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH) +# ) +# cooperative_multi_dev_launch = Bool( +# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH) +# ) +# pci_domainid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID) +# pci_busid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID) +# pci_deviceid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID) +# compute_mode = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE) +# comp_modes = [ +# "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", +# "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", +# "Prohibited (no host thread can use ::cudaSetDevice() with this device)", +# "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", +# "Unknown", +# ] + +# # printing +# println(io, "Device: ", name(dev), " ($dev)") +# println( +# io, "Total amount of global memory: ", Base.format_bytes(Int(CUDA.totalmem(dev))) +# ) +# println(io, "Number of CUDA cores: ", cores) +# println(io, "Number of multiprocessors: ", mp, " ($(cores ÷ mp) CUDA cores each)") +# println(io, "GPU max. clock rate: ", max_clock_rate, " MHz") +# println(io, "Memory clock rate: ", mem_clock_rate, " MHz") +# println(io, "Memory bus width: ", mem_bus_width, "-bit") +# println(io, "L2 cache size: ", Base.format_bytes(l2cachesize)) +# println(io, "Max. texture dimension sizes (1D): $maxTex1D") +# println(io, "Max. texture dimension sizes (2D): $maxTex2D_width, $maxTex2D_height") +# println( +# io, +# "Max. texture dimension sizes (3D): $maxTex3D_width, $maxTex3D_height, $maxTex3D_depth", +# ) +# println( +# io, +# "Max. layered 1D texture size: $(maxTex1DLayered_width) ($(maxTex1DLayered_layers) layers)", +# ) +# println( +# io, +# "Max. layered 2D texture size: $(maxTex2DLayered_width), $(maxTex2DLayered_height) ($(maxTex2DLayered_layers) layers)", +# ) +# println(io, "Total amount of constant memory: ", Base.format_bytes(total_constant_mem)) +# println( +# io, +# "Total amount of shared memory per block: ", +# Base.format_bytes(shared_mem_per_block), +# ) +# println(io, "Total number of registers available per block: ", regs_per_block) +# println(io, "Warp size: ", warpsize) +# println(io, "Max. number of threads per multiprocessor: ", max_threads_per_mp) +# println(io, "Max. number of threads per block: ", max_threads_per_block) +# println( +# io, +# "Max. dimension size of a thread block (x,y,z): $(blockdim_x), $(blockdim_y), $(blockdim_z)", +# ) +# println( +# io, +# "Max. dimension size of a grid size (x,y,z): $(griddim_x), $(griddim_y), $(griddim_z)", +# ) +# println(io, "Texture alignment: ", Base.format_bytes(texture_align)) +# println(io, "Maximum memory pitch: ", Base.format_bytes(max_mem_pitch)) +# println( +# io, +# "Concurrent copy and kernel execution: ", +# gpu_overlap ? "Yes" : "No", +# " with $(async_engine_count) copy engine(s)", +# ) +# println(io, "Run time limit on kernels: ", kernel_exec_timeout_enabled ? "Yes" : "No") +# println(io, "Integrated GPU sharing host memory: ", integrated ? "Yes" : "No") +# println( +# io, "Support host page-locked memory mapping: ", can_map_host_mem ? "Yes" : "No" +# ) +# println(io, "Concurrent kernel execution: ", concurrent_kernels ? "Yes" : "No") +# println(io, "Alignment requirement for surfaces: ", surface_alignment ? "Yes" : "No") +# println(io, "Device has ECC support: ", ecc_enabled ? "Yes" : "No") +# println( +# io, "Device supports Unified Addressing (UVA): ", unified_addressing ? "Yes" : "No" +# ) +# println(io, "Device supports managed memory: ", managed_memory ? "Yes" : "No") +# println(io, "Device supports compute preemption: ", compute_preemption ? "Yes" : "No") +# println(io, "Supports cooperative kernel launch: ", cooperative_launch ? "Yes" : "No") +# println( +# io, +# "Supports multi-device co-op kernel launch: ", +# cooperative_multi_dev_launch ? "Yes" : "No", +# ) +# println( +# io, +# "Device PCI domain ID / bus ID / device ID: $(pci_domainid) / $(pci_busid) / $(pci_deviceid)", +# ) +# println(io, "Compute mode: ", comp_modes[compute_mode + 1]) + +# return nothing +# end + +# function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout) +# # check p2p access +# ndevs = ngpus(NVIDIABackend()) +# if ndevs <= 1 +# error("Only a single GPU available.") +# else +# mat_p2p_access_supported = Matrix{Bool}(undef, ndevs, ndevs) +# mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs) +# mat_p2p_atomic_supported = Matrix{Bool}(undef, ndevs, ndevs) +# for i in 1:ndevs +# dev_i = CuDevice(i - 1) +# for j in 1:ndevs +# dev_j = CuDevice(j - 1) +# if i != j +# p2p_access_supported = Bool( +# CUDA.p2p_attribute( +# dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED +# ), +# ) +# p2p_can_access = Bool(CUDA.can_access_peer(dev_i, dev_j)) +# p2p_atomic_supported = Bool( +# CUDA.p2p_attribute( +# dev_i, +# dev_j, +# CUDA.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED, +# ), +# ) +# mat_p2p_atomic_supported[i, j] = p2p_atomic_supported +# mat_p2p_access_supported[i, j] = p2p_access_supported +# mat_p2p_can_access[i, j] = p2p_can_access +# # p2p_performance_rank = CUDA.p2p_attribute(dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK) +# else +# mat_p2p_atomic_supported[i, i] = false +# mat_p2p_access_supported[i, i] = false +# mat_p2p_can_access[i, j] = false +# end +# end +# end + +# printstyled(io, "P2P Access Supported:\n"; bold=true) +# show(io, "text/plain", mat_p2p_access_supported) +# println(io) +# println(io) +# if mat_p2p_access_supported != mat_p2p_can_access +# printstyled(io, "P2P Can Access:\n"; bold=true) +# show(io, "text/plain", mat_p2p_can_access) +# println(io) +# println(io) +# end +# printstyled(io, "P2P Atomic Supported:\n"; bold=true) +# show(io, "text/plain", mat_p2p_atomic_supported) +# println(io) +# println(io) +# end +# return nothing +# end diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl new file mode 100644 index 0000000..b32a534 --- /dev/null +++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl @@ -0,0 +1,85 @@ +function host2device_bandwidth( + ::AMDBackend; + memsize::UnitPrefixedBytes=GiB(0.5), + dtype=Cchar, + DtoDfactor=true, + verbose=true, + io::IO=stdout, + kwargs..., +) + N = Int(bytes(memsize) ÷ sizeof(dtype)) + mem_host = rand(dtype, N) + # mem_host_pinned = Mem.pin(rand(dtype, N)) # TODO + mem_gpu = AMDGPU.rand(dtype, N) + + _perform_memcpy(mem_host, mem_gpu; title="Host <-> Device", verbose, io=io, kwargs...) + verbose && println(io) + # _perform_memcpy( + # mem_host_pinned, + # mem_gpu; + # title="Host (pinned) <-> Device", + # verbose, + # io=io, + # kwargs..., + # ) + # verbose && println() + # _perform_memcpy(mem_gpu, mem_gpu2; title="Device <-> Device (same device)", DtoDfactor, verbose, kwargs...) + return nothing +end + +function _perform_memcpy( + mem1, + mem2; + title="", + nbench=10, + times=false, + stats=false, + DtoDfactor=false, + verbose=true, + io::IO=stdout, +) + sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)") + ts = zeros(nbench) + + @inbounds for i in 1:nbench + if i % 2 == 0 + ts[i] = AMDGPU.@elapsed copyto!(mem1, mem2) + else + ts[i] = AMDGPU.@elapsed copyto!(mem2, mem1) + end + end + + t_min = minimum(ts) + t_max = maximum(ts) + t_avg = mean(ts) + + actual_memsize_GiB = sizeof(mem1) * 2^(-30) + if DtoDfactor + actual_memsize_GiB *= 2 # must count both the read and the write here (taken from p2pBandwidthLatencyTest cuda sample....) + end + bws = actual_memsize_GiB ./ ts + bw_min = minimum(bws) + bw_max = maximum(bws) + bw_avg = mean(bws) + + if verbose + if times + println(io, "t_min: $t_min") + println(io, "t_max: $t_max") + println(io, "t_avg: $t_avg") + end + printstyled(io, "$(title) Bandwidth (GiB/s):\n"; bold=true) + if stats + print(io, " ├ max: ") + printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true) + println(io, " ├ min: ", round(bw_min; digits=2)) + println(io, " ├ avg: ", round(bw_avg; digits=2)) + print(io, " └ std_dev: ") + printstyled(io, round(std(bws); digits=2), "\n"; color=:yellow, bold=true) + else + print(io, " └ max: ") + printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true) + end + end + return bw_max +end diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl new file mode 100644 index 0000000..e43aefc --- /dev/null +++ b/ext/AMDGPUExt/implementations/membw.jl @@ -0,0 +1,163 @@ +# function theoretical_memory_bandwidth( +# ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout +# ) +# max_mem_clock_rate = +# CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz +# max_mem_bus_width = +# CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) / 8.0 # in bytes +# max_bw = 2.0 * max_mem_clock_rate * max_mem_bus_width * 2^(-30) +# if verbose +# printstyled(io, "Theoretical Maximal Memory Bandwidth (GiB/s):\n"; bold=true) +# print(io, " └ max: ") +# printstyled(io, round(max_bw; digits=1), "\n"; color=:green, bold=true) +# end +# return max_bw +# end + +function memory_bandwidth( + ::AMDBackend; + memsize::UnitPrefixedBytes=GiB(0.5), + dtype=Cchar, + verbose=true, + DtoDfactor=true, + device=AMDGPU.device(), + io::IO=stdout, + kwargs..., +)::Float64 + AMDGPU.device!(device) do + N = Int(bytes(memsize) ÷ sizeof(dtype)) + mem_gpu = AMDGPU.rand(dtype, N) + mem_gpu2 = AMDGPU.rand(dtype, N) + + return _perform_memcpy( + mem_gpu, mem_gpu2; title="Memory", DtoDfactor, verbose, io=io, kwargs... + ) + end +end + +function memory_bandwidth_scaling( + ::AMDBackend; + device=AMDGPU.device(), + sizes=logspace(1, exp2(30), 10), + verbose=true, + io::IO=stdout, + kwargs..., +) + bandwidths = zeros(length(sizes)) + for (i, s) in enumerate(sizes) + bandwidths[i] = memory_bandwidth( + AMDBackend(); memsize=B(s), device=device, verbose=false, kwargs... + ) + clear_gpu_memory(AMDBackend(); device=device) + end + if verbose + peak_val, idx = findmax(bandwidths) + peak_size = sizes[idx] + p = UnicodePlots.lineplot( + sizes, + bandwidths; + xlabel="data size", + ylabel="GiB/s", + title=string( + "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))" + ), + xscale=:log2, + ) + UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red) + println(io) # top margin + println(io, p) + println(io) # bottom margin + end + return (sizes=sizes, bandwidths=bandwidths) +end + +# """ +# Extra keyword arguments: +# * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`. + +# (This method is from the CUDA backend.) +# """ +# function memory_bandwidth_saxpy( +# ::NVIDIABackend; +# device=CUDA.device(), +# size=2^20 * 10, +# nbench=10, +# dtype=Float32, +# cublas=true, +# verbose=true, +# io::IO=stdout, +# )::Float64 +# device!(device) do +# a = dtype(pi) +# x = CUDA.rand(dtype, size) +# y = CUDA.rand(dtype, size) +# z = CUDA.zeros(dtype, size) + +# nthreads = CUDA.attribute(device, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) +# nblocks = cld(size, nthreads) +# t = Inf +# for _ in 1:nbench +# if cublas +# Δt = CUDA.@elapsed CUBLAS.axpy!(size, a, x, y) +# else +# Δt = CUDA.@elapsed @cuda( +# threads = nthreads, blocks = nblocks, _saxpy_gpu_kernel!(z, a, x, y) +# ) +# end +# t = min(t, Δt) +# end + +# bandwidth = 3.0 * sizeof(dtype) * size * (1024)^(-3) / t +# if verbose +# printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true) +# print(io, " └ max: ") +# printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true) +# end +# return bandwidth +# end +# end + +# function _saxpy_gpu_kernel!(z, a, x, y) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# if i <= length(z) +# @inbounds z[i] = a * x[i] + y[i] +# end +# return nothing +# end + +# function memory_bandwidth_saxpy_scaling( +# ::NVIDIABackend; +# device=CUDA.device(), +# sizes=[2^20 * i for i in 10:10:300], +# verbose=true, +# io::IO=stdout, +# kwargs..., +# ) +# # sizes = [2^20 * i for i in 8:128] # V100 +# bandwidths = zeros(length(sizes)) +# for (i, s) in enumerate(sizes) +# bandwidths[i] = memory_bandwidth_saxpy( +# NVIDIABackend(); device=device, size=s, verbose=false, kwargs... +# ) +# clear_gpu_memory(AMDBackend(); device=device) +# end +# if verbose +# peak_val, idx = findmax(bandwidths) +# peak_size = sizes[idx] +# p = UnicodePlots.lineplot( +# sizes, +# bandwidths; +# xlabel="vector length", +# ylabel="GiB/s", +# title=string( +# "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))" +# ), +# xscale=:log2, +# ) +# UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red) +# println(io) # top margin +# println(io, p) +# println(io) # bottom margin +# end +# return (sizes=sizes, bandwidths=bandwidths) +# end diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl index b44e5fd..ac4b4b3 100644 --- a/ext/CUDAExt/implementations/general.jl +++ b/ext/CUDAExt/implementations/general.jl @@ -21,3 +21,11 @@ function functional(::NVIDIABackend; verbose=true) end return hascuda end + +function clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true) + device!(device) do + gc && GC.gc() + CUDA.reclaim() + end + return nothing +end diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl index b960b08..7fbc0ce 100644 --- a/ext/CUDAExt/implementations/gpuinfo.jl +++ b/ext/CUDAExt/implementations/gpuinfo.jl @@ -1,4 +1,6 @@ -ngpus(::NVIDIABackend) = length(CUDA.devices()) +function ngpus(::NVIDIABackend) + length(CUDA.devices()) +end function gpus(::NVIDIABackend; io::IO=stdout) # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl index c0b68f1..003f106 100644 --- a/ext/CUDAExt/implementations/membw.jl +++ b/ext/CUDAExt/implementations/membw.jl @@ -54,7 +54,7 @@ function memory_bandwidth_scaling( bandwidths[i] = memory_bandwidth( NVIDIABackend(); memsize=B(s), device=device, verbose=false, kwargs... ) - clear_gpu_memory(device) + clear_gpu_memory(NVIDIABackend(); device=device) end if verbose peak_val, idx = findmax(bandwidths) @@ -145,7 +145,7 @@ function memory_bandwidth_saxpy_scaling( bandwidths[i] = memory_bandwidth_saxpy( NVIDIABackend(); device=device, size=s, verbose=false, kwargs... ) - clear_gpu_memory(device) + clear_gpu_memory(NVIDIABackend(); device=device) end if verbose peak_val, idx = findmax(bandwidths) diff --git a/ext/CUDAExt/utility.jl b/ext/CUDAExt/utility.jl index 7f6a504..3c9681d 100644 --- a/ext/CUDAExt/utility.jl +++ b/ext/CUDAExt/utility.jl @@ -22,26 +22,6 @@ function alloc_mem(memsize::UnitPrefixedBytes; devs=(CUDA.device(),), dtype=Floa return mem_handles end -# TODO: Maybe make API/stub? -"Reclaim the unused memory of the currently active GPU (i.e. `device()`)." -function clear_gpu_memory(device::CuDevice=CUDA.device(); gc=true) - device!(device) do - gc && GC.gc() - CUDA.reclaim() - end - return nothing -end - -# TODO: Maybe make API/stub? -"Reclaim the unused memory of all available GPUs." -function clear_all_gpus_memory(devices=CUDA.devices(); gc=true) - gc && GC.gc() - for dev in devices - clear_gpu_memory(dev; gc=false) - end - return nothing -end - """ toggle_tensorcoremath([enable::Bool; verbose=true]) Switches the `CUDA.math_mode` between `CUDA.FAST_MATH` (`enable=true`) and `CUDA.DEFAULT_MATH` (`enable=false`). diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl index 9dbe9e9..fe4f641 100644 --- a/src/GPUInspector.jl +++ b/src/GPUInspector.jl @@ -27,8 +27,8 @@ include("monitoring_io.jl") function not_implemented_yet() return error( - "Not implemented yet. You either haven't loaded a backend (like CUDA.jl) yet, or" * - " the loaded backend doesn't provide this functionality.", + "Not implemented yet. You either haven't loaded a backend yet (e.g. CUDA.jl or " * + "AMDGPU.jl), or the loaded backend doesn't provide this functionality.", ) end include("stubs/stubs_general.jl") @@ -50,11 +50,14 @@ export plot_monitoring_results, load_monitoring_results, save_monitoring_results # utilities export UnitPrefixedBytes, B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, bytes, simplify, change_base, value -export logspace +export logspace, clear_all_gpus_memory # Let's currently not export the CPU tests. After all, this is GPUInspector.jl :) # export stresstest_cpu +# stubs general +export clear_gpu_memory + # stubs gpuinfo export ngpus, gpuinfo, gpuinfo_p2p_access, gpus # stubs p2p bandwidth diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl index eee42ed..57b7136 100644 --- a/src/stubs/stubs_general.jl +++ b/src/stubs/stubs_general.jl @@ -4,3 +4,11 @@ If not, print some hopefully useful debug information (or turn it off with `verb """ functional(; kwargs...) = functional(backend(); kwargs...) functional(::Backend; kwargs...) = not_implemented_yet() + +""" + clear_gpu_memory(; device, gc) + +Reclaim the unused memory of a GPU +""" +clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...) +clear_gpu_memory(::Backend; kwargs...) = not_implemented_yet() diff --git a/src/utility.jl b/src/utility.jl index bba0d55..0b38851 100644 --- a/src/utility.jl +++ b/src/utility.jl @@ -2,6 +2,15 @@ function logspace(start, stop, length) return exp2.(range(log2(start), log2(stop); length=length)) end +"Reclaim the unused memory of all available GPUs." +function clear_all_gpus_memory(; gc=true, devices) + gc && GC.gc() + for dev in devices + clear_gpu_memory(; device=dev, gc=false) + end + return nothing +end + # L2_cachesize() = cachesize()[2] # """ From 375c6e45b27e1be3f9a8e1a489efea43b4cc6b90 Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Thu, 17 Aug 2023 15:39:20 +0200 Subject: [PATCH 2/5] CI restructuring; gpuinfo_p2p_access etc --- Project.toml | 6 +- ext/AMDGPUExt/AMDGPUExt.jl | 35 +-- ext/AMDGPUExt/implementations/general.jl | 7 +- ext/AMDGPUExt/implementations/gpuinfo.jl | 281 +++--------------- .../implementations/host2device_bandwidth.jl | 2 +- ext/AMDGPUExt/implementations/membw.jl | 6 +- ext/AMDGPUExt/utility.jl | 5 + ext/CUDAExt/CUDAExt.jl | 32 +- ext/CUDAExt/implementations/general.jl | 7 +- ext/CUDAExt/implementations/gpuinfo.jl | 12 +- .../implementations/host2device_bandwidth.jl | 2 +- ext/CUDAExt/implementations/membw.jl | 12 +- ext/CUDAExt/implementations/monitoring.jl | 12 +- ext/CUDAExt/implementations/p2p_bandwidth.jl | 8 +- ext/CUDAExt/implementations/peakflops_gpu.jl | 6 +- ext/CUDAExt/implementations/stresstest.jl | 2 +- src/GPUInspector.jl | 5 +- src/backends.jl | 1 + src/stubs/stubs_general.jl | 8 + test/gpuinfo_tests.jl | 7 - test/runtests.jl | 87 ++++-- test/{backend_tests.jl => tests_backend.jl} | 4 +- ...{bandwidth_tests.jl => tests_bandwidth.jl} | 15 +- test/tests_core.jl | 0 test/tests_gpuinfo.jl | 8 + ...{peakflops_tests.jl => tests_peakflops.jl} | 0 ...tresstest_tests.jl => tests_stresstest.jl} | 0 test/{utility_tests.jl => tests_utility.jl} | 0 28 files changed, 191 insertions(+), 379 deletions(-) create mode 100644 ext/AMDGPUExt/utility.jl delete mode 100644 test/gpuinfo_tests.jl rename test/{backend_tests.jl => tests_backend.jl} (79%) rename test/{bandwidth_tests.jl => tests_bandwidth.jl} (89%) create mode 100644 test/tests_core.jl create mode 100644 test/tests_gpuinfo.jl rename test/{peakflops_tests.jl => tests_peakflops.jl} (100%) rename test/{stresstest_tests.jl => tests_stresstest.jl} (100%) rename test/{utility_tests.jl => tests_utility.jl} (100%) diff --git a/Project.toml b/Project.toml index dfbf455..c4a6b50 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,6 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" @@ -38,8 +37,6 @@ DocStringExtensions = "0.9" Glob = "1.3" HDF5 = "0.16" NVTX = "0.3" -Reexport = "1.2" -TestItemRunner = "0.2" ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7" UnicodePlots = "2.8, 3" julia = "1.9" @@ -50,7 +47,6 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" [targets] -test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU", "TestItemRunner"] +test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU"] diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl index d141faf..f6d829a 100644 --- a/ext/AMDGPUExt/AMDGPUExt.jl +++ b/ext/AMDGPUExt/AMDGPUExt.jl @@ -12,6 +12,7 @@ using LinearAlgebra # pkgs using UnicodePlots +using ThreadPinning # for usage in AMDGPUExt using GPUInspector: @@ -25,39 +26,7 @@ using GPUInspector: @unroll, AMDBackend -# import stubs to implement them -import GPUInspector: backendinfo, functional, clear_gpu_memory -# gpuinfo -import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus -# p2p bw -import GPUInspector: - p2p_bandwidth, - p2p_bandwidth_all, - p2p_bandwidth_bidirectional, - p2p_bandwidth_bidirectional_all -# host2device bw -import GPUInspector: host2device_bandwidth -# membw -import GPUInspector: - theoretical_memory_bandwidth, - memory_bandwidth, - memory_bandwidth_scaling, - memory_bandwidth_saxpy, - memory_bandwidth_saxpy_scaling -# stresstest -import GPUInspector: stresstest -# monitoring -import GPUInspector: - monitoring_start, - monitoring_stop, - livemonitor_something, - livemonitor_powerusage, - livemonitor_temperature -# peakflops_gpu -import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu - -# include("cuda_wrappers.jl") -# include("utility.jl") +include("utility.jl") # include("stresstests.jl") # include("peakflops_gpu_fmas.jl") # include("peakflops_gpu_wmmas.jl") diff --git a/ext/AMDGPUExt/implementations/general.jl b/ext/AMDGPUExt/implementations/general.jl index 7ca7b14..4735281 100644 --- a/ext/AMDGPUExt/implementations/general.jl +++ b/ext/AMDGPUExt/implementations/general.jl @@ -1,4 +1,4 @@ -function functional(::AMDBackend; verbose=true) +function GPUInspector.functional(::AMDBackend; verbose=true) if AMDGPU.functional() verbose && @info("AMDGPU.jl is functional.") working = true @@ -9,10 +9,13 @@ function functional(::AMDBackend; verbose=true) return working end -function clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true) +function GPUInspector.clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true) device!(device) do gc && GC.gc() AMDGPU.HIP.reclaim() end return nothing end + +GPUInspector.device(::AMDBackend) = AMDGPU.device() +GPUInspector.devices(::AMDBackend) = AMDGPU.devices() diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl index 92db2e2..b172a52 100644 --- a/ext/AMDGPUExt/implementations/gpuinfo.jl +++ b/ext/AMDGPUExt/implementations/gpuinfo.jl @@ -1,8 +1,8 @@ -function ngpus(::AMDBackend) - length(AMDGPU.devices()) +function GPUInspector.ngpus(::AMDBackend) + return length(AMDGPU.devices()) end -function gpus(::AMDBackend; io::IO=stdout) +function GPUInspector.gpus(::AMDBackend; io::IO=stdout) # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 devs = AMDGPU.devices() if isempty(devs) @@ -18,244 +18,53 @@ function gpus(::AMDBackend; io::IO=stdout) end println( io, - " $(i-1): ", repr(dev), " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)", + " $(_gpuid(dev)): ", + repr(dev), + " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)", ) end end -# """ -# gpuinfo(deviceid::Integer) +""" + gpuinfo(deviceid::Integer) -# Print out detailed information about the NVIDIA GPU with the given `deviceid`. +Print out detailed information about the AMD GPU with the given `deviceid`. -# Heavily inspired by the CUDA sample "deviceQueryDrv.cpp". - -# (This method is from the CUDA backend.) -# """ -# function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout) -# 0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id.")) -# return gpuinfo(CuDevice(deviceid); io) -# end -# function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) -# # query -# mp = nmultiprocessors(dev) -# cores = ncudacores(dev) -# max_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE) ÷ 1000 -# mem_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) ÷ 1000 -# mem_bus_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) -# l2cachesize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE) -# maxTex1D = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH) -# maxTex2D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH) -# maxTex2D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT) -# maxTex3D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH) -# maxTex3D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT) -# maxTex3D_depth = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH) -# maxTex1DLayered_width = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH -# ) -# maxTex1DLayered_layers = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS -# ) -# maxTex2DLayered_width = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH -# ) -# maxTex2DLayered_height = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT -# ) -# maxTex2DLayered_layers = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS -# ) -# total_constant_mem = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY) -# shared_mem_per_block = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK -# ) -# regs_per_block = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK) -# warpsize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE) -# max_threads_per_mp = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR -# ) -# max_threads_per_block = CUDA.attribute( -# dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK -# ) -# blockdim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X) -# blockdim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y) -# blockdim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z) -# griddim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X) -# griddim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y) -# griddim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z) -# texture_align = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT) -# max_mem_pitch = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_PITCH) -# async_engine_count = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT) -# gpu_overlap = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP)) -# kernel_exec_timeout_enabled = Bool( -# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT) -# ) -# integrated = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_INTEGRATED)) -# can_map_host_mem = Bool( -# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY) -# ) -# concurrent_kernels = Bool( -# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS) -# ) -# surface_alignment = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT) > 0 -# ecc_enabled = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ECC_ENABLED)) -# unified_addressing = Bool( -# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) -# ) -# managed_memory = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) -# compute_preemption = Bool( -# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED) -# ) -# cooperative_launch = Bool( -# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH) -# ) -# cooperative_multi_dev_launch = Bool( -# CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH) -# ) -# pci_domainid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID) -# pci_busid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID) -# pci_deviceid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID) -# compute_mode = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE) -# comp_modes = [ -# "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", -# "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", -# "Prohibited (no host thread can use ::cudaSetDevice() with this device)", -# "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", -# "Unknown", -# ] - -# # printing -# println(io, "Device: ", name(dev), " ($dev)") -# println( -# io, "Total amount of global memory: ", Base.format_bytes(Int(CUDA.totalmem(dev))) -# ) -# println(io, "Number of CUDA cores: ", cores) -# println(io, "Number of multiprocessors: ", mp, " ($(cores ÷ mp) CUDA cores each)") -# println(io, "GPU max. clock rate: ", max_clock_rate, " MHz") -# println(io, "Memory clock rate: ", mem_clock_rate, " MHz") -# println(io, "Memory bus width: ", mem_bus_width, "-bit") -# println(io, "L2 cache size: ", Base.format_bytes(l2cachesize)) -# println(io, "Max. texture dimension sizes (1D): $maxTex1D") -# println(io, "Max. texture dimension sizes (2D): $maxTex2D_width, $maxTex2D_height") -# println( -# io, -# "Max. texture dimension sizes (3D): $maxTex3D_width, $maxTex3D_height, $maxTex3D_depth", -# ) -# println( -# io, -# "Max. layered 1D texture size: $(maxTex1DLayered_width) ($(maxTex1DLayered_layers) layers)", -# ) -# println( -# io, -# "Max. layered 2D texture size: $(maxTex2DLayered_width), $(maxTex2DLayered_height) ($(maxTex2DLayered_layers) layers)", -# ) -# println(io, "Total amount of constant memory: ", Base.format_bytes(total_constant_mem)) -# println( -# io, -# "Total amount of shared memory per block: ", -# Base.format_bytes(shared_mem_per_block), -# ) -# println(io, "Total number of registers available per block: ", regs_per_block) -# println(io, "Warp size: ", warpsize) -# println(io, "Max. number of threads per multiprocessor: ", max_threads_per_mp) -# println(io, "Max. number of threads per block: ", max_threads_per_block) -# println( -# io, -# "Max. dimension size of a thread block (x,y,z): $(blockdim_x), $(blockdim_y), $(blockdim_z)", -# ) -# println( -# io, -# "Max. dimension size of a grid size (x,y,z): $(griddim_x), $(griddim_y), $(griddim_z)", -# ) -# println(io, "Texture alignment: ", Base.format_bytes(texture_align)) -# println(io, "Maximum memory pitch: ", Base.format_bytes(max_mem_pitch)) -# println( -# io, -# "Concurrent copy and kernel execution: ", -# gpu_overlap ? "Yes" : "No", -# " with $(async_engine_count) copy engine(s)", -# ) -# println(io, "Run time limit on kernels: ", kernel_exec_timeout_enabled ? "Yes" : "No") -# println(io, "Integrated GPU sharing host memory: ", integrated ? "Yes" : "No") -# println( -# io, "Support host page-locked memory mapping: ", can_map_host_mem ? "Yes" : "No" -# ) -# println(io, "Concurrent kernel execution: ", concurrent_kernels ? "Yes" : "No") -# println(io, "Alignment requirement for surfaces: ", surface_alignment ? "Yes" : "No") -# println(io, "Device has ECC support: ", ecc_enabled ? "Yes" : "No") -# println( -# io, "Device supports Unified Addressing (UVA): ", unified_addressing ? "Yes" : "No" -# ) -# println(io, "Device supports managed memory: ", managed_memory ? "Yes" : "No") -# println(io, "Device supports compute preemption: ", compute_preemption ? "Yes" : "No") -# println(io, "Supports cooperative kernel launch: ", cooperative_launch ? "Yes" : "No") -# println( -# io, -# "Supports multi-device co-op kernel launch: ", -# cooperative_multi_dev_launch ? "Yes" : "No", -# ) -# println( -# io, -# "Device PCI domain ID / bus ID / device ID: $(pci_domainid) / $(pci_busid) / $(pci_deviceid)", -# ) -# println(io, "Compute mode: ", comp_modes[compute_mode + 1]) - -# return nothing -# end +(This method is from the AMD backend.) +""" +function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io::IO=stdout) + 0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id.")) + return gpuinfo(HIPDevice(deviceid); io) +end +function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io::IO=stdout) + # printing + println(io, "Device: $dev \n") + show(io, AMDGPU.HIP.properties(dev)) + return nothing +end -# function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout) -# # check p2p access -# ndevs = ngpus(NVIDIABackend()) -# if ndevs <= 1 -# error("Only a single GPU available.") -# else -# mat_p2p_access_supported = Matrix{Bool}(undef, ndevs, ndevs) -# mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs) -# mat_p2p_atomic_supported = Matrix{Bool}(undef, ndevs, ndevs) -# for i in 1:ndevs -# dev_i = CuDevice(i - 1) -# for j in 1:ndevs -# dev_j = CuDevice(j - 1) -# if i != j -# p2p_access_supported = Bool( -# CUDA.p2p_attribute( -# dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED -# ), -# ) -# p2p_can_access = Bool(CUDA.can_access_peer(dev_i, dev_j)) -# p2p_atomic_supported = Bool( -# CUDA.p2p_attribute( -# dev_i, -# dev_j, -# CUDA.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED, -# ), -# ) -# mat_p2p_atomic_supported[i, j] = p2p_atomic_supported -# mat_p2p_access_supported[i, j] = p2p_access_supported -# mat_p2p_can_access[i, j] = p2p_can_access -# # p2p_performance_rank = CUDA.p2p_attribute(dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK) -# else -# mat_p2p_atomic_supported[i, i] = false -# mat_p2p_access_supported[i, i] = false -# mat_p2p_can_access[i, j] = false -# end -# end -# end +function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io::IO=stdout) + # check p2p access + ndevs = ngpus(AMDBackend()) + if ndevs <= 1 + error("Only a single GPU available.") + else + devs = AMDGPU.devices() + mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs) + for i in 1:ndevs + for j in 1:ndevs + if i != j + mat_p2p_can_access[i, j] = Bool(AMDGPU.HIP.can_access_peer(devs[i], devs[j])) + else + mat_p2p_can_access[i, j] = false + end + end + end -# printstyled(io, "P2P Access Supported:\n"; bold=true) -# show(io, "text/plain", mat_p2p_access_supported) -# println(io) -# println(io) -# if mat_p2p_access_supported != mat_p2p_can_access -# printstyled(io, "P2P Can Access:\n"; bold=true) -# show(io, "text/plain", mat_p2p_can_access) -# println(io) -# println(io) -# end -# printstyled(io, "P2P Atomic Supported:\n"; bold=true) -# show(io, "text/plain", mat_p2p_atomic_supported) -# println(io) -# println(io) -# end -# return nothing -# end + printstyled(io, "P2P Can Access:\n"; bold=true) + show(io, "text/plain", mat_p2p_can_access) + println(io) + println(io) + end + return nothing +end diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl index b32a534..de80ef5 100644 --- a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl +++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl @@ -1,4 +1,4 @@ -function host2device_bandwidth( +function GPUInspector.host2device_bandwidth( ::AMDBackend; memsize::UnitPrefixedBytes=GiB(0.5), dtype=Cchar, diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl index e43aefc..8ddba6a 100644 --- a/ext/AMDGPUExt/implementations/membw.jl +++ b/ext/AMDGPUExt/implementations/membw.jl @@ -14,7 +14,7 @@ # return max_bw # end -function memory_bandwidth( +function GPUInspector.memory_bandwidth( ::AMDBackend; memsize::UnitPrefixedBytes=GiB(0.5), dtype=Cchar, @@ -35,7 +35,7 @@ function memory_bandwidth( end end -function memory_bandwidth_scaling( +function GPUInspector.memory_bandwidth_scaling( ::AMDBackend; device=AMDGPU.device(), sizes=logspace(1, exp2(30), 10), @@ -75,7 +75,7 @@ end # Extra keyword arguments: # * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`. -# (This method is from the CUDA backend.) +# (This method is from the NVIDIA Backend.) # """ # function memory_bandwidth_saxpy( # ::NVIDIABackend; diff --git a/ext/AMDGPUExt/utility.jl b/ext/AMDGPUExt/utility.jl new file mode 100644 index 0000000..f9878ea --- /dev/null +++ b/ext/AMDGPUExt/utility.jl @@ -0,0 +1,5 @@ +_device2string(dev::HIPDevice) = "GPU $(_gpuid(dev)): $(_name(dev))" + +_gpuid(dev::HIPDevice) = AMDGPU.HIP.device_id(dev) + 1 + +_name(dev::HIPDevice) = AMDGPU.HIP.name(dev) diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index 9d2a770..91081ec 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -12,6 +12,7 @@ using LinearAlgebra # pkgs using UnicodePlots using NVTX +using ThreadPinning # for usage in CUDAExt using GPUInspector: @@ -25,37 +26,6 @@ using GPUInspector: @unroll, NVIDIABackend -# import stubs to implement them -import GPUInspector: backendinfo, functional -# gpuinfo -import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus -# p2p bw -import GPUInspector: - p2p_bandwidth, - p2p_bandwidth_all, - p2p_bandwidth_bidirectional, - p2p_bandwidth_bidirectional_all -# host2device bw -import GPUInspector: host2device_bandwidth -# membw -import GPUInspector: - theoretical_memory_bandwidth, - memory_bandwidth, - memory_bandwidth_scaling, - memory_bandwidth_saxpy, - memory_bandwidth_saxpy_scaling -# stresstest -import GPUInspector: stresstest -# monitoring -import GPUInspector: - monitoring_start, - monitoring_stop, - livemonitor_something, - livemonitor_powerusage, - livemonitor_temperature -# peakflops_gpu -import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu - # for convenience const BFloat16 = CUDA.BFloat16 diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl index ac4b4b3..f147679 100644 --- a/ext/CUDAExt/implementations/general.jl +++ b/ext/CUDAExt/implementations/general.jl @@ -1,4 +1,4 @@ -function functional(::NVIDIABackend; verbose=true) +function GPUInspector.functional(::NVIDIABackend; verbose=true) if CUDA.functional() verbose && @info("CUDA/GPU available.") hascuda = true @@ -22,10 +22,13 @@ function functional(::NVIDIABackend; verbose=true) return hascuda end -function clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true) +function GPUInspector.clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true) device!(device) do gc && GC.gc() CUDA.reclaim() end return nothing end + +GPUInspector.device(::NVIDIABackend) = CUDA.device() +GPUInspector.devices(::NVIDIABackend) = CUDA.devices() diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl index 7fbc0ce..73a1fda 100644 --- a/ext/CUDAExt/implementations/gpuinfo.jl +++ b/ext/CUDAExt/implementations/gpuinfo.jl @@ -1,8 +1,8 @@ -function ngpus(::NVIDIABackend) +function GPUInspector.ngpus(::NVIDIABackend) length(CUDA.devices()) end -function gpus(::NVIDIABackend; io::IO=stdout) +function GPUInspector.gpus(::NVIDIABackend; io::IO=stdout) # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 devs = devices() if isempty(devs) @@ -43,13 +43,13 @@ Print out detailed information about the NVIDIA GPU with the given `deviceid`. Heavily inspired by the CUDA sample "deviceQueryDrv.cpp". -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout) +function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout) 0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id.")) return gpuinfo(CuDevice(deviceid); io) end -function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) +function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) # query mp = nmultiprocessors(dev) cores = ncudacores(dev) @@ -216,7 +216,7 @@ function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) return nothing end -function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout) +function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout) # check p2p access ndevs = ngpus(NVIDIABackend()) if ndevs <= 1 diff --git a/ext/CUDAExt/implementations/host2device_bandwidth.jl b/ext/CUDAExt/implementations/host2device_bandwidth.jl index 82f2f5d..d3b747c 100644 --- a/ext/CUDAExt/implementations/host2device_bandwidth.jl +++ b/ext/CUDAExt/implementations/host2device_bandwidth.jl @@ -1,4 +1,4 @@ -function host2device_bandwidth(::NVIDIABackend; +function GPUInspector.host2device_bandwidth(::NVIDIABackend; memsize::UnitPrefixedBytes=GiB(0.5), dtype=Cchar, DtoDfactor=true, diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl index 003f106..58d9b7b 100644 --- a/ext/CUDAExt/implementations/membw.jl +++ b/ext/CUDAExt/implementations/membw.jl @@ -1,4 +1,4 @@ -function theoretical_memory_bandwidth( +function GPUInspector.theoretical_memory_bandwidth( ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout ) max_mem_clock_rate = @@ -14,7 +14,7 @@ function theoretical_memory_bandwidth( return max_bw end -function memory_bandwidth( +function GPUInspector.memory_bandwidth( ::NVIDIABackend; memsize::UnitPrefixedBytes=GiB(0.5), dtype=Cchar, @@ -41,7 +41,7 @@ function memory_bandwidth( end end -function memory_bandwidth_scaling( +function GPUInspector.memory_bandwidth_scaling( ::NVIDIABackend; device=CUDA.device(), sizes=logspace(1, exp2(30), 10), @@ -81,9 +81,9 @@ end Extra keyword arguments: * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`. -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function memory_bandwidth_saxpy( +function GPUInspector.memory_bandwidth_saxpy( ::NVIDIABackend; device=CUDA.device(), size=2^20 * 10, @@ -131,7 +131,7 @@ function _saxpy_gpu_kernel!(z, a, x, y) return nothing end -function memory_bandwidth_saxpy_scaling( +function GPUInspector.memory_bandwidth_saxpy_scaling( ::NVIDIABackend; device=CUDA.device(), sizes=[2^20 * i for i in 10:10:300], diff --git a/ext/CUDAExt/implementations/monitoring.jl b/ext/CUDAExt/implementations/monitoring.jl index 483da9b..54c24f5 100644 --- a/ext/CUDAExt/implementations/monitoring.jl +++ b/ext/CUDAExt/implementations/monitoring.jl @@ -1,4 +1,4 @@ -function monitoring_start( +function GPUInspector.monitoring_start( ::NVIDIABackend; freq=1, devices=CUDA.devices(), thread=Threads.nthreads(), verbose=true ) if ismonitoring() @@ -54,9 +54,9 @@ Specifically, `results` is a named tuple with the following keys: * `time`: the (relative) times at which we measured * `temperature`, `power`, `compute`, `mem` -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults +function GPUInspector.monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults if ismonitoring() verbose && @info("Stopping monitoring and fetching results...") _monitoring!(false) @@ -67,7 +67,7 @@ function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults end end -function livemonitor_temperature(::NVIDIABackend, duration; kwargs...) +function GPUInspector.livemonitor_temperature(::NVIDIABackend, duration; kwargs...) return livemonitor_something( NVIDIABackend(), get_temperatures, @@ -78,7 +78,7 @@ function livemonitor_temperature(::NVIDIABackend, duration; kwargs...) ) end -function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...) +function GPUInspector.livemonitor_powerusage(::NVIDIABackend, duration; kwargs...) return livemonitor_something( NVIDIABackend(), get_power_usages, @@ -89,7 +89,7 @@ function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...) ) end -function livemonitor_something( +function GPUInspector.livemonitor_something( ::NVIDIABackend, f::F, duration; diff --git a/ext/CUDAExt/implementations/p2p_bandwidth.jl b/ext/CUDAExt/implementations/p2p_bandwidth.jl index 6b5a83e..61cb1af 100644 --- a/ext/CUDAExt/implementations/p2p_bandwidth.jl +++ b/ext/CUDAExt/implementations/p2p_bandwidth.jl @@ -1,4 +1,4 @@ -function p2p_bandwidth( +function GPUInspector.p2p_bandwidth( ::NVIDIABackend; memsize::UnitPrefixedBytes=B(40_000_000), nbench=5, @@ -66,7 +66,7 @@ function p2p_bandwidth( return bw_max end -function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...) +function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...) ngpus = length(CUDA.devices()) if ngpus < 2 error("At least 2 GPUs are needed for the P2P benchmark.") @@ -82,7 +82,7 @@ function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs ] end -function p2p_bandwidth_bidirectional( +function GPUInspector.p2p_bandwidth_bidirectional( ::NVIDIABackend; memsize::UnitPrefixedBytes=B(40_000_000), nbench=20, @@ -142,7 +142,7 @@ function p2p_bandwidth_bidirectional( return bw_max end -function p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...) +function GPUInspector.p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...) ngpus = length(CUDA.devices()) if ngpus < 2 error("At least 2 GPUs are needed for the P2P benchmark.") diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl index 0d6bb2e..6751cbf 100644 --- a/ext/CUDAExt/implementations/peakflops_gpu.jl +++ b/ext/CUDAExt/implementations/peakflops_gpu.jl @@ -8,9 +8,9 @@ Estimates the theoretical peak performance of a CUDA device in TFLOP/s. * `dtype` (default: `tensorcores ? Float16 : Float32`): element type of the matrices * `io` (default: `stdout`): set the stream where the results should be printed. -(This method is from the CUDA backend.) +(This method is from the NVIDIA Backend.) """ -function theoretical_peakflops_gpu( +function GPUInspector.theoretical_peakflops_gpu( ::NVIDIABackend; device=CUDA.device(), tensorcores=hastensorcores(), @@ -104,7 +104,7 @@ it takes to perform For more keyword argument options see [`peakflops_gpu_fmas`](@ref) and [`peakflops_gpu_wmmas`](@ref). """ -function peakflops_gpu( +function GPUInspector.peakflops_gpu( ::NVIDIABackend; tensorcores=hastensorcores(), verbose=true, diff --git a/ext/CUDAExt/implementations/stresstest.jl b/ext/CUDAExt/implementations/stresstest.jl index 358d9ca..41288ea 100644 --- a/ext/CUDAExt/implementations/stresstest.jl +++ b/ext/CUDAExt/implementations/stresstest.jl @@ -1,4 +1,4 @@ -function stresstest( +function GPUInspector.stresstest( ::NVIDIABackend; devices=[CUDA.device()], mem=nothing, diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl index fe4f641..19ba114 100644 --- a/src/GPUInspector.jl +++ b/src/GPUInspector.jl @@ -9,8 +9,7 @@ using Base: UUID using Pkg: Pkg # external -using Reexport -@reexport using ThreadPinning +using ThreadPinning using DocStringExtensions using UnicodePlots using CpuId: cachesize @@ -42,7 +41,7 @@ include("stubs/stubs_peakflops_gpu.jl") # backends export Backend, NoBackend, NVIDIABackend, AMDBackend, backend, backend!, backendinfo -export CUDAExt +export CUDAExt, AMDGPUExt # monitoring io+plotting export plot_monitoring_results, load_monitoring_results, save_monitoring_results diff --git a/src/backends.jl b/src/backends.jl index bd5d1de..ae147fc 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -56,6 +56,7 @@ function check_backend(b::Backend) end CUDAExt::Union{Nothing,Module} = nothing +AMDGPUExt::Union{Nothing,Module} = nothing """ Query information about a specific backend, e.g., what functionality the backend currently diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl index 57b7136..219d185 100644 --- a/src/stubs/stubs_general.jl +++ b/src/stubs/stubs_general.jl @@ -12,3 +12,11 @@ Reclaim the unused memory of a GPU """ clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...) clear_gpu_memory(::Backend; kwargs...) = not_implemented_yet() + +"Return the current device of the active backend." +device() = device(backend()) +device(::Backend) = not_implemented_yet() + +"Return the devices of the active backend." +devices() = devices(backend()) +devices(::Backend) = not_implemented_yet() diff --git a/test/gpuinfo_tests.jl b/test/gpuinfo_tests.jl deleted file mode 100644 index 699a766..0000000 --- a/test/gpuinfo_tests.jl +++ /dev/null @@ -1,7 +0,0 @@ -@testitem "gpuinfo / gpus" begin - using CUDA - @test isnothing(gpus()) - @test isnothing(gpuinfo()) - @test isnothing(gpuinfo(0)) - @test isnothing(gpuinfo(device())) -end diff --git a/test/runtests.jl b/test/runtests.jl index 111ff35..1707833 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,24 +1,75 @@ -using TestItemRunner using GPUInspector -using CUDA +using Test +using LinearAlgebra -if !GPUInspector.functional() - error("Can't run testsuite since CUDA/GPU not present or not functional!") +# figure out which backend to use (if both CUDA and AMDGPU are functional we use CUDA) +if haskey(ENV, "TEST_BACKEND") + if lowercase(ENV["TEST_BACKEND"]) in ("nvidia", "cuda", "nvidiabackend") + using CUDA + TEST_BACKEND = NVIDIABackend() + elseif lowercase(ENV["TEST_BACKEND"]) in ("amd", "amdgpu", "amdbackend") + using AMDGPU + TEST_BACKEND = AMDBackend() + else + error(""" + TEST_BACKEND environment variable contains unsupported value. + """) + end +else + using CUDA + using AMDGPU + if CUDA.functional() + @info("NVIDIA GPUs detected.", CUDA.devices()) + TEST_BACKEND = NVIDIABackend() + elseif AMDGPU.functional() + @info("AMD GPUs detected.", AMDGPU.devices()) + TEST_BACKEND = AMDBackend() + else + error(""" + Aborting because neither CUDA.jl nor AMDGPU.jl are functional. + Are there any GPUs in the system? + """) + end end -if Threads.nthreads() == 1 || (Threads.nthreads() < length(CUDA.devices()) + 1) - # we should have at least one thread per gpu + one monitoring thread - @warn( - "You should run the tests with at least $(length(CUDA.devices()) + 1) Julia threads.", - Threads.nthreads(), - length(CUDA.devices()) - ) +backend!(TEST_BACKEND) +@info "Running tests with the following backend: $TEST_BACKEND." + +const TEST_NAMES = ["bandwidth", "peakflops", "stresstest", "gpuinfo", "core"] +if haskey(ENV, "TESTS") + tests = split(ENV["TESTS"], ",") + if !all(t -> t in TEST_NAMES, tests) + error(""" + TESTS environment variable contains unknown test names. + Valid test names are: $(TEST_NAMES) + """) + end + TARGET_TESTS = tests +else + # run all tests + const TARGET_TESTS = TEST_NAMES end +@info "Running following tests: $TARGET_TESTS." + -@run_package_tests +if "stresstest" in TARGET_TESTS + # error if we aren't running with enough threads + if Threads.nthreads() == 1 || (Threads.nthreads() < ngpus() + 1) + # we should have at least one thread per gpu + one monitoring thread + error("You should run the tests with at least $(ngpus() + 1) Julia threads.") + end +end -include("backend_tests.jl") -include("utility_tests.jl") -include("stresstest_tests.jl") -include("bandwidth_tests.jl") -include("peakflops_tests.jl") -include("gpuinfo_tests.jl") +if "gpuinfo" in TARGET_TESTS + include("tests_gpuinfo.jl") +end +# if "stresstest" in TARGET_TESTS +# include("tests_stresstest.jl") +# end +# if "peakflops" in TARGET_TESTS +# include("tests_peakflops.jl") +# end +# if "bandwidth" in TARGET_TESTS +# include("tests_bandwidth.jl") +# end +# include("tests_backend.jl") +# include("tests_utility.jl") diff --git a/test/backend_tests.jl b/test/tests_backend.jl similarity index 79% rename from test/backend_tests.jl rename to test/tests_backend.jl index d772080..c6bed8c 100644 --- a/test/backend_tests.jl +++ b/test/tests_backend.jl @@ -1,5 +1,5 @@ -@testitem "CUDA backend" begin - using CUDA +@testset "Backend switching" begin + @test GPUInspector.is_cuda_loaded() @test GPUInspector.is_cuda_loaded() @test GPUInspector.is_backend_loaded(NVIDIABackend()) @test backend() == NVIDIABackend() diff --git a/test/bandwidth_tests.jl b/test/tests_bandwidth.jl similarity index 89% rename from test/bandwidth_tests.jl rename to test/tests_bandwidth.jl index a68ea19..9d174ae 100644 --- a/test/bandwidth_tests.jl +++ b/test/tests_bandwidth.jl @@ -1,14 +1,13 @@ -@testitem "p2p_bandwidth" begin - using LinearAlgebra - using CUDA - +@testset "p2p_bandwidth" begin @testset "unidirectional" begin # p2p_bandwidth @test typeof(p2p_bandwidth(; verbose=false)) == Float64 @test 0 ≤ p2p_bandwidth(; verbose=false) # options @test typeof(p2p_bandwidth(; memsize=MB(100), verbose=false)) == Float64 - @test typeof(p2p_bandwidth(; src=CuDevice(0), dst=CuDevice(1), verbose=false)) == + dev_src = GPUInspector.devices()[1] + dev_dst = GPUInspector.devices()[2] + @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; dtype=Float16, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; nbench=10, verbose=false)) == Float64 @@ -41,15 +40,13 @@ end end -@testitem "host2device_bandwidth" begin - using CUDA +@testset "host2device_bandwidth" begin @test isnothing(host2device_bandwidth()) @test isnothing(host2device_bandwidth(; memsize=MB(100))) @test isnothing(host2device_bandwidth(; dtype=Float16)) end -@testitem "memory_bandwidth" begin - using CUDA +@testset "memory_bandwidth" begin @test typeof(memory_bandwidth()) == Float64 @test typeof(memory_bandwidth(; memsize=MiB(10))) == Float64 @test typeof(memory_bandwidth(; dtype=Float32)) == Float64 diff --git a/test/tests_core.jl b/test/tests_core.jl new file mode 100644 index 0000000..e69de29 diff --git a/test/tests_gpuinfo.jl b/test/tests_gpuinfo.jl new file mode 100644 index 0000000..9ea54c7 --- /dev/null +++ b/test/tests_gpuinfo.jl @@ -0,0 +1,8 @@ +@testset "gpuinfo / gpus" begin + @test isnothing(gpus()) + @test isnothing(gpuinfo()) + @test isnothing(gpuinfo(GPUInspector.device())) + if ngpus() > 1 + @test isnothing(gpuinfo_p2p_access()) + end +end diff --git a/test/peakflops_tests.jl b/test/tests_peakflops.jl similarity index 100% rename from test/peakflops_tests.jl rename to test/tests_peakflops.jl diff --git a/test/stresstest_tests.jl b/test/tests_stresstest.jl similarity index 100% rename from test/stresstest_tests.jl rename to test/tests_stresstest.jl diff --git a/test/utility_tests.jl b/test/tests_utility.jl similarity index 100% rename from test/utility_tests.jl rename to test/tests_utility.jl From 5c155247a2543c1a5718b3630069a6d2ec2c6b81 Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Fri, 18 Aug 2023 19:55:51 +0200 Subject: [PATCH 3/5] new testing infrastructure works for NVIDIABackend() --- .gitlab-ci.yml | 22 ++++++-- ext/AMDGPUExt/AMDGPUExt.jl | 3 +- ext/AMDGPUExt/implementations/gpuinfo.jl | 8 +-- .../implementations/host2device_bandwidth.jl | 4 +- ext/AMDGPUExt/implementations/membw.jl | 10 ++-- ext/CUDAExt/CUDAExt.jl | 3 +- ext/CUDAExt/implementations/gpuinfo.jl | 8 +-- .../implementations/host2device_bandwidth.jl | 4 +- ext/CUDAExt/implementations/membw.jl | 10 ++-- ext/CUDAExt/implementations/p2p_bandwidth.jl | 6 +-- ext/CUDAExt/implementations/peakflops_gpu.jl | 4 +- ext/CUDAExt/implementations/stresstest.jl | 4 +- ext/CUDAExt/peakflops_gpu_fmas.jl | 2 +- ext/CUDAExt/peakflops_gpu_matmul.jl | 8 +-- ext/CUDAExt/peakflops_gpu_wmmas.jl | 2 +- src/GPUInspector.jl | 3 ++ test/runtests.jl | 51 ++++++++++++++----- test/tests_amd_only.jl | 0 test/tests_backend.jl | 10 ---- test/tests_bandwidth.jl | 9 ++-- test/tests_core.jl | 10 ++++ test/tests_nvidia_only.jl | 11 ++++ test/tests_peakflops.jl | 45 ++++++++-------- test/tests_stresstest.jl | 28 +++++----- test/tests_utility.jl | 15 +----- 25 files changed, 162 insertions(+), 118 deletions(-) create mode 100644 test/tests_amd_only.jl delete mode 100644 test/tests_backend.jl create mode 100644 test/tests_nvidia_only.jl diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7ae5973..3becffc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,24 +2,23 @@ stages: - test - documentation variables: - SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci" JULIA_NUM_THREADS: "10" JULIA_EXCLUSIVE: "1" JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0" - MKL_DYNAMIC: "false" - MKL_NUM_THREADS: "1" default: tags: - bauerc-noctua2 # Generates code coverage -julia/1.9: +julia/1.9/NVIDIA: stage: test rules: - changes: - "README.md" - when: on_success + variables: + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" script: - /bin/bash -l - module load $JULIA_1_9 @@ -28,6 +27,19 @@ julia/1.9: - julia --color=yes --project=test/coverage test/coverage/coverage.jl allow_failure: false +julia/1.9/AMD: + stage: test + rules: + - changes: + - "README.md" + - when: on_success + variables: + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:15:00 -A pc2-mitarbeiter -p hacc --exclusive" + script: + - /bin/bash -l + - module load $JULIA_1_9 + - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = false);' + allow_failure: true # Documentation build-and-deploy-docs: @@ -37,6 +49,8 @@ build-and-deploy-docs: - pushes - tags - external_pull_requests + variables: + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" script: - /bin/bash -l - module load $JULIA_1_9 diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl index f6d829a..2edd10c 100644 --- a/ext/AMDGPUExt/AMDGPUExt.jl +++ b/ext/AMDGPUExt/AMDGPUExt.jl @@ -24,7 +24,8 @@ using GPUInspector: MonitoringResults, _defaultylims, @unroll, - AMDBackend + AMDBackend, + getstdout include("utility.jl") # include("stresstests.jl") diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl index b172a52..64c4d70 100644 --- a/ext/AMDGPUExt/implementations/gpuinfo.jl +++ b/ext/AMDGPUExt/implementations/gpuinfo.jl @@ -2,7 +2,7 @@ function GPUInspector.ngpus(::AMDBackend) return length(AMDGPU.devices()) end -function GPUInspector.gpus(::AMDBackend; io::IO=stdout) +function GPUInspector.gpus(::AMDBackend; io=getstdout()) # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 devs = AMDGPU.devices() if isempty(devs) @@ -32,18 +32,18 @@ Print out detailed information about the AMD GPU with the given `deviceid`. (This method is from the AMD backend.) """ -function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io::IO=stdout) +function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io=getstdout()) 0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id.")) return gpuinfo(HIPDevice(deviceid); io) end -function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io::IO=stdout) +function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io=getstdout()) # printing println(io, "Device: $dev \n") show(io, AMDGPU.HIP.properties(dev)) return nothing end -function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io::IO=stdout) +function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io=getstdout()) # check p2p access ndevs = ngpus(AMDBackend()) if ndevs <= 1 diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl index de80ef5..8095f45 100644 --- a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl +++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl @@ -4,7 +4,7 @@ function GPUInspector.host2device_bandwidth( dtype=Cchar, DtoDfactor=true, verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) N = Int(bytes(memsize) ÷ sizeof(dtype)) @@ -36,7 +36,7 @@ function _perform_memcpy( stats=false, DtoDfactor=false, verbose=true, - io::IO=stdout, + io=getstdout(), ) sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)") ts = zeros(nbench) diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl index 8ddba6a..d177993 100644 --- a/ext/AMDGPUExt/implementations/membw.jl +++ b/ext/AMDGPUExt/implementations/membw.jl @@ -1,5 +1,5 @@ # function theoretical_memory_bandwidth( -# ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout +# ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout() # ) # max_mem_clock_rate = # CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz @@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth( verbose=true, DtoDfactor=true, device=AMDGPU.device(), - io::IO=stdout, + io=getstdout(), kwargs..., )::Float64 AMDGPU.device!(device) do @@ -40,7 +40,7 @@ function GPUInspector.memory_bandwidth_scaling( device=AMDGPU.device(), sizes=logspace(1, exp2(30), 10), verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) bandwidths = zeros(length(sizes)) @@ -85,7 +85,7 @@ end # dtype=Float32, # cublas=true, # verbose=true, -# io::IO=stdout, +# io=getstdout(), # )::Float64 # device!(device) do # a = dtype(pi) @@ -130,7 +130,7 @@ end # device=CUDA.device(), # sizes=[2^20 * i for i in 10:10:300], # verbose=true, -# io::IO=stdout, +# io=getstdout(), # kwargs..., # ) # # sizes = [2^20 * i for i in 8:128] # V100 diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl index 91081ec..0703604 100644 --- a/ext/CUDAExt/CUDAExt.jl +++ b/ext/CUDAExt/CUDAExt.jl @@ -24,7 +24,8 @@ using GPUInspector: MonitoringResults, _defaultylims, @unroll, - NVIDIABackend + NVIDIABackend, + getstdout # for convenience const BFloat16 = CUDA.BFloat16 diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl index 73a1fda..bff635d 100644 --- a/ext/CUDAExt/implementations/gpuinfo.jl +++ b/ext/CUDAExt/implementations/gpuinfo.jl @@ -2,7 +2,7 @@ function GPUInspector.ngpus(::NVIDIABackend) length(CUDA.devices()) end -function GPUInspector.gpus(::NVIDIABackend; io::IO=stdout) +function GPUInspector.gpus(::NVIDIABackend; io=getstdout()) # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69 devs = devices() if isempty(devs) @@ -45,11 +45,11 @@ Heavily inspired by the CUDA sample "deviceQueryDrv.cpp". (This method is from the NVIDIA Backend.) """ -function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout) +function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io=getstdout()) 0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id.")) return gpuinfo(CuDevice(deviceid); io) end -function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout) +function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io=getstdout()) # query mp = nmultiprocessors(dev) cores = ncudacores(dev) @@ -216,7 +216,7 @@ function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io:: return nothing end -function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout) +function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io=getstdout()) # check p2p access ndevs = ngpus(NVIDIABackend()) if ndevs <= 1 diff --git a/ext/CUDAExt/implementations/host2device_bandwidth.jl b/ext/CUDAExt/implementations/host2device_bandwidth.jl index d3b747c..5b3b31c 100644 --- a/ext/CUDAExt/implementations/host2device_bandwidth.jl +++ b/ext/CUDAExt/implementations/host2device_bandwidth.jl @@ -3,7 +3,7 @@ function GPUInspector.host2device_bandwidth(::NVIDIABackend; dtype=Cchar, DtoDfactor=true, verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) N = Int(bytes(memsize) ÷ sizeof(dtype)) @@ -42,7 +42,7 @@ function _perform_memcpy( stats=false, DtoDfactor=false, verbose=true, - io::IO=stdout, + io=getstdout(), ) NVTX.@range "host2dev: $title" begin sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)") diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl index 58d9b7b..a9dc2bf 100644 --- a/ext/CUDAExt/implementations/membw.jl +++ b/ext/CUDAExt/implementations/membw.jl @@ -1,5 +1,5 @@ function GPUInspector.theoretical_memory_bandwidth( - ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout + ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout() ) max_mem_clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz @@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth( verbose=true, DtoDfactor=true, device=CUDA.device(), - io::IO=stdout, + io=getstdout(), kwargs..., )::Float64 device!(device) do @@ -46,7 +46,7 @@ function GPUInspector.memory_bandwidth_scaling( device=CUDA.device(), sizes=logspace(1, exp2(30), 10), verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) bandwidths = zeros(length(sizes)) @@ -91,7 +91,7 @@ function GPUInspector.memory_bandwidth_saxpy( dtype=Float32, cublas=true, verbose=true, - io::IO=stdout, + io=getstdout(), )::Float64 device!(device) do a = dtype(pi) @@ -136,7 +136,7 @@ function GPUInspector.memory_bandwidth_saxpy_scaling( device=CUDA.device(), sizes=[2^20 * i for i in 10:10:300], verbose=true, - io::IO=stdout, + io=getstdout(), kwargs..., ) # sizes = [2^20 * i for i in 8:128] # V100 diff --git a/ext/CUDAExt/implementations/p2p_bandwidth.jl b/ext/CUDAExt/implementations/p2p_bandwidth.jl index 61cb1af..2734ecb 100644 --- a/ext/CUDAExt/implementations/p2p_bandwidth.jl +++ b/ext/CUDAExt/implementations/p2p_bandwidth.jl @@ -9,7 +9,7 @@ function GPUInspector.p2p_bandwidth( dtype=Float32, src=0, dst=1, - io::IO=stdout, + io=getstdout(), ) if ngpus(NVIDIABackend()) < 2 error("At least 2 GPUs are needed for the P2P benchmark.") @@ -66,7 +66,7 @@ function GPUInspector.p2p_bandwidth( return bw_max end -function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...) +function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io=getstdout(), verbose=false, kwargs...) ngpus = length(CUDA.devices()) if ngpus < 2 error("At least 2 GPUs are needed for the P2P benchmark.") @@ -93,7 +93,7 @@ function GPUInspector.p2p_bandwidth_bidirectional( dev1=0, dev2=1, repeat=100, - io::IO=stdout, + io=getstdout(), ) if ngpus(NVIDIABackend()) < 2 error("At least 2 GPUs are needed for the P2P benchmark.") diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl index 6751cbf..99a117c 100644 --- a/ext/CUDAExt/implementations/peakflops_gpu.jl +++ b/ext/CUDAExt/implementations/peakflops_gpu.jl @@ -16,7 +16,7 @@ function GPUInspector.theoretical_peakflops_gpu( tensorcores=hastensorcores(), dtype=tensorcores ? Float16 : Float32, verbose=true, - io::IO=stdout, + io=getstdout(), ) if tensorcores max_peakflops = _theoretical_peakflops_gpu_tensorcores(; device, dtype) @@ -109,7 +109,7 @@ function GPUInspector.peakflops_gpu( tensorcores=hastensorcores(), verbose=true, dtype=tensorcores ? Float16 : Float32, - io::IO=stdout, + io=getstdout(), kwargs..., ) if tensorcores diff --git a/ext/CUDAExt/implementations/stresstest.jl b/ext/CUDAExt/implementations/stresstest.jl index 41288ea..9c14994 100644 --- a/ext/CUDAExt/implementations/stresstest.jl +++ b/ext/CUDAExt/implementations/stresstest.jl @@ -12,7 +12,7 @@ function GPUInspector.stresstest( clearmem=false, monitoring=false, batch_duration=nothing, - io::IO=stdout, + io=getstdout(), kwargs..., ) logger = ConsoleLogger(io) @@ -69,7 +69,7 @@ function GPUInspector.stresstest( Δt = @elapsed _run_stresstests(ts; verbose, kwargs...) if clearmem verbose && @info("Clearing GPU memory.") - clear_all_gpus_memory(devices) + GPUInspector.clear_all_gpus_memory(; devices=devices) end verbose && @info("Took $(round(Δt; digits=2)) seconds to run the tests.") if monitoring diff --git a/ext/CUDAExt/peakflops_gpu_fmas.jl b/ext/CUDAExt/peakflops_gpu_fmas.jl index a251341..eb3ff6f 100644 --- a/ext/CUDAExt/peakflops_gpu_fmas.jl +++ b/ext/CUDAExt/peakflops_gpu_fmas.jl @@ -48,7 +48,7 @@ function _peakflops_gpu_fmas(; nkernel=5, device::CuDevice=CUDA.device(), verbose=true, - io::IO=stdout, + io=getstdout(), ) device!(device) do d_a = CUDA.rand(dtype, size) diff --git a/ext/CUDAExt/peakflops_gpu_matmul.jl b/ext/CUDAExt/peakflops_gpu_matmul.jl index a081b69..93bf221 100644 --- a/ext/CUDAExt/peakflops_gpu_matmul.jl +++ b/ext/CUDAExt/peakflops_gpu_matmul.jl @@ -9,13 +9,13 @@ function peakflops_gpu_matmul_scaling( device=CUDA.device(), verbose=true, sizes=2 .^ (10:15), - io::IO=stdout, + io=getstdout(), kwargs..., ) where {F} flops = zeros(length(sizes)) for (i, s) in enumerate(sizes) flops[i] = peakflops_func(; device=device, size=s, verbose=false, kwargs...) - clear_gpu_memory(device) + GPUInspector.clear_gpu_memory(; device=device) end if verbose peak_val, idx = findmax(flops) @@ -64,7 +64,7 @@ function peakflops_gpu_matmul(; nmatmuls=5, nbench=5, verbose=true, - io::IO=stdout, + io=getstdout(), ) device!(device) do C = CUDA.zeros(dtype, size, size) @@ -108,7 +108,7 @@ function peakflops_gpu_matmul_graphs(; nmatmuls=5, nbench=5, verbose=true, - io::IO=stdout, + io=getstdout(), ) device!(device) do C = CUDA.zeros(dtype, size, size) diff --git a/ext/CUDAExt/peakflops_gpu_wmmas.jl b/ext/CUDAExt/peakflops_gpu_wmmas.jl index a12295b..f6c000c 100644 --- a/ext/CUDAExt/peakflops_gpu_wmmas.jl +++ b/ext/CUDAExt/peakflops_gpu_wmmas.jl @@ -91,7 +91,7 @@ function _peakflops_gpu_wmmas(; nkernel=10, verbose=true, dtype=Float16, - io::IO=stdout, + io=getstdout(), ) device!(device) do if Symbol(dtype) == :Float16 diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl index 19ba114..7790536 100644 --- a/src/GPUInspector.jl +++ b/src/GPUInspector.jl @@ -16,6 +16,9 @@ using CpuId: cachesize using HDF5: h5open using Glob: glob +const DEFAULT_IO = Ref{Union{IO, Nothing}}(nothing) +getstdout() = something(DEFAULT_IO[], stdout) + include("backends.jl") include("UnitPrefixedBytes.jl") include("utility.jl") diff --git a/test/runtests.jl b/test/runtests.jl index 1707833..fb1f3f8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,12 @@ using GPUInspector using Test using LinearAlgebra +using Logging + +# Environment variables: +# - "TEST_BACKEND": can be set to manually specify a backend +# - "TEST_QUIET": can be set to true/false to enable/disable non-verbose testing +# - "TESTS": a comma separated list of test suites to run (see TEST_NAMES below) # figure out which backend to use (if both CUDA and AMDGPU are functional we use CUDA) if haskey(ENV, "TEST_BACKEND") @@ -34,7 +40,9 @@ end backend!(TEST_BACKEND) @info "Running tests with the following backend: $TEST_BACKEND." -const TEST_NAMES = ["bandwidth", "peakflops", "stresstest", "gpuinfo", "core"] +const TEST_NAMES = [ + "bandwidth", "peakflops", "stresstest", "gpuinfo", "utility", "backend_specific", "core" +] if haskey(ENV, "TESTS") tests = split(ENV["TESTS"], ",") if !all(t -> t in TEST_NAMES, tests) @@ -50,7 +58,6 @@ else end @info "Running following tests: $TARGET_TESTS." - if "stresstest" in TARGET_TESTS # error if we aren't running with enough threads if Threads.nthreads() == 1 || (Threads.nthreads() < ngpus() + 1) @@ -59,17 +66,35 @@ if "stresstest" in TARGET_TESTS end end +quiet_testing = parse(Bool, get(ENV, "TEST_QUIET", "true")) +if quiet_testing + GPUInspector.DEFAULT_IO[] = Base.BufferStream() + global_logger(Logging.NullLogger()) +end + +if "core" in TARGET_TESTS + include("tests_core.jl") +end +if "utility" in TARGET_TESTS + include("tests_utility.jl") +end if "gpuinfo" in TARGET_TESTS include("tests_gpuinfo.jl") end -# if "stresstest" in TARGET_TESTS -# include("tests_stresstest.jl") -# end -# if "peakflops" in TARGET_TESTS -# include("tests_peakflops.jl") -# end -# if "bandwidth" in TARGET_TESTS -# include("tests_bandwidth.jl") -# end -# include("tests_backend.jl") -# include("tests_utility.jl") +if "bandwidth" in TARGET_TESTS + include("tests_bandwidth.jl") +end +if "stresstest" in TARGET_TESTS + using CairoMakie + include("tests_stresstest.jl") +end +if "peakflops" in TARGET_TESTS + include("tests_peakflops.jl") +end +if "backend_specific" in TARGET_TESTS + if TEST_BACKEND == NVIDIABackend() + include("tests_nvidia_only.jl") + elseif TEST_BACKEND == AMDBackend() + include("tests_amd_only.jl") + end +end diff --git a/test/tests_amd_only.jl b/test/tests_amd_only.jl new file mode 100644 index 0000000..e69de29 diff --git a/test/tests_backend.jl b/test/tests_backend.jl deleted file mode 100644 index c6bed8c..0000000 --- a/test/tests_backend.jl +++ /dev/null @@ -1,10 +0,0 @@ -@testset "Backend switching" begin - @test GPUInspector.is_cuda_loaded() - @test GPUInspector.is_cuda_loaded() - @test GPUInspector.is_backend_loaded(NVIDIABackend()) - @test backend() == NVIDIABackend() - @test isnothing(backend!(NoBackend())) - @test backend() == NoBackend() - @test isnothing(backend!(:cuda)) - @test backend() == NVIDIABackend() -end diff --git a/test/tests_bandwidth.jl b/test/tests_bandwidth.jl index 9d174ae..7f81fc4 100644 --- a/test/tests_bandwidth.jl +++ b/test/tests_bandwidth.jl @@ -5,10 +5,8 @@ @test 0 ≤ p2p_bandwidth(; verbose=false) # options @test typeof(p2p_bandwidth(; memsize=MB(100), verbose=false)) == Float64 - dev_src = GPUInspector.devices()[1] - dev_dst = GPUInspector.devices()[2] - @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) == - Float64 + dev_src, dev_dst = collect(GPUInspector.devices())[1:2] + @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; dtype=Float16, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; nbench=10, verbose=false)) == Float64 @test typeof(p2p_bandwidth(; hist=true, verbose=true)) == Float64 @@ -25,7 +23,8 @@ @test typeof(p2p_bandwidth_bidirectional(; verbose=false)) == Float64 @test 0 ≤ p2p_bandwidth_bidirectional(; verbose=false) # options - @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) == Float64 + @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) == + Float64 @test typeof(p2p_bandwidth_bidirectional(; dtype=Float16, verbose=false)) == Float64 @test typeof(p2p_bandwidth_bidirectional(; nbench=10, verbose=false)) == Float64 @test typeof(p2p_bandwidth_bidirectional(; hist=true, verbose=true)) == Float64 diff --git a/test/tests_core.jl b/test/tests_core.jl index e69de29..c6bed8c 100644 --- a/test/tests_core.jl +++ b/test/tests_core.jl @@ -0,0 +1,10 @@ +@testset "Backend switching" begin + @test GPUInspector.is_cuda_loaded() + @test GPUInspector.is_cuda_loaded() + @test GPUInspector.is_backend_loaded(NVIDIABackend()) + @test backend() == NVIDIABackend() + @test isnothing(backend!(NoBackend())) + @test backend() == NoBackend() + @test isnothing(backend!(:cuda)) + @test backend() == NVIDIABackend() +end diff --git a/test/tests_nvidia_only.jl b/test/tests_nvidia_only.jl new file mode 100644 index 0000000..27ffcd7 --- /dev/null +++ b/test/tests_nvidia_only.jl @@ -0,0 +1,11 @@ +@testset "toggle_tensorcoremath" begin + @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false)) + @test CUDA.math_mode() == CUDA.FAST_MATH + @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false)) + @test CUDA.math_mode() == CUDA.DEFAULT_MATH + # test toggle + @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) + @test CUDA.math_mode() == CUDA.FAST_MATH + @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) + @test CUDA.math_mode() == CUDA.DEFAULT_MATH +end diff --git a/test/tests_peakflops.jl b/test/tests_peakflops.jl index 9005fce..0d835e0 100644 --- a/test/tests_peakflops.jl +++ b/test/tests_peakflops.jl @@ -1,23 +1,28 @@ -@testitem "peakflops_gpu (CUDA cores)" begin - using CUDA - @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64 - @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) == - Float64 - @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) == - Float64 -end +if backend() == NVIDIABackend() + @testset "peakflops_gpu (CUDA cores)" begin + @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64 + @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) == + Float64 + @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) == + Float64 + end -@testitem "peakflops_gpu (Tensor cores)" begin - using CUDA - @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64 - @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) == Float64 -end + @testset "peakflops_gpu (Tensor cores)" begin + @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64 + @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) == + Float64 + end -@testitem "peakflops_gpu_matmul / scaling" begin - using CUDA - @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64 - @test typeof(CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false)) == Float64 - @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) == Float64 - @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) == - Tuple{Vector{Int64},Vector{Float64}} + @testset "peakflops_gpu_matmul / scaling" begin + @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64 + @test typeof( + CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false) + ) == Float64 + @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) == + Float64 + @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) == + Tuple{Vector{Int64},Vector{Float64}} + end +elseif backend() == AMDBackend() + # TODO end diff --git a/test/tests_stresstest.jl b/test/tests_stresstest.jl index 8b99fc0..32152b9 100644 --- a/test/tests_stresstest.jl +++ b/test/tests_stresstest.jl @@ -1,5 +1,4 @@ -@testitem "Stresstest: different kinds" begin - using CUDA +@testset "Stresstest: different kinds" begin @test isnothing(stresstest(; duration=2, verbose=false)) @test isnothing(stresstest(; enforced_duration=2, verbose=false)) @test isnothing(stresstest(; approx_duration=2, verbose=false)) @@ -8,10 +7,9 @@ @test isnothing(stresstest(; mem=0.2, verbose=false)) end -@testitem "Stresstest: keyword options" begin - using CUDA +@testset "Stresstest: keyword options" begin @test isnothing(stresstest(; duration=2, verbose=false)) - @test isnothing(stresstest(; duration=2, devices=devices(), verbose=false)) + @test isnothing(stresstest(; duration=2, devices=GPUInspector.devices(), verbose=false)) @test isnothing(stresstest(; duration=2, size=3000, verbose=false)) @test isnothing(stresstest(; duration=2, dtype=Float16, verbose=false)) @test isnothing(stresstest(; duration=2, clearmem=true, verbose=false)) @@ -19,15 +17,16 @@ end # TODO: kwargs: threads, parallel end -@testitem "Stresstest: monitoring" begin - using CUDA +@testset "Stresstest: monitoring" begin @testset "automatic (monitoring=true)" begin @test typeof( - stresstest(; devices=devices(), duration=2, verbose=false, monitoring=true) + stresstest(; + devices=GPUInspector.devices(), duration=2, verbose=false, monitoring=true + ), ) == MonitoringResults end @testset "manual" begin - devs = devices() + devs = GPUInspector.devices() @test isnothing(monitoring_start(; freq=1, devices=devs, verbose=false)) @test isnothing( stresstest(; devices=devs, duration=2, verbose=false, monitoring=false) @@ -41,8 +40,7 @@ end end end -@testitem "Stresstest: monitoring results" begin - using CUDA +@testset "Stresstest: monitoring results" begin @testset "MonitoringResults" begin r = stresstest(; duration=2, verbose=false, monitoring=true) @test typeof(r) == MonitoringResults @@ -53,12 +51,13 @@ end end @testset "save / load" begin d = Dict{Symbol,Vector{Vector{Float64}}}() - ndevs = length(CUDA.devices()) + ndevs = ngpus() d[:asd] = [rand(ndevs) for _ in 1:5] d[:qwe] = [rand(ndevs) for _ in 1:5] d[:jkl] = [rand(ndevs) for _ in 1:5] devices = Tuple{String,Base.UUID}[ - (CUDAExt._device2string(dev), uuid(dev)) for dev in collect(CUDA.devices()) + (CUDAExt._device2string(dev), uuid(dev)) for + dev in collect(GPUInspector.devices()) ] r = MonitoringResults(rand(5), devices, d) cd(mktempdir()) do @@ -74,8 +73,7 @@ end end end -@testitem "Stresstest: monitoring results (CairoMakie)" begin - using CairoMakie +@testset "Stresstest: monitoring results (CairoMakie)" begin r = load_monitoring_results(joinpath(@__DIR__, "test.h5")) @test isnothing(savefig_monitoring_results(r)) @test isnothing(savefig_monitoring_results(r, (:compute, :mem))) diff --git a/test/tests_utility.jl b/test/tests_utility.jl index 1bf5bd3..9ec5c5f 100644 --- a/test/tests_utility.jl +++ b/test/tests_utility.jl @@ -1,4 +1,4 @@ -@testitem "UnitPrefixedBytes" begin +@testset "UnitPrefixedBytes" begin using InteractiveUtils: subtypes # general stuff @@ -64,16 +64,3 @@ end @test B(40_000_000) + MB(3) - 2 * KiB(2) ≈ MB(42.995904) end - -@testitem "toggle_tensorcoremath" begin - using CUDA - @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false)) - @test CUDA.math_mode() == CUDA.FAST_MATH - @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false)) - @test CUDA.math_mode() == CUDA.DEFAULT_MATH - # test toggle - @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) - @test CUDA.math_mode() == CUDA.FAST_MATH - @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false)) - @test CUDA.math_mode() == CUDA.DEFAULT_MATH -end From b7dd03a7237b08ceb0eb69dfc0b2d96b468cd8ae Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Fri, 18 Aug 2023 20:07:26 +0200 Subject: [PATCH 4/5] AMD CI: use Juliaa isnstead of JuliaHPC --- .gitlab-ci.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3becffc..5900b30 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,7 +5,8 @@ variables: JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci" JULIA_NUM_THREADS: "10" JULIA_EXCLUSIVE: "1" - JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0" + JULIAHPC_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0" + JULIA_1_9: "lang/Julia/1.9.2-linux-x86_64" default: tags: - bauerc-noctua2 @@ -18,10 +19,10 @@ julia/1.9/NVIDIA: - "README.md" - when: on_success variables: - SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" script: - /bin/bash -l - - module load $JULIA_1_9 + - module load $JULIAHPC_1_9 - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = true);' - julia --color=yes --project=test/coverage -e 'import Pkg; Pkg.instantiate()' - julia --color=yes --project=test/coverage test/coverage/coverage.jl @@ -34,7 +35,7 @@ julia/1.9/AMD: - "README.md" - when: on_success variables: - SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:15:00 -A pc2-mitarbeiter -p hacc --exclusive" + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:20:00 -A pc2-mitarbeiter -p hacc --exclusive" script: - /bin/bash -l - module load $JULIA_1_9 @@ -50,10 +51,10 @@ build-and-deploy-docs: - tags - external_pull_requests variables: - SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" + SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2" script: - /bin/bash -l - - module load $JULIA_1_9 + - module load $JULIAHPC_1_9 - cd docs - julia --color=yes build_docs.jl allow_failure: false From c8bac39d3fdf47069be5cc1efcdc5119019a16e6 Mon Sep 17 00:00:00 2001 From: Carsten Bauer Date: Thu, 24 Aug 2023 14:48:41 +0200 Subject: [PATCH 5/5] membw saxpy etc --- Project.toml | 2 +- ext/AMDGPUExt/implementations/membw.jl | 163 ++++++++++++------------- test/tests_bandwidth.jl | 29 +++-- test/tests_core.jl | 27 ++-- 4 files changed, 116 insertions(+), 105 deletions(-) diff --git a/Project.toml b/Project.toml index c4a6b50..e478059 100644 --- a/Project.toml +++ b/Project.toml @@ -29,7 +29,7 @@ AMDGPUExt = "AMDGPU" CairoMakieExt = "CairoMakie" [compat] -AMDGPU = "0.5" +AMDGPU = "0.5.5" CUDA = "3.8.4, 3.12, 4.4" CairoMakie = "0.7, 0.10.7" CpuId = "0.3" diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl index d177993..b309e05 100644 --- a/ext/AMDGPUExt/implementations/membw.jl +++ b/ext/AMDGPUExt/implementations/membw.jl @@ -45,7 +45,7 @@ function GPUInspector.memory_bandwidth_scaling( ) bandwidths = zeros(length(sizes)) for (i, s) in enumerate(sizes) - bandwidths[i] = memory_bandwidth( + bandwidths[i] = GPUInspector.memory_bandwidth( AMDBackend(); memsize=B(s), device=device, verbose=false, kwargs... ) clear_gpu_memory(AMDBackend(); device=device) @@ -71,93 +71,82 @@ function GPUInspector.memory_bandwidth_scaling( return (sizes=sizes, bandwidths=bandwidths) end -# """ -# Extra keyword arguments: -# * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`. - -# (This method is from the NVIDIA Backend.) -# """ -# function memory_bandwidth_saxpy( -# ::NVIDIABackend; -# device=CUDA.device(), -# size=2^20 * 10, -# nbench=10, -# dtype=Float32, -# cublas=true, -# verbose=true, -# io=getstdout(), -# )::Float64 -# device!(device) do -# a = dtype(pi) -# x = CUDA.rand(dtype, size) -# y = CUDA.rand(dtype, size) -# z = CUDA.zeros(dtype, size) +function GPUInspector.memory_bandwidth_saxpy( + ::AMDBackend; + device=AMDGPU.device(), + size=2^26, + nbench=10, + dtype=Float32, + verbose=true, + io=getstdout(), +)::Float64 + device!(device) do + a = dtype(pi) + x = AMDGPU.rand(dtype, size) + y = AMDGPU.rand(dtype, size) + z = AMDGPU.zeros(dtype, size) -# nthreads = CUDA.attribute(device, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) -# nblocks = cld(size, nthreads) -# t = Inf -# for _ in 1:nbench -# if cublas -# Δt = CUDA.@elapsed CUBLAS.axpy!(size, a, x, y) -# else -# Δt = CUDA.@elapsed @cuda( -# threads = nthreads, blocks = nblocks, _saxpy_gpu_kernel!(z, a, x, y) -# ) -# end -# t = min(t, Δt) -# end + kernel = @roc launch = false _saxpy_gpu_kernel!(z, a, x, y) + occupancy = AMDGPU.launch_configuration(kernel) + t = Inf + for _ in 1:nbench + Δt = AMDGPU.@elapsed @roc( + groupsize = occupancy.groupsize, _saxpy_gpu_kernel!(z, a, x, y) + ) + t = min(t, Δt) + end -# bandwidth = 3.0 * sizeof(dtype) * size * (1024)^(-3) / t -# if verbose -# printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true) -# print(io, " └ max: ") -# printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true) -# end -# return bandwidth -# end -# end + bandwidth = 3.0 * sizeof(dtype) * size / t / (1024)^3 + if verbose + printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true) + print(io, " └ max: ") + printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true) + end + return bandwidth + end +end -# function _saxpy_gpu_kernel!(z, a, x, y) -# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x -# if i <= length(z) -# @inbounds z[i] = a * x[i] + y[i] -# end -# return nothing -# end +function _saxpy_gpu_kernel!(z, a, x, y) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i <= length(z) + @inbounds z[i] = a * x[i] + y[i] + end + return nothing +end -# function memory_bandwidth_saxpy_scaling( -# ::NVIDIABackend; -# device=CUDA.device(), -# sizes=[2^20 * i for i in 10:10:300], -# verbose=true, -# io=getstdout(), -# kwargs..., -# ) -# # sizes = [2^20 * i for i in 8:128] # V100 -# bandwidths = zeros(length(sizes)) -# for (i, s) in enumerate(sizes) -# bandwidths[i] = memory_bandwidth_saxpy( -# NVIDIABackend(); device=device, size=s, verbose=false, kwargs... -# ) -# clear_gpu_memory(AMDBackend(); device=device) -# end -# if verbose -# peak_val, idx = findmax(bandwidths) -# peak_size = sizes[idx] -# p = UnicodePlots.lineplot( -# sizes, -# bandwidths; -# xlabel="vector length", -# ylabel="GiB/s", -# title=string( -# "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))" -# ), -# xscale=:log2, -# ) -# UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red) -# println(io) # top margin -# println(io, p) -# println(io) # bottom margin -# end -# return (sizes=sizes, bandwidths=bandwidths) -# end +function GPUInspector.memory_bandwidth_saxpy_scaling( + ::AMDBackend; + device=AMDGPU.device(), + sizes=[2^20 * i for i in 10:10:300], + verbose=true, + io=getstdout(), + kwargs..., +) + # sizes = [2^20 * i for i in 8:128] # V100 + bandwidths = zeros(length(sizes)) + for (i, s) in enumerate(sizes) + bandwidths[i] = GPUInspector.memory_bandwidth_saxpy( + AMDBackend(); device=device, size=s, verbose=false, kwargs... + ) + clear_gpu_memory(AMDBackend(); device=device) + end + if verbose + peak_val, idx = findmax(bandwidths) + peak_size = sizes[idx] + p = UnicodePlots.lineplot( + sizes, + bandwidths; + xlabel="vector length", + ylabel="GiB/s", + title=string( + "Peak: ", round(peak_val; digits=2), " GiB/s (vector size = $(bytes(peak_size)))" + ), + xscale=:log2, + ) + UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red) + println(io) # top margin + println(io, p) + println(io) # bottom margin + end + return (sizes=sizes, bandwidths=bandwidths) +end diff --git a/test/tests_bandwidth.jl b/test/tests_bandwidth.jl index 7f81fc4..6626617 100644 --- a/test/tests_bandwidth.jl +++ b/test/tests_bandwidth.jl @@ -41,16 +41,27 @@ end @testset "host2device_bandwidth" begin @test isnothing(host2device_bandwidth()) - @test isnothing(host2device_bandwidth(; memsize=MB(100))) - @test isnothing(host2device_bandwidth(; dtype=Float16)) + @test isnothing(host2device_bandwidth(; memsize=MB(1))) + @test isnothing(host2device_bandwidth(; dtype=Float64)) end @testset "memory_bandwidth" begin - @test typeof(memory_bandwidth()) == Float64 - @test typeof(memory_bandwidth(; memsize=MiB(10))) == Float64 - @test typeof(memory_bandwidth(; dtype=Float32)) == Float64 - - @test typeof(memory_bandwidth_saxpy()) == Float64 - @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64 - @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64 + @testset "regular" begin + @test typeof(memory_bandwidth()) == Float64 + @test typeof(memory_bandwidth(; memsize=MiB(1))) == Float64 + @test typeof(memory_bandwidth(; dtype=Float32)) == Float64 + end + @testset "regular, scaling" begin + @test typeof(memory_bandwidth_scaling()) == + NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Float64},Vector{Float64}}} + end + @testset "saxpy" begin + @test typeof(memory_bandwidth_saxpy()) == Float64 + @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64 + @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64 + end + @testset "saxpy, scaling" begin + @test typeof(memory_bandwidth_saxpy_scaling()) == + NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Int64},Vector{Float64}}} + end end diff --git a/test/tests_core.jl b/test/tests_core.jl index c6bed8c..1db7309 100644 --- a/test/tests_core.jl +++ b/test/tests_core.jl @@ -1,10 +1,21 @@ @testset "Backend switching" begin - @test GPUInspector.is_cuda_loaded() - @test GPUInspector.is_cuda_loaded() - @test GPUInspector.is_backend_loaded(NVIDIABackend()) - @test backend() == NVIDIABackend() - @test isnothing(backend!(NoBackend())) - @test backend() == NoBackend() - @test isnothing(backend!(:cuda)) - @test backend() == NVIDIABackend() + if TEST_BACKEND == NVIDIABackend() + @test GPUInspector.is_cuda_loaded() + @test GPUInspector.is_backend_loaded(NVIDIABackend()) + @test backend() == NVIDIABackend() + @test isnothing(backend!(NoBackend())) + @test backend() == NoBackend() + @test isnothing(backend!(:cuda)) + @test backend() == NVIDIABackend() + @test isnothing(backend!(NVIDIABackend())) + elseif TEST_BACKEND == AMDBackend() + @test GPUInspector.is_amdgpu_loaded() + @test GPUInspector.is_backend_loaded(AMDBackend()) + @test backend() == AMDBackend() + @test isnothing(backend!(NoBackend())) + @test backend() == NoBackend() + @test isnothing(backend!(:amd)) + @test backend() == AMDBackend() + @test isnothing(backend!(AMDBackend())) + end end