From e7be3edf97aeb799f4564e3252ae9e72b88694e5 Mon Sep 17 00:00:00 2001
From: Carsten Bauer <crstnbr@gmail.com>
Date: Wed, 16 Aug 2023 15:46:07 +0200
Subject: [PATCH 1/5] initial attempt

---
 Project.toml                                  |   6 +-
 ext/AMDGPUExt/AMDGPUExt.jl                    | 106 +++++++
 ext/AMDGPUExt/implementations/general.jl      |  18 ++
 ext/AMDGPUExt/implementations/gpuinfo.jl      | 261 ++++++++++++++++++
 .../implementations/host2device_bandwidth.jl  |  85 ++++++
 ext/AMDGPUExt/implementations/membw.jl        | 163 +++++++++++
 ext/CUDAExt/implementations/general.jl        |   8 +
 ext/CUDAExt/implementations/gpuinfo.jl        |   4 +-
 ext/CUDAExt/implementations/membw.jl          |   4 +-
 ext/CUDAExt/utility.jl                        |  20 --
 src/GPUInspector.jl                           |   9 +-
 src/stubs/stubs_general.jl                    |   8 +
 src/utility.jl                                |   9 +
 13 files changed, 674 insertions(+), 27 deletions(-)
 create mode 100644 ext/AMDGPUExt/AMDGPUExt.jl
 create mode 100644 ext/AMDGPUExt/implementations/general.jl
 create mode 100644 ext/AMDGPUExt/implementations/gpuinfo.jl
 create mode 100644 ext/AMDGPUExt/implementations/host2device_bandwidth.jl
 create mode 100644 ext/AMDGPUExt/implementations/membw.jl

diff --git a/Project.toml b/Project.toml
index 053af89..dfbf455 100644
--- a/Project.toml
+++ b/Project.toml
@@ -21,13 +21,16 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 
 [extensions]
 CUDAExt = "CUDA"
+AMDGPUExt = "AMDGPU"
 CairoMakieExt = "CairoMakie"
 
 [compat]
+AMDGPU = "0.5"
 CUDA = "3.8.4, 3.12, 4.4"
 CairoMakie = "0.7, 0.10.7"
 CpuId = "0.3"
@@ -43,10 +46,11 @@ julia = "1.9"
 
 [extras]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 
 [targets]
-test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "TestItemRunner"]
+test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU", "TestItemRunner"]
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
new file mode 100644
index 0000000..d141faf
--- /dev/null
+++ b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -0,0 +1,106 @@
+module AMDGPUExt
+
+using GPUInspector
+using AMDGPU
+using AMDGPU: device, device!, devices
+
+# stdlibs etc.
+using Base: UUID
+using Statistics
+using Logging
+using LinearAlgebra
+
+# pkgs
+using UnicodePlots
+
+# for usage in AMDGPUExt
+using GPUInspector:
+    logspace,
+    ismonitoring,
+    _monitoring!,
+    _set_monitoring_task,
+    _get_monitoring_task,
+    MonitoringResults,
+    _defaultylims,
+    @unroll,
+    AMDBackend
+
+# import stubs to implement them
+import GPUInspector: backendinfo, functional, clear_gpu_memory
+# gpuinfo
+import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus
+# p2p bw
+import GPUInspector:
+    p2p_bandwidth,
+    p2p_bandwidth_all,
+    p2p_bandwidth_bidirectional,
+    p2p_bandwidth_bidirectional_all
+# host2device bw
+import GPUInspector: host2device_bandwidth
+# membw
+import GPUInspector:
+    theoretical_memory_bandwidth,
+    memory_bandwidth,
+    memory_bandwidth_scaling,
+    memory_bandwidth_saxpy,
+    memory_bandwidth_saxpy_scaling
+# stresstest
+import GPUInspector: stresstest
+# monitoring
+import GPUInspector:
+    monitoring_start,
+    monitoring_stop,
+    livemonitor_something,
+    livemonitor_powerusage,
+    livemonitor_temperature
+# peakflops_gpu
+import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu
+
+# include("cuda_wrappers.jl")
+# include("utility.jl")
+# include("stresstests.jl")
+# include("peakflops_gpu_fmas.jl")
+# include("peakflops_gpu_wmmas.jl")
+# include("peakflops_gpu_matmul.jl")
+include("implementations/general.jl")
+include("implementations/gpuinfo.jl")
+# include("implementations/p2p_bandwidth.jl")
+include("implementations/host2device_bandwidth.jl")
+include("implementations/membw.jl")
+# include("implementations/stresstest.jl")
+# include("implementations/monitoring.jl")
+# include("implementations/peakflops_gpu.jl")
+
+function __init__()
+    GPUInspector.AMDGPUJL_LOADED[] = true
+    GPUInspector.backend!(AMDBackend())
+    GPUInspector.AMDGPUExt = Base.get_extension(GPUInspector, :AMDGPUExt)
+    return nothing
+end
+
+function backendinfo(::AMDBackend)
+    # somewhat crude way to figure out which API functions are implemented :)
+    funcs = String[]
+    impl_dir = joinpath(@__DIR__, "implementations/")
+    for f in readdir(impl_dir)
+        lines = readlines(joinpath(impl_dir, f))
+        func_lines = filter(startswith("function"), lines)
+        for fl in func_lines
+            fname = strip(split(split(fl, "function")[2], "(")[1])
+            if startswith(fname, "_") || startswith(fname, "Base")
+                continue
+            end
+            if fname in funcs # avoid duplicates
+                continue
+            end
+            push!(funcs, fname)
+        end
+    end
+    println("Implementend API functions for AMDBackend:")
+    for f in funcs
+        println("\t", f)
+    end
+    return nothing
+end
+
+end # module
diff --git a/ext/AMDGPUExt/implementations/general.jl b/ext/AMDGPUExt/implementations/general.jl
new file mode 100644
index 0000000..7ca7b14
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/general.jl
@@ -0,0 +1,18 @@
+function functional(::AMDBackend; verbose=true)
+    if AMDGPU.functional()
+        verbose && @info("AMDGPU.jl is functional.")
+        working = true
+    else
+        verbose && @info("AMDGPU.jl not functional.")
+        working = false
+    end
+    return working
+end
+
+function clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true)
+    device!(device) do
+        gc && GC.gc()
+        AMDGPU.HIP.reclaim()
+    end
+    return nothing
+end
diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl
new file mode 100644
index 0000000..92db2e2
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/gpuinfo.jl
@@ -0,0 +1,261 @@
+function ngpus(::AMDBackend)
+    length(AMDGPU.devices())
+end
+
+function gpus(::AMDBackend; io::IO=stdout)
+    # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
+    devs = AMDGPU.devices()
+    if isempty(devs)
+        println(io, "No AMD devices found.")
+    elseif length(devs) == 1
+        println(io, "1 device:")
+    else
+        println(io, length(devs), " devices:")
+    end
+    for (i, dev) in enumerate(devs)
+        mem_free, mem_tot = AMDGPU.device!(dev) do
+            AMDGPU.Runtime.Mem.info()
+        end
+        println(
+            io,
+            "  $(i-1): ", repr(dev), " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)",
+        )
+    end
+end
+
+# """
+#     gpuinfo(deviceid::Integer)
+
+# Print out detailed information about the NVIDIA GPU with the given `deviceid`.
+
+# Heavily inspired by the CUDA sample "deviceQueryDrv.cpp".
+
+# (This method is from the CUDA backend.)
+# """
+# function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
+#     0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id."))
+#     return gpuinfo(CuDevice(deviceid); io)
+# end
+# function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
+#     # query
+#     mp = nmultiprocessors(dev)
+#     cores = ncudacores(dev)
+#     max_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE) ÷ 1000
+#     mem_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) ÷ 1000
+#     mem_bus_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH)
+#     l2cachesize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE)
+#     maxTex1D = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH)
+#     maxTex2D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH)
+#     maxTex2D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT)
+#     maxTex3D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH)
+#     maxTex3D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT)
+#     maxTex3D_depth = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH)
+#     maxTex1DLayered_width = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH
+#     )
+#     maxTex1DLayered_layers = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS
+#     )
+#     maxTex2DLayered_width = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH
+#     )
+#     maxTex2DLayered_height = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT
+#     )
+#     maxTex2DLayered_layers = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS
+#     )
+#     total_constant_mem = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)
+#     shared_mem_per_block = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
+#     )
+#     regs_per_block = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK)
+#     warpsize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE)
+#     max_threads_per_mp = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
+#     )
+#     max_threads_per_block = CUDA.attribute(
+#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+#     )
+#     blockdim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
+#     blockdim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)
+#     blockdim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)
+#     griddim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)
+#     griddim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
+#     griddim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z)
+#     texture_align = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT)
+#     max_mem_pitch = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_PITCH)
+#     async_engine_count = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT)
+#     gpu_overlap = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP))
+#     kernel_exec_timeout_enabled = Bool(
+#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)
+#     )
+#     integrated = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_INTEGRATED))
+#     can_map_host_mem = Bool(
+#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY)
+#     )
+#     concurrent_kernels = Bool(
+#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS)
+#     )
+#     surface_alignment = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT) > 0
+#     ecc_enabled = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ECC_ENABLED))
+#     unified_addressing = Bool(
+#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)
+#     )
+#     managed_memory = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY))
+#     compute_preemption = Bool(
+#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED)
+#     )
+#     cooperative_launch = Bool(
+#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH)
+#     )
+#     cooperative_multi_dev_launch = Bool(
+#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH)
+#     )
+#     pci_domainid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID)
+#     pci_busid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID)
+#     pci_deviceid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)
+#     compute_mode = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE)
+#     comp_modes = [
+#         "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+#         "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+#         "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+#         "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+#         "Unknown",
+#     ]
+
+#     # printing
+#     println(io, "Device: ", name(dev), " ($dev)")
+#     println(
+#         io, "Total amount of global memory: ", Base.format_bytes(Int(CUDA.totalmem(dev)))
+#     )
+#     println(io, "Number of CUDA cores: ", cores)
+#     println(io, "Number of multiprocessors: ", mp, " ($(cores ÷ mp) CUDA cores each)")
+#     println(io, "GPU max. clock rate: ", max_clock_rate, " MHz")
+#     println(io, "Memory clock rate: ", mem_clock_rate, " MHz")
+#     println(io, "Memory bus width: ", mem_bus_width, "-bit")
+#     println(io, "L2 cache size: ", Base.format_bytes(l2cachesize))
+#     println(io, "Max. texture dimension sizes (1D): $maxTex1D")
+#     println(io, "Max. texture dimension sizes (2D): $maxTex2D_width, $maxTex2D_height")
+#     println(
+#         io,
+#         "Max. texture dimension sizes (3D): $maxTex3D_width, $maxTex3D_height, $maxTex3D_depth",
+#     )
+#     println(
+#         io,
+#         "Max. layered 1D texture size: $(maxTex1DLayered_width) ($(maxTex1DLayered_layers) layers)",
+#     )
+#     println(
+#         io,
+#         "Max. layered 2D texture size: $(maxTex2DLayered_width), $(maxTex2DLayered_height) ($(maxTex2DLayered_layers) layers)",
+#     )
+#     println(io, "Total amount of constant memory: ", Base.format_bytes(total_constant_mem))
+#     println(
+#         io,
+#         "Total amount of shared memory per block: ",
+#         Base.format_bytes(shared_mem_per_block),
+#     )
+#     println(io, "Total number of registers available per block: ", regs_per_block)
+#     println(io, "Warp size: ", warpsize)
+#     println(io, "Max. number of threads per multiprocessor: ", max_threads_per_mp)
+#     println(io, "Max. number of threads per block: ", max_threads_per_block)
+#     println(
+#         io,
+#         "Max. dimension size of a thread block (x,y,z): $(blockdim_x), $(blockdim_y), $(blockdim_z)",
+#     )
+#     println(
+#         io,
+#         "Max. dimension size of a grid size (x,y,z): $(griddim_x), $(griddim_y), $(griddim_z)",
+#     )
+#     println(io, "Texture alignment: ", Base.format_bytes(texture_align))
+#     println(io, "Maximum memory pitch: ", Base.format_bytes(max_mem_pitch))
+#     println(
+#         io,
+#         "Concurrent copy and kernel execution: ",
+#         gpu_overlap ? "Yes" : "No",
+#         " with $(async_engine_count) copy engine(s)",
+#     )
+#     println(io, "Run time limit on kernels: ", kernel_exec_timeout_enabled ? "Yes" : "No")
+#     println(io, "Integrated GPU sharing host memory: ", integrated ? "Yes" : "No")
+#     println(
+#         io, "Support host page-locked memory mapping: ", can_map_host_mem ? "Yes" : "No"
+#     )
+#     println(io, "Concurrent kernel execution: ", concurrent_kernels ? "Yes" : "No")
+#     println(io, "Alignment requirement for surfaces: ", surface_alignment ? "Yes" : "No")
+#     println(io, "Device has ECC support: ", ecc_enabled ? "Yes" : "No")
+#     println(
+#         io, "Device supports Unified Addressing (UVA): ", unified_addressing ? "Yes" : "No"
+#     )
+#     println(io, "Device supports managed memory: ", managed_memory ? "Yes" : "No")
+#     println(io, "Device supports compute preemption: ", compute_preemption ? "Yes" : "No")
+#     println(io, "Supports cooperative kernel launch: ", cooperative_launch ? "Yes" : "No")
+#     println(
+#         io,
+#         "Supports multi-device co-op kernel launch: ",
+#         cooperative_multi_dev_launch ? "Yes" : "No",
+#     )
+#     println(
+#         io,
+#         "Device PCI domain ID / bus ID / device ID: $(pci_domainid) / $(pci_busid) / $(pci_deviceid)",
+#     )
+#     println(io, "Compute mode: ", comp_modes[compute_mode + 1])
+
+#     return nothing
+# end
+
+# function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
+#     # check p2p access
+#     ndevs = ngpus(NVIDIABackend())
+#     if ndevs <= 1
+#         error("Only a single GPU available.")
+#     else
+#         mat_p2p_access_supported = Matrix{Bool}(undef, ndevs, ndevs)
+#         mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs)
+#         mat_p2p_atomic_supported = Matrix{Bool}(undef, ndevs, ndevs)
+#         for i in 1:ndevs
+#             dev_i = CuDevice(i - 1)
+#             for j in 1:ndevs
+#                 dev_j = CuDevice(j - 1)
+#                 if i != j
+#                     p2p_access_supported = Bool(
+#                         CUDA.p2p_attribute(
+#                             dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED
+#                         ),
+#                     )
+#                     p2p_can_access = Bool(CUDA.can_access_peer(dev_i, dev_j))
+#                     p2p_atomic_supported = Bool(
+#                         CUDA.p2p_attribute(
+#                             dev_i,
+#                             dev_j,
+#                             CUDA.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED,
+#                         ),
+#                     )
+#                     mat_p2p_atomic_supported[i, j] = p2p_atomic_supported
+#                     mat_p2p_access_supported[i, j] = p2p_access_supported
+#                     mat_p2p_can_access[i, j] = p2p_can_access
+#                     # p2p_performance_rank = CUDA.p2p_attribute(dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK)
+#                 else
+#                     mat_p2p_atomic_supported[i, i] = false
+#                     mat_p2p_access_supported[i, i] = false
+#                     mat_p2p_can_access[i, j] = false
+#                 end
+#             end
+#         end
+
+#         printstyled(io, "P2P Access Supported:\n"; bold=true)
+#         show(io, "text/plain", mat_p2p_access_supported)
+#         println(io)
+#         println(io)
+#         if mat_p2p_access_supported != mat_p2p_can_access
+#             printstyled(io, "P2P Can Access:\n"; bold=true)
+#             show(io, "text/plain", mat_p2p_can_access)
+#             println(io)
+#             println(io)
+#         end
+#         printstyled(io, "P2P Atomic Supported:\n"; bold=true)
+#         show(io, "text/plain", mat_p2p_atomic_supported)
+#         println(io)
+#         println(io)
+#     end
+#     return nothing
+# end
diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
new file mode 100644
index 0000000..b32a534
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
@@ -0,0 +1,85 @@
+function host2device_bandwidth(
+    ::AMDBackend;
+    memsize::UnitPrefixedBytes=GiB(0.5),
+    dtype=Cchar,
+    DtoDfactor=true,
+    verbose=true,
+    io::IO=stdout,
+    kwargs...,
+)
+    N = Int(bytes(memsize) ÷ sizeof(dtype))
+    mem_host = rand(dtype, N)
+    # mem_host_pinned = Mem.pin(rand(dtype, N)) # TODO
+    mem_gpu = AMDGPU.rand(dtype, N)
+
+    _perform_memcpy(mem_host, mem_gpu; title="Host <-> Device", verbose, io=io, kwargs...)
+    verbose && println(io)
+    # _perform_memcpy(
+    #     mem_host_pinned,
+    #     mem_gpu;
+    #     title="Host (pinned) <-> Device",
+    #     verbose,
+    #     io=io,
+    #     kwargs...,
+    # )
+    # verbose && println()
+    # _perform_memcpy(mem_gpu, mem_gpu2; title="Device <-> Device (same device)", DtoDfactor, verbose, kwargs...)
+    return nothing
+end
+
+function _perform_memcpy(
+    mem1,
+    mem2;
+    title="",
+    nbench=10,
+    times=false,
+    stats=false,
+    DtoDfactor=false,
+    verbose=true,
+    io::IO=stdout,
+)
+    sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
+    ts = zeros(nbench)
+
+    @inbounds for i in 1:nbench
+        if i % 2 == 0
+            ts[i] = AMDGPU.@elapsed copyto!(mem1, mem2)
+        else
+            ts[i] = AMDGPU.@elapsed copyto!(mem2, mem1)
+        end
+    end
+
+    t_min = minimum(ts)
+    t_max = maximum(ts)
+    t_avg = mean(ts)
+
+    actual_memsize_GiB = sizeof(mem1) * 2^(-30)
+    if DtoDfactor
+        actual_memsize_GiB *= 2 # must count both the read and the write here (taken from p2pBandwidthLatencyTest cuda sample....)
+    end
+    bws = actual_memsize_GiB ./ ts
+    bw_min = minimum(bws)
+    bw_max = maximum(bws)
+    bw_avg = mean(bws)
+
+    if verbose
+        if times
+            println(io, "t_min: $t_min")
+            println(io, "t_max: $t_max")
+            println(io, "t_avg: $t_avg")
+        end
+        printstyled(io, "$(title) Bandwidth (GiB/s):\n"; bold=true)
+        if stats
+            print(io, " ├ max: ")
+            printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true)
+            println(io, " ├ min: ", round(bw_min; digits=2))
+            println(io, " ├ avg: ", round(bw_avg; digits=2))
+            print(io, " └ std_dev: ")
+            printstyled(io, round(std(bws); digits=2), "\n"; color=:yellow, bold=true)
+        else
+            print(io, " └ max: ")
+            printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true)
+        end
+    end
+    return bw_max
+end
diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl
new file mode 100644
index 0000000..e43aefc
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/membw.jl
@@ -0,0 +1,163 @@
+# function theoretical_memory_bandwidth(
+#     ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
+# )
+#     max_mem_clock_rate =
+#         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
+#     max_mem_bus_width =
+#         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) / 8.0 # in bytes
+#     max_bw = 2.0 * max_mem_clock_rate * max_mem_bus_width * 2^(-30)
+#     if verbose
+#         printstyled(io, "Theoretical Maximal Memory Bandwidth (GiB/s):\n"; bold=true)
+#         print(io, " └ max: ")
+#         printstyled(io, round(max_bw; digits=1), "\n"; color=:green, bold=true)
+#     end
+#     return max_bw
+# end
+
+function memory_bandwidth(
+    ::AMDBackend;
+    memsize::UnitPrefixedBytes=GiB(0.5),
+    dtype=Cchar,
+    verbose=true,
+    DtoDfactor=true,
+    device=AMDGPU.device(),
+    io::IO=stdout,
+    kwargs...,
+)::Float64
+    AMDGPU.device!(device) do
+        N = Int(bytes(memsize) ÷ sizeof(dtype))
+        mem_gpu = AMDGPU.rand(dtype, N)
+        mem_gpu2 = AMDGPU.rand(dtype, N)
+
+        return _perform_memcpy(
+            mem_gpu, mem_gpu2; title="Memory", DtoDfactor, verbose, io=io, kwargs...
+        )
+    end
+end
+
+function memory_bandwidth_scaling(
+    ::AMDBackend;
+    device=AMDGPU.device(),
+    sizes=logspace(1, exp2(30), 10),
+    verbose=true,
+    io::IO=stdout,
+    kwargs...,
+)
+    bandwidths = zeros(length(sizes))
+    for (i, s) in enumerate(sizes)
+        bandwidths[i] = memory_bandwidth(
+            AMDBackend(); memsize=B(s), device=device, verbose=false, kwargs...
+        )
+        clear_gpu_memory(AMDBackend(); device=device)
+    end
+    if verbose
+        peak_val, idx = findmax(bandwidths)
+        peak_size = sizes[idx]
+        p = UnicodePlots.lineplot(
+            sizes,
+            bandwidths;
+            xlabel="data size",
+            ylabel="GiB/s",
+            title=string(
+                "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))"
+            ),
+            xscale=:log2,
+        )
+        UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red)
+        println(io) # top margin
+        println(io, p)
+        println(io) # bottom margin
+    end
+    return (sizes=sizes, bandwidths=bandwidths)
+end
+
+# """
+# Extra keyword arguments:
+# * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`.
+
+# (This method is from the CUDA backend.)
+# """
+# function memory_bandwidth_saxpy(
+#     ::NVIDIABackend;
+#     device=CUDA.device(),
+#     size=2^20 * 10,
+#     nbench=10,
+#     dtype=Float32,
+#     cublas=true,
+#     verbose=true,
+#     io::IO=stdout,
+# )::Float64
+#     device!(device) do
+#         a = dtype(pi)
+#         x = CUDA.rand(dtype, size)
+#         y = CUDA.rand(dtype, size)
+#         z = CUDA.zeros(dtype, size)
+
+#         nthreads = CUDA.attribute(device, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
+#         nblocks = cld(size, nthreads)
+#         t = Inf
+#         for _ in 1:nbench
+#             if cublas
+#                 Δt = CUDA.@elapsed CUBLAS.axpy!(size, a, x, y)
+#             else
+#                 Δt = CUDA.@elapsed @cuda(
+#                     threads = nthreads, blocks = nblocks, _saxpy_gpu_kernel!(z, a, x, y)
+#                 )
+#             end
+#             t = min(t, Δt)
+#         end
+
+#         bandwidth = 3.0 * sizeof(dtype) * size * (1024)^(-3) / t
+#         if verbose
+#             printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true)
+#             print(io, " └ max: ")
+#             printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true)
+#         end
+#         return bandwidth
+#     end
+# end
+
+# function _saxpy_gpu_kernel!(z, a, x, y)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     if i <= length(z)
+#         @inbounds z[i] = a * x[i] + y[i]
+#     end
+#     return nothing
+# end
+
+# function memory_bandwidth_saxpy_scaling(
+#     ::NVIDIABackend;
+#     device=CUDA.device(),
+#     sizes=[2^20 * i for i in 10:10:300],
+#     verbose=true,
+#     io::IO=stdout,
+#     kwargs...,
+# )
+#     # sizes = [2^20 * i for i in 8:128] # V100
+#     bandwidths = zeros(length(sizes))
+#     for (i, s) in enumerate(sizes)
+#         bandwidths[i] = memory_bandwidth_saxpy(
+#             NVIDIABackend(); device=device, size=s, verbose=false, kwargs...
+#         )
+#         clear_gpu_memory(AMDBackend(); device=device)
+#     end
+#     if verbose
+#         peak_val, idx = findmax(bandwidths)
+#         peak_size = sizes[idx]
+#         p = UnicodePlots.lineplot(
+#             sizes,
+#             bandwidths;
+#             xlabel="vector length",
+#             ylabel="GiB/s",
+#             title=string(
+#                 "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))"
+#             ),
+#             xscale=:log2,
+#         )
+#         UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red)
+#         println(io) # top margin
+#         println(io, p)
+#         println(io) # bottom margin
+#     end
+#     return (sizes=sizes, bandwidths=bandwidths)
+# end
diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl
index b44e5fd..ac4b4b3 100644
--- a/ext/CUDAExt/implementations/general.jl
+++ b/ext/CUDAExt/implementations/general.jl
@@ -21,3 +21,11 @@ function functional(::NVIDIABackend; verbose=true)
     end
     return hascuda
 end
+
+function clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true)
+    device!(device) do
+        gc && GC.gc()
+        CUDA.reclaim()
+    end
+    return nothing
+end
diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl
index b960b08..7fbc0ce 100644
--- a/ext/CUDAExt/implementations/gpuinfo.jl
+++ b/ext/CUDAExt/implementations/gpuinfo.jl
@@ -1,4 +1,6 @@
-ngpus(::NVIDIABackend) = length(CUDA.devices())
+function ngpus(::NVIDIABackend)
+    length(CUDA.devices())
+end
 
 function gpus(::NVIDIABackend; io::IO=stdout)
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl
index c0b68f1..003f106 100644
--- a/ext/CUDAExt/implementations/membw.jl
+++ b/ext/CUDAExt/implementations/membw.jl
@@ -54,7 +54,7 @@ function memory_bandwidth_scaling(
         bandwidths[i] = memory_bandwidth(
             NVIDIABackend(); memsize=B(s), device=device, verbose=false, kwargs...
         )
-        clear_gpu_memory(device)
+        clear_gpu_memory(NVIDIABackend(); device=device)
     end
     if verbose
         peak_val, idx = findmax(bandwidths)
@@ -145,7 +145,7 @@ function memory_bandwidth_saxpy_scaling(
         bandwidths[i] = memory_bandwidth_saxpy(
             NVIDIABackend(); device=device, size=s, verbose=false, kwargs...
         )
-        clear_gpu_memory(device)
+        clear_gpu_memory(NVIDIABackend(); device=device)
     end
     if verbose
         peak_val, idx = findmax(bandwidths)
diff --git a/ext/CUDAExt/utility.jl b/ext/CUDAExt/utility.jl
index 7f6a504..3c9681d 100644
--- a/ext/CUDAExt/utility.jl
+++ b/ext/CUDAExt/utility.jl
@@ -22,26 +22,6 @@ function alloc_mem(memsize::UnitPrefixedBytes; devs=(CUDA.device(),), dtype=Floa
     return mem_handles
 end
 
-# TODO: Maybe make API/stub?
-"Reclaim the unused memory of the currently active GPU (i.e. `device()`)."
-function clear_gpu_memory(device::CuDevice=CUDA.device(); gc=true)
-    device!(device) do
-        gc && GC.gc()
-        CUDA.reclaim()
-    end
-    return nothing
-end
-
-# TODO: Maybe make API/stub?
-"Reclaim the unused memory of all available GPUs."
-function clear_all_gpus_memory(devices=CUDA.devices(); gc=true)
-    gc && GC.gc()
-    for dev in devices
-        clear_gpu_memory(dev; gc=false)
-    end
-    return nothing
-end
-
 """
     toggle_tensorcoremath([enable::Bool; verbose=true])
 Switches the `CUDA.math_mode` between `CUDA.FAST_MATH` (`enable=true`) and `CUDA.DEFAULT_MATH` (`enable=false`).
diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl
index 9dbe9e9..fe4f641 100644
--- a/src/GPUInspector.jl
+++ b/src/GPUInspector.jl
@@ -27,8 +27,8 @@ include("monitoring_io.jl")
 
 function not_implemented_yet()
     return error(
-        "Not implemented yet. You either haven't loaded a backend (like CUDA.jl) yet, or" *
-        " the loaded backend doesn't provide this functionality.",
+        "Not implemented yet. You either haven't loaded a backend yet (e.g. CUDA.jl or " *
+        "AMDGPU.jl), or the loaded backend doesn't provide this functionality.",
     )
 end
 include("stubs/stubs_general.jl")
@@ -50,11 +50,14 @@ export plot_monitoring_results, load_monitoring_results, save_monitoring_results
 # utilities
 export UnitPrefixedBytes,
     B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, bytes, simplify, change_base, value
-export logspace
+export logspace, clear_all_gpus_memory
 
 # Let's currently not export the CPU tests. After all, this is GPUInspector.jl :)
 # export stresstest_cpu
 
+# stubs general
+export clear_gpu_memory
+
 # stubs gpuinfo
 export ngpus, gpuinfo, gpuinfo_p2p_access, gpus
 # stubs p2p bandwidth
diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl
index eee42ed..57b7136 100644
--- a/src/stubs/stubs_general.jl
+++ b/src/stubs/stubs_general.jl
@@ -4,3 +4,11 @@ If not, print some hopefully useful debug information (or turn it off with `verb
 """
 functional(; kwargs...) = functional(backend(); kwargs...)
 functional(::Backend; kwargs...) = not_implemented_yet()
+
+"""
+    clear_gpu_memory(; device, gc)
+
+Reclaim the unused memory of a GPU
+"""
+clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...)
+clear_gpu_memory(::Backend; kwargs...) = not_implemented_yet()
diff --git a/src/utility.jl b/src/utility.jl
index bba0d55..0b38851 100644
--- a/src/utility.jl
+++ b/src/utility.jl
@@ -2,6 +2,15 @@ function logspace(start, stop, length)
     return exp2.(range(log2(start), log2(stop); length=length))
 end
 
+"Reclaim the unused memory of all available GPUs."
+function clear_all_gpus_memory(; gc=true, devices)
+    gc && GC.gc()
+    for dev in devices
+        clear_gpu_memory(; device=dev, gc=false)
+    end
+    return nothing
+end
+
 # L2_cachesize() = cachesize()[2]
 
 # """

From 375c6e45b27e1be3f9a8e1a489efea43b4cc6b90 Mon Sep 17 00:00:00 2001
From: Carsten Bauer <crstnbr@gmail.com>
Date: Thu, 17 Aug 2023 15:39:20 +0200
Subject: [PATCH 2/5] CI restructuring; gpuinfo_p2p_access etc

---
 Project.toml                                  |   6 +-
 ext/AMDGPUExt/AMDGPUExt.jl                    |  35 +--
 ext/AMDGPUExt/implementations/general.jl      |   7 +-
 ext/AMDGPUExt/implementations/gpuinfo.jl      | 281 +++---------------
 .../implementations/host2device_bandwidth.jl  |   2 +-
 ext/AMDGPUExt/implementations/membw.jl        |   6 +-
 ext/AMDGPUExt/utility.jl                      |   5 +
 ext/CUDAExt/CUDAExt.jl                        |  32 +-
 ext/CUDAExt/implementations/general.jl        |   7 +-
 ext/CUDAExt/implementations/gpuinfo.jl        |  12 +-
 .../implementations/host2device_bandwidth.jl  |   2 +-
 ext/CUDAExt/implementations/membw.jl          |  12 +-
 ext/CUDAExt/implementations/monitoring.jl     |  12 +-
 ext/CUDAExt/implementations/p2p_bandwidth.jl  |   8 +-
 ext/CUDAExt/implementations/peakflops_gpu.jl  |   6 +-
 ext/CUDAExt/implementations/stresstest.jl     |   2 +-
 src/GPUInspector.jl                           |   5 +-
 src/backends.jl                               |   1 +
 src/stubs/stubs_general.jl                    |   8 +
 test/gpuinfo_tests.jl                         |   7 -
 test/runtests.jl                              |  87 ++++--
 test/{backend_tests.jl => tests_backend.jl}   |   4 +-
 ...{bandwidth_tests.jl => tests_bandwidth.jl} |  15 +-
 test/tests_core.jl                            |   0
 test/tests_gpuinfo.jl                         |   8 +
 ...{peakflops_tests.jl => tests_peakflops.jl} |   0
 ...tresstest_tests.jl => tests_stresstest.jl} |   0
 test/{utility_tests.jl => tests_utility.jl}   |   0
 28 files changed, 191 insertions(+), 379 deletions(-)
 create mode 100644 ext/AMDGPUExt/utility.jl
 delete mode 100644 test/gpuinfo_tests.jl
 rename test/{backend_tests.jl => tests_backend.jl} (79%)
 rename test/{bandwidth_tests.jl => tests_bandwidth.jl} (89%)
 create mode 100644 test/tests_core.jl
 create mode 100644 test/tests_gpuinfo.jl
 rename test/{peakflops_tests.jl => tests_peakflops.jl} (100%)
 rename test/{stresstest_tests.jl => tests_stresstest.jl} (100%)
 rename test/{utility_tests.jl => tests_utility.jl} (100%)

diff --git a/Project.toml b/Project.toml
index dfbf455..c4a6b50 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,7 +14,6 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
@@ -38,8 +37,6 @@ DocStringExtensions = "0.9"
 Glob = "1.3"
 HDF5 = "0.16"
 NVTX = "0.3"
-Reexport = "1.2"
-TestItemRunner = "0.2"
 ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7"
 UnicodePlots = "2.8, 3"
 julia = "1.9"
@@ -50,7 +47,6 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 
 [targets]
-test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU", "TestItemRunner"]
+test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU"]
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
index d141faf..f6d829a 100644
--- a/ext/AMDGPUExt/AMDGPUExt.jl
+++ b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -12,6 +12,7 @@ using LinearAlgebra
 
 # pkgs
 using UnicodePlots
+using ThreadPinning
 
 # for usage in AMDGPUExt
 using GPUInspector:
@@ -25,39 +26,7 @@ using GPUInspector:
     @unroll,
     AMDBackend
 
-# import stubs to implement them
-import GPUInspector: backendinfo, functional, clear_gpu_memory
-# gpuinfo
-import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus
-# p2p bw
-import GPUInspector:
-    p2p_bandwidth,
-    p2p_bandwidth_all,
-    p2p_bandwidth_bidirectional,
-    p2p_bandwidth_bidirectional_all
-# host2device bw
-import GPUInspector: host2device_bandwidth
-# membw
-import GPUInspector:
-    theoretical_memory_bandwidth,
-    memory_bandwidth,
-    memory_bandwidth_scaling,
-    memory_bandwidth_saxpy,
-    memory_bandwidth_saxpy_scaling
-# stresstest
-import GPUInspector: stresstest
-# monitoring
-import GPUInspector:
-    monitoring_start,
-    monitoring_stop,
-    livemonitor_something,
-    livemonitor_powerusage,
-    livemonitor_temperature
-# peakflops_gpu
-import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu
-
-# include("cuda_wrappers.jl")
-# include("utility.jl")
+include("utility.jl")
 # include("stresstests.jl")
 # include("peakflops_gpu_fmas.jl")
 # include("peakflops_gpu_wmmas.jl")
diff --git a/ext/AMDGPUExt/implementations/general.jl b/ext/AMDGPUExt/implementations/general.jl
index 7ca7b14..4735281 100644
--- a/ext/AMDGPUExt/implementations/general.jl
+++ b/ext/AMDGPUExt/implementations/general.jl
@@ -1,4 +1,4 @@
-function functional(::AMDBackend; verbose=true)
+function GPUInspector.functional(::AMDBackend; verbose=true)
     if AMDGPU.functional()
         verbose && @info("AMDGPU.jl is functional.")
         working = true
@@ -9,10 +9,13 @@ function functional(::AMDBackend; verbose=true)
     return working
 end
 
-function clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true)
+function GPUInspector.clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true)
     device!(device) do
         gc && GC.gc()
         AMDGPU.HIP.reclaim()
     end
     return nothing
 end
+
+GPUInspector.device(::AMDBackend) = AMDGPU.device()
+GPUInspector.devices(::AMDBackend) = AMDGPU.devices()
diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl
index 92db2e2..b172a52 100644
--- a/ext/AMDGPUExt/implementations/gpuinfo.jl
+++ b/ext/AMDGPUExt/implementations/gpuinfo.jl
@@ -1,8 +1,8 @@
-function ngpus(::AMDBackend)
-    length(AMDGPU.devices())
+function GPUInspector.ngpus(::AMDBackend)
+    return length(AMDGPU.devices())
 end
 
-function gpus(::AMDBackend; io::IO=stdout)
+function GPUInspector.gpus(::AMDBackend; io::IO=stdout)
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
     devs = AMDGPU.devices()
     if isempty(devs)
@@ -18,244 +18,53 @@ function gpus(::AMDBackend; io::IO=stdout)
         end
         println(
             io,
-            "  $(i-1): ", repr(dev), " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)",
+            "  $(_gpuid(dev)): ",
+            repr(dev),
+            " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)",
         )
     end
 end
 
-# """
-#     gpuinfo(deviceid::Integer)
+"""
+    gpuinfo(deviceid::Integer)
 
-# Print out detailed information about the NVIDIA GPU with the given `deviceid`.
+Print out detailed information about the AMD GPU with the given `deviceid`.
 
-# Heavily inspired by the CUDA sample "deviceQueryDrv.cpp".
-
-# (This method is from the CUDA backend.)
-# """
-# function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
-#     0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id."))
-#     return gpuinfo(CuDevice(deviceid); io)
-# end
-# function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
-#     # query
-#     mp = nmultiprocessors(dev)
-#     cores = ncudacores(dev)
-#     max_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE) ÷ 1000
-#     mem_clock_rate = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) ÷ 1000
-#     mem_bus_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH)
-#     l2cachesize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE)
-#     maxTex1D = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH)
-#     maxTex2D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH)
-#     maxTex2D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT)
-#     maxTex3D_width = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH)
-#     maxTex3D_height = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT)
-#     maxTex3D_depth = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH)
-#     maxTex1DLayered_width = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH
-#     )
-#     maxTex1DLayered_layers = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS
-#     )
-#     maxTex2DLayered_width = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH
-#     )
-#     maxTex2DLayered_height = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT
-#     )
-#     maxTex2DLayered_layers = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS
-#     )
-#     total_constant_mem = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)
-#     shared_mem_per_block = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
-#     )
-#     regs_per_block = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK)
-#     warpsize = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE)
-#     max_threads_per_mp = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
-#     )
-#     max_threads_per_block = CUDA.attribute(
-#         dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
-#     )
-#     blockdim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
-#     blockdim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)
-#     blockdim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)
-#     griddim_x = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)
-#     griddim_y = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
-#     griddim_z = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z)
-#     texture_align = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT)
-#     max_mem_pitch = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_PITCH)
-#     async_engine_count = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT)
-#     gpu_overlap = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP))
-#     kernel_exec_timeout_enabled = Bool(
-#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)
-#     )
-#     integrated = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_INTEGRATED))
-#     can_map_host_mem = Bool(
-#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY)
-#     )
-#     concurrent_kernels = Bool(
-#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS)
-#     )
-#     surface_alignment = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT) > 0
-#     ecc_enabled = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_ECC_ENABLED))
-#     unified_addressing = Bool(
-#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)
-#     )
-#     managed_memory = Bool(CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY))
-#     compute_preemption = Bool(
-#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED)
-#     )
-#     cooperative_launch = Bool(
-#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH)
-#     )
-#     cooperative_multi_dev_launch = Bool(
-#         CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH)
-#     )
-#     pci_domainid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID)
-#     pci_busid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID)
-#     pci_deviceid = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)
-#     compute_mode = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE)
-#     comp_modes = [
-#         "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
-#         "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
-#         "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
-#         "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
-#         "Unknown",
-#     ]
-
-#     # printing
-#     println(io, "Device: ", name(dev), " ($dev)")
-#     println(
-#         io, "Total amount of global memory: ", Base.format_bytes(Int(CUDA.totalmem(dev)))
-#     )
-#     println(io, "Number of CUDA cores: ", cores)
-#     println(io, "Number of multiprocessors: ", mp, " ($(cores ÷ mp) CUDA cores each)")
-#     println(io, "GPU max. clock rate: ", max_clock_rate, " MHz")
-#     println(io, "Memory clock rate: ", mem_clock_rate, " MHz")
-#     println(io, "Memory bus width: ", mem_bus_width, "-bit")
-#     println(io, "L2 cache size: ", Base.format_bytes(l2cachesize))
-#     println(io, "Max. texture dimension sizes (1D): $maxTex1D")
-#     println(io, "Max. texture dimension sizes (2D): $maxTex2D_width, $maxTex2D_height")
-#     println(
-#         io,
-#         "Max. texture dimension sizes (3D): $maxTex3D_width, $maxTex3D_height, $maxTex3D_depth",
-#     )
-#     println(
-#         io,
-#         "Max. layered 1D texture size: $(maxTex1DLayered_width) ($(maxTex1DLayered_layers) layers)",
-#     )
-#     println(
-#         io,
-#         "Max. layered 2D texture size: $(maxTex2DLayered_width), $(maxTex2DLayered_height) ($(maxTex2DLayered_layers) layers)",
-#     )
-#     println(io, "Total amount of constant memory: ", Base.format_bytes(total_constant_mem))
-#     println(
-#         io,
-#         "Total amount of shared memory per block: ",
-#         Base.format_bytes(shared_mem_per_block),
-#     )
-#     println(io, "Total number of registers available per block: ", regs_per_block)
-#     println(io, "Warp size: ", warpsize)
-#     println(io, "Max. number of threads per multiprocessor: ", max_threads_per_mp)
-#     println(io, "Max. number of threads per block: ", max_threads_per_block)
-#     println(
-#         io,
-#         "Max. dimension size of a thread block (x,y,z): $(blockdim_x), $(blockdim_y), $(blockdim_z)",
-#     )
-#     println(
-#         io,
-#         "Max. dimension size of a grid size (x,y,z): $(griddim_x), $(griddim_y), $(griddim_z)",
-#     )
-#     println(io, "Texture alignment: ", Base.format_bytes(texture_align))
-#     println(io, "Maximum memory pitch: ", Base.format_bytes(max_mem_pitch))
-#     println(
-#         io,
-#         "Concurrent copy and kernel execution: ",
-#         gpu_overlap ? "Yes" : "No",
-#         " with $(async_engine_count) copy engine(s)",
-#     )
-#     println(io, "Run time limit on kernels: ", kernel_exec_timeout_enabled ? "Yes" : "No")
-#     println(io, "Integrated GPU sharing host memory: ", integrated ? "Yes" : "No")
-#     println(
-#         io, "Support host page-locked memory mapping: ", can_map_host_mem ? "Yes" : "No"
-#     )
-#     println(io, "Concurrent kernel execution: ", concurrent_kernels ? "Yes" : "No")
-#     println(io, "Alignment requirement for surfaces: ", surface_alignment ? "Yes" : "No")
-#     println(io, "Device has ECC support: ", ecc_enabled ? "Yes" : "No")
-#     println(
-#         io, "Device supports Unified Addressing (UVA): ", unified_addressing ? "Yes" : "No"
-#     )
-#     println(io, "Device supports managed memory: ", managed_memory ? "Yes" : "No")
-#     println(io, "Device supports compute preemption: ", compute_preemption ? "Yes" : "No")
-#     println(io, "Supports cooperative kernel launch: ", cooperative_launch ? "Yes" : "No")
-#     println(
-#         io,
-#         "Supports multi-device co-op kernel launch: ",
-#         cooperative_multi_dev_launch ? "Yes" : "No",
-#     )
-#     println(
-#         io,
-#         "Device PCI domain ID / bus ID / device ID: $(pci_domainid) / $(pci_busid) / $(pci_deviceid)",
-#     )
-#     println(io, "Compute mode: ", comp_modes[compute_mode + 1])
-
-#     return nothing
-# end
+(This method is from the AMD backend.)
+"""
+function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io::IO=stdout)
+    0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id."))
+    return gpuinfo(HIPDevice(deviceid); io)
+end
+function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io::IO=stdout)
+    # printing
+    println(io, "Device: $dev \n")
+    show(io, AMDGPU.HIP.properties(dev))
+    return nothing
+end
 
-# function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
-#     # check p2p access
-#     ndevs = ngpus(NVIDIABackend())
-#     if ndevs <= 1
-#         error("Only a single GPU available.")
-#     else
-#         mat_p2p_access_supported = Matrix{Bool}(undef, ndevs, ndevs)
-#         mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs)
-#         mat_p2p_atomic_supported = Matrix{Bool}(undef, ndevs, ndevs)
-#         for i in 1:ndevs
-#             dev_i = CuDevice(i - 1)
-#             for j in 1:ndevs
-#                 dev_j = CuDevice(j - 1)
-#                 if i != j
-#                     p2p_access_supported = Bool(
-#                         CUDA.p2p_attribute(
-#                             dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED
-#                         ),
-#                     )
-#                     p2p_can_access = Bool(CUDA.can_access_peer(dev_i, dev_j))
-#                     p2p_atomic_supported = Bool(
-#                         CUDA.p2p_attribute(
-#                             dev_i,
-#                             dev_j,
-#                             CUDA.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED,
-#                         ),
-#                     )
-#                     mat_p2p_atomic_supported[i, j] = p2p_atomic_supported
-#                     mat_p2p_access_supported[i, j] = p2p_access_supported
-#                     mat_p2p_can_access[i, j] = p2p_can_access
-#                     # p2p_performance_rank = CUDA.p2p_attribute(dev_i, dev_j, CUDA.CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK)
-#                 else
-#                     mat_p2p_atomic_supported[i, i] = false
-#                     mat_p2p_access_supported[i, i] = false
-#                     mat_p2p_can_access[i, j] = false
-#                 end
-#             end
-#         end
+function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io::IO=stdout)
+    # check p2p access
+    ndevs = ngpus(AMDBackend())
+    if ndevs <= 1
+        error("Only a single GPU available.")
+    else
+        devs = AMDGPU.devices()
+        mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs)
+        for i in 1:ndevs
+            for j in 1:ndevs
+                if i != j
+                    mat_p2p_can_access[i, j] = Bool(AMDGPU.HIP.can_access_peer(devs[i], devs[j]))
+                else
+                    mat_p2p_can_access[i, j] = false
+                end
+            end
+        end
 
-#         printstyled(io, "P2P Access Supported:\n"; bold=true)
-#         show(io, "text/plain", mat_p2p_access_supported)
-#         println(io)
-#         println(io)
-#         if mat_p2p_access_supported != mat_p2p_can_access
-#             printstyled(io, "P2P Can Access:\n"; bold=true)
-#             show(io, "text/plain", mat_p2p_can_access)
-#             println(io)
-#             println(io)
-#         end
-#         printstyled(io, "P2P Atomic Supported:\n"; bold=true)
-#         show(io, "text/plain", mat_p2p_atomic_supported)
-#         println(io)
-#         println(io)
-#     end
-#     return nothing
-# end
+        printstyled(io, "P2P Can Access:\n"; bold=true)
+        show(io, "text/plain", mat_p2p_can_access)
+        println(io)
+        println(io)
+    end
+    return nothing
+end
diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
index b32a534..de80ef5 100644
--- a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
+++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
@@ -1,4 +1,4 @@
-function host2device_bandwidth(
+function GPUInspector.host2device_bandwidth(
     ::AMDBackend;
     memsize::UnitPrefixedBytes=GiB(0.5),
     dtype=Cchar,
diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl
index e43aefc..8ddba6a 100644
--- a/ext/AMDGPUExt/implementations/membw.jl
+++ b/ext/AMDGPUExt/implementations/membw.jl
@@ -14,7 +14,7 @@
 #     return max_bw
 # end
 
-function memory_bandwidth(
+function GPUInspector.memory_bandwidth(
     ::AMDBackend;
     memsize::UnitPrefixedBytes=GiB(0.5),
     dtype=Cchar,
@@ -35,7 +35,7 @@ function memory_bandwidth(
     end
 end
 
-function memory_bandwidth_scaling(
+function GPUInspector.memory_bandwidth_scaling(
     ::AMDBackend;
     device=AMDGPU.device(),
     sizes=logspace(1, exp2(30), 10),
@@ -75,7 +75,7 @@ end
 # Extra keyword arguments:
 # * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`.
 
-# (This method is from the CUDA backend.)
+# (This method is from the NVIDIA Backend.)
 # """
 # function memory_bandwidth_saxpy(
 #     ::NVIDIABackend;
diff --git a/ext/AMDGPUExt/utility.jl b/ext/AMDGPUExt/utility.jl
new file mode 100644
index 0000000..f9878ea
--- /dev/null
+++ b/ext/AMDGPUExt/utility.jl
@@ -0,0 +1,5 @@
+_device2string(dev::HIPDevice) = "GPU $(_gpuid(dev)): $(_name(dev))"
+
+_gpuid(dev::HIPDevice) = AMDGPU.HIP.device_id(dev) + 1
+
+_name(dev::HIPDevice) = AMDGPU.HIP.name(dev)
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
index 9d2a770..91081ec 100644
--- a/ext/CUDAExt/CUDAExt.jl
+++ b/ext/CUDAExt/CUDAExt.jl
@@ -12,6 +12,7 @@ using LinearAlgebra
 # pkgs
 using UnicodePlots
 using NVTX
+using ThreadPinning
 
 # for usage in CUDAExt
 using GPUInspector:
@@ -25,37 +26,6 @@ using GPUInspector:
     @unroll,
     NVIDIABackend
 
-# import stubs to implement them
-import GPUInspector: backendinfo, functional
-# gpuinfo
-import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus
-# p2p bw
-import GPUInspector:
-    p2p_bandwidth,
-    p2p_bandwidth_all,
-    p2p_bandwidth_bidirectional,
-    p2p_bandwidth_bidirectional_all
-# host2device bw
-import GPUInspector: host2device_bandwidth
-# membw
-import GPUInspector:
-    theoretical_memory_bandwidth,
-    memory_bandwidth,
-    memory_bandwidth_scaling,
-    memory_bandwidth_saxpy,
-    memory_bandwidth_saxpy_scaling
-# stresstest
-import GPUInspector: stresstest
-# monitoring
-import GPUInspector:
-    monitoring_start,
-    monitoring_stop,
-    livemonitor_something,
-    livemonitor_powerusage,
-    livemonitor_temperature
-# peakflops_gpu
-import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu
-
 # for convenience
 const BFloat16 = CUDA.BFloat16
 
diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl
index ac4b4b3..f147679 100644
--- a/ext/CUDAExt/implementations/general.jl
+++ b/ext/CUDAExt/implementations/general.jl
@@ -1,4 +1,4 @@
-function functional(::NVIDIABackend; verbose=true)
+function GPUInspector.functional(::NVIDIABackend; verbose=true)
     if CUDA.functional()
         verbose && @info("CUDA/GPU available.")
         hascuda = true
@@ -22,10 +22,13 @@ function functional(::NVIDIABackend; verbose=true)
     return hascuda
 end
 
-function clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true)
+function GPUInspector.clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true)
     device!(device) do
         gc && GC.gc()
         CUDA.reclaim()
     end
     return nothing
 end
+
+GPUInspector.device(::NVIDIABackend) = CUDA.device()
+GPUInspector.devices(::NVIDIABackend) = CUDA.devices()
diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl
index 7fbc0ce..73a1fda 100644
--- a/ext/CUDAExt/implementations/gpuinfo.jl
+++ b/ext/CUDAExt/implementations/gpuinfo.jl
@@ -1,8 +1,8 @@
-function ngpus(::NVIDIABackend)
+function GPUInspector.ngpus(::NVIDIABackend)
     length(CUDA.devices())
 end
 
-function gpus(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpus(::NVIDIABackend; io::IO=stdout)
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
     devs = devices()
     if isempty(devs)
@@ -43,13 +43,13 @@ Print out detailed information about the NVIDIA GPU with the given `deviceid`.
 
 Heavily inspired by the CUDA sample "deviceQueryDrv.cpp".
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
     0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id."))
     return gpuinfo(CuDevice(deviceid); io)
 end
-function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
     # query
     mp = nmultiprocessors(dev)
     cores = ncudacores(dev)
@@ -216,7 +216,7 @@ function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
     return nothing
 end
 
-function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
     # check p2p access
     ndevs = ngpus(NVIDIABackend())
     if ndevs <= 1
diff --git a/ext/CUDAExt/implementations/host2device_bandwidth.jl b/ext/CUDAExt/implementations/host2device_bandwidth.jl
index 82f2f5d..d3b747c 100644
--- a/ext/CUDAExt/implementations/host2device_bandwidth.jl
+++ b/ext/CUDAExt/implementations/host2device_bandwidth.jl
@@ -1,4 +1,4 @@
-function host2device_bandwidth(::NVIDIABackend;
+function GPUInspector.host2device_bandwidth(::NVIDIABackend;
     memsize::UnitPrefixedBytes=GiB(0.5),
     dtype=Cchar,
     DtoDfactor=true,
diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl
index 003f106..58d9b7b 100644
--- a/ext/CUDAExt/implementations/membw.jl
+++ b/ext/CUDAExt/implementations/membw.jl
@@ -1,4 +1,4 @@
-function theoretical_memory_bandwidth(
+function GPUInspector.theoretical_memory_bandwidth(
     ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
 )
     max_mem_clock_rate =
@@ -14,7 +14,7 @@ function theoretical_memory_bandwidth(
     return max_bw
 end
 
-function memory_bandwidth(
+function GPUInspector.memory_bandwidth(
     ::NVIDIABackend;
     memsize::UnitPrefixedBytes=GiB(0.5),
     dtype=Cchar,
@@ -41,7 +41,7 @@ function memory_bandwidth(
     end
 end
 
-function memory_bandwidth_scaling(
+function GPUInspector.memory_bandwidth_scaling(
     ::NVIDIABackend;
     device=CUDA.device(),
     sizes=logspace(1, exp2(30), 10),
@@ -81,9 +81,9 @@ end
 Extra keyword arguments:
 * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`.
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function memory_bandwidth_saxpy(
+function GPUInspector.memory_bandwidth_saxpy(
     ::NVIDIABackend;
     device=CUDA.device(),
     size=2^20 * 10,
@@ -131,7 +131,7 @@ function _saxpy_gpu_kernel!(z, a, x, y)
     return nothing
 end
 
-function memory_bandwidth_saxpy_scaling(
+function GPUInspector.memory_bandwidth_saxpy_scaling(
     ::NVIDIABackend;
     device=CUDA.device(),
     sizes=[2^20 * i for i in 10:10:300],
diff --git a/ext/CUDAExt/implementations/monitoring.jl b/ext/CUDAExt/implementations/monitoring.jl
index 483da9b..54c24f5 100644
--- a/ext/CUDAExt/implementations/monitoring.jl
+++ b/ext/CUDAExt/implementations/monitoring.jl
@@ -1,4 +1,4 @@
-function monitoring_start(
+function GPUInspector.monitoring_start(
     ::NVIDIABackend; freq=1, devices=CUDA.devices(), thread=Threads.nthreads(), verbose=true
 )
     if ismonitoring()
@@ -54,9 +54,9 @@ Specifically, `results` is a named tuple with the following keys:
 * `time`: the (relative) times at which we measured
 * `temperature`, `power`, `compute`, `mem`
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults
+function GPUInspector.monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults
     if ismonitoring()
         verbose && @info("Stopping monitoring and fetching results...")
         _monitoring!(false)
@@ -67,7 +67,7 @@ function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults
     end
 end
 
-function livemonitor_temperature(::NVIDIABackend, duration; kwargs...)
+function GPUInspector.livemonitor_temperature(::NVIDIABackend, duration; kwargs...)
     return livemonitor_something(
         NVIDIABackend(),
         get_temperatures,
@@ -78,7 +78,7 @@ function livemonitor_temperature(::NVIDIABackend, duration; kwargs...)
     )
 end
 
-function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...)
+function GPUInspector.livemonitor_powerusage(::NVIDIABackend, duration; kwargs...)
     return livemonitor_something(
         NVIDIABackend(),
         get_power_usages,
@@ -89,7 +89,7 @@ function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...)
     )
 end
 
-function livemonitor_something(
+function GPUInspector.livemonitor_something(
     ::NVIDIABackend,
     f::F,
     duration;
diff --git a/ext/CUDAExt/implementations/p2p_bandwidth.jl b/ext/CUDAExt/implementations/p2p_bandwidth.jl
index 6b5a83e..61cb1af 100644
--- a/ext/CUDAExt/implementations/p2p_bandwidth.jl
+++ b/ext/CUDAExt/implementations/p2p_bandwidth.jl
@@ -1,4 +1,4 @@
-function p2p_bandwidth(
+function GPUInspector.p2p_bandwidth(
     ::NVIDIABackend;
     memsize::UnitPrefixedBytes=B(40_000_000),
     nbench=5,
@@ -66,7 +66,7 @@ function p2p_bandwidth(
     return bw_max
 end
 
-function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...)
+function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...)
     ngpus = length(CUDA.devices())
     if ngpus < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -82,7 +82,7 @@ function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs
     ]
 end
 
-function p2p_bandwidth_bidirectional(
+function GPUInspector.p2p_bandwidth_bidirectional(
     ::NVIDIABackend;
     memsize::UnitPrefixedBytes=B(40_000_000),
     nbench=20,
@@ -142,7 +142,7 @@ function p2p_bandwidth_bidirectional(
     return bw_max
 end
 
-function p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...)
+function GPUInspector.p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...)
     ngpus = length(CUDA.devices())
     if ngpus < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl
index 0d6bb2e..6751cbf 100644
--- a/ext/CUDAExt/implementations/peakflops_gpu.jl
+++ b/ext/CUDAExt/implementations/peakflops_gpu.jl
@@ -8,9 +8,9 @@ Estimates the theoretical peak performance of a CUDA device in TFLOP/s.
 * `dtype` (default: `tensorcores ? Float16 : Float32`): element type of the matrices
 * `io` (default: `stdout`): set the stream where the results should be printed.
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function theoretical_peakflops_gpu(
+function GPUInspector.theoretical_peakflops_gpu(
     ::NVIDIABackend;
     device=CUDA.device(),
     tensorcores=hastensorcores(),
@@ -104,7 +104,7 @@ it takes to perform
 
 For more keyword argument options see [`peakflops_gpu_fmas`](@ref) and [`peakflops_gpu_wmmas`](@ref).
 """
-function peakflops_gpu(
+function GPUInspector.peakflops_gpu(
     ::NVIDIABackend;
     tensorcores=hastensorcores(),
     verbose=true,
diff --git a/ext/CUDAExt/implementations/stresstest.jl b/ext/CUDAExt/implementations/stresstest.jl
index 358d9ca..41288ea 100644
--- a/ext/CUDAExt/implementations/stresstest.jl
+++ b/ext/CUDAExt/implementations/stresstest.jl
@@ -1,4 +1,4 @@
-function stresstest(
+function GPUInspector.stresstest(
     ::NVIDIABackend;
     devices=[CUDA.device()],
     mem=nothing,
diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl
index fe4f641..19ba114 100644
--- a/src/GPUInspector.jl
+++ b/src/GPUInspector.jl
@@ -9,8 +9,7 @@ using Base: UUID
 using Pkg: Pkg
 
 # external
-using Reexport
-@reexport using ThreadPinning
+using ThreadPinning
 using DocStringExtensions
 using UnicodePlots
 using CpuId: cachesize
@@ -42,7 +41,7 @@ include("stubs/stubs_peakflops_gpu.jl")
 
 # backends
 export Backend, NoBackend, NVIDIABackend, AMDBackend, backend, backend!, backendinfo
-export CUDAExt
+export CUDAExt, AMDGPUExt
 
 # monitoring io+plotting
 export plot_monitoring_results, load_monitoring_results, save_monitoring_results
diff --git a/src/backends.jl b/src/backends.jl
index bd5d1de..ae147fc 100644
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -56,6 +56,7 @@ function check_backend(b::Backend)
 end
 
 CUDAExt::Union{Nothing,Module} = nothing
+AMDGPUExt::Union{Nothing,Module} = nothing
 
 """
 Query information about a specific backend, e.g., what functionality the backend currently
diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl
index 57b7136..219d185 100644
--- a/src/stubs/stubs_general.jl
+++ b/src/stubs/stubs_general.jl
@@ -12,3 +12,11 @@ Reclaim the unused memory of a GPU
 """
 clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...)
 clear_gpu_memory(::Backend; kwargs...) = not_implemented_yet()
+
+"Return the current device of the active backend."
+device() = device(backend())
+device(::Backend) = not_implemented_yet()
+
+"Return the devices of the active backend."
+devices() = devices(backend())
+devices(::Backend) = not_implemented_yet()
diff --git a/test/gpuinfo_tests.jl b/test/gpuinfo_tests.jl
deleted file mode 100644
index 699a766..0000000
--- a/test/gpuinfo_tests.jl
+++ /dev/null
@@ -1,7 +0,0 @@
-@testitem "gpuinfo / gpus" begin
-    using CUDA
-    @test isnothing(gpus())
-    @test isnothing(gpuinfo())
-    @test isnothing(gpuinfo(0))
-    @test isnothing(gpuinfo(device()))
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 111ff35..1707833 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,24 +1,75 @@
-using TestItemRunner
 using GPUInspector
-using CUDA
+using Test
+using LinearAlgebra
 
-if !GPUInspector.functional()
-    error("Can't run testsuite since CUDA/GPU not present or not functional!")
+# figure out which backend to use (if both CUDA and AMDGPU are functional we use CUDA)
+if haskey(ENV, "TEST_BACKEND")
+    if lowercase(ENV["TEST_BACKEND"]) in ("nvidia", "cuda", "nvidiabackend")
+        using CUDA
+        TEST_BACKEND = NVIDIABackend()
+    elseif lowercase(ENV["TEST_BACKEND"]) in ("amd", "amdgpu", "amdbackend")
+        using AMDGPU
+        TEST_BACKEND = AMDBackend()
+    else
+        error("""
+        TEST_BACKEND environment variable contains unsupported value.
+        """)
+    end
+else
+    using CUDA
+    using AMDGPU
+    if CUDA.functional()
+        @info("NVIDIA GPUs detected.", CUDA.devices())
+        TEST_BACKEND = NVIDIABackend()
+    elseif AMDGPU.functional()
+        @info("AMD GPUs detected.", AMDGPU.devices())
+        TEST_BACKEND = AMDBackend()
+    else
+        error("""
+            Aborting because neither CUDA.jl nor AMDGPU.jl are functional.
+            Are there any GPUs in the system?
+            """)
+    end
 end
-if Threads.nthreads() == 1 || (Threads.nthreads() < length(CUDA.devices()) + 1)
-    # we should have at least one thread per gpu + one monitoring thread
-    @warn(
-        "You should run the tests with at least $(length(CUDA.devices()) + 1) Julia threads.",
-        Threads.nthreads(),
-        length(CUDA.devices())
-    )
+backend!(TEST_BACKEND)
+@info "Running tests with the following backend: $TEST_BACKEND."
+
+const TEST_NAMES = ["bandwidth", "peakflops", "stresstest", "gpuinfo", "core"]
+if haskey(ENV, "TESTS")
+    tests = split(ENV["TESTS"], ",")
+    if !all(t -> t in TEST_NAMES, tests)
+        error("""
+        TESTS environment variable contains unknown test names.
+        Valid test names are: $(TEST_NAMES)
+        """)
+    end
+    TARGET_TESTS = tests
+else
+    # run all tests
+    const TARGET_TESTS = TEST_NAMES
 end
+@info "Running following tests: $TARGET_TESTS."
+
 
-@run_package_tests
+if "stresstest" in TARGET_TESTS
+    # error if we aren't running with enough threads
+    if Threads.nthreads() == 1 || (Threads.nthreads() < ngpus() + 1)
+        # we should have at least one thread per gpu + one monitoring thread
+        error("You should run the tests with at least $(ngpus() + 1) Julia threads.")
+    end
+end
 
-include("backend_tests.jl")
-include("utility_tests.jl")
-include("stresstest_tests.jl")
-include("bandwidth_tests.jl")
-include("peakflops_tests.jl")
-include("gpuinfo_tests.jl")
+if "gpuinfo" in TARGET_TESTS
+    include("tests_gpuinfo.jl")
+end
+# if "stresstest" in TARGET_TESTS
+#     include("tests_stresstest.jl")
+# end
+# if "peakflops" in TARGET_TESTS
+#     include("tests_peakflops.jl")
+# end
+# if "bandwidth" in TARGET_TESTS
+#     include("tests_bandwidth.jl")
+# end
+# include("tests_backend.jl")
+# include("tests_utility.jl")
diff --git a/test/backend_tests.jl b/test/tests_backend.jl
similarity index 79%
rename from test/backend_tests.jl
rename to test/tests_backend.jl
index d772080..c6bed8c 100644
--- a/test/backend_tests.jl
+++ b/test/tests_backend.jl
@@ -1,5 +1,5 @@
-@testitem "CUDA backend" begin
-    using CUDA
+@testset "Backend switching" begin
+    @test GPUInspector.is_cuda_loaded()
     @test GPUInspector.is_cuda_loaded()
     @test GPUInspector.is_backend_loaded(NVIDIABackend())
     @test backend() == NVIDIABackend()
diff --git a/test/bandwidth_tests.jl b/test/tests_bandwidth.jl
similarity index 89%
rename from test/bandwidth_tests.jl
rename to test/tests_bandwidth.jl
index a68ea19..9d174ae 100644
--- a/test/bandwidth_tests.jl
+++ b/test/tests_bandwidth.jl
@@ -1,14 +1,13 @@
-@testitem "p2p_bandwidth" begin
-    using LinearAlgebra
-    using CUDA
-
+@testset "p2p_bandwidth" begin
     @testset "unidirectional" begin
         # p2p_bandwidth
         @test typeof(p2p_bandwidth(; verbose=false)) == Float64
         @test 0 ≤ p2p_bandwidth(; verbose=false)
         # options
         @test typeof(p2p_bandwidth(; memsize=MB(100), verbose=false)) == Float64
-        @test typeof(p2p_bandwidth(; src=CuDevice(0), dst=CuDevice(1), verbose=false)) ==
+        dev_src = GPUInspector.devices()[1]
+        dev_dst = GPUInspector.devices()[2]
+        @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) ==
             Float64
         @test typeof(p2p_bandwidth(; dtype=Float16, verbose=false)) == Float64
         @test typeof(p2p_bandwidth(; nbench=10, verbose=false)) == Float64
@@ -41,15 +40,13 @@
     end
 end
 
-@testitem "host2device_bandwidth" begin
-    using CUDA
+@testset "host2device_bandwidth" begin
     @test isnothing(host2device_bandwidth())
     @test isnothing(host2device_bandwidth(; memsize=MB(100)))
     @test isnothing(host2device_bandwidth(; dtype=Float16))
 end
 
-@testitem "memory_bandwidth" begin
-    using CUDA
+@testset "memory_bandwidth" begin
     @test typeof(memory_bandwidth()) == Float64
     @test typeof(memory_bandwidth(; memsize=MiB(10))) == Float64
     @test typeof(memory_bandwidth(; dtype=Float32)) == Float64
diff --git a/test/tests_core.jl b/test/tests_core.jl
new file mode 100644
index 0000000..e69de29
diff --git a/test/tests_gpuinfo.jl b/test/tests_gpuinfo.jl
new file mode 100644
index 0000000..9ea54c7
--- /dev/null
+++ b/test/tests_gpuinfo.jl
@@ -0,0 +1,8 @@
+@testset "gpuinfo / gpus" begin
+    @test isnothing(gpus())
+    @test isnothing(gpuinfo())
+    @test isnothing(gpuinfo(GPUInspector.device()))
+    if ngpus() > 1
+        @test isnothing(gpuinfo_p2p_access())
+    end
+end
diff --git a/test/peakflops_tests.jl b/test/tests_peakflops.jl
similarity index 100%
rename from test/peakflops_tests.jl
rename to test/tests_peakflops.jl
diff --git a/test/stresstest_tests.jl b/test/tests_stresstest.jl
similarity index 100%
rename from test/stresstest_tests.jl
rename to test/tests_stresstest.jl
diff --git a/test/utility_tests.jl b/test/tests_utility.jl
similarity index 100%
rename from test/utility_tests.jl
rename to test/tests_utility.jl

From 5c155247a2543c1a5718b3630069a6d2ec2c6b81 Mon Sep 17 00:00:00 2001
From: Carsten Bauer <crstnbr@gmail.com>
Date: Fri, 18 Aug 2023 19:55:51 +0200
Subject: [PATCH 3/5] new testing infrastructure works for NVIDIABackend()

---
 .gitlab-ci.yml                                | 22 ++++++--
 ext/AMDGPUExt/AMDGPUExt.jl                    |  3 +-
 ext/AMDGPUExt/implementations/gpuinfo.jl      |  8 +--
 .../implementations/host2device_bandwidth.jl  |  4 +-
 ext/AMDGPUExt/implementations/membw.jl        | 10 ++--
 ext/CUDAExt/CUDAExt.jl                        |  3 +-
 ext/CUDAExt/implementations/gpuinfo.jl        |  8 +--
 .../implementations/host2device_bandwidth.jl  |  4 +-
 ext/CUDAExt/implementations/membw.jl          | 10 ++--
 ext/CUDAExt/implementations/p2p_bandwidth.jl  |  6 +--
 ext/CUDAExt/implementations/peakflops_gpu.jl  |  4 +-
 ext/CUDAExt/implementations/stresstest.jl     |  4 +-
 ext/CUDAExt/peakflops_gpu_fmas.jl             |  2 +-
 ext/CUDAExt/peakflops_gpu_matmul.jl           |  8 +--
 ext/CUDAExt/peakflops_gpu_wmmas.jl            |  2 +-
 src/GPUInspector.jl                           |  3 ++
 test/runtests.jl                              | 51 ++++++++++++++-----
 test/tests_amd_only.jl                        |  0
 test/tests_backend.jl                         | 10 ----
 test/tests_bandwidth.jl                       |  9 ++--
 test/tests_core.jl                            | 10 ++++
 test/tests_nvidia_only.jl                     | 11 ++++
 test/tests_peakflops.jl                       | 45 ++++++++--------
 test/tests_stresstest.jl                      | 28 +++++-----
 test/tests_utility.jl                         | 15 +-----
 25 files changed, 162 insertions(+), 118 deletions(-)
 create mode 100644 test/tests_amd_only.jl
 delete mode 100644 test/tests_backend.jl
 create mode 100644 test/tests_nvidia_only.jl

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7ae5973..3becffc 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,24 +2,23 @@ stages:
   - test
   - documentation
 variables:
-  SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci"
   JULIA_NUM_THREADS: "10"
   JULIA_EXCLUSIVE: "1"
   JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
-  MKL_DYNAMIC: "false"
-  MKL_NUM_THREADS: "1"
 default:
   tags:
     - bauerc-noctua2
 
 # Generates code coverage
-julia/1.9:
+julia/1.9/NVIDIA:
   stage: test
   rules:
     - changes:
         - "README.md"
     - when: on_success
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
     - module load $JULIA_1_9
@@ -28,6 +27,19 @@ julia/1.9:
     - julia --color=yes --project=test/coverage test/coverage/coverage.jl
   allow_failure: false
 
+julia/1.9/AMD:
+  stage: test
+  rules:
+    - changes:
+        - "README.md"
+    - when: on_success
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:15:00 -A pc2-mitarbeiter -p hacc --exclusive"
+  script:
+    - /bin/bash -l
+    - module load $JULIA_1_9
+    - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = false);'
+  allow_failure: true
 
 # Documentation
 build-and-deploy-docs:
@@ -37,6 +49,8 @@ build-and-deploy-docs:
     - pushes
     - tags
     - external_pull_requests
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
     - module load $JULIA_1_9
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
index f6d829a..2edd10c 100644
--- a/ext/AMDGPUExt/AMDGPUExt.jl
+++ b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -24,7 +24,8 @@ using GPUInspector:
     MonitoringResults,
     _defaultylims,
     @unroll,
-    AMDBackend
+    AMDBackend,
+    getstdout
 
 include("utility.jl")
 # include("stresstests.jl")
diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl
index b172a52..64c4d70 100644
--- a/ext/AMDGPUExt/implementations/gpuinfo.jl
+++ b/ext/AMDGPUExt/implementations/gpuinfo.jl
@@ -2,7 +2,7 @@ function GPUInspector.ngpus(::AMDBackend)
     return length(AMDGPU.devices())
 end
 
-function GPUInspector.gpus(::AMDBackend; io::IO=stdout)
+function GPUInspector.gpus(::AMDBackend; io=getstdout())
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
     devs = AMDGPU.devices()
     if isempty(devs)
@@ -32,18 +32,18 @@ Print out detailed information about the AMD GPU with the given `deviceid`.
 
 (This method is from the AMD backend.)
 """
-function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io::IO=stdout)
+function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io=getstdout())
     0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id."))
     return gpuinfo(HIPDevice(deviceid); io)
 end
-function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io::IO=stdout)
+function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io=getstdout())
     # printing
     println(io, "Device: $dev \n")
     show(io, AMDGPU.HIP.properties(dev))
     return nothing
 end
 
-function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io::IO=stdout)
+function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io=getstdout())
     # check p2p access
     ndevs = ngpus(AMDBackend())
     if ndevs <= 1
diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
index de80ef5..8095f45 100644
--- a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
+++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
@@ -4,7 +4,7 @@ function GPUInspector.host2device_bandwidth(
     dtype=Cchar,
     DtoDfactor=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     N = Int(bytes(memsize) ÷ sizeof(dtype))
@@ -36,7 +36,7 @@ function _perform_memcpy(
     stats=false,
     DtoDfactor=false,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
     ts = zeros(nbench)
diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl
index 8ddba6a..d177993 100644
--- a/ext/AMDGPUExt/implementations/membw.jl
+++ b/ext/AMDGPUExt/implementations/membw.jl
@@ -1,5 +1,5 @@
 # function theoretical_memory_bandwidth(
-#     ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
+#     ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
 # )
 #     max_mem_clock_rate =
 #         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
@@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth(
     verbose=true,
     DtoDfactor=true,
     device=AMDGPU.device(),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )::Float64
     AMDGPU.device!(device) do
@@ -40,7 +40,7 @@ function GPUInspector.memory_bandwidth_scaling(
     device=AMDGPU.device(),
     sizes=logspace(1, exp2(30), 10),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     bandwidths = zeros(length(sizes))
@@ -85,7 +85,7 @@ end
 #     dtype=Float32,
 #     cublas=true,
 #     verbose=true,
-#     io::IO=stdout,
+#     io=getstdout(),
 # )::Float64
 #     device!(device) do
 #         a = dtype(pi)
@@ -130,7 +130,7 @@ end
 #     device=CUDA.device(),
 #     sizes=[2^20 * i for i in 10:10:300],
 #     verbose=true,
-#     io::IO=stdout,
+#     io=getstdout(),
 #     kwargs...,
 # )
 #     # sizes = [2^20 * i for i in 8:128] # V100
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
index 91081ec..0703604 100644
--- a/ext/CUDAExt/CUDAExt.jl
+++ b/ext/CUDAExt/CUDAExt.jl
@@ -24,7 +24,8 @@ using GPUInspector:
     MonitoringResults,
     _defaultylims,
     @unroll,
-    NVIDIABackend
+    NVIDIABackend,
+    getstdout
 
 # for convenience
 const BFloat16 = CUDA.BFloat16
diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl
index 73a1fda..bff635d 100644
--- a/ext/CUDAExt/implementations/gpuinfo.jl
+++ b/ext/CUDAExt/implementations/gpuinfo.jl
@@ -2,7 +2,7 @@ function GPUInspector.ngpus(::NVIDIABackend)
     length(CUDA.devices())
 end
 
-function GPUInspector.gpus(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpus(::NVIDIABackend; io=getstdout())
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
     devs = devices()
     if isempty(devs)
@@ -45,11 +45,11 @@ Heavily inspired by the CUDA sample "deviceQueryDrv.cpp".
 
 (This method is from the NVIDIA Backend.)
 """
-function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io=getstdout())
     0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id."))
     return gpuinfo(CuDevice(deviceid); io)
 end
-function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io=getstdout())
     # query
     mp = nmultiprocessors(dev)
     cores = ncudacores(dev)
@@ -216,7 +216,7 @@ function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::
     return nothing
 end
 
-function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io=getstdout())
     # check p2p access
     ndevs = ngpus(NVIDIABackend())
     if ndevs <= 1
diff --git a/ext/CUDAExt/implementations/host2device_bandwidth.jl b/ext/CUDAExt/implementations/host2device_bandwidth.jl
index d3b747c..5b3b31c 100644
--- a/ext/CUDAExt/implementations/host2device_bandwidth.jl
+++ b/ext/CUDAExt/implementations/host2device_bandwidth.jl
@@ -3,7 +3,7 @@ function GPUInspector.host2device_bandwidth(::NVIDIABackend;
     dtype=Cchar,
     DtoDfactor=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     N = Int(bytes(memsize) ÷ sizeof(dtype))
@@ -42,7 +42,7 @@ function _perform_memcpy(
     stats=false,
     DtoDfactor=false,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     NVTX.@range "host2dev: $title" begin
         sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl
index 58d9b7b..a9dc2bf 100644
--- a/ext/CUDAExt/implementations/membw.jl
+++ b/ext/CUDAExt/implementations/membw.jl
@@ -1,5 +1,5 @@
 function GPUInspector.theoretical_memory_bandwidth(
-    ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
+    ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
 )
     max_mem_clock_rate =
         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
@@ -21,7 +21,7 @@ function GPUInspector.memory_bandwidth(
     verbose=true,
     DtoDfactor=true,
     device=CUDA.device(),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )::Float64
     device!(device) do
@@ -46,7 +46,7 @@ function GPUInspector.memory_bandwidth_scaling(
     device=CUDA.device(),
     sizes=logspace(1, exp2(30), 10),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     bandwidths = zeros(length(sizes))
@@ -91,7 +91,7 @@ function GPUInspector.memory_bandwidth_saxpy(
     dtype=Float32,
     cublas=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )::Float64
     device!(device) do
         a = dtype(pi)
@@ -136,7 +136,7 @@ function GPUInspector.memory_bandwidth_saxpy_scaling(
     device=CUDA.device(),
     sizes=[2^20 * i for i in 10:10:300],
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     # sizes = [2^20 * i for i in 8:128] # V100
diff --git a/ext/CUDAExt/implementations/p2p_bandwidth.jl b/ext/CUDAExt/implementations/p2p_bandwidth.jl
index 61cb1af..2734ecb 100644
--- a/ext/CUDAExt/implementations/p2p_bandwidth.jl
+++ b/ext/CUDAExt/implementations/p2p_bandwidth.jl
@@ -9,7 +9,7 @@ function GPUInspector.p2p_bandwidth(
     dtype=Float32,
     src=0,
     dst=1,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if ngpus(NVIDIABackend()) < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -66,7 +66,7 @@ function GPUInspector.p2p_bandwidth(
     return bw_max
 end
 
-function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...)
+function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io=getstdout(), verbose=false, kwargs...)
     ngpus = length(CUDA.devices())
     if ngpus < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -93,7 +93,7 @@ function GPUInspector.p2p_bandwidth_bidirectional(
     dev1=0,
     dev2=1,
     repeat=100,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if ngpus(NVIDIABackend()) < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl
index 6751cbf..99a117c 100644
--- a/ext/CUDAExt/implementations/peakflops_gpu.jl
+++ b/ext/CUDAExt/implementations/peakflops_gpu.jl
@@ -16,7 +16,7 @@ function GPUInspector.theoretical_peakflops_gpu(
     tensorcores=hastensorcores(),
     dtype=tensorcores ? Float16 : Float32,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if tensorcores
         max_peakflops = _theoretical_peakflops_gpu_tensorcores(; device, dtype)
@@ -109,7 +109,7 @@ function GPUInspector.peakflops_gpu(
     tensorcores=hastensorcores(),
     verbose=true,
     dtype=tensorcores ? Float16 : Float32,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     if tensorcores
diff --git a/ext/CUDAExt/implementations/stresstest.jl b/ext/CUDAExt/implementations/stresstest.jl
index 41288ea..9c14994 100644
--- a/ext/CUDAExt/implementations/stresstest.jl
+++ b/ext/CUDAExt/implementations/stresstest.jl
@@ -12,7 +12,7 @@ function GPUInspector.stresstest(
     clearmem=false,
     monitoring=false,
     batch_duration=nothing,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     logger = ConsoleLogger(io)
@@ -69,7 +69,7 @@ function GPUInspector.stresstest(
         Δt = @elapsed _run_stresstests(ts; verbose, kwargs...)
         if clearmem
             verbose && @info("Clearing GPU memory.")
-            clear_all_gpus_memory(devices)
+            GPUInspector.clear_all_gpus_memory(; devices=devices)
         end
         verbose && @info("Took $(round(Δt; digits=2)) seconds to run the tests.")
         if monitoring
diff --git a/ext/CUDAExt/peakflops_gpu_fmas.jl b/ext/CUDAExt/peakflops_gpu_fmas.jl
index a251341..eb3ff6f 100644
--- a/ext/CUDAExt/peakflops_gpu_fmas.jl
+++ b/ext/CUDAExt/peakflops_gpu_fmas.jl
@@ -48,7 +48,7 @@ function _peakflops_gpu_fmas(;
     nkernel=5,
     device::CuDevice=CUDA.device(),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         d_a = CUDA.rand(dtype, size)
diff --git a/ext/CUDAExt/peakflops_gpu_matmul.jl b/ext/CUDAExt/peakflops_gpu_matmul.jl
index a081b69..93bf221 100644
--- a/ext/CUDAExt/peakflops_gpu_matmul.jl
+++ b/ext/CUDAExt/peakflops_gpu_matmul.jl
@@ -9,13 +9,13 @@ function peakflops_gpu_matmul_scaling(
     device=CUDA.device(),
     verbose=true,
     sizes=2 .^ (10:15),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 ) where {F}
     flops = zeros(length(sizes))
     for (i, s) in enumerate(sizes)
         flops[i] = peakflops_func(; device=device, size=s, verbose=false, kwargs...)
-        clear_gpu_memory(device)
+        GPUInspector.clear_gpu_memory(; device=device)
     end
     if verbose
         peak_val, idx = findmax(flops)
@@ -64,7 +64,7 @@ function peakflops_gpu_matmul(;
     nmatmuls=5,
     nbench=5,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         C = CUDA.zeros(dtype, size, size)
@@ -108,7 +108,7 @@ function peakflops_gpu_matmul_graphs(;
     nmatmuls=5,
     nbench=5,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         C = CUDA.zeros(dtype, size, size)
diff --git a/ext/CUDAExt/peakflops_gpu_wmmas.jl b/ext/CUDAExt/peakflops_gpu_wmmas.jl
index a12295b..f6c000c 100644
--- a/ext/CUDAExt/peakflops_gpu_wmmas.jl
+++ b/ext/CUDAExt/peakflops_gpu_wmmas.jl
@@ -91,7 +91,7 @@ function _peakflops_gpu_wmmas(;
     nkernel=10,
     verbose=true,
     dtype=Float16,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         if Symbol(dtype) == :Float16
diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl
index 19ba114..7790536 100644
--- a/src/GPUInspector.jl
+++ b/src/GPUInspector.jl
@@ -16,6 +16,9 @@ using CpuId: cachesize
 using HDF5: h5open
 using Glob: glob
 
+const DEFAULT_IO = Ref{Union{IO, Nothing}}(nothing)
+getstdout() = something(DEFAULT_IO[], stdout)
+
 include("backends.jl")
 include("UnitPrefixedBytes.jl")
 include("utility.jl")
diff --git a/test/runtests.jl b/test/runtests.jl
index 1707833..fb1f3f8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,12 @@
 using GPUInspector
 using Test
 using LinearAlgebra
+using Logging
+
+# Environment variables:
+#   - "TEST_BACKEND": can be set to manually specify a backend
+#   - "TEST_QUIET": can be set to true/false to enable/disable non-verbose testing
+#   - "TESTS": a comma separated list of test suites to run (see TEST_NAMES below)
 
 # figure out which backend to use (if both CUDA and AMDGPU are functional we use CUDA)
 if haskey(ENV, "TEST_BACKEND")
@@ -34,7 +40,9 @@ end
 backend!(TEST_BACKEND)
 @info "Running tests with the following backend: $TEST_BACKEND."
 
-const TEST_NAMES = ["bandwidth", "peakflops", "stresstest", "gpuinfo", "core"]
+const TEST_NAMES = [
+    "bandwidth", "peakflops", "stresstest", "gpuinfo", "utility", "backend_specific", "core"
+]
 if haskey(ENV, "TESTS")
     tests = split(ENV["TESTS"], ",")
     if !all(t -> t in TEST_NAMES, tests)
@@ -50,7 +58,6 @@ else
 end
 @info "Running following tests: $TARGET_TESTS."
 
-
 if "stresstest" in TARGET_TESTS
     # error if we aren't running with enough threads
     if Threads.nthreads() == 1 || (Threads.nthreads() < ngpus() + 1)
@@ -59,17 +66,35 @@ if "stresstest" in TARGET_TESTS
     end
 end
 
+quiet_testing = parse(Bool, get(ENV, "TEST_QUIET", "true"))
+if quiet_testing
+    GPUInspector.DEFAULT_IO[] = Base.BufferStream()
+    global_logger(Logging.NullLogger())
+end
+
+if "core" in TARGET_TESTS
+    include("tests_core.jl")
+end
+if "utility" in TARGET_TESTS
+    include("tests_utility.jl")
+end
 if "gpuinfo" in TARGET_TESTS
     include("tests_gpuinfo.jl")
 end
-# if "stresstest" in TARGET_TESTS
-#     include("tests_stresstest.jl")
-# end
-# if "peakflops" in TARGET_TESTS
-#     include("tests_peakflops.jl")
-# end
-# if "bandwidth" in TARGET_TESTS
-#     include("tests_bandwidth.jl")
-# end
-# include("tests_backend.jl")
-# include("tests_utility.jl")
+if "bandwidth" in TARGET_TESTS
+    include("tests_bandwidth.jl")
+end
+if "stresstest" in TARGET_TESTS
+    using CairoMakie
+    include("tests_stresstest.jl")
+end
+if "peakflops" in TARGET_TESTS
+    include("tests_peakflops.jl")
+end
+if "backend_specific" in TARGET_TESTS
+    if TEST_BACKEND == NVIDIABackend()
+        include("tests_nvidia_only.jl")
+    elseif TEST_BACKEND == AMDBackend()
+        include("tests_amd_only.jl")
+    end
+end
diff --git a/test/tests_amd_only.jl b/test/tests_amd_only.jl
new file mode 100644
index 0000000..e69de29
diff --git a/test/tests_backend.jl b/test/tests_backend.jl
deleted file mode 100644
index c6bed8c..0000000
--- a/test/tests_backend.jl
+++ /dev/null
@@ -1,10 +0,0 @@
-@testset "Backend switching" begin
-    @test GPUInspector.is_cuda_loaded()
-    @test GPUInspector.is_cuda_loaded()
-    @test GPUInspector.is_backend_loaded(NVIDIABackend())
-    @test backend() == NVIDIABackend()
-    @test isnothing(backend!(NoBackend()))
-    @test backend() == NoBackend()
-    @test isnothing(backend!(:cuda))
-    @test backend() == NVIDIABackend()
-end
diff --git a/test/tests_bandwidth.jl b/test/tests_bandwidth.jl
index 9d174ae..7f81fc4 100644
--- a/test/tests_bandwidth.jl
+++ b/test/tests_bandwidth.jl
@@ -5,10 +5,8 @@
         @test 0 ≤ p2p_bandwidth(; verbose=false)
         # options
         @test typeof(p2p_bandwidth(; memsize=MB(100), verbose=false)) == Float64
-        dev_src = GPUInspector.devices()[1]
-        dev_dst = GPUInspector.devices()[2]
-        @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) ==
-            Float64
+        dev_src, dev_dst = collect(GPUInspector.devices())[1:2]
+        @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) == Float64
         @test typeof(p2p_bandwidth(; dtype=Float16, verbose=false)) == Float64
         @test typeof(p2p_bandwidth(; nbench=10, verbose=false)) == Float64
         @test typeof(p2p_bandwidth(; hist=true, verbose=true)) == Float64
@@ -25,7 +23,8 @@
         @test typeof(p2p_bandwidth_bidirectional(; verbose=false)) == Float64
         @test 0 ≤ p2p_bandwidth_bidirectional(; verbose=false)
         # options
-        @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) == Float64
+        @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) ==
+            Float64
         @test typeof(p2p_bandwidth_bidirectional(; dtype=Float16, verbose=false)) == Float64
         @test typeof(p2p_bandwidth_bidirectional(; nbench=10, verbose=false)) == Float64
         @test typeof(p2p_bandwidth_bidirectional(; hist=true, verbose=true)) == Float64
diff --git a/test/tests_core.jl b/test/tests_core.jl
index e69de29..c6bed8c 100644
--- a/test/tests_core.jl
+++ b/test/tests_core.jl
@@ -0,0 +1,10 @@
+@testset "Backend switching" begin
+    @test GPUInspector.is_cuda_loaded()
+    @test GPUInspector.is_cuda_loaded()
+    @test GPUInspector.is_backend_loaded(NVIDIABackend())
+    @test backend() == NVIDIABackend()
+    @test isnothing(backend!(NoBackend()))
+    @test backend() == NoBackend()
+    @test isnothing(backend!(:cuda))
+    @test backend() == NVIDIABackend()
+end
diff --git a/test/tests_nvidia_only.jl b/test/tests_nvidia_only.jl
new file mode 100644
index 0000000..27ffcd7
--- /dev/null
+++ b/test/tests_nvidia_only.jl
@@ -0,0 +1,11 @@
+@testset "toggle_tensorcoremath" begin
+    @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false))
+    @test CUDA.math_mode() == CUDA.FAST_MATH
+    @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false))
+    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
+    # test toggle
+    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
+    @test CUDA.math_mode() == CUDA.FAST_MATH
+    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
+    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
+end
diff --git a/test/tests_peakflops.jl b/test/tests_peakflops.jl
index 9005fce..0d835e0 100644
--- a/test/tests_peakflops.jl
+++ b/test/tests_peakflops.jl
@@ -1,23 +1,28 @@
-@testitem "peakflops_gpu (CUDA cores)" begin
-    using CUDA
-    @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64
-    @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) ==
-        Float64
-    @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) ==
-        Float64
-end
+if backend() == NVIDIABackend()
+    @testset "peakflops_gpu (CUDA cores)" begin
+        @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64
+        @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) ==
+            Float64
+        @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) ==
+            Float64
+    end
 
-@testitem "peakflops_gpu (Tensor cores)" begin
-    using CUDA
-    @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64
-    @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) == Float64
-end
+    @testset "peakflops_gpu (Tensor cores)" begin
+        @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64
+        @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) ==
+            Float64
+    end
 
-@testitem "peakflops_gpu_matmul / scaling" begin
-    using CUDA
-    @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64
-    @test typeof(CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false)) == Float64
-    @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) == Float64
-    @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) ==
-        Tuple{Vector{Int64},Vector{Float64}}
+    @testset "peakflops_gpu_matmul / scaling" begin
+        @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64
+        @test typeof(
+            CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false)
+        ) == Float64
+        @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) ==
+            Float64
+        @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) ==
+            Tuple{Vector{Int64},Vector{Float64}}
+    end
+elseif backend() == AMDBackend()
+    # TODO
 end
diff --git a/test/tests_stresstest.jl b/test/tests_stresstest.jl
index 8b99fc0..32152b9 100644
--- a/test/tests_stresstest.jl
+++ b/test/tests_stresstest.jl
@@ -1,5 +1,4 @@
-@testitem "Stresstest: different kinds" begin
-    using CUDA
+@testset "Stresstest: different kinds" begin
     @test isnothing(stresstest(; duration=2, verbose=false))
     @test isnothing(stresstest(; enforced_duration=2, verbose=false))
     @test isnothing(stresstest(; approx_duration=2, verbose=false))
@@ -8,10 +7,9 @@
     @test isnothing(stresstest(; mem=0.2, verbose=false))
 end
 
-@testitem "Stresstest: keyword options" begin
-    using CUDA
+@testset "Stresstest: keyword options" begin
     @test isnothing(stresstest(; duration=2, verbose=false))
-    @test isnothing(stresstest(; duration=2, devices=devices(), verbose=false))
+    @test isnothing(stresstest(; duration=2, devices=GPUInspector.devices(), verbose=false))
     @test isnothing(stresstest(; duration=2, size=3000, verbose=false))
     @test isnothing(stresstest(; duration=2, dtype=Float16, verbose=false))
     @test isnothing(stresstest(; duration=2, clearmem=true, verbose=false))
@@ -19,15 +17,16 @@ end
     # TODO: kwargs: threads, parallel
 end
 
-@testitem "Stresstest: monitoring" begin
-    using CUDA
+@testset "Stresstest: monitoring" begin
     @testset "automatic (monitoring=true)" begin
         @test typeof(
-            stresstest(; devices=devices(), duration=2, verbose=false, monitoring=true)
+            stresstest(;
+                devices=GPUInspector.devices(), duration=2, verbose=false, monitoring=true
+            ),
         ) == MonitoringResults
     end
     @testset "manual" begin
-        devs = devices()
+        devs = GPUInspector.devices()
         @test isnothing(monitoring_start(; freq=1, devices=devs, verbose=false))
         @test isnothing(
             stresstest(; devices=devs, duration=2, verbose=false, monitoring=false)
@@ -41,8 +40,7 @@ end
     end
 end
 
-@testitem "Stresstest: monitoring results" begin
-    using CUDA
+@testset "Stresstest: monitoring results" begin
     @testset "MonitoringResults" begin
         r = stresstest(; duration=2, verbose=false, monitoring=true)
         @test typeof(r) == MonitoringResults
@@ -53,12 +51,13 @@ end
     end
     @testset "save / load" begin
         d = Dict{Symbol,Vector{Vector{Float64}}}()
-        ndevs = length(CUDA.devices())
+        ndevs = ngpus()
         d[:asd] = [rand(ndevs) for _ in 1:5]
         d[:qwe] = [rand(ndevs) for _ in 1:5]
         d[:jkl] = [rand(ndevs) for _ in 1:5]
         devices = Tuple{String,Base.UUID}[
-            (CUDAExt._device2string(dev), uuid(dev)) for dev in collect(CUDA.devices())
+            (CUDAExt._device2string(dev), uuid(dev)) for
+            dev in collect(GPUInspector.devices())
         ]
         r = MonitoringResults(rand(5), devices, d)
         cd(mktempdir()) do
@@ -74,8 +73,7 @@ end
     end
 end
 
-@testitem "Stresstest: monitoring results (CairoMakie)" begin
-    using CairoMakie
+@testset "Stresstest: monitoring results (CairoMakie)" begin
     r = load_monitoring_results(joinpath(@__DIR__, "test.h5"))
     @test isnothing(savefig_monitoring_results(r))
     @test isnothing(savefig_monitoring_results(r, (:compute, :mem)))
diff --git a/test/tests_utility.jl b/test/tests_utility.jl
index 1bf5bd3..9ec5c5f 100644
--- a/test/tests_utility.jl
+++ b/test/tests_utility.jl
@@ -1,4 +1,4 @@
-@testitem "UnitPrefixedBytes" begin
+@testset "UnitPrefixedBytes" begin
     using InteractiveUtils: subtypes
 
     # general stuff
@@ -64,16 +64,3 @@
     end
     @test B(40_000_000) + MB(3) - 2 * KiB(2) ≈ MB(42.995904)
 end
-
-@testitem "toggle_tensorcoremath" begin
-    using CUDA
-    @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false))
-    @test CUDA.math_mode() == CUDA.FAST_MATH
-    @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false))
-    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
-    # test toggle
-    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
-    @test CUDA.math_mode() == CUDA.FAST_MATH
-    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
-    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
-end

From b7dd03a7237b08ceb0eb69dfc0b2d96b468cd8ae Mon Sep 17 00:00:00 2001
From: Carsten Bauer <crstnbr@gmail.com>
Date: Fri, 18 Aug 2023 20:07:26 +0200
Subject: [PATCH 4/5] AMD CI: use Juliaa isnstead of JuliaHPC

---
 .gitlab-ci.yml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3becffc..5900b30 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,7 +5,8 @@ variables:
   JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci"
   JULIA_NUM_THREADS: "10"
   JULIA_EXCLUSIVE: "1"
-  JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
+  JULIAHPC_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
+  JULIA_1_9: "lang/Julia/1.9.2-linux-x86_64"
 default:
   tags:
     - bauerc-noctua2
@@ -18,10 +19,10 @@ julia/1.9/NVIDIA:
         - "README.md"
     - when: on_success
   variables:
-    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
-    - module load $JULIA_1_9
+    - module load $JULIAHPC_1_9
     - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = true);'
     - julia --color=yes --project=test/coverage -e 'import Pkg; Pkg.instantiate()'
     - julia --color=yes --project=test/coverage test/coverage/coverage.jl
@@ -34,7 +35,7 @@ julia/1.9/AMD:
         - "README.md"
     - when: on_success
   variables:
-    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:15:00 -A pc2-mitarbeiter -p hacc --exclusive"
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:20:00 -A pc2-mitarbeiter -p hacc --exclusive"
   script:
     - /bin/bash -l
     - module load $JULIA_1_9
@@ -50,10 +51,10 @@ build-and-deploy-docs:
     - tags
     - external_pull_requests
   variables:
-    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
-    - module load $JULIA_1_9
+    - module load $JULIAHPC_1_9
     - cd docs
     - julia --color=yes build_docs.jl
   allow_failure: false

From c8bac39d3fdf47069be5cc1efcdc5119019a16e6 Mon Sep 17 00:00:00 2001
From: Carsten Bauer <crstnbr@gmail.com>
Date: Thu, 24 Aug 2023 14:48:41 +0200
Subject: [PATCH 5/5] membw saxpy etc

---
 Project.toml                           |   2 +-
 ext/AMDGPUExt/implementations/membw.jl | 163 ++++++++++++-------------
 test/tests_bandwidth.jl                |  29 +++--
 test/tests_core.jl                     |  27 ++--
 4 files changed, 116 insertions(+), 105 deletions(-)

diff --git a/Project.toml b/Project.toml
index c4a6b50..e478059 100644
--- a/Project.toml
+++ b/Project.toml
@@ -29,7 +29,7 @@ AMDGPUExt = "AMDGPU"
 CairoMakieExt = "CairoMakie"
 
 [compat]
-AMDGPU = "0.5"
+AMDGPU = "0.5.5"
 CUDA = "3.8.4, 3.12, 4.4"
 CairoMakie = "0.7, 0.10.7"
 CpuId = "0.3"
diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl
index d177993..b309e05 100644
--- a/ext/AMDGPUExt/implementations/membw.jl
+++ b/ext/AMDGPUExt/implementations/membw.jl
@@ -45,7 +45,7 @@ function GPUInspector.memory_bandwidth_scaling(
 )
     bandwidths = zeros(length(sizes))
     for (i, s) in enumerate(sizes)
-        bandwidths[i] = memory_bandwidth(
+        bandwidths[i] = GPUInspector.memory_bandwidth(
             AMDBackend(); memsize=B(s), device=device, verbose=false, kwargs...
         )
         clear_gpu_memory(AMDBackend(); device=device)
@@ -71,93 +71,82 @@ function GPUInspector.memory_bandwidth_scaling(
     return (sizes=sizes, bandwidths=bandwidths)
 end
 
-# """
-# Extra keyword arguments:
-# * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`.
-
-# (This method is from the NVIDIA Backend.)
-# """
-# function memory_bandwidth_saxpy(
-#     ::NVIDIABackend;
-#     device=CUDA.device(),
-#     size=2^20 * 10,
-#     nbench=10,
-#     dtype=Float32,
-#     cublas=true,
-#     verbose=true,
-#     io=getstdout(),
-# )::Float64
-#     device!(device) do
-#         a = dtype(pi)
-#         x = CUDA.rand(dtype, size)
-#         y = CUDA.rand(dtype, size)
-#         z = CUDA.zeros(dtype, size)
+function GPUInspector.memory_bandwidth_saxpy(
+    ::AMDBackend;
+    device=AMDGPU.device(),
+    size=2^26,
+    nbench=10,
+    dtype=Float32,
+    verbose=true,
+    io=getstdout(),
+)::Float64
+    device!(device) do
+        a = dtype(pi)
+        x = AMDGPU.rand(dtype, size)
+        y = AMDGPU.rand(dtype, size)
+        z = AMDGPU.zeros(dtype, size)
 
-#         nthreads = CUDA.attribute(device, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
-#         nblocks = cld(size, nthreads)
-#         t = Inf
-#         for _ in 1:nbench
-#             if cublas
-#                 Δt = CUDA.@elapsed CUBLAS.axpy!(size, a, x, y)
-#             else
-#                 Δt = CUDA.@elapsed @cuda(
-#                     threads = nthreads, blocks = nblocks, _saxpy_gpu_kernel!(z, a, x, y)
-#                 )
-#             end
-#             t = min(t, Δt)
-#         end
+        kernel = @roc launch = false _saxpy_gpu_kernel!(z, a, x, y)
+        occupancy = AMDGPU.launch_configuration(kernel)
+        t = Inf
+        for _ in 1:nbench
+            Δt = AMDGPU.@elapsed @roc(
+                groupsize = occupancy.groupsize, _saxpy_gpu_kernel!(z, a, x, y)
+            )
+            t = min(t, Δt)
+        end
 
-#         bandwidth = 3.0 * sizeof(dtype) * size * (1024)^(-3) / t
-#         if verbose
-#             printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true)
-#             print(io, " └ max: ")
-#             printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true)
-#         end
-#         return bandwidth
-#     end
-# end
+        bandwidth = 3.0 * sizeof(dtype) * size / t / (1024)^3
+        if verbose
+            printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true)
+            print(io, " └ max: ")
+            printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true)
+        end
+        return bandwidth
+    end
+end
 
-# function _saxpy_gpu_kernel!(z, a, x, y)
-#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-#     if i <= length(z)
-#         @inbounds z[i] = a * x[i] + y[i]
-#     end
-#     return nothing
-# end
+function _saxpy_gpu_kernel!(z, a, x, y)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i <= length(z)
+        @inbounds z[i] = a * x[i] + y[i]
+    end
+    return nothing
+end
 
-# function memory_bandwidth_saxpy_scaling(
-#     ::NVIDIABackend;
-#     device=CUDA.device(),
-#     sizes=[2^20 * i for i in 10:10:300],
-#     verbose=true,
-#     io=getstdout(),
-#     kwargs...,
-# )
-#     # sizes = [2^20 * i for i in 8:128] # V100
-#     bandwidths = zeros(length(sizes))
-#     for (i, s) in enumerate(sizes)
-#         bandwidths[i] = memory_bandwidth_saxpy(
-#             NVIDIABackend(); device=device, size=s, verbose=false, kwargs...
-#         )
-#         clear_gpu_memory(AMDBackend(); device=device)
-#     end
-#     if verbose
-#         peak_val, idx = findmax(bandwidths)
-#         peak_size = sizes[idx]
-#         p = UnicodePlots.lineplot(
-#             sizes,
-#             bandwidths;
-#             xlabel="vector length",
-#             ylabel="GiB/s",
-#             title=string(
-#                 "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))"
-#             ),
-#             xscale=:log2,
-#         )
-#         UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red)
-#         println(io) # top margin
-#         println(io, p)
-#         println(io) # bottom margin
-#     end
-#     return (sizes=sizes, bandwidths=bandwidths)
-# end
+function GPUInspector.memory_bandwidth_saxpy_scaling(
+    ::AMDBackend;
+    device=AMDGPU.device(),
+    sizes=[2^20 * i for i in 10:10:300],
+    verbose=true,
+    io=getstdout(),
+    kwargs...,
+)
+    # sizes = [2^20 * i for i in 8:128] # V100
+    bandwidths = zeros(length(sizes))
+    for (i, s) in enumerate(sizes)
+        bandwidths[i] = GPUInspector.memory_bandwidth_saxpy(
+            AMDBackend(); device=device, size=s, verbose=false, kwargs...
+        )
+        clear_gpu_memory(AMDBackend(); device=device)
+    end
+    if verbose
+        peak_val, idx = findmax(bandwidths)
+        peak_size = sizes[idx]
+        p = UnicodePlots.lineplot(
+            sizes,
+            bandwidths;
+            xlabel="vector length",
+            ylabel="GiB/s",
+            title=string(
+                "Peak: ", round(peak_val; digits=2), " GiB/s (vector size = $(bytes(peak_size)))"
+            ),
+            xscale=:log2,
+        )
+        UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red)
+        println(io) # top margin
+        println(io, p)
+        println(io) # bottom margin
+    end
+    return (sizes=sizes, bandwidths=bandwidths)
+end
diff --git a/test/tests_bandwidth.jl b/test/tests_bandwidth.jl
index 7f81fc4..6626617 100644
--- a/test/tests_bandwidth.jl
+++ b/test/tests_bandwidth.jl
@@ -41,16 +41,27 @@ end
 
 @testset "host2device_bandwidth" begin
     @test isnothing(host2device_bandwidth())
-    @test isnothing(host2device_bandwidth(; memsize=MB(100)))
-    @test isnothing(host2device_bandwidth(; dtype=Float16))
+    @test isnothing(host2device_bandwidth(; memsize=MB(1)))
+    @test isnothing(host2device_bandwidth(; dtype=Float64))
 end
 
 @testset "memory_bandwidth" begin
-    @test typeof(memory_bandwidth()) == Float64
-    @test typeof(memory_bandwidth(; memsize=MiB(10))) == Float64
-    @test typeof(memory_bandwidth(; dtype=Float32)) == Float64
-
-    @test typeof(memory_bandwidth_saxpy()) == Float64
-    @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64
-    @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64
+    @testset "regular" begin
+        @test typeof(memory_bandwidth()) == Float64
+        @test typeof(memory_bandwidth(; memsize=MiB(1))) == Float64
+        @test typeof(memory_bandwidth(; dtype=Float32)) == Float64
+    end
+    @testset "regular, scaling" begin
+        @test typeof(memory_bandwidth_scaling()) ==
+            NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Float64},Vector{Float64}}}
+    end
+    @testset "saxpy" begin
+        @test typeof(memory_bandwidth_saxpy()) == Float64
+        @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64
+        @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64
+    end
+    @testset "saxpy, scaling" begin
+        @test typeof(memory_bandwidth_saxpy_scaling()) ==
+            NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Int64},Vector{Float64}}}
+    end
 end
diff --git a/test/tests_core.jl b/test/tests_core.jl
index c6bed8c..1db7309 100644
--- a/test/tests_core.jl
+++ b/test/tests_core.jl
@@ -1,10 +1,21 @@
 @testset "Backend switching" begin
-    @test GPUInspector.is_cuda_loaded()
-    @test GPUInspector.is_cuda_loaded()
-    @test GPUInspector.is_backend_loaded(NVIDIABackend())
-    @test backend() == NVIDIABackend()
-    @test isnothing(backend!(NoBackend()))
-    @test backend() == NoBackend()
-    @test isnothing(backend!(:cuda))
-    @test backend() == NVIDIABackend()
+    if TEST_BACKEND == NVIDIABackend()
+        @test GPUInspector.is_cuda_loaded()
+        @test GPUInspector.is_backend_loaded(NVIDIABackend())
+        @test backend() == NVIDIABackend()
+        @test isnothing(backend!(NoBackend()))
+        @test backend() == NoBackend()
+        @test isnothing(backend!(:cuda))
+        @test backend() == NVIDIABackend()
+        @test isnothing(backend!(NVIDIABackend()))
+    elseif TEST_BACKEND == AMDBackend()
+        @test GPUInspector.is_amdgpu_loaded()
+        @test GPUInspector.is_backend_loaded(AMDBackend())
+        @test backend() == AMDBackend()
+        @test isnothing(backend!(NoBackend()))
+        @test backend() == NoBackend()
+        @test isnothing(backend!(:amd))
+        @test backend() == AMDBackend()
+        @test isnothing(backend!(AMDBackend()))
+    end
 end