diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7ae5973..5900b30 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,32 +2,45 @@ stages:
   - test
   - documentation
 variables:
-  SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci"
   JULIA_NUM_THREADS: "10"
   JULIA_EXCLUSIVE: "1"
-  JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
-  MKL_DYNAMIC: "false"
-  MKL_NUM_THREADS: "1"
+  JULIAHPC_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
+  JULIA_1_9: "lang/Julia/1.9.2-linux-x86_64"
 default:
   tags:
     - bauerc-noctua2
 
 # Generates code coverage
-julia/1.9:
+julia/1.9/NVIDIA:
   stage: test
   rules:
     - changes:
         - "README.md"
     - when: on_success
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
-    - module load $JULIA_1_9
+    - module load $JULIAHPC_1_9
     - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = true);'
     - julia --color=yes --project=test/coverage -e 'import Pkg; Pkg.instantiate()'
     - julia --color=yes --project=test/coverage test/coverage/coverage.jl
   allow_failure: false
 
+julia/1.9/AMD:
+  stage: test
+  rules:
+    - changes:
+        - "README.md"
+    - when: on_success
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:20:00 -A pc2-mitarbeiter -p hacc --exclusive"
+  script:
+    - /bin/bash -l
+    - module load $JULIA_1_9
+    - julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = false);'
+  allow_failure: true
 
 # Documentation
 build-and-deploy-docs:
@@ -37,9 +50,11 @@ build-and-deploy-docs:
     - pushes
     - tags
     - external_pull_requests
+  variables:
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
   script:
     - /bin/bash -l
-    - module load $JULIA_1_9
+    - module load $JULIAHPC_1_9
     - cd docs
     - julia --color=yes build_docs.jl
   allow_failure: false
diff --git a/Project.toml b/Project.toml
index 053af89..e478059 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,20 +14,22 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 
 [extensions]
 CUDAExt = "CUDA"
+AMDGPUExt = "AMDGPU"
 CairoMakieExt = "CairoMakie"
 
 [compat]
+AMDGPU = "0.5.5"
 CUDA = "3.8.4, 3.12, 4.4"
 CairoMakie = "0.7, 0.10.7"
 CpuId = "0.3"
@@ -35,18 +37,16 @@ DocStringExtensions = "0.9"
 Glob = "1.3"
 HDF5 = "0.16"
 NVTX = "0.3"
-Reexport = "1.2"
-TestItemRunner = "0.2"
 ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7"
 UnicodePlots = "2.8, 3"
 julia = "1.9"
 
 [extras]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 
 [targets]
-test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "TestItemRunner"]
+test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU"]
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
new file mode 100644
index 0000000..2edd10c
--- /dev/null
+++ b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -0,0 +1,76 @@
+module AMDGPUExt
+
+using GPUInspector
+using AMDGPU
+using AMDGPU: device, device!, devices
+
+# stdlibs etc.
+using Base: UUID
+using Statistics
+using Logging
+using LinearAlgebra
+
+# pkgs
+using UnicodePlots
+using ThreadPinning
+
+# for usage in AMDGPUExt
+using GPUInspector:
+    logspace,
+    ismonitoring,
+    _monitoring!,
+    _set_monitoring_task,
+    _get_monitoring_task,
+    MonitoringResults,
+    _defaultylims,
+    @unroll,
+    AMDBackend,
+    getstdout
+
+include("utility.jl")
+# include("stresstests.jl")
+# include("peakflops_gpu_fmas.jl")
+# include("peakflops_gpu_wmmas.jl")
+# include("peakflops_gpu_matmul.jl")
+include("implementations/general.jl")
+include("implementations/gpuinfo.jl")
+# include("implementations/p2p_bandwidth.jl")
+include("implementations/host2device_bandwidth.jl")
+include("implementations/membw.jl")
+# include("implementations/stresstest.jl")
+# include("implementations/monitoring.jl")
+# include("implementations/peakflops_gpu.jl")
+
+function __init__()
+    GPUInspector.AMDGPUJL_LOADED[] = true
+    GPUInspector.backend!(AMDBackend())
+    GPUInspector.AMDGPUExt = Base.get_extension(GPUInspector, :AMDGPUExt)
+    return nothing
+end
+
+function backendinfo(::AMDBackend)
+    # somewhat crude way to figure out which API functions are implemented :)
+    funcs = String[]
+    impl_dir = joinpath(@__DIR__, "implementations/")
+    for f in readdir(impl_dir)
+        lines = readlines(joinpath(impl_dir, f))
+        func_lines = filter(startswith("function"), lines)
+        for fl in func_lines
+            fname = strip(split(split(fl, "function")[2], "(")[1])
+            if startswith(fname, "_") || startswith(fname, "Base")
+                continue
+            end
+            if fname in funcs # avoid duplicates
+                continue
+            end
+            push!(funcs, fname)
+        end
+    end
+    println("Implementend API functions for AMDBackend:")
+    for f in funcs
+        println("\t", f)
+    end
+    return nothing
+end
+
+end # module
diff --git a/ext/AMDGPUExt/implementations/general.jl b/ext/AMDGPUExt/implementations/general.jl
new file mode 100644
index 0000000..4735281
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/general.jl
@@ -0,0 +1,21 @@
+function GPUInspector.functional(::AMDBackend; verbose=true)
+    if AMDGPU.functional()
+        verbose && @info("AMDGPU.jl is functional.")
+        working = true
+    else
+        verbose && @info("AMDGPU.jl not functional.")
+        working = false
+    end
+    return working
+end
+
+function GPUInspector.clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true)
+    device!(device) do
+        gc && GC.gc()
+        AMDGPU.HIP.reclaim()
+    end
+    return nothing
+end
+
+GPUInspector.device(::AMDBackend) = AMDGPU.device()
+GPUInspector.devices(::AMDBackend) = AMDGPU.devices()
diff --git a/ext/AMDGPUExt/implementations/gpuinfo.jl b/ext/AMDGPUExt/implementations/gpuinfo.jl
new file mode 100644
index 0000000..64c4d70
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/gpuinfo.jl
@@ -0,0 +1,70 @@
+function GPUInspector.ngpus(::AMDBackend)
+    return length(AMDGPU.devices())
+end
+
+function GPUInspector.gpus(::AMDBackend; io=getstdout())
+    # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
+    devs = AMDGPU.devices()
+    if isempty(devs)
+        println(io, "No AMD devices found.")
+    elseif length(devs) == 1
+        println(io, "1 device:")
+    else
+        println(io, length(devs), " devices:")
+    end
+    for (i, dev) in enumerate(devs)
+        mem_free, mem_tot = AMDGPU.device!(dev) do
+            AMDGPU.Runtime.Mem.info()
+        end
+        println(
+            io,
+            "  $(_gpuid(dev)): ",
+            repr(dev),
+            " ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)",
+        )
+    end
+end
+
+"""
+    gpuinfo(deviceid::Integer)
+
+Print out detailed information about the AMD GPU with the given `deviceid`.
+
+(This method is from the AMD backend.)
+"""
+function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io=getstdout())
+    0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id."))
+    return gpuinfo(HIPDevice(deviceid); io)
+end
+function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io=getstdout())
+    # printing
+    println(io, "Device: $dev \n")
+    show(io, AMDGPU.HIP.properties(dev))
+    return nothing
+end
+
+function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io=getstdout())
+    # check p2p access
+    ndevs = ngpus(AMDBackend())
+    if ndevs <= 1
+        error("Only a single GPU available.")
+    else
+        devs = AMDGPU.devices()
+        mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs)
+        for i in 1:ndevs
+            for j in 1:ndevs
+                if i != j
+                    mat_p2p_can_access[i, j] = Bool(AMDGPU.HIP.can_access_peer(devs[i], devs[j]))
+                else
+                    mat_p2p_can_access[i, j] = false
+                end
+            end
+        end
+
+        printstyled(io, "P2P Can Access:\n"; bold=true)
+        show(io, "text/plain", mat_p2p_can_access)
+        println(io)
+        println(io)
+    end
+    return nothing
+end
diff --git a/ext/AMDGPUExt/implementations/host2device_bandwidth.jl b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
new file mode 100644
index 0000000..8095f45
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/host2device_bandwidth.jl
@@ -0,0 +1,85 @@
+function GPUInspector.host2device_bandwidth(
+    ::AMDBackend;
+    memsize::UnitPrefixedBytes=GiB(0.5),
+    dtype=Cchar,
+    DtoDfactor=true,
+    verbose=true,
+    io=getstdout(),
+    kwargs...,
+)
+    N = Int(bytes(memsize) ÷ sizeof(dtype))
+    mem_host = rand(dtype, N)
+    # mem_host_pinned = Mem.pin(rand(dtype, N)) # TODO
+    mem_gpu = AMDGPU.rand(dtype, N)
+
+    _perform_memcpy(mem_host, mem_gpu; title="Host <-> Device", verbose, io=io, kwargs...)
+    verbose && println(io)
+    # _perform_memcpy(
+    #     mem_host_pinned,
+    #     mem_gpu;
+    #     title="Host (pinned) <-> Device",
+    #     verbose,
+    #     io=io,
+    #     kwargs...,
+    # )
+    # verbose && println()
+    # _perform_memcpy(mem_gpu, mem_gpu2; title="Device <-> Device (same device)", DtoDfactor, verbose, kwargs...)
+    return nothing
+end
+
+function _perform_memcpy(
+    mem1,
+    mem2;
+    title="",
+    nbench=10,
+    times=false,
+    stats=false,
+    DtoDfactor=false,
+    verbose=true,
+    io=getstdout(),
+)
+    sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
+    ts = zeros(nbench)
+
+    @inbounds for i in 1:nbench
+        if i % 2 == 0
+            ts[i] = AMDGPU.@elapsed copyto!(mem1, mem2)
+        else
+            ts[i] = AMDGPU.@elapsed copyto!(mem2, mem1)
+        end
+    end
+
+    t_min = minimum(ts)
+    t_max = maximum(ts)
+    t_avg = mean(ts)
+
+    actual_memsize_GiB = sizeof(mem1) * 2^(-30)
+    if DtoDfactor
+        actual_memsize_GiB *= 2 # must count both the read and the write here (taken from p2pBandwidthLatencyTest cuda sample....)
+    end
+    bws = actual_memsize_GiB ./ ts
+    bw_min = minimum(bws)
+    bw_max = maximum(bws)
+    bw_avg = mean(bws)
+
+    if verbose
+        if times
+            println(io, "t_min: $t_min")
+            println(io, "t_max: $t_max")
+            println(io, "t_avg: $t_avg")
+        end
+        printstyled(io, "$(title) Bandwidth (GiB/s):\n"; bold=true)
+        if stats
+            print(io, " ├ max: ")
+            printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true)
+            println(io, " ├ min: ", round(bw_min; digits=2))
+            println(io, " ├ avg: ", round(bw_avg; digits=2))
+            print(io, " └ std_dev: ")
+            printstyled(io, round(std(bws); digits=2), "\n"; color=:yellow, bold=true)
+        else
+            print(io, " └ max: ")
+            printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true)
+        end
+    end
+    return bw_max
+end
diff --git a/ext/AMDGPUExt/implementations/membw.jl b/ext/AMDGPUExt/implementations/membw.jl
new file mode 100644
index 0000000..b309e05
--- /dev/null
+++ b/ext/AMDGPUExt/implementations/membw.jl
@@ -0,0 +1,152 @@
+# function theoretical_memory_bandwidth(
+#     ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
+# )
+#     max_mem_clock_rate =
+#         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
+#     max_mem_bus_width =
+#         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) / 8.0 # in bytes
+#     max_bw = 2.0 * max_mem_clock_rate * max_mem_bus_width * 2^(-30)
+#     if verbose
+#         printstyled(io, "Theoretical Maximal Memory Bandwidth (GiB/s):\n"; bold=true)
+#         print(io, " └ max: ")
+#         printstyled(io, round(max_bw; digits=1), "\n"; color=:green, bold=true)
+#     end
+#     return max_bw
+# end
+
+function GPUInspector.memory_bandwidth(
+    ::AMDBackend;
+    memsize::UnitPrefixedBytes=GiB(0.5),
+    dtype=Cchar,
+    verbose=true,
+    DtoDfactor=true,
+    device=AMDGPU.device(),
+    io=getstdout(),
+    kwargs...,
+)::Float64
+    AMDGPU.device!(device) do
+        N = Int(bytes(memsize) ÷ sizeof(dtype))
+        mem_gpu = AMDGPU.rand(dtype, N)
+        mem_gpu2 = AMDGPU.rand(dtype, N)
+
+        return _perform_memcpy(
+            mem_gpu, mem_gpu2; title="Memory", DtoDfactor, verbose, io=io, kwargs...
+        )
+    end
+end
+
+function GPUInspector.memory_bandwidth_scaling(
+    ::AMDBackend;
+    device=AMDGPU.device(),
+    sizes=logspace(1, exp2(30), 10),
+    verbose=true,
+    io=getstdout(),
+    kwargs...,
+)
+    bandwidths = zeros(length(sizes))
+    for (i, s) in enumerate(sizes)
+        bandwidths[i] = GPUInspector.memory_bandwidth(
+            AMDBackend(); memsize=B(s), device=device, verbose=false, kwargs...
+        )
+        clear_gpu_memory(AMDBackend(); device=device)
+    end
+    if verbose
+        peak_val, idx = findmax(bandwidths)
+        peak_size = sizes[idx]
+        p = UnicodePlots.lineplot(
+            sizes,
+            bandwidths;
+            xlabel="data size",
+            ylabel="GiB/s",
+            title=string(
+                "Peak: ", round(peak_val; digits=2), " GiB/s (size = $(bytes(peak_size)))"
+            ),
+            xscale=:log2,
+        )
+        UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red)
+        println(io) # top margin
+        println(io, p)
+        println(io) # bottom margin
+    end
+    return (sizes=sizes, bandwidths=bandwidths)
+end
+
+function GPUInspector.memory_bandwidth_saxpy(
+    ::AMDBackend;
+    device=AMDGPU.device(),
+    size=2^26,
+    nbench=10,
+    dtype=Float32,
+    verbose=true,
+    io=getstdout(),
+)::Float64
+    device!(device) do
+        a = dtype(pi)
+        x = AMDGPU.rand(dtype, size)
+        y = AMDGPU.rand(dtype, size)
+        z = AMDGPU.zeros(dtype, size)
+
+        kernel = @roc launch = false _saxpy_gpu_kernel!(z, a, x, y)
+        occupancy = AMDGPU.launch_configuration(kernel)
+        t = Inf
+        for _ in 1:nbench
+            Δt = AMDGPU.@elapsed @roc(
+                groupsize = occupancy.groupsize, _saxpy_gpu_kernel!(z, a, x, y)
+            )
+            t = min(t, Δt)
+        end
+
+        bandwidth = 3.0 * sizeof(dtype) * size / t / (1024)^3
+        if verbose
+            printstyled(io, "Memory Bandwidth (GiB/s):\n"; bold=true)
+            print(io, " └ max: ")
+            printstyled(io, round(bandwidth; digits=2), "\n"; color=:green, bold=true)
+        end
+        return bandwidth
+    end
+end
+
+function _saxpy_gpu_kernel!(z, a, x, y)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i <= length(z)
+        @inbounds z[i] = a * x[i] + y[i]
+    end
+    return nothing
+end
+
+function GPUInspector.memory_bandwidth_saxpy_scaling(
+    ::AMDBackend;
+    device=AMDGPU.device(),
+    sizes=[2^20 * i for i in 10:10:300],
+    verbose=true,
+    io=getstdout(),
+    kwargs...,
+)
+    # sizes = [2^20 * i for i in 8:128] # V100
+    bandwidths = zeros(length(sizes))
+    for (i, s) in enumerate(sizes)
+        bandwidths[i] = GPUInspector.memory_bandwidth_saxpy(
+            AMDBackend(); device=device, size=s, verbose=false, kwargs...
+        )
+        clear_gpu_memory(AMDBackend(); device=device)
+    end
+    if verbose
+        peak_val, idx = findmax(bandwidths)
+        peak_size = sizes[idx]
+        p = UnicodePlots.lineplot(
+            sizes,
+            bandwidths;
+            xlabel="vector length",
+            ylabel="GiB/s",
+            title=string(
+                "Peak: ", round(peak_val; digits=2), " GiB/s (vector size = $(bytes(peak_size)))"
+            ),
+            xscale=:log2,
+        )
+        UnicodePlots.lineplot!(p, [peak_size, peak_size], [0.0, peak_val]; color=:red)
+        println(io) # top margin
+        println(io, p)
+        println(io) # bottom margin
+    end
+    return (sizes=sizes, bandwidths=bandwidths)
+end
diff --git a/ext/AMDGPUExt/utility.jl b/ext/AMDGPUExt/utility.jl
new file mode 100644
index 0000000..f9878ea
--- /dev/null
+++ b/ext/AMDGPUExt/utility.jl
@@ -0,0 +1,5 @@
+_device2string(dev::HIPDevice) = "GPU $(_gpuid(dev)): $(_name(dev))"
+
+_gpuid(dev::HIPDevice) = AMDGPU.HIP.device_id(dev) + 1
+
+_name(dev::HIPDevice) = AMDGPU.HIP.name(dev)
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
index 9d2a770..0703604 100644
--- a/ext/CUDAExt/CUDAExt.jl
+++ b/ext/CUDAExt/CUDAExt.jl
@@ -12,6 +12,7 @@ using LinearAlgebra
 # pkgs
 using UnicodePlots
 using NVTX
+using ThreadPinning
 
 # for usage in CUDAExt
 using GPUInspector:
@@ -23,38 +24,8 @@ using GPUInspector:
     MonitoringResults,
     _defaultylims,
     @unroll,
-    NVIDIABackend
-
-# import stubs to implement them
-import GPUInspector: backendinfo, functional
-# gpuinfo
-import GPUInspector: ngpus, gpuinfo, gpuinfo_p2p_access, gpus
-# p2p bw
-import GPUInspector:
-    p2p_bandwidth,
-    p2p_bandwidth_all,
-    p2p_bandwidth_bidirectional,
-    p2p_bandwidth_bidirectional_all
-# host2device bw
-import GPUInspector: host2device_bandwidth
-# membw
-import GPUInspector:
-    theoretical_memory_bandwidth,
-    memory_bandwidth,
-    memory_bandwidth_scaling,
-    memory_bandwidth_saxpy,
-    memory_bandwidth_saxpy_scaling
-# stresstest
-import GPUInspector: stresstest
-# monitoring
-import GPUInspector:
-    monitoring_start,
-    monitoring_stop,
-    livemonitor_something,
-    livemonitor_powerusage,
-    livemonitor_temperature
-# peakflops_gpu
-import GPUInspector: peakflops_gpu, theoretical_peakflops_gpu
+    NVIDIABackend,
+    getstdout
 
 # for convenience
 const BFloat16 = CUDA.BFloat16
diff --git a/ext/CUDAExt/implementations/general.jl b/ext/CUDAExt/implementations/general.jl
index b44e5fd..f147679 100644
--- a/ext/CUDAExt/implementations/general.jl
+++ b/ext/CUDAExt/implementations/general.jl
@@ -1,4 +1,4 @@
-function functional(::NVIDIABackend; verbose=true)
+function GPUInspector.functional(::NVIDIABackend; verbose=true)
     if CUDA.functional()
         verbose && @info("CUDA/GPU available.")
         hascuda = true
@@ -21,3 +21,14 @@ function functional(::NVIDIABackend; verbose=true)
     end
     return hascuda
 end
+
+function GPUInspector.clear_gpu_memory(::NVIDIABackend; device=CUDA.device(), gc=true)
+    device!(device) do
+        gc && GC.gc()
+        CUDA.reclaim()
+    end
+    return nothing
+end
+
+GPUInspector.device(::NVIDIABackend) = CUDA.device()
+GPUInspector.devices(::NVIDIABackend) = CUDA.devices()
diff --git a/ext/CUDAExt/implementations/gpuinfo.jl b/ext/CUDAExt/implementations/gpuinfo.jl
index b960b08..bff635d 100644
--- a/ext/CUDAExt/implementations/gpuinfo.jl
+++ b/ext/CUDAExt/implementations/gpuinfo.jl
@@ -1,6 +1,8 @@
-ngpus(::NVIDIABackend) = length(CUDA.devices())
+function GPUInspector.ngpus(::NVIDIABackend)
+    length(CUDA.devices())
+end
 
-function gpus(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpus(::NVIDIABackend; io=getstdout())
     # Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
     devs = devices()
     if isempty(devs)
@@ -41,13 +43,13 @@ Print out detailed information about the NVIDIA GPU with the given `deviceid`.
 
 Heavily inspired by the CUDA sample "deviceQueryDrv.cpp".
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function gpuinfo(::NVIDIABackend, deviceid::Integer; io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, deviceid::Integer; io=getstdout())
     0 <= deviceid <= ngpus(NVIDIABackend()) - 1 || throw(ArgumentError("Invalid device id."))
     return gpuinfo(CuDevice(deviceid); io)
 end
-function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
+function GPUInspector.gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io=getstdout())
     # query
     mp = nmultiprocessors(dev)
     cores = ncudacores(dev)
@@ -214,7 +216,7 @@ function gpuinfo(::NVIDIABackend, dev::CuDevice=CUDA.device(); io::IO=stdout)
     return nothing
 end
 
-function gpuinfo_p2p_access(::NVIDIABackend; io::IO=stdout)
+function GPUInspector.gpuinfo_p2p_access(::NVIDIABackend; io=getstdout())
     # check p2p access
     ndevs = ngpus(NVIDIABackend())
     if ndevs <= 1
diff --git a/ext/CUDAExt/implementations/host2device_bandwidth.jl b/ext/CUDAExt/implementations/host2device_bandwidth.jl
index 82f2f5d..5b3b31c 100644
--- a/ext/CUDAExt/implementations/host2device_bandwidth.jl
+++ b/ext/CUDAExt/implementations/host2device_bandwidth.jl
@@ -1,9 +1,9 @@
-function host2device_bandwidth(::NVIDIABackend;
+function GPUInspector.host2device_bandwidth(::NVIDIABackend;
     memsize::UnitPrefixedBytes=GiB(0.5),
     dtype=Cchar,
     DtoDfactor=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     N = Int(bytes(memsize) ÷ sizeof(dtype))
@@ -42,7 +42,7 @@ function _perform_memcpy(
     stats=false,
     DtoDfactor=false,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     NVTX.@range "host2dev: $title" begin
         sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
diff --git a/ext/CUDAExt/implementations/membw.jl b/ext/CUDAExt/implementations/membw.jl
index c0b68f1..a9dc2bf 100644
--- a/ext/CUDAExt/implementations/membw.jl
+++ b/ext/CUDAExt/implementations/membw.jl
@@ -1,5 +1,5 @@
-function theoretical_memory_bandwidth(
-    ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io::IO=stdout
+function GPUInspector.theoretical_memory_bandwidth(
+    ::NVIDIABackend; device::CuDevice=CUDA.device(), verbose=true, io=getstdout()
 )
     max_mem_clock_rate =
         CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) * 1000 # in Hz
@@ -14,14 +14,14 @@ function theoretical_memory_bandwidth(
     return max_bw
 end
 
-function memory_bandwidth(
+function GPUInspector.memory_bandwidth(
     ::NVIDIABackend;
     memsize::UnitPrefixedBytes=GiB(0.5),
     dtype=Cchar,
     verbose=true,
     DtoDfactor=true,
     device=CUDA.device(),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )::Float64
     device!(device) do
@@ -41,12 +41,12 @@ function memory_bandwidth(
     end
 end
 
-function memory_bandwidth_scaling(
+function GPUInspector.memory_bandwidth_scaling(
     ::NVIDIABackend;
     device=CUDA.device(),
     sizes=logspace(1, exp2(30), 10),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     bandwidths = zeros(length(sizes))
@@ -54,7 +54,7 @@ function memory_bandwidth_scaling(
         bandwidths[i] = memory_bandwidth(
             NVIDIABackend(); memsize=B(s), device=device, verbose=false, kwargs...
         )
-        clear_gpu_memory(device)
+        clear_gpu_memory(NVIDIABackend(); device=device)
     end
     if verbose
         peak_val, idx = findmax(bandwidths)
@@ -81,9 +81,9 @@ end
 Extra keyword arguments:
 * `cublas` (default: `true`): toggle between `CUDA.axpy!` and a custom `_saxpy_gpu_kernel!`.
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function memory_bandwidth_saxpy(
+function GPUInspector.memory_bandwidth_saxpy(
     ::NVIDIABackend;
     device=CUDA.device(),
     size=2^20 * 10,
@@ -91,7 +91,7 @@ function memory_bandwidth_saxpy(
     dtype=Float32,
     cublas=true,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )::Float64
     device!(device) do
         a = dtype(pi)
@@ -131,12 +131,12 @@ function _saxpy_gpu_kernel!(z, a, x, y)
     return nothing
 end
 
-function memory_bandwidth_saxpy_scaling(
+function GPUInspector.memory_bandwidth_saxpy_scaling(
     ::NVIDIABackend;
     device=CUDA.device(),
     sizes=[2^20 * i for i in 10:10:300],
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     # sizes = [2^20 * i for i in 8:128] # V100
@@ -145,7 +145,7 @@ function memory_bandwidth_saxpy_scaling(
         bandwidths[i] = memory_bandwidth_saxpy(
             NVIDIABackend(); device=device, size=s, verbose=false, kwargs...
         )
-        clear_gpu_memory(device)
+        clear_gpu_memory(NVIDIABackend(); device=device)
     end
     if verbose
         peak_val, idx = findmax(bandwidths)
diff --git a/ext/CUDAExt/implementations/monitoring.jl b/ext/CUDAExt/implementations/monitoring.jl
index 483da9b..54c24f5 100644
--- a/ext/CUDAExt/implementations/monitoring.jl
+++ b/ext/CUDAExt/implementations/monitoring.jl
@@ -1,4 +1,4 @@
-function monitoring_start(
+function GPUInspector.monitoring_start(
     ::NVIDIABackend; freq=1, devices=CUDA.devices(), thread=Threads.nthreads(), verbose=true
 )
     if ismonitoring()
@@ -54,9 +54,9 @@ Specifically, `results` is a named tuple with the following keys:
 * `time`: the (relative) times at which we measured
 * `temperature`, `power`, `compute`, `mem`
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults
+function GPUInspector.monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults
     if ismonitoring()
         verbose && @info("Stopping monitoring and fetching results...")
         _monitoring!(false)
@@ -67,7 +67,7 @@ function monitoring_stop(::NVIDIABackend; verbose=true)::MonitoringResults
     end
 end
 
-function livemonitor_temperature(::NVIDIABackend, duration; kwargs...)
+function GPUInspector.livemonitor_temperature(::NVIDIABackend, duration; kwargs...)
     return livemonitor_something(
         NVIDIABackend(),
         get_temperatures,
@@ -78,7 +78,7 @@ function livemonitor_temperature(::NVIDIABackend, duration; kwargs...)
     )
 end
 
-function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...)
+function GPUInspector.livemonitor_powerusage(::NVIDIABackend, duration; kwargs...)
     return livemonitor_something(
         NVIDIABackend(),
         get_power_usages,
@@ -89,7 +89,7 @@ function livemonitor_powerusage(::NVIDIABackend, duration; kwargs...)
     )
 end
 
-function livemonitor_something(
+function GPUInspector.livemonitor_something(
     ::NVIDIABackend,
     f::F,
     duration;
diff --git a/ext/CUDAExt/implementations/p2p_bandwidth.jl b/ext/CUDAExt/implementations/p2p_bandwidth.jl
index 6b5a83e..2734ecb 100644
--- a/ext/CUDAExt/implementations/p2p_bandwidth.jl
+++ b/ext/CUDAExt/implementations/p2p_bandwidth.jl
@@ -1,4 +1,4 @@
-function p2p_bandwidth(
+function GPUInspector.p2p_bandwidth(
     ::NVIDIABackend;
     memsize::UnitPrefixedBytes=B(40_000_000),
     nbench=5,
@@ -9,7 +9,7 @@ function p2p_bandwidth(
     dtype=Float32,
     src=0,
     dst=1,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if ngpus(NVIDIABackend()) < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -66,7 +66,7 @@ function p2p_bandwidth(
     return bw_max
 end
 
-function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs...)
+function GPUInspector.p2p_bandwidth_all(::NVIDIABackend; io=getstdout(), verbose=false, kwargs...)
     ngpus = length(CUDA.devices())
     if ngpus < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -82,7 +82,7 @@ function p2p_bandwidth_all(::NVIDIABackend; io::IO=stdout, verbose=false, kwargs
     ]
 end
 
-function p2p_bandwidth_bidirectional(
+function GPUInspector.p2p_bandwidth_bidirectional(
     ::NVIDIABackend;
     memsize::UnitPrefixedBytes=B(40_000_000),
     nbench=20,
@@ -93,7 +93,7 @@ function p2p_bandwidth_bidirectional(
     dev1=0,
     dev2=1,
     repeat=100,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if ngpus(NVIDIABackend()) < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
@@ -142,7 +142,7 @@ function p2p_bandwidth_bidirectional(
     return bw_max
 end
 
-function p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...)
+function GPUInspector.p2p_bandwidth_bidirectional_all(::NVIDIABackend; kwargs...)
     ngpus = length(CUDA.devices())
     if ngpus < 2
         error("At least 2 GPUs are needed for the P2P benchmark.")
diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl
index 0d6bb2e..99a117c 100644
--- a/ext/CUDAExt/implementations/peakflops_gpu.jl
+++ b/ext/CUDAExt/implementations/peakflops_gpu.jl
@@ -8,15 +8,15 @@ Estimates the theoretical peak performance of a CUDA device in TFLOP/s.
 * `dtype` (default: `tensorcores ? Float16 : Float32`): element type of the matrices
 * `io` (default: `stdout`): set the stream where the results should be printed.
 
-(This method is from the CUDA backend.)
+(This method is from the NVIDIA Backend.)
 """
-function theoretical_peakflops_gpu(
+function GPUInspector.theoretical_peakflops_gpu(
     ::NVIDIABackend;
     device=CUDA.device(),
     tensorcores=hastensorcores(),
     dtype=tensorcores ? Float16 : Float32,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     if tensorcores
         max_peakflops = _theoretical_peakflops_gpu_tensorcores(; device, dtype)
@@ -104,12 +104,12 @@ it takes to perform
 
 For more keyword argument options see [`peakflops_gpu_fmas`](@ref) and [`peakflops_gpu_wmmas`](@ref).
 """
-function peakflops_gpu(
+function GPUInspector.peakflops_gpu(
     ::NVIDIABackend;
     tensorcores=hastensorcores(),
     verbose=true,
     dtype=tensorcores ? Float16 : Float32,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     if tensorcores
diff --git a/ext/CUDAExt/implementations/stresstest.jl b/ext/CUDAExt/implementations/stresstest.jl
index 358d9ca..9c14994 100644
--- a/ext/CUDAExt/implementations/stresstest.jl
+++ b/ext/CUDAExt/implementations/stresstest.jl
@@ -1,4 +1,4 @@
-function stresstest(
+function GPUInspector.stresstest(
     ::NVIDIABackend;
     devices=[CUDA.device()],
     mem=nothing,
@@ -12,7 +12,7 @@ function stresstest(
     clearmem=false,
     monitoring=false,
     batch_duration=nothing,
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 )
     logger = ConsoleLogger(io)
@@ -69,7 +69,7 @@ function stresstest(
         Δt = @elapsed _run_stresstests(ts; verbose, kwargs...)
         if clearmem
             verbose && @info("Clearing GPU memory.")
-            clear_all_gpus_memory(devices)
+            GPUInspector.clear_all_gpus_memory(; devices=devices)
         end
         verbose && @info("Took $(round(Δt; digits=2)) seconds to run the tests.")
         if monitoring
diff --git a/ext/CUDAExt/peakflops_gpu_fmas.jl b/ext/CUDAExt/peakflops_gpu_fmas.jl
index a251341..eb3ff6f 100644
--- a/ext/CUDAExt/peakflops_gpu_fmas.jl
+++ b/ext/CUDAExt/peakflops_gpu_fmas.jl
@@ -48,7 +48,7 @@ function _peakflops_gpu_fmas(;
     nkernel=5,
     device::CuDevice=CUDA.device(),
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         d_a = CUDA.rand(dtype, size)
diff --git a/ext/CUDAExt/peakflops_gpu_matmul.jl b/ext/CUDAExt/peakflops_gpu_matmul.jl
index a081b69..93bf221 100644
--- a/ext/CUDAExt/peakflops_gpu_matmul.jl
+++ b/ext/CUDAExt/peakflops_gpu_matmul.jl
@@ -9,13 +9,13 @@ function peakflops_gpu_matmul_scaling(
     device=CUDA.device(),
     verbose=true,
     sizes=2 .^ (10:15),
-    io::IO=stdout,
+    io=getstdout(),
     kwargs...,
 ) where {F}
     flops = zeros(length(sizes))
     for (i, s) in enumerate(sizes)
         flops[i] = peakflops_func(; device=device, size=s, verbose=false, kwargs...)
-        clear_gpu_memory(device)
+        GPUInspector.clear_gpu_memory(; device=device)
     end
     if verbose
         peak_val, idx = findmax(flops)
@@ -64,7 +64,7 @@ function peakflops_gpu_matmul(;
     nmatmuls=5,
     nbench=5,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         C = CUDA.zeros(dtype, size, size)
@@ -108,7 +108,7 @@ function peakflops_gpu_matmul_graphs(;
     nmatmuls=5,
     nbench=5,
     verbose=true,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         C = CUDA.zeros(dtype, size, size)
diff --git a/ext/CUDAExt/peakflops_gpu_wmmas.jl b/ext/CUDAExt/peakflops_gpu_wmmas.jl
index a12295b..f6c000c 100644
--- a/ext/CUDAExt/peakflops_gpu_wmmas.jl
+++ b/ext/CUDAExt/peakflops_gpu_wmmas.jl
@@ -91,7 +91,7 @@ function _peakflops_gpu_wmmas(;
     nkernel=10,
     verbose=true,
     dtype=Float16,
-    io::IO=stdout,
+    io=getstdout(),
 )
     device!(device) do
         if Symbol(dtype) == :Float16
diff --git a/ext/CUDAExt/utility.jl b/ext/CUDAExt/utility.jl
index 7f6a504..3c9681d 100644
--- a/ext/CUDAExt/utility.jl
+++ b/ext/CUDAExt/utility.jl
@@ -22,26 +22,6 @@ function alloc_mem(memsize::UnitPrefixedBytes; devs=(CUDA.device(),), dtype=Floa
     return mem_handles
 end
 
-# TODO: Maybe make API/stub?
-"Reclaim the unused memory of the currently active GPU (i.e. `device()`)."
-function clear_gpu_memory(device::CuDevice=CUDA.device(); gc=true)
-    device!(device) do
-        gc && GC.gc()
-        CUDA.reclaim()
-    end
-    return nothing
-end
-
-# TODO: Maybe make API/stub?
-"Reclaim the unused memory of all available GPUs."
-function clear_all_gpus_memory(devices=CUDA.devices(); gc=true)
-    gc && GC.gc()
-    for dev in devices
-        clear_gpu_memory(dev; gc=false)
-    end
-    return nothing
-end
-
 """
     toggle_tensorcoremath([enable::Bool; verbose=true])
 Switches the `CUDA.math_mode` between `CUDA.FAST_MATH` (`enable=true`) and `CUDA.DEFAULT_MATH` (`enable=false`).
diff --git a/src/GPUInspector.jl b/src/GPUInspector.jl
index 9dbe9e9..7790536 100644
--- a/src/GPUInspector.jl
+++ b/src/GPUInspector.jl
@@ -9,14 +9,16 @@ using Base: UUID
 using Pkg: Pkg
 
 # external
-using Reexport
-@reexport using ThreadPinning
+using ThreadPinning
 using DocStringExtensions
 using UnicodePlots
 using CpuId: cachesize
 using HDF5: h5open
 using Glob: glob
 
+const DEFAULT_IO = Ref{Union{IO, Nothing}}(nothing)
+getstdout() = something(DEFAULT_IO[], stdout)
+
 include("backends.jl")
 include("UnitPrefixedBytes.jl")
 include("utility.jl")
@@ -27,8 +29,8 @@ include("monitoring_io.jl")
 
 function not_implemented_yet()
     return error(
-        "Not implemented yet. You either haven't loaded a backend (like CUDA.jl) yet, or" *
-        " the loaded backend doesn't provide this functionality.",
+        "Not implemented yet. You either haven't loaded a backend yet (e.g. CUDA.jl or " *
+        "AMDGPU.jl), or the loaded backend doesn't provide this functionality.",
     )
 end
 include("stubs/stubs_general.jl")
@@ -42,7 +44,7 @@ include("stubs/stubs_peakflops_gpu.jl")
 
 # backends
 export Backend, NoBackend, NVIDIABackend, AMDBackend, backend, backend!, backendinfo
-export CUDAExt
+export CUDAExt, AMDGPUExt
 
 # monitoring io+plotting
 export plot_monitoring_results, load_monitoring_results, save_monitoring_results
@@ -50,11 +52,14 @@ export plot_monitoring_results, load_monitoring_results, save_monitoring_results
 # utilities
 export UnitPrefixedBytes,
     B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, bytes, simplify, change_base, value
-export logspace
+export logspace, clear_all_gpus_memory
 
 # Let's currently not export the CPU tests. After all, this is GPUInspector.jl :)
 # export stresstest_cpu
 
+# stubs general
+export clear_gpu_memory
+
 # stubs gpuinfo
 export ngpus, gpuinfo, gpuinfo_p2p_access, gpus
 # stubs p2p bandwidth
diff --git a/src/backends.jl b/src/backends.jl
index bd5d1de..ae147fc 100644
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -56,6 +56,7 @@ function check_backend(b::Backend)
 end
 
 CUDAExt::Union{Nothing,Module} = nothing
+AMDGPUExt::Union{Nothing,Module} = nothing
 
 """
 Query information about a specific backend, e.g., what functionality the backend currently
diff --git a/src/stubs/stubs_general.jl b/src/stubs/stubs_general.jl
index eee42ed..219d185 100644
--- a/src/stubs/stubs_general.jl
+++ b/src/stubs/stubs_general.jl
@@ -4,3 +4,19 @@ If not, print some hopefully useful debug information (or turn it off with `verb
 """
 functional(; kwargs...) = functional(backend(); kwargs...)
 functional(::Backend; kwargs...) = not_implemented_yet()
+
+"""
+    clear_gpu_memory(; device, gc)
+
+Reclaim the unused memory of a GPU
+"""
+clear_gpu_memory(; kwargs...) = clear_gpu_memory(backend(); kwargs...)
+clear_gpu_memory(::Backend; kwargs...) = not_implemented_yet()
+
+"Return the current device of the active backend."
+device() = device(backend())
+device(::Backend) = not_implemented_yet()
+
+"Return the devices of the active backend."
+devices() = devices(backend())
+devices(::Backend) = not_implemented_yet()
diff --git a/src/utility.jl b/src/utility.jl
index bba0d55..0b38851 100644
--- a/src/utility.jl
+++ b/src/utility.jl
@@ -2,6 +2,15 @@ function logspace(start, stop, length)
     return exp2.(range(log2(start), log2(stop); length=length))
 end
 
+"Reclaim the unused memory of all available GPUs."
+function clear_all_gpus_memory(; gc=true, devices)
+    gc && GC.gc()
+    for dev in devices
+        clear_gpu_memory(; device=dev, gc=false)
+    end
+    return nothing
+end
+
 # L2_cachesize() = cachesize()[2]
 
 # """
diff --git a/test/backend_tests.jl b/test/backend_tests.jl
deleted file mode 100644
index d772080..0000000
--- a/test/backend_tests.jl
+++ /dev/null
@@ -1,10 +0,0 @@
-@testitem "CUDA backend" begin
-    using CUDA
-    @test GPUInspector.is_cuda_loaded()
-    @test GPUInspector.is_backend_loaded(NVIDIABackend())
-    @test backend() == NVIDIABackend()
-    @test isnothing(backend!(NoBackend()))
-    @test backend() == NoBackend()
-    @test isnothing(backend!(:cuda))
-    @test backend() == NVIDIABackend()
-end
diff --git a/test/gpuinfo_tests.jl b/test/gpuinfo_tests.jl
deleted file mode 100644
index 699a766..0000000
--- a/test/gpuinfo_tests.jl
+++ /dev/null
@@ -1,7 +0,0 @@
-@testitem "gpuinfo / gpus" begin
-    using CUDA
-    @test isnothing(gpus())
-    @test isnothing(gpuinfo())
-    @test isnothing(gpuinfo(0))
-    @test isnothing(gpuinfo(device()))
-end
diff --git a/test/peakflops_tests.jl b/test/peakflops_tests.jl
deleted file mode 100644
index 9005fce..0000000
--- a/test/peakflops_tests.jl
+++ /dev/null
@@ -1,23 +0,0 @@
-@testitem "peakflops_gpu (CUDA cores)" begin
-    using CUDA
-    @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64
-    @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) ==
-        Float64
-    @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) ==
-        Float64
-end
-
-@testitem "peakflops_gpu (Tensor cores)" begin
-    using CUDA
-    @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64
-    @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) == Float64
-end
-
-@testitem "peakflops_gpu_matmul / scaling" begin
-    using CUDA
-    @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64
-    @test typeof(CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false)) == Float64
-    @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) == Float64
-    @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) ==
-        Tuple{Vector{Int64},Vector{Float64}}
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 111ff35..fb1f3f8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,24 +1,100 @@
-using TestItemRunner
 using GPUInspector
-using CUDA
+using Test
+using LinearAlgebra
+using Logging
 
-if !GPUInspector.functional()
-    error("Can't run testsuite since CUDA/GPU not present or not functional!")
+# Environment variables:
+#   - "TEST_BACKEND": can be set to manually specify a backend
+#   - "TEST_QUIET": can be set to true/false to enable/disable non-verbose testing
+#   - "TESTS": a comma separated list of test suites to run (see TEST_NAMES below)
+
+# figure out which backend to use (if both CUDA and AMDGPU are functional we use CUDA)
+if haskey(ENV, "TEST_BACKEND")
+    if lowercase(ENV["TEST_BACKEND"]) in ("nvidia", "cuda", "nvidiabackend")
+        using CUDA
+        TEST_BACKEND = NVIDIABackend()
+    elseif lowercase(ENV["TEST_BACKEND"]) in ("amd", "amdgpu", "amdbackend")
+        using AMDGPU
+        TEST_BACKEND = AMDBackend()
+    else
+        error("""
+        TEST_BACKEND environment variable contains unsupported value.
+        """)
+    end
+else
+    using CUDA
+    using AMDGPU
+    if CUDA.functional()
+        @info("NVIDIA GPUs detected.", CUDA.devices())
+        TEST_BACKEND = NVIDIABackend()
+    elseif AMDGPU.functional()
+        @info("AMD GPUs detected.", AMDGPU.devices())
+        TEST_BACKEND = AMDBackend()
+    else
+        error("""
+            Aborting because neither CUDA.jl nor AMDGPU.jl are functional.
+            Are there any GPUs in the system?
+            """)
+    end
+end
+backend!(TEST_BACKEND)
+@info "Running tests with the following backend: $TEST_BACKEND."
+
+const TEST_NAMES = [
+    "bandwidth", "peakflops", "stresstest", "gpuinfo", "utility", "backend_specific", "core"
+]
+if haskey(ENV, "TESTS")
+    tests = split(ENV["TESTS"], ",")
+    if !all(t -> t in TEST_NAMES, tests)
+        error("""
+        TESTS environment variable contains unknown test names.
+        Valid test names are: $(TEST_NAMES)
+        """)
+    end
+    TARGET_TESTS = tests
+else
+    # run all tests
+    const TARGET_TESTS = TEST_NAMES
 end
-if Threads.nthreads() == 1 || (Threads.nthreads() < length(CUDA.devices()) + 1)
-    # we should have at least one thread per gpu + one monitoring thread
-    @warn(
-        "You should run the tests with at least $(length(CUDA.devices()) + 1) Julia threads.",
-        Threads.nthreads(),
-        length(CUDA.devices())
-    )
+@info "Running following tests: $TARGET_TESTS."
+
+if "stresstest" in TARGET_TESTS
+    # error if we aren't running with enough threads
+    if Threads.nthreads() == 1 || (Threads.nthreads() < ngpus() + 1)
+        # we should have at least one thread per gpu + one monitoring thread
+        error("You should run the tests with at least $(ngpus() + 1) Julia threads.")
+    end
 end
 
-@run_package_tests
+quiet_testing = parse(Bool, get(ENV, "TEST_QUIET", "true"))
+if quiet_testing
+    GPUInspector.DEFAULT_IO[] = Base.BufferStream()
+    global_logger(Logging.NullLogger())
+end
 
-include("backend_tests.jl")
-include("utility_tests.jl")
-include("stresstest_tests.jl")
-include("bandwidth_tests.jl")
-include("peakflops_tests.jl")
-include("gpuinfo_tests.jl")
+if "core" in TARGET_TESTS
+    include("tests_core.jl")
+end
+if "utility" in TARGET_TESTS
+    include("tests_utility.jl")
+end
+if "gpuinfo" in TARGET_TESTS
+    include("tests_gpuinfo.jl")
+end
+if "bandwidth" in TARGET_TESTS
+    include("tests_bandwidth.jl")
+end
+if "stresstest" in TARGET_TESTS
+    using CairoMakie
+    include("tests_stresstest.jl")
+end
+if "peakflops" in TARGET_TESTS
+    include("tests_peakflops.jl")
+end
+if "backend_specific" in TARGET_TESTS
+    if TEST_BACKEND == NVIDIABackend()
+        include("tests_nvidia_only.jl")
+    elseif TEST_BACKEND == AMDBackend()
+        include("tests_amd_only.jl")
+    end
+end
diff --git a/test/tests_amd_only.jl b/test/tests_amd_only.jl
new file mode 100644
index 0000000..e69de29
diff --git a/test/bandwidth_tests.jl b/test/tests_bandwidth.jl
similarity index 60%
rename from test/bandwidth_tests.jl
rename to test/tests_bandwidth.jl
index a68ea19..6626617 100644
--- a/test/bandwidth_tests.jl
+++ b/test/tests_bandwidth.jl
@@ -1,15 +1,12 @@
-@testitem "p2p_bandwidth" begin
-    using LinearAlgebra
-    using CUDA
-
+@testset "p2p_bandwidth" begin
     @testset "unidirectional" begin
         # p2p_bandwidth
         @test typeof(p2p_bandwidth(; verbose=false)) == Float64
         @test 0 ≤ p2p_bandwidth(; verbose=false)
         # options
         @test typeof(p2p_bandwidth(; memsize=MB(100), verbose=false)) == Float64
-        @test typeof(p2p_bandwidth(; src=CuDevice(0), dst=CuDevice(1), verbose=false)) ==
-            Float64
+        dev_src, dev_dst = collect(GPUInspector.devices())[1:2]
+        @test typeof(p2p_bandwidth(; src=dev_src, dst=dev_dst, verbose=false)) == Float64
         @test typeof(p2p_bandwidth(; dtype=Float16, verbose=false)) == Float64
         @test typeof(p2p_bandwidth(; nbench=10, verbose=false)) == Float64
         @test typeof(p2p_bandwidth(; hist=true, verbose=true)) == Float64
@@ -26,7 +23,8 @@
         @test typeof(p2p_bandwidth_bidirectional(; verbose=false)) == Float64
         @test 0 ≤ p2p_bandwidth_bidirectional(; verbose=false)
         # options
-        @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) == Float64
+        @test typeof(p2p_bandwidth_bidirectional(; memsize=MB(100), verbose=false)) ==
+            Float64
         @test typeof(p2p_bandwidth_bidirectional(; dtype=Float16, verbose=false)) == Float64
         @test typeof(p2p_bandwidth_bidirectional(; nbench=10, verbose=false)) == Float64
         @test typeof(p2p_bandwidth_bidirectional(; hist=true, verbose=true)) == Float64
@@ -41,20 +39,29 @@
     end
 end
 
-@testitem "host2device_bandwidth" begin
-    using CUDA
+@testset "host2device_bandwidth" begin
     @test isnothing(host2device_bandwidth())
-    @test isnothing(host2device_bandwidth(; memsize=MB(100)))
-    @test isnothing(host2device_bandwidth(; dtype=Float16))
+    @test isnothing(host2device_bandwidth(; memsize=MB(1)))
+    @test isnothing(host2device_bandwidth(; dtype=Float64))
 end
 
-@testitem "memory_bandwidth" begin
-    using CUDA
-    @test typeof(memory_bandwidth()) == Float64
-    @test typeof(memory_bandwidth(; memsize=MiB(10))) == Float64
-    @test typeof(memory_bandwidth(; dtype=Float32)) == Float64
-
-    @test typeof(memory_bandwidth_saxpy()) == Float64
-    @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64
-    @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64
+@testset "memory_bandwidth" begin
+    @testset "regular" begin
+        @test typeof(memory_bandwidth()) == Float64
+        @test typeof(memory_bandwidth(; memsize=MiB(1))) == Float64
+        @test typeof(memory_bandwidth(; dtype=Float32)) == Float64
+    end
+    @testset "regular, scaling" begin
+        @test typeof(memory_bandwidth_scaling()) ==
+            NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Float64},Vector{Float64}}}
+    end
+    @testset "saxpy" begin
+        @test typeof(memory_bandwidth_saxpy()) == Float64
+        @test typeof(memory_bandwidth_saxpy(; size=2^20 * 2)) == Float64
+        @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64
+    end
+    @testset "saxpy, scaling" begin
+        @test typeof(memory_bandwidth_saxpy_scaling()) ==
+            NamedTuple{(:sizes, :bandwidths),Tuple{Vector{Int64},Vector{Float64}}}
+    end
 end
diff --git a/test/tests_core.jl b/test/tests_core.jl
new file mode 100644
index 0000000..1db7309
--- /dev/null
+++ b/test/tests_core.jl
@@ -0,0 +1,21 @@
+@testset "Backend switching" begin
+    if TEST_BACKEND == NVIDIABackend()
+        @test GPUInspector.is_cuda_loaded()
+        @test GPUInspector.is_backend_loaded(NVIDIABackend())
+        @test backend() == NVIDIABackend()
+        @test isnothing(backend!(NoBackend()))
+        @test backend() == NoBackend()
+        @test isnothing(backend!(:cuda))
+        @test backend() == NVIDIABackend()
+        @test isnothing(backend!(NVIDIABackend()))
+    elseif TEST_BACKEND == AMDBackend()
+        @test GPUInspector.is_amdgpu_loaded()
+        @test GPUInspector.is_backend_loaded(AMDBackend())
+        @test backend() == AMDBackend()
+        @test isnothing(backend!(NoBackend()))
+        @test backend() == NoBackend()
+        @test isnothing(backend!(:amd))
+        @test backend() == AMDBackend()
+        @test isnothing(backend!(AMDBackend()))
+    end
+end
diff --git a/test/tests_gpuinfo.jl b/test/tests_gpuinfo.jl
new file mode 100644
index 0000000..9ea54c7
--- /dev/null
+++ b/test/tests_gpuinfo.jl
@@ -0,0 +1,8 @@
+@testset "gpuinfo / gpus" begin
+    @test isnothing(gpus())
+    @test isnothing(gpuinfo())
+    @test isnothing(gpuinfo(GPUInspector.device()))
+    if ngpus() > 1
+        @test isnothing(gpuinfo_p2p_access())
+    end
+end
diff --git a/test/tests_nvidia_only.jl b/test/tests_nvidia_only.jl
new file mode 100644
index 0000000..27ffcd7
--- /dev/null
+++ b/test/tests_nvidia_only.jl
@@ -0,0 +1,11 @@
+@testset "toggle_tensorcoremath" begin
+    @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false))
+    @test CUDA.math_mode() == CUDA.FAST_MATH
+    @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false))
+    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
+    # test toggle
+    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
+    @test CUDA.math_mode() == CUDA.FAST_MATH
+    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
+    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
+end
diff --git a/test/tests_peakflops.jl b/test/tests_peakflops.jl
new file mode 100644
index 0000000..0d835e0
--- /dev/null
+++ b/test/tests_peakflops.jl
@@ -0,0 +1,28 @@
+if backend() == NVIDIABackend()
+    @testset "peakflops_gpu (CUDA cores)" begin
+        @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64
+        @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) ==
+            Float64
+        @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) ==
+            Float64
+    end
+
+    @testset "peakflops_gpu (Tensor cores)" begin
+        @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64
+        @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) ==
+            Float64
+    end
+
+    @testset "peakflops_gpu_matmul / scaling" begin
+        @test typeof(CUDAExt.peakflops_gpu_matmul(; verbose=false)) == Float64
+        @test typeof(
+            CUDAExt.peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false)
+        ) == Float64
+        @test typeof(CUDAExt.peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) ==
+            Float64
+        @test typeof(CUDAExt.peakflops_gpu_matmul_scaling(; verbose=false)) ==
+            Tuple{Vector{Int64},Vector{Float64}}
+    end
+elseif backend() == AMDBackend()
+    # TODO
+end
diff --git a/test/stresstest_tests.jl b/test/tests_stresstest.jl
similarity index 81%
rename from test/stresstest_tests.jl
rename to test/tests_stresstest.jl
index 8b99fc0..32152b9 100644
--- a/test/stresstest_tests.jl
+++ b/test/tests_stresstest.jl
@@ -1,5 +1,4 @@
-@testitem "Stresstest: different kinds" begin
-    using CUDA
+@testset "Stresstest: different kinds" begin
     @test isnothing(stresstest(; duration=2, verbose=false))
     @test isnothing(stresstest(; enforced_duration=2, verbose=false))
     @test isnothing(stresstest(; approx_duration=2, verbose=false))
@@ -8,10 +7,9 @@
     @test isnothing(stresstest(; mem=0.2, verbose=false))
 end
 
-@testitem "Stresstest: keyword options" begin
-    using CUDA
+@testset "Stresstest: keyword options" begin
     @test isnothing(stresstest(; duration=2, verbose=false))
-    @test isnothing(stresstest(; duration=2, devices=devices(), verbose=false))
+    @test isnothing(stresstest(; duration=2, devices=GPUInspector.devices(), verbose=false))
     @test isnothing(stresstest(; duration=2, size=3000, verbose=false))
     @test isnothing(stresstest(; duration=2, dtype=Float16, verbose=false))
     @test isnothing(stresstest(; duration=2, clearmem=true, verbose=false))
@@ -19,15 +17,16 @@ end
     # TODO: kwargs: threads, parallel
 end
 
-@testitem "Stresstest: monitoring" begin
-    using CUDA
+@testset "Stresstest: monitoring" begin
     @testset "automatic (monitoring=true)" begin
         @test typeof(
-            stresstest(; devices=devices(), duration=2, verbose=false, monitoring=true)
+            stresstest(;
+                devices=GPUInspector.devices(), duration=2, verbose=false, monitoring=true
+            ),
         ) == MonitoringResults
     end
     @testset "manual" begin
-        devs = devices()
+        devs = GPUInspector.devices()
         @test isnothing(monitoring_start(; freq=1, devices=devs, verbose=false))
         @test isnothing(
             stresstest(; devices=devs, duration=2, verbose=false, monitoring=false)
@@ -41,8 +40,7 @@ end
     end
 end
 
-@testitem "Stresstest: monitoring results" begin
-    using CUDA
+@testset "Stresstest: monitoring results" begin
     @testset "MonitoringResults" begin
         r = stresstest(; duration=2, verbose=false, monitoring=true)
         @test typeof(r) == MonitoringResults
@@ -53,12 +51,13 @@ end
     end
     @testset "save / load" begin
         d = Dict{Symbol,Vector{Vector{Float64}}}()
-        ndevs = length(CUDA.devices())
+        ndevs = ngpus()
         d[:asd] = [rand(ndevs) for _ in 1:5]
         d[:qwe] = [rand(ndevs) for _ in 1:5]
         d[:jkl] = [rand(ndevs) for _ in 1:5]
         devices = Tuple{String,Base.UUID}[
-            (CUDAExt._device2string(dev), uuid(dev)) for dev in collect(CUDA.devices())
+            (CUDAExt._device2string(dev), uuid(dev)) for
+            dev in collect(GPUInspector.devices())
         ]
         r = MonitoringResults(rand(5), devices, d)
         cd(mktempdir()) do
@@ -74,8 +73,7 @@ end
     end
 end
 
-@testitem "Stresstest: monitoring results (CairoMakie)" begin
-    using CairoMakie
+@testset "Stresstest: monitoring results (CairoMakie)" begin
     r = load_monitoring_results(joinpath(@__DIR__, "test.h5"))
     @test isnothing(savefig_monitoring_results(r))
     @test isnothing(savefig_monitoring_results(r, (:compute, :mem)))
diff --git a/test/utility_tests.jl b/test/tests_utility.jl
similarity index 78%
rename from test/utility_tests.jl
rename to test/tests_utility.jl
index 1bf5bd3..9ec5c5f 100644
--- a/test/utility_tests.jl
+++ b/test/tests_utility.jl
@@ -1,4 +1,4 @@
-@testitem "UnitPrefixedBytes" begin
+@testset "UnitPrefixedBytes" begin
     using InteractiveUtils: subtypes
 
     # general stuff
@@ -64,16 +64,3 @@
     end
     @test B(40_000_000) + MB(3) - 2 * KiB(2) ≈ MB(42.995904)
 end
-
-@testitem "toggle_tensorcoremath" begin
-    using CUDA
-    @test isnothing(CUDAExt.toggle_tensorcoremath(true; verbose=false))
-    @test CUDA.math_mode() == CUDA.FAST_MATH
-    @test isnothing(CUDAExt.toggle_tensorcoremath(false; verbose=false))
-    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
-    # test toggle
-    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
-    @test CUDA.math_mode() == CUDA.FAST_MATH
-    @test isnothing(CUDAExt.toggle_tensorcoremath(; verbose=false))
-    @test CUDA.math_mode() == CUDA.DEFAULT_MATH
-end