From 0696123125dabb81b3312fa1f7a8409dc0ef2f97 Mon Sep 17 00:00:00 2001 From: Graeme A Stewart Date: Fri, 3 May 2024 15:12:14 +0200 Subject: [PATCH] Nvidia monitor fixes (#239) * Update output parsing for nvidiamon * Update test files for nvidia-smi parsing Update to the new nvidia-smi pmon output fields Add a pycuda GPU burner script for tests * Parse ccpm field as string This can be a "-" instead of 0 * Update precook script and precooked ouptuts Ensure precooked values are fixed to what we want * Fix hash-bang an mode on GPU burner * Python reformatting With latest verisons of black and flake8 There is one import in gpu-burner.py that is needed (pycuda.autoinit) as it has side effects, so this is marked as excempt for flake8 --------- Co-authored-by: Johannes Elmsheuser Co-authored-by: Graeme Stewart --- README.md | 2 +- package/scripts/gpu-burner.py | 41 +++++++++++++++++++ package/scripts/precook_test.py | 15 ++++--- .../scripts/precooked_tests/drop/1/nvidia/smi | 2 +- .../scripts/precooked_tests/drop/2/nvidia/smi | 2 +- .../scripts/precooked_tests/drop/3/nvidia/smi | 2 +- package/scripts/prmon_compress_output.py | 8 ++-- package/src/nvidiamon.cpp | 10 ++--- 8 files changed, 65 insertions(+), 17 deletions(-) create mode 100755 package/scripts/gpu-burner.py diff --git a/README.md b/README.md index 00746b4..1cf45bc 100644 --- a/README.md +++ b/README.md @@ -254,4 +254,4 @@ to CMake using `Gperftools_ROOT_DIR`. # Copyright -Copyright (c) 2018-2023 CERN. +Copyright (c) 2018-2024 CERN. diff --git a/package/scripts/gpu-burner.py b/package/scripts/gpu-burner.py new file mode 100755 index 0000000..bc51372 --- /dev/null +++ b/package/scripts/gpu-burner.py @@ -0,0 +1,41 @@ +#! /usr/bin/env python3 +# +# This is a slightly adapted "hello, world" script from +# pycuda, that can be used for stressing a CUDA GPU for +# tests +# +# pycuda is required! +# + +import pycuda.autoinit # noqa: F401 +import pycuda.driver as drv +import numpy +from time import time + +from pycuda.compiler import SourceModule + +mod = SourceModule( + """ +__global__ void multiply_them(float *dest, float *a, float *b, float *c) +{ + const int i = threadIdx.x; + dest[i] = a[i] * b[i] + c[i]; +} +""" +) + +multiply_them = mod.get_function("multiply_them") + +a = numpy.random.randn(1024).astype(numpy.float32) +b = numpy.random.randn(1024).astype(numpy.float32) +c = numpy.random.randn(1024).astype(numpy.float32) + +dest = numpy.zeros_like(a) + +start = time() +while time() - start < 20: + multiply_them( + drv.Out(dest), drv.In(a), drv.In(b), drv.In(c), block=(1024, 1, 1), grid=(1, 1) + ) + +print(dest - a * b + c) diff --git a/package/scripts/precook_test.py b/package/scripts/precook_test.py index 7a9f52e..131006b 100755 --- a/package/scripts/precook_test.py +++ b/package/scripts/precook_test.py @@ -95,19 +95,24 @@ def make_net(proc_net, fixed_value, rand=False): def make_nvidia(proc_nvidia, fixed_value, rand=False): # idx + print(proc_nvidia, fixed_value, rand) smi_fname = os.path.join(proc_nvidia, "smi") + pct_lim = 100 memory_lim = 10000 with open(smi_fname, "w") as f: params = [ 0, # idx pid, # pid "G", # type - random.randint(0, memory_lim) if rand else fixed_value, # sm - random.randint(0, memory_lim) if rand else fixed_value, # mem - # enc, dec are not monitored metrics - 0, # enc - 0, # dec + random.randint(0, pct_lim) if rand else fixed_value, # sm + random.randint(0, pct_lim) if rand else fixed_value, # mem + # The following are not monitored metrics + "-", # enc + "-", # dec + "-", # jpg + "-", # ofa random.randint(0, memory_lim) if rand else fixed_value, # fb + 0, # ccpm "python3", # command ] for param in params: diff --git a/package/scripts/precooked_tests/drop/1/nvidia/smi b/package/scripts/precooked_tests/drop/1/nvidia/smi index 2e17e65..835afdf 100644 --- a/package/scripts/precooked_tests/drop/1/nvidia/smi +++ b/package/scripts/precooked_tests/drop/1/nvidia/smi @@ -1 +1 @@ -0 1729 G 50 50 0 0 50 python3 +0 1729 G 50 50 - - - - 50 0 python3 diff --git a/package/scripts/precooked_tests/drop/2/nvidia/smi b/package/scripts/precooked_tests/drop/2/nvidia/smi index e27a02f..b9e438f 100644 --- a/package/scripts/precooked_tests/drop/2/nvidia/smi +++ b/package/scripts/precooked_tests/drop/2/nvidia/smi @@ -1 +1 @@ -0 1729 G 100 100 0 0 100 python3 +0 1729 G 100 100 - - - - 100 0 python3 diff --git a/package/scripts/precooked_tests/drop/3/nvidia/smi b/package/scripts/precooked_tests/drop/3/nvidia/smi index 9dc7ae5..30220eb 100644 --- a/package/scripts/precooked_tests/drop/3/nvidia/smi +++ b/package/scripts/precooked_tests/drop/3/nvidia/smi @@ -1 +1 @@ -0 1729 G 20 20 0 0 20 python3 +0 1729 G 20 20 - - - - 20 0 python3 diff --git a/package/scripts/prmon_compress_output.py b/package/scripts/prmon_compress_output.py index ef4daf5..6ef7801 100755 --- a/package/scripts/prmon_compress_output.py +++ b/package/scripts/prmon_compress_output.py @@ -122,9 +122,11 @@ def main(): parser.add_argument( "--precision", - type=lambda x: float(x) - if 0 < float(x) < 1 - else parser.exit(-1, "Precision must be strictly between 0 and 1"), + type=lambda x: ( + float(x) + if 0 < float(x) < 1 + else parser.exit(-1, "Precision must be strictly between 0 and 1") + ), default=0.05, help="precision value for interpolation threshold", ) diff --git a/package/src/nvidiamon.cpp b/package/src/nvidiamon.cpp index 04c538a..947c65c 100644 --- a/package/src/nvidiamon.cpp +++ b/package/src/nvidiamon.cpp @@ -76,20 +76,20 @@ void nvidiamon::update_stats(const std::vector& pids, // Loop over output unsigned int gpu_idx{}, sm{}, mem{}, fb_mem{}; pid_t pid{}; - std::string enc{}, dec{}, cg_type{}, cmd_name{}; + std::string enc{}, dec{}, jpg{}, ofa{}, cg_type{}, ccpm{}, cmd_name{}; std::unordered_map activegpus{}; // Avoid double counting active GPUs for (const auto& s : cmd_result.second) { if (s[0] == '#') continue; std::istringstream instr(s); - instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> fb_mem >> - cmd_name; + instr >> gpu_idx >> pid >> cg_type >> sm >> mem >> enc >> dec >> jpg >> ofa >> fb_mem >> + ccpm >> cmd_name; auto read_ok = !(instr.fail() || instr.bad()); // eof() is ok if (read_ok) { if (log_level <= spdlog::level::debug) { std::stringstream strm; strm << "Good read: " << gpu_idx << " " << pid << " " << cg_type << " " - << sm << " " << mem << " " << enc << " " << dec << " " << fb_mem + << sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm << " " << cmd_name << std::endl; debug(strm.str()); } @@ -115,7 +115,7 @@ void nvidiamon::update_stats(const std::vector& pids, std::stringstream strm; strm << "Bad read of line: " << s << std::endl; strm << "Parsed to: " << gpu_idx << " " << pid << " " << cg_type << " " - << sm << " " << mem << " " << enc << " " << dec << " " << fb_mem + << sm << " " << mem << " " << enc << " " << dec << " " << jpg << " " << ofa << " " << fb_mem << " " << ccpm << " " << cmd_name << std::endl; strm << "StringStream status: good()=" << instr.good();