From 07aff911eb8aaf9d691c48b1b10df1cbaf21346a Mon Sep 17 00:00:00 2001 From: Luke Weiler Date: Fri, 22 Apr 2022 15:02:04 -0400 Subject: [PATCH] Added cluster_info (#752) Add a `cluster_info` API to adapters to report the current cluster status like how many nodes, cpus and GPUs are active among other stuff. --- lib/ood_core.rb | 1 + lib/ood_core/job/adapter.rb | 9 ++++++++ lib/ood_core/job/adapters/slurm.rb | 33 ++++++++++++++++++++++++++---- lib/ood_core/job/cluster_info.rb | 32 +++++++++++++++++++++++++++++ spec/job/adapters/slurm_spec.rb | 2 +- 5 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 lib/ood_core/job/cluster_info.rb diff --git a/lib/ood_core.rb b/lib/ood_core.rb index ce51ab194..aa25ad73c 100644 --- a/lib/ood_core.rb +++ b/lib/ood_core.rb @@ -11,6 +11,7 @@ module Job require "ood_core/job/node_info" require "ood_core/job/script" require "ood_core/job/info" + require "ood_core/job/cluster_info" require "ood_core/job/status" require "ood_core/job/adapter" require "ood_core/job/factory" diff --git a/lib/ood_core/job/adapter.rb b/lib/ood_core/job/adapter.rb index ecdd54315..6fc36f51b 100644 --- a/lib/ood_core/job/adapter.rb +++ b/lib/ood_core/job/adapter.rb @@ -33,6 +33,15 @@ def submit(script, after: [], afterok: [], afternotok: [], afterany: []) raise NotImplementedError, "subclass did not define #submit" end + # Retrieve the number of active and total cpus, nodes, and gpus + # @abstract Subclass is expected to implement {#cluster_stats} + # @raise [NotImplementedError] if subclass did not define {#cluster_stats} + # @return [ClusterInfo] Object containing quantified statistics about the + # cluster's active/total cpus, nodes, and gpus + def cluster_info + raise NotImplementedError, "subclass did not define #cluster_stats" + end + # Retrieve info for all jobs from the resource manager # @abstract Subclass is expected to implement {#info_all} # @raise [NotImplementedError] if subclass did not define {#info_all} diff --git a/lib/ood_core/job/adapters/slurm.rb b/lib/ood_core/job/adapters/slurm.rb index b91027ada..5d5d1545d 100644 --- a/lib/ood_core/job/adapters/slurm.rb +++ b/lib/ood_core/job/adapters/slurm.rb @@ -36,6 +36,13 @@ class Slurm < Adapter using Refinements::HashExtensions using Refinements::ArrayExtensions + # Get integer representing the number of gpus used by a node or job, + # calculated from gres string + # @return [Integer] the number of gpus in gres + def gpus_from_gres(gres) + gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum + end + # Object used for simplified communication with a Slurm batch server # @api private class Batch @@ -98,6 +105,22 @@ def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host @strict_host_checking = strict_host_checking end + # Get a ClusterInfo object containing information about the given cluster + # @return [ClusterInfo] object containing cluster details + def get_cluster_info + node_cpu_info = call("sinfo", "-aho %A/%D/%C").strip.split('/') + gres_length = call("sinfo", "-o %G").lines.map(&:strip).map(&:length).max + 2 + gres_lines = call("sinfo", "-ahNO ,nodehost,gres:#{gres_length},gresused:#{gres_length}") + .lines.uniq.map(&:split) + ClusterInfo.new(active_nodes: node_cpu_info[0].to_i, + total_nodes: node_cpu_info[2].to_i, + active_processors: node_cpu_info[3].to_i, + total_processors: node_cpu_info[6].to_i, + active_gpus: gres_lines.sum { |line| gpus_from_gres(line[2]) }, + total_gpus: gres_lines.sum { |line| gpus_from_gres(line[1]) } + ) + end + # Get a list of hashes detailing each of the jobs on the batch server # @example Status info for all jobs # my_batch.get_jobs @@ -454,6 +477,12 @@ def submit(script, after: [], afterok: [], afternotok: [], afterany: []) raise JobAdapterError, e.message end + # Retrieve info about active and total cpus, gpus, and nodes + # @return [Hash] information about cluster usage + def cluster_info + @slurm.get_cluster_info + end + # Retrieve info for all jobs from the resource manager # @raise [JobAdapterError] if something goes wrong getting job info # @return [Array] information describing submitted jobs @@ -617,10 +646,6 @@ def get_state(st) STATE_MAP.fetch(st, :undetermined) end - def gpus_from_gres(gres) - gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum - end - # Parse hash describing Slurm job status def parse_job_info(v) allocated_nodes = parse_nodes(v[:node_list]) diff --git a/lib/ood_core/job/cluster_info.rb b/lib/ood_core/job/cluster_info.rb new file mode 100644 index 000000000..3b458c66a --- /dev/null +++ b/lib/ood_core/job/cluster_info.rb @@ -0,0 +1,32 @@ +module OodCore + module Job + # An object that contains details about the cluster's active and total nodes, processors, and gpus + class ClusterInfo + using Refinements::HashExtensions + + attr_reader :active_nodes, :total_nodes, :active_processors, :total_processors, :active_gpu_nodes, + :total_gpu_nodes, :active_gpus, :total_gpus + + def initialize(opts = {}) + opts = opts.transform_keys(&:to_sym) + @active_nodes = opts.fetch(:active_nodes, nil).to_i + @total_nodes = opts.fetch(:total_nodes, nil).to_i + @active_processors = opts.fetch(:active_processors, nil).to_i + @total_processors = opts.fetch(:total_processors, nil).to_i + @active_gpus = opts.fetch(:active_gpus, nil).to_i + @total_gpus = opts.fetch(:total_gpus, nil).to_i + end + + def to_h + { + active_nodes: active_nodes, + total_nodes: total_nodes, + active_processors: active_processors, + total_processors: total_processors, + active_gpus: active_gpus, + total_gpus: total_gpus + } + end + end + end +end diff --git a/spec/job/adapters/slurm_spec.rb b/spec/job/adapters/slurm_spec.rb index 6c2a3b33f..351f7b352 100644 --- a/spec/job/adapters/slurm_spec.rb +++ b/spec/job/adapters/slurm_spec.rb @@ -1223,7 +1223,7 @@ def job_info(opts = {}) ] gres_cases.each do |gc| it "does not return the correct number of gpus when gres=\"#{gc[0]}\"" do - gpus = adapter.send(:gpus_from_gres, gc[0]) + gpus = adapter.gpus_from_gres(gc[0]); expect(gpus).to be(gc[1]); end end