Skip to content

Commit

Permalink
Added cluster_info (#752)
Browse files Browse the repository at this point in the history
Add a `cluster_info` API to adapters to report the current cluster status like how many nodes, cpus and GPUs are active among other stuff.
  • Loading branch information
lukew3 authored Apr 22, 2022
1 parent 6fcc24c commit 07aff91
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 5 deletions.
1 change: 1 addition & 0 deletions lib/ood_core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ module Job
require "ood_core/job/node_info"
require "ood_core/job/script"
require "ood_core/job/info"
require "ood_core/job/cluster_info"
require "ood_core/job/status"
require "ood_core/job/adapter"
require "ood_core/job/factory"
Expand Down
9 changes: 9 additions & 0 deletions lib/ood_core/job/adapter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
raise NotImplementedError, "subclass did not define #submit"
end

# Retrieve the number of active and total cpus, nodes, and gpus
# @abstract Subclass is expected to implement {#cluster_stats}
# @raise [NotImplementedError] if subclass did not define {#cluster_stats}
# @return [ClusterInfo] Object containing quantified statistics about the
# cluster's active/total cpus, nodes, and gpus
def cluster_info
raise NotImplementedError, "subclass did not define #cluster_stats"
end

# Retrieve info for all jobs from the resource manager
# @abstract Subclass is expected to implement {#info_all}
# @raise [NotImplementedError] if subclass did not define {#info_all}
Expand Down
33 changes: 29 additions & 4 deletions lib/ood_core/job/adapters/slurm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ class Slurm < Adapter
using Refinements::HashExtensions
using Refinements::ArrayExtensions

# Get integer representing the number of gpus used by a node or job,
# calculated from gres string
# @return [Integer] the number of gpus in gres
def gpus_from_gres(gres)
gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum
end

# Object used for simplified communication with a Slurm batch server
# @api private
class Batch
Expand Down Expand Up @@ -98,6 +105,22 @@ def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host
@strict_host_checking = strict_host_checking
end

# Get a ClusterInfo object containing information about the given cluster
# @return [ClusterInfo] object containing cluster details
def get_cluster_info
node_cpu_info = call("sinfo", "-aho %A/%D/%C").strip.split('/')
gres_length = call("sinfo", "-o %G").lines.map(&:strip).map(&:length).max + 2
gres_lines = call("sinfo", "-ahNO ,nodehost,gres:#{gres_length},gresused:#{gres_length}")
.lines.uniq.map(&:split)
ClusterInfo.new(active_nodes: node_cpu_info[0].to_i,
total_nodes: node_cpu_info[2].to_i,
active_processors: node_cpu_info[3].to_i,
total_processors: node_cpu_info[6].to_i,
active_gpus: gres_lines.sum { |line| gpus_from_gres(line[2]) },
total_gpus: gres_lines.sum { |line| gpus_from_gres(line[1]) }
)
end

# Get a list of hashes detailing each of the jobs on the batch server
# @example Status info for all jobs
# my_batch.get_jobs
Expand Down Expand Up @@ -454,6 +477,12 @@ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
raise JobAdapterError, e.message
end

# Retrieve info about active and total cpus, gpus, and nodes
# @return [Hash] information about cluster usage
def cluster_info
@slurm.get_cluster_info
end

# Retrieve info for all jobs from the resource manager
# @raise [JobAdapterError] if something goes wrong getting job info
# @return [Array<Info>] information describing submitted jobs
Expand Down Expand Up @@ -617,10 +646,6 @@ def get_state(st)
STATE_MAP.fetch(st, :undetermined)
end

def gpus_from_gres(gres)
gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum
end

# Parse hash describing Slurm job status
def parse_job_info(v)
allocated_nodes = parse_nodes(v[:node_list])
Expand Down
32 changes: 32 additions & 0 deletions lib/ood_core/job/cluster_info.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module OodCore
module Job
# An object that contains details about the cluster's active and total nodes, processors, and gpus
class ClusterInfo
using Refinements::HashExtensions

attr_reader :active_nodes, :total_nodes, :active_processors, :total_processors, :active_gpu_nodes,
:total_gpu_nodes, :active_gpus, :total_gpus

def initialize(opts = {})
opts = opts.transform_keys(&:to_sym)
@active_nodes = opts.fetch(:active_nodes, nil).to_i
@total_nodes = opts.fetch(:total_nodes, nil).to_i
@active_processors = opts.fetch(:active_processors, nil).to_i
@total_processors = opts.fetch(:total_processors, nil).to_i
@active_gpus = opts.fetch(:active_gpus, nil).to_i
@total_gpus = opts.fetch(:total_gpus, nil).to_i
end

def to_h
{
active_nodes: active_nodes,
total_nodes: total_nodes,
active_processors: active_processors,
total_processors: total_processors,
active_gpus: active_gpus,
total_gpus: total_gpus
}
end
end
end
end
2 changes: 1 addition & 1 deletion spec/job/adapters/slurm_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1223,7 +1223,7 @@ def job_info(opts = {})
]
gres_cases.each do |gc|
it "does not return the correct number of gpus when gres=\"#{gc[0]}\"" do
gpus = adapter.send(:gpus_from_gres, gc[0])
gpus = adapter.gpus_from_gres(gc[0]);
expect(gpus).to be(gc[1]);
end
end
Expand Down

0 comments on commit 07aff91

Please sign in to comment.