Skip to content

Commit

Permalink
Fix duplicate SLURM statistics
Browse files Browse the repository at this point in the history
* Add --clusters="cluster-name" to all SLURM commands
  • Loading branch information
msquee committed Nov 19, 2020
1 parent 4ecde89 commit fb092ac
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions lib/slurm_squeue_client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def initialize(cluster)
end

@cluster_id = cluster.id
@canonical_cluster_id = @cluster_id.to_s.partition('-').first
@cluster_title = cluster.metadata.title || cluster.id.titleize
@job_scheduler = cluster.job_config[:adapter]

Expand All @@ -40,7 +41,7 @@ def job_scheduler_name
def squeue_jobs_pending
return @squeue_jobs_pending if defined?(@squeue_jobs_pending)

o, e, s = Open3.capture3({}, squeue_cmd, '-h', '--all', '--states=PENDING')
o, e, s = Open3.capture3({}, squeue_cmd, '-h', '--all', '--states=PENDING', "--clusters=\"#{@canonical_cluster_id}\"")

s.success? ? @squeue_jobs_pending = o : raise(CommandFailed, e)
end
Expand All @@ -49,7 +50,7 @@ def squeue_jobs_pending
def squeue_jobs_running
return @squeue_jobs_running if defined?(@squeue_jobs_running)

o, e, s = Open3.capture3({}, squeue_cmd, '-h', '--all', '--states=RUNNING')
o, e, s = Open3.capture3({}, squeue_cmd, '-h', '--all', '--states=RUNNING', "--clusters=\"#{@canonical_cluster_id}\"")

s.success? ? @squeue_jobs_running = o : raise(CommandFailed, e)
end
Expand All @@ -59,9 +60,9 @@ def sinfo
return @sinfo if defined?(@sinfo)

cmd = '/usr/bin/sinfo'
args = ["-a", "-h", "-o=\"%C/%A/%D\""]
o, e, s = Open3.capture3({}, sinfo_cmd, *%w[-a -h -o="%C/%A/%D"])
args = ["-a", "-h", "-o=\"%C/%A/%D\"", "--clusters=\"#{@canonical_cluster_id}\""]

o, e, s = Open3.capture3({}, sinfo_cmd, *args)

s.success? ? @sinfo = o : raise(CommandFailed, e)
end
Expand All @@ -71,7 +72,7 @@ def sinfo
def gres_length
return @gres_length if defined?(@gres_length)

o, e, s = Open3.capture3("#{sinfo_cmd} -o '%G' | awk '{ print length }' | sort -n | tail -1")
o, e, s = Open3.capture3("#{sinfo_cmd} --clusters=\"#{@canonical_cluster_id}\" -o '%G' | awk '{ print length }' | sort -n | tail -1")

if s.success?
@gres_length = o.to_i
Expand All @@ -87,7 +88,7 @@ def gres_length
def gpu_nodes
return @available_gpu_nodes if defined?(@available_gpu_nodes)

o, e, s = Open3.capture3("#{sinfo_cmd} -N -h -a --Format='nodehost,gres:#{gres_length}' | uniq | grep gpu: | wc -l")
o, e, s = Open3.capture3("#{sinfo_cmd} --clusters=\"#{@canonical_cluster_id}\" -N -h -a --Format='nodehost,gres:#{gres_length}' | uniq | grep gpu: | wc -l")

if s.success?
@available_gpu_nodes = o.to_i
Expand All @@ -104,7 +105,7 @@ def gpu_nodes
def gpu_nodes_free
return @gpu_nodes_free if defined?(@gpu_nodes_free)

o, e, s = Open3.capture3("#{sinfo_cmd} -a -h --Node --Format='nodehost,gres:#{gres_length},statelong' | uniq | grep gpu: | egrep 'idle' | wc -l")
o, e, s = Open3.capture3("#{sinfo_cmd} --clusters=\"#{@canonical_cluster_id}\" -a -h --Node --Format='nodehost,gres:#{gres_length},statelong' | uniq | grep gpu: | egrep 'idle' | wc -l")

if s.success?
@gpu_nodes_free = o.to_i
Expand Down Expand Up @@ -137,7 +138,7 @@ def gpu_nodes_available_percent
def gpu_jobs_pending
return @gpu_jobs_pending if defined?(@gpu_jobs_pending)

o, e, s = Open3.capture3("#{squeue_cmd} --states=PENDING -O 'jobid,tres-pefr-job:#{gres_length},tres-per-node:#{gres_length},tres-per-socket:#{gres_length},tres-per-task:#{gres_length}' -h | grep gpu: | wc -l")
o, e, s = Open3.capture3("#{squeue_cmd} --clusters=\"#{@canonical_cluster_id}\" --states=PENDING -O 'jobid,tres-pefr-job:#{gres_length},tres-per-node:#{gres_length},tres-per-socket:#{gres_length},tres-per-task:#{gres_length}' -h | grep gpu: | wc -l")

if s.success?
@gpu_jobs_pending = o.to_i
Expand Down

0 comments on commit fb092ac

Please sign in to comment.