Skip to content

Commit

Permalink
Update squeue parsing to support 24.05.
Browse files Browse the repository at this point in the history
  • Loading branch information
dougnd committed Aug 22, 2024
1 parent 05901b9 commit 5d3d810
Showing 1 changed file with 114 additions and 31 deletions.
145 changes: 114 additions & 31 deletions slurm/squeue.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log/slog"
"os/exec"
"regexp"
"slices"
"strconv"
"strings"
"time"
Expand All @@ -15,8 +16,6 @@ import (
"gopkg.in/yaml.v3"
)

// Note: these were automatically generated with json-to-go. They could use some cleanup.

type squeueResponse struct {
Meta meta `json:"meta" yaml:"meta"`
Jobs []jobs `json:"jobs" yaml:"jobs"`
Expand All @@ -36,30 +35,118 @@ type meta struct {
Command []string `json:"command" yaml:"command"`
Slurm slurmInfo `json:"Slurm" yaml:"Slurm"`
}
type Cores struct {
Num0 string `json:"0" yaml:"0"`
Num1 string `json:"1" yaml:"1"`

type slurm23Sockets map[string]struct {
Cores map[string]string `json:"cores" yaml:"cores"`
}
type Num0 struct {
Cores Cores `json:"cores" yaml:"cores"`

type slurm23AllocatedNodes struct {
Sockets slurm23Sockets `json:"sockets" yaml:"sockets"`
Nodename string `json:"nodename" yaml:"nodename"`
CpusUsed int `json:"cpus_used" yaml:"cpus_used"`
MemoryUsed int `json:"memory_used" yaml:"memory_used"`
MemoryAllocated int `json:"memory_allocated" yaml:"memory_allocated"`
}
type sockets map[string]struct {
Cores map[string]string `json:"cores" yaml:"cores"`
type slurm23JobResources struct {
Nodes string `json:"nodes" yaml:"nodes"`
AllocatedCores int `json:"allocated_cores" yaml:"allocated_cores"`
AllocatedCpus int `json:"allocated_cpus" yaml:"allocated_cpus"`
AllocatedHosts int `json:"allocated_hosts" yaml:"allocated_hosts"`
AllocatedNodes []slurm23AllocatedNodes `json:"allocated_nodes" yaml:"allocated_nodes"`
}

type AllocatedNodes struct {
Sockets sockets `json:"sockets" yaml:"sockets"`
Nodename string `json:"nodename" yaml:"nodename"`
CpusUsed int `json:"cpus_used" yaml:"cpus_used"`
MemoryUsed int `json:"memory_used" yaml:"memory_used"`
MemoryAllocated int `json:"memory_allocated" yaml:"memory_allocated"`
type slurm24JobResources struct {
Nodes struct {
Count int `json:"count" yaml:"count"`
Allocation []struct {
Name string `json:"name" yaml:"name"`
Memory struct {
Allocated int `json:"allocated" yaml:"allocated"`
} `json:"memory" yaml:"memory"`
Sockets []struct {
Index int `json:"index" yaml:"index"`
Cores []struct {
Index int `json:"index" yaml:"index"`
Status []string `json:"status" yaml:"status"`
} `json:"cores" yaml:"cores"`
} `json:"sockets" yaml:"sockets"`
} `json:"allocation" yaml:"allocation"`
} `json:"nodes" yaml:"nodes"`
}

type jobResourcesNode struct {
Hostname string
NCores int
Memory jobperf.Bytes
}

type jobResources struct {
Nodes string `json:"nodes" yaml:"nodes"`
AllocatedCores int `json:"allocated_cores" yaml:"allocated_cores"`
AllocatedCpus int `json:"allocated_cpus" yaml:"allocated_cpus"`
AllocatedHosts int `json:"allocated_hosts" yaml:"allocated_hosts"`
AllocatedNodes []AllocatedNodes `json:"allocated_nodes" yaml:"allocated_nodes"`
Nodes []jobResourcesNode
}

func (jr *jobResources) fromSlurm23JobResources(s23jr *slurm23JobResources) error {
jr.Nodes = make([]jobResourcesNode, 0, len(s23jr.AllocatedNodes))
for _, n := range s23jr.AllocatedNodes {
nCores := 0
for _, s := range n.Sockets {
nCores += len(s.Cores)
}
node := jobResourcesNode{
Hostname: n.Nodename,
Memory: jobperf.Bytes(1024 * 1024 * n.MemoryAllocated),
NCores: nCores,
}
jr.Nodes = append(jr.Nodes, node)
}
return nil
}
func (jr *jobResources) fromSlurm24JobResources(s24jr *slurm24JobResources) error {
jr.Nodes = make([]jobResourcesNode, 0, len(s24jr.Nodes.Allocation))
for _, n := range s24jr.Nodes.Allocation {
nCores := 0
for _, s := range n.Sockets {
for _, c := range s.Cores {
if slices.Contains(c.Status, "ALLOCATED") {
nCores++
}
}
}
node := jobResourcesNode{
Hostname: n.Name,
Memory: jobperf.Bytes(1024 * 1024 * n.Memory.Allocated),
NCores: nCores,
}
jr.Nodes = append(jr.Nodes, node)
}
return nil
}

func (jr *jobResources) UnmarshalJSON(b []byte) error {
var s23jr slurm23JobResources
s23err := json.Unmarshal(b, &s23jr)
if s23err == nil {
return jr.fromSlurm23JobResources(&s23jr)
}
var s24jr slurm24JobResources
s24err := json.Unmarshal(b, &s24jr)
if s24err == nil {
return jr.fromSlurm24JobResources(&s24jr)
}
return fmt.Errorf("failed to parse job resource using both slurm23 format: %v and slurm24: %w", s23err, s24err)
}

func (jr *jobResources) UnmarshalYAML(n *yaml.Node) error {
var s23jr slurm23JobResources
s23err := n.Decode(&s23jr)
if s23err == nil {
return jr.fromSlurm23JobResources(&s23jr)
}
var s24jr slurm24JobResources
s24err := n.Decode(&s24jr)
if s24err == nil {
return jr.fromSlurm24JobResources(&s24jr)
}
return fmt.Errorf("failed to parse job resource using both slurm23 format: %v and slurm24: %w", s23err, s24err)
}

type optionalValue struct {
Expand Down Expand Up @@ -190,20 +277,16 @@ func (e jobEngine) squeueGetJobByID(jobID string) (*jobperf.Job, error) {
return nil, fmt.Errorf("unexpected number of jobs returned from squeue: %v", len(parsed.Jobs))
}
parsedJob := parsed.Jobs[0]
if len(parsedJob.GresDetail) > 0 && len(parsedJob.GresDetail) != len(parsedJob.JobResources.AllocatedNodes) {
return nil, fmt.Errorf("expected gres_detail to be empty or equal to number of nodes: %v != %v", len(parsedJob.GresDetail), len(parsedJob.JobResources.AllocatedNodes))
if len(parsedJob.GresDetail) > 0 && len(parsedJob.GresDetail) != len(parsedJob.JobResources.Nodes) {
return nil, fmt.Errorf("expected gres_detail to be empty or equal to number of nodes: %v != %v", len(parsedJob.GresDetail), len(parsedJob.JobResources.Nodes))
}

var nodes []jobperf.Node

totalCores := 0
totalGPUs := 0
var totalMemoryBytes jobperf.Bytes = 0
for i, n := range parsedJob.JobResources.AllocatedNodes {
nCores := 0
for _, s := range n.Sockets {
nCores += len(s.Cores)
}
for i, n := range parsedJob.JobResources.Nodes {
nGPUs := 0
if len(parsedJob.GresDetail) > 0 {
nGPUs, err = nGPUFromGRESDetails(parsedJob.GresDetail[i])
Expand All @@ -212,14 +295,14 @@ func (e jobEngine) squeueGetJobByID(jobID string) (*jobperf.Job, error) {
}
}
node := jobperf.Node{
Hostname: n.Nodename,
Memory: jobperf.Bytes(1024 * 1024 * n.MemoryAllocated),
NCores: nCores,
Hostname: n.Hostname,
Memory: n.Memory,
NCores: n.NCores,
NGPUs: nGPUs,
}
nodes = append(nodes, node)

totalCores += nCores
totalCores += n.NCores
totalGPUs += nGPUs
totalMemoryBytes += node.Memory
}
Expand Down

0 comments on commit 5d3d810

Please sign in to comment.