Skip to content

Commit

Permalink
Add logic to is_backing_instance_valid to check the IP to make sure t…
Browse files Browse the repository at this point in the history
…he instance matches what is being tracked

The missing instance map did not track what the IP address was that was associated with the slurm node.
Because of this if a new instance is launched before an instance becomes healthy, the increment is not reset
for the instance count map.  This change uses a class object to track the data and links the node name to the ip.
  • Loading branch information
dreambeyondorange committed Feb 28, 2024
1 parent 293efb7 commit 3952665
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 9 deletions.
23 changes: 20 additions & 3 deletions src/slurm_plugin/slurm_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,17 @@ class SlurmReservation:
users: str


class MissingInstance:
name: str
ip: str
count: int

def __init__(self, name, ip, count):
self.name = name
self.ip = ip
self.count = count


class SlurmNode(metaclass=ABCMeta):
SLURM_SCONTROL_COMPLETING_STATE = "COMPLETING"
SLURM_SCONTROL_BUSY_STATES = {"MIXED", "ALLOCATED", SLURM_SCONTROL_COMPLETING_STATE}
Expand Down Expand Up @@ -427,7 +438,7 @@ def is_powering_down_with_nodeaddr(self):
def is_backing_instance_valid(
self,
ec2_instance_missing_max_count,
nodes_without_backing_instance_count_map: dict,
nodes_without_backing_instance_count_map: dict[str, MissingInstance],
log_warn_if_unhealthy=True,
):
"""Check if a slurm node's addr is set, it points to a valid instance in EC2."""
Expand All @@ -443,9 +454,14 @@ def is_backing_instance_valid(
self,
self.state_string,
)

# Allow a few iterations for the eventual consistency of EC2 data
logger.debug(f"Map of slurm nodes without backing instances {nodes_without_backing_instance_count_map}")
missing_instance_loop_count = nodes_without_backing_instance_count_map.get(self.name, 0)
missing_instance = nodes_without_backing_instance_count_map.get(self.name, None)
missing_instance_loop_count = missing_instance.count if missing_instance else 0
if missing_instance and self.nodeaddr != missing_instance.ip:
# Reset the loop count since the nodeaddr has changed
missing_instance_loop_count = 0
# If the loop count has been reached, the instance is unhealthy and will be terminated
if missing_instance_loop_count >= ec2_instance_missing_max_count:
if log_warn_if_unhealthy:
Expand All @@ -454,7 +470,8 @@ def is_backing_instance_valid(
nodes_without_backing_instance_count_map.pop(self.name, None)
self.ec2_backing_instance_valid = False
else:
nodes_without_backing_instance_count_map[self.name] = missing_instance_loop_count + 1
instance_to_add = MissingInstance(self.name, self.nodeaddr, missing_instance_loop_count + 1)
nodes_without_backing_instance_count_map[self.name] = instance_to_add
if log_warn_if_unhealthy:
logger.warning(
f"Incrementing missing EC2 instance count for node {self.name} to "
Expand Down
24 changes: 18 additions & 6 deletions tests/slurm_plugin/slurm_resources/test_slurm_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
DynamicNode,
EC2InstanceHealthState,
InvalidNodenameError,
MissingInstance,
SlurmPartition,
SlurmResumeJob,
StaticNode,
Expand Down Expand Up @@ -1185,26 +1186,34 @@ def test_slurm_node_is_powering_down_with_nodeaddr(node, expected_result):
StaticNode("queue1-st-c5xlarge-1", "ip-1", "hostname", "IDLE+CLOUD+POWER", "queue1"),
None,
2,
{"queue1-st-c5xlarge-1": 1},
{"queue1-st-c5xlarge-1": 2},
{"queue1-st-c5xlarge-1": MissingInstance("queue1-st-c5xlarge-1", "ip-1", 1)},
{"queue1-st-c5xlarge-1": MissingInstance("queue1-st-c5xlarge-1", "ip-1", 2)},
True,
),
(
StaticNode("queue1-st-c5xlarge-1", "ip-1", "hostname", "IDLE+CLOUD+POWER", "queue1"),
None,
2,
{"queue1-st-c5xlarge-1": 2},
{"queue1-st-c5xlarge-1": 2},
{"queue1-st-c5xlarge-1": MissingInstance("queue1-st-c5xlarge-1", "ip-1", 2)},
{"queue1-st-c5xlarge-1": MissingInstance("queue1-st-c5xlarge-1", "ip-1", 2)},
False,
),
(
StaticNode("queue1-st-c5xlarge-1", "ip-1", "hostname", "IDLE+CLOUD+POWER", "queue1"),
"Instance",
2,
{"queue1-st-c5xlarge-1": 3},
{"queue1-st-c5xlarge-1": MissingInstance("queue1-st-c5xlarge-1", "ip-1", 3)},
{},
True,
),
(
StaticNode("queue1-st-c5xlarge-1", "ip-1", "hostname", "IDLE+CLOUD+POWER", "queue1"),
"Instance",
3,
{"queue1-st-c5xlarge-1": MissingInstance("queue1-st-c5xlarge-1", "ip-2", 2)},
{"queue1-st-c5xlarge-1": MissingInstance("queue1-st-c5xlarge-1", "ip-1", 1)},
True,
),
],
ids=[
"static_no_backing_zero_max_count",
Expand All @@ -1214,6 +1223,7 @@ def test_slurm_node_is_powering_down_with_nodeaddr(node, expected_result):
"static_no_backing_count_not_exceeded",
"static_no_backing_with_count_exceeded",
"static_backed_with_count_exceeded",
"static_no_backing_count_not_exceeded_with_wrong_ip",
],
)
def test_slurm_node_is_backing_instance_valid(node, instance, max_count, count_map, final_map, expected_result):
Expand All @@ -1225,7 +1235,9 @@ def test_slurm_node_is_backing_instance_valid(node, instance, max_count, count_m
).is_equal_to(expected_result)
assert_that(node.ec2_backing_instance_valid).is_equal_to(expected_result)
if count_map:
assert_that(count_map[node.name]).is_equal_to(final_map.get(node.name, None))
assert_that(count_map[node.name].count).is_equal_to(final_map.get(node.name, None).count)
assert_that(count_map[node.name].ip).is_equal_to(final_map.get(node.name, None).ip)
assert_that(count_map[node.name].ip).is_equal_to(node.nodeaddr)


@pytest.mark.parametrize(
Expand Down

0 comments on commit 3952665

Please sign in to comment.