Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(operator/inventory): reset node inventory on watcher reconnect #228

Merged
merged 1 commit into from
Apr 10, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 13 additions & 18 deletions operator/inventory/node-discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,6 @@
gpusIDs := make(RegistryGPUVendors)
currLabels := make(map[string]string)
currPods := make(map[string]corev1.Pod)
currPodsInitCount := 0

select {
case <-dp.ctx.Done():
Expand Down Expand Up @@ -429,18 +428,13 @@
pods, terr := kc.CoreV1().Pods(corev1.NamespaceAll).List(dp.ctx, metav1.ListOptions{
FieldSelector: fields.OneTermEqualSelector("spec.nodeName", dp.name).String(),
})

if terr != nil {
return terr
}

for name := range currPods {
pod := currPods[name]

subPodAllocatedResources(&node, &pod)
nodeResetAllocated(&node)

delete(currPods, name)
}
currPods = make(map[string]corev1.Pod)

for name := range pods.Items {
pod := pods.Items[name].DeepCopy()
Expand All @@ -449,8 +443,6 @@
currPods[pod.Name] = *pod
}

currPodsInitCount = len(currPods)

return nil
}

Expand Down Expand Up @@ -535,9 +527,9 @@
if _, exists := currPods[obj.Name]; !exists {
currPods[obj.Name] = *obj.DeepCopy()
addPodAllocatedResources(&node, obj)
} else {
currPodsInitCount--
}

signalState()
case watch.Deleted:
pod, exists := currPods[obj.Name]
if !exists {
Expand All @@ -547,14 +539,8 @@

subPodAllocatedResources(&node, &pod)

if currPodsInitCount > 0 {
currPodsInitCount--
}

delete(currPods, obj.Name)
}

if currPodsInitCount == 0 {
signalState()
}
case <-statech:
Expand Down Expand Up @@ -660,6 +646,15 @@
return res, nil
}

func nodeResetAllocated(node *v1.Node) {
node.Resources.CPU.Quantity.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.GPU.Quantity.Allocated = resource.NewQuantity(0, resource.DecimalSI)
node.Resources.Memory.Quantity.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.EphemeralStorage.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.VolumesAttached.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
node.Resources.VolumesMounted.Allocated = resource.NewMilliQuantity(0, resource.DecimalSI)
}

func addPodAllocatedResources(node *v1.Node, pod *corev1.Pod) {
for _, container := range pod.Spec.Containers {
for name, quantity := range container.Resources.Requests {
Expand Down Expand Up @@ -777,7 +772,7 @@
return res, node
}

res[builder.AkashManagedLabelName] = "true"

Check failure on line 775 in operator/inventory/node-discovery.go

View workflow job for this annotation

GitHub Actions / lint

string `true` has 3 occurrences, make it a constant (goconst)

allowedSc := adjConfig.StorageClassesForNode(knode.Name)
for _, class := range allowedSc {
Expand Down
Loading