Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix Cluster Autoscaler Evictions #100

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- Kubernetes Cluster Autoscaler evicting pods on scale down

### Security

## 0.3.2
Expand Down
24 changes: 22 additions & 2 deletions prefect_kubernetes/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,15 @@ def _get_default_job_manifest_template() -> Dict[str, Any]:
"generateName": "{{ name }}-",
},
"spec": {
"backoffLimit": 0,
"backoffLimit": "{{ backoff_limit }}",
"ttlSecondsAfterFinished": "{{ finished_job_ttl }}",
"template": {
"metadata": {
"annotations": {
"cluster-autoscaler.kubernetes.io/"
"safe-to-evict": "{{ safe_to_evict }}"
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line separation was completed due to conflicting rules between black and flake8. Reference the docs here. I am open to alternative options, but the options I saw were: the line break, increasing the line length arbitrarily, or ignoring the E501 rule.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to add a comment to ignore the rule!

}
},
"spec": {
"parallelism": 1,
"completions": 1,
Expand All @@ -189,7 +195,7 @@ def _get_default_job_manifest_template() -> Dict[str, Any]:
"args": "{{ command }}",
}
],
}
},
},
},
}
Expand Down Expand Up @@ -516,6 +522,20 @@ class KubernetesWorkerVariables(BaseVariables):
default=None,
description="The Kubernetes cluster config to use for job creation.",
)
safe_to_evict: bool = Field(
default=False,
description="If set to True and using a Cluster Autoscaler, "
"the Pod for a job is allowing rescheduling on a different node. "
"Should only be used if your workloads are fault tolerant and you have "
"increased the backoff limit. Not doing so will cause the Flow to exhibit "
"a Crashed state",
)
backoff_limit: int = Field(
default=0,
description="If value is not 0, then the job may recreate the pod without "
"failing the kubernetes Job. This will cause a flow to restart from the "
"beginning. Should only be used if your workloads are fault tolerant.",
)


class KubernetesWorkerResult(BaseWorkerResult):
Expand Down
34 changes: 28 additions & 6 deletions tests/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ def enable_store_api_key_in_secret(monkeypatch):
"spec": {
"backoffLimit": 0,
"template": {
"metadata": {
"annotations": {
"cluster-autoscaler.kubernetes.io/safe-to-evict": False
}
},
"spec": {
"parallelism": 1,
"completions": 1,
Expand All @@ -191,7 +196,7 @@ def enable_store_api_key_in_secret(monkeypatch):
"imagePullPolicy": "IfNotPresent",
}
],
}
},
},
},
},
Expand Down Expand Up @@ -236,6 +241,11 @@ def enable_store_api_key_in_secret(monkeypatch):
"spec": {
"backoffLimit": 0,
"template": {
"metadata": {
"annotations": {
"cluster-autoscaler.kubernetes.io/safe-to-evict": False
}
},
"spec": {
"parallelism": 1,
"completions": 1,
Expand All @@ -262,7 +272,7 @@ def enable_store_api_key_in_secret(monkeypatch):
"args": ["python", "-m", "prefect.engine"],
}
],
}
},
},
},
},
Expand Down Expand Up @@ -578,6 +588,8 @@ def enable_store_api_key_in_secret(monkeypatch):
"image": "test-image:latest",
"finished_job_ttl": 60,
"namespace": "test-namespace",
"safe_to_evict": True,
"backoff_limit": 6,
},
KubernetesWorkerJobConfiguration(
command="echo hello",
Expand All @@ -598,9 +610,14 @@ def enable_store_api_key_in_secret(monkeypatch):
"generateName": "test-",
},
"spec": {
"backoffLimit": 0,
"backoffLimit": 6,
"ttlSecondsAfterFinished": 60,
"template": {
"metadata": {
"annotations": {
"cluster-autoscaler.kubernetes.io/safe-to-evict": True
}
},
"spec": {
"parallelism": 1,
"completions": 1,
Expand All @@ -617,7 +634,7 @@ def enable_store_api_key_in_secret(monkeypatch):
"args": "echo hello",
}
],
}
},
},
},
},
Expand Down Expand Up @@ -663,9 +680,14 @@ def enable_store_api_key_in_secret(monkeypatch):
},
},
"spec": {
"backoffLimit": 0,
"backoffLimit": 6,
"ttlSecondsAfterFinished": 60,
"template": {
"metadata": {
"annotations": {
"cluster-autoscaler.kubernetes.io/safe-to-evict": True
}
},
"spec": {
"parallelism": 1,
"completions": 1,
Expand Down Expand Up @@ -697,7 +719,7 @@ def enable_store_api_key_in_secret(monkeypatch):
"args": ["echo", "hello"],
}
],
}
},
},
},
},
Expand Down
Loading