Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements for K8S deployment (especially ITs) #18514

Merged
merged 14 commits into from
Sep 10, 2024
62 changes: 61 additions & 1 deletion .github/workflows/build_container_image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,67 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
ghcrbuild:
name: Build container image for GHCR
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
# https://stackoverflow.com/questions/59810838/how-to-get-the-short-sha-for-the-github-workflow
- name: Set outputs
id: commit
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
- name: Set branch name
id: branch
run: |
if [[ "$GITHUB_REF" == "refs/tags/"* ]]; then
echo "name=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF" == "refs/heads/dev" ]]; then
echo "name=dev" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF" == "refs/heads/release_"* ]]; then
echo "name=${GITHUB_REF#refs/heads/release_}-auto" >> $GITHUB_OUTPUT
fi
shell: bash
- name: Extract metadata for container image
id: meta
uses: docker/metadata-action@v4
with:
images: ghcr.io/${{ github.repository }}
tags: |
type=raw,value=${{steps.branch.outputs.name}}
- name: Build args
id: buildargs
run: |
echo "gitcommit=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
echo "builddate=$(date -u +'%Y-%m-%dT%H:%M:%SZ')"

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
platforms: linux/amd64

- name: Login to GHCR
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build and push container image to ghcr
uses: docker/build-push-action@v4
with:
build-args: |
GIT_COMMIT=${{ steps.buildargs.outputs.gitcommit }}
BUILD_DATE=${{ steps.buildargs.outputs.builddate }}
IMAGE_TAG=${{ steps.branch.outputs.name }}
file: .k8s_ci.Dockerfile
push: true
context: .
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
platforms: linux/amd64

build:
name: Build container image
name: Build container image for Galaxy repos
runs-on: ubuntu-latest
if: github.repository_owner == 'galaxyproject'
steps:
Expand Down Expand Up @@ -58,3 +117,4 @@ jobs:
uses: actions-hub/docker@master
with:
args: push galaxy/galaxy-min:${{ steps.branch.outputs.name }}

2 changes: 1 addition & 1 deletion lib/galaxy/dependencies/conditional-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ custos-sdk
chronos-python==1.2.1

# Kubernetes job runner
pykube-ng==21.3.0
pykube-ng==23.6.0

# Synnefo / Pithos+ object store client
kamaki
Expand Down
107 changes: 79 additions & 28 deletions lib/galaxy/jobs/runners/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
)
from galaxy.jobs.runners.util.pykube_util import (
deduplicate_entries,
DEFAULT_INGRESS_API_VERSION,
DEFAULT_JOB_API_VERSION,
delete_ingress,
delete_job,
Expand All @@ -34,6 +35,7 @@
HTTPError,
Ingress,
ingress_object_dict,
is_pod_running,
is_pod_unschedulable,
Job,
job_object_dict,
Expand Down Expand Up @@ -109,6 +111,8 @@ def __init__(self, app, nworkers, **kwargs):
k8s_interactivetools_use_ssl=dict(map=bool, default=False),
k8s_interactivetools_ingress_annotations=dict(map=str),
k8s_interactivetools_ingress_class=dict(map=str, default=None),
k8s_interactivetools_tls_secret=dict(map=str, default=None),
k8s_ingress_api_version=dict(map=str, default=DEFAULT_INGRESS_API_VERSION),
)

if "runner_param_specs" not in kwargs:
Expand Down Expand Up @@ -249,6 +253,7 @@ def __configure_port_routing(self, ajs):
service = Service(self._pykube_api, k8s_service_obj)
service.create()
ingress = Ingress(self._pykube_api, k8s_ingress_obj)
ingress.version = self.runner_params["k8s_ingress_api_version"]
ingress.create()

def __get_overridable_params(self, job_wrapper, param_key):
Expand Down Expand Up @@ -429,6 +434,54 @@ def __get_k8s_service_spec(self, ajs):
}
return k8s_spec_template

def __get_k8s_ingress_rules_spec(self, ajs, entry_points):
"""This represents the template for the "rules" portion of the Ingress spec."""
if "v1beta1" in self.runner_params["k8s_ingress_api_version"]:
rules_spec = [
{
"host": ep["domain"],
"http": {
"paths": [
{
"backend": {
"serviceName": self.__get_k8s_job_name(
self.__produce_k8s_job_prefix(), ajs.job_wrapper
),
"servicePort": int(ep["tool_port"]),
},
"path": ep.get("entry_path", "/"),
"pathType": "Prefix",
}
]
},
}
for ep in entry_points
]
else:
rules_spec = [
{
"host": ep["domain"],
"http": {
"paths": [
{
"backend": {
"service": {
"name": self.__get_k8s_job_name(
self.__produce_k8s_job_prefix(), ajs.job_wrapper
),
"port": {"number": int(ep["tool_port"])},
}
},
"path": ep.get("entry_path", "/"),
"pathType": "ImplementationSpecific",
}
]
},
}
for ep in entry_points
]
return rules_spec

def __get_k8s_ingress_spec(self, ajs):
"""The k8s spec template is nothing but a Ingress spec, except that it is nested and does not have an apiversion
nor kind."""
Expand Down Expand Up @@ -466,39 +519,22 @@ def __get_k8s_ingress_spec(self, ajs):
},
"annotations": {"app.galaxyproject.org/tool_id": ajs.job_wrapper.tool.id},
},
"spec": {
"rules": [
{
"host": ep["domain"],
"http": {
"paths": [
{
"backend": {
"service": {
"name": self.__get_k8s_job_name(
self.__produce_k8s_job_prefix(), ajs.job_wrapper
),
"port": {"number": int(ep["tool_port"])},
}
},
"path": ep.get("entry_path", "/"),
"pathType": "Prefix",
}
]
},
}
for ep in entry_points
],
},
"spec": {"rules": self.__get_k8s_ingress_rules_spec(ajs, entry_points)},
}
default_ingress_class = self.runner_params.get("k8s_interactivetools_ingress_class")
if default_ingress_class:
k8s_spec_template["spec"]["ingressClassName"] = default_ingress_class
if self.runner_params.get("k8s_interactivetools_use_ssl"):
domains = list({e["domain"] for e in entry_points})
k8s_spec_template["spec"]["tls"] = [
{"hosts": [domain], "secretName": re.sub("[^a-z0-9-]", "-", domain)} for domain in domains
]
override_secret = self.runner_params.get("k8s_interactivetools_tls_secret")
if override_secret:
k8s_spec_template["spec"]["tls"] = [
{"hosts": [domain], "secretName": override_secret} for domain in domains
]
else:
k8s_spec_template["spec"]["tls"] = [
{"hosts": [domain], "secretName": re.sub("[^a-z0-9-]", "-", domain)} for domain in domains
]
if self.runner_params.get("k8s_interactivetools_ingress_annotations"):
new_ann = yaml.safe_load(self.runner_params.get("k8s_interactivetools_ingress_annotations"))
k8s_spec_template["metadata"]["annotations"].update(new_ann)
Expand Down Expand Up @@ -767,10 +803,14 @@ def check_watched_item(self, job_state):
pass
else:
pass
else:
elif self.__check_job_pod_running(job_state):
log.debug("Job set to running...")
job_state.running = True
job_state.job_wrapper.change_state(model.Job.states.RUNNING)
else:
log.debug(
f"Job id: {job_state.job_id} with k8s id: {job.name} scheduled and is waiting to start..."
)
return job_state
elif job_persisted_state == model.Job.states.DELETED:
# Job has been deleted via stop_job and job has not been deleted,
Expand Down Expand Up @@ -921,6 +961,17 @@ def __job_failed_due_to_low_memory(self, job_state):

return False

def __check_job_pod_running(self, job_state):
"""
checks the state of the pod to see if it is running.
"""
pods = find_pod_object_by_name(self._pykube_api, job_state.job_id, self.runner_params["k8s_namespace"])
if not pods.response["items"]:
return False

pod = Pod(self._pykube_api, pods.response["items"][0])
return is_pod_running(self._pykube_api, pod, self.runner_params["k8s_namespace"])

def __job_pending_due_to_unschedulable_pod(self, job_state):
"""
checks the state of the pod to see if it is unschedulable.
Expand Down
8 changes: 8 additions & 0 deletions lib/galaxy/jobs/runners/util/pykube_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ def find_pod_object_by_name(pykube_api, job_name, namespace=None):
return Pod.objects(pykube_api).filter(selector=f"job-name={job_name}", namespace=namespace)


def is_pod_running(pykube_api, pod, namespace=None):
if pod.obj["status"].get("phase") == "Running":
return True

return False


def is_pod_unschedulable(pykube_api, pod, namespace=None):
is_unschedulable = any(c.get("reason") == "Unschedulable" for c in pod.obj["status"].get("conditions", []))
if pod.obj["status"].get("phase") == "Pending" and is_unschedulable:
Expand Down Expand Up @@ -311,6 +318,7 @@ def galaxy_instance_id(params):
"find_pod_object_by_name",
"galaxy_instance_id",
"HTTPError",
"is_pod_running",
"is_pod_unschedulable",
"Job",
"Service",
Expand Down
3 changes: 1 addition & 2 deletions lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2912,8 +2912,7 @@ def __init__(
@property
def active(self):
if self.configured and not self.deleted:
# FIXME: don't included queued?
return not self.job.finished
return self.job.running
nuwang marked this conversation as resolved.
Show resolved Hide resolved
return False

@property
Expand Down
Loading