Skip to content

Commit

Permalink
Merge pull request #141 from Cray-HPE/release/3.17.0
Browse files Browse the repository at this point in the history
Release/3.17.0
  • Loading branch information
dlaine-hpe authored Aug 29, 2024
2 parents 9e6d672 + 8930914 commit 65afcb4
Show file tree
Hide file tree
Showing 15 changed files with 309 additions and 65 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [3.17.0] - 2024-08-29
### Added
- CASMCMS-8979 - add a status endpoint for the remote build nodes.
- CASMCMS-8977 - check that the ssh key is present each time spawning a remote job.
- CASMINST-6602 - enable dkms by default.
- CASMTRIAGE-7169 - job memory size was not getting picked up correctly from the ims configuration settings.
- CASMCMS-9040 - change read/write permissions of recipe config files output in image.

### Dependencies
- CSM 1.6 moved to Kubernetes 1.24, so use client v24.x to ensure compatibility

## [3.16.2] - 2024-07-25
### Dependencies
- Resolved CVE: Require `setuptools` >= 70.0
Expand Down
76 changes: 76 additions & 0 deletions api/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,46 @@ paths:
$ref: '#/components/responses/NotFound'
'500':
$ref: '#/components/responses/InternalServerError'
/v3/remote-build-nodes/status/{remote_build_node_xname}:
parameters:
- $ref: '#/components/parameters/remote_build_node_xname'
get:
summary: List remote build node status objects
operationId: get_all_v3_remote_build_status
tags:
- remote build node status
- v3
description: Retrieve the status of all remote build nodes that are registered with IMS.
responses:
'200':
description: A collection of the status of each remote build node
content:
application/json:
schema:
items:
$ref: '#/components/schemas/RemoteBuildNodeStatus'
type: array
'500':
$ref: '#/components/responses/InternalServerError'
/v3/remote-build-nodes/status:
get:
summary: List remote build node status objects
operationId: get_all_v3_remote_build_status
tags:
- remote build node status
- v3
description: Retrieve the status of all remote build nodes that are registered with IMS.
responses:
'200':
description: A collection of the status of each remote build node
content:
application/json:
schema:
items:
$ref: '#/components/schemas/RemoteBuildNodeStatus'
type: array
'500':
$ref: '#/components/responses/InternalServerError'
/v3/jobs:
get:
summary: Retrieve a list of JobRecords that are registered with IMS
Expand Down Expand Up @@ -2072,6 +2112,42 @@ components:
example: x3000c1s10b1n0
type: string
minLength: 1
RemoteBuildNodeStatus:
description: A Remote Build Node Status
type: object
required:
- xname
properties:
xname:
description: Xname of the remote build node
example: x3000c1s10b1n0
type: string
minLength: 1
nodeArch:
description: Architecture of the remote build node
example: x86_64
type: string
minLength: 1
numCurrentJobs:
description: Number of current jobs running on the remote build node
example: 15
type: integer
minLength: 1
podmanStatus:
description: Status of the podman executable on the remote build node
example: Podman present at /usr/bin/podman
type: string
minLength: 1
sshStatus:
description: Status of the ssh connection to the remote build node
example: SSH connection established
type: string
minLength: 1
ableToRunJobs:
description: If the node is able to run new jobs
example: True
type: boolean
minLength: 1
ArtifactLinkRecord:
description: An Artifact Link Record
type: object
Expand Down
6 changes: 3 additions & 3 deletions constraints.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Constraints updated 5/29/2024
# Constraints updated 8/27/2024
aniso8601==9.0.1
boto3==1.34.114
botocore==1.34.114
Expand All @@ -18,8 +18,8 @@ idna==3.7
itsdangerous==2.2.0
Jinja2==3.1.4
jmespath==1.0.1
# CSM 1.6 uses Kubernetes 1.22, so use client v22.x to ensure compatability
kubernetes==22.6.0
# CSM 1.6 moved to Kubernetes 1.24, so use client v24.x to ensure compatability
kubernetes>=24.2,<24.3
MarkupSafe==2.1.5
marshmallow==3.21.2
oauthlib==3.2.2
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/cray-ims/values.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2021-2023 Hewlett Packard Enterprise Development LP
# (C) Copyright 2021-2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -88,7 +88,7 @@ customer_access:
subnet_name: "cmn"

jobs:
enable_dkms: false
enable_dkms: true
kata_runtime: "kata-qemu"
aarch64_runtime: "kata-qemu"

Expand Down
35 changes: 25 additions & 10 deletions src/server/models/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
from marshmallow.validate import Length, OneOf, Range

from src.server.helper import ARCH_ARM64, ARCH_X86_64
from src.server.vault import test_private_key_file
from src.server.models.remote_build_nodes import RemoteNodeStatus

JOB_TYPE_CREATE = 'create'
JOB_TYPE_CUSTOMIZE = 'customize'
Expand Down Expand Up @@ -83,7 +85,7 @@ def __init__(self, job_type, artifact_id, id=None, created=None, status=None,
kubernetes_configmap=None, enable_debug=False,
build_env_size=None, image_root_archive_name=None, kernel_file_name=None,
initrd_file_name=None, resultant_image_id=None, ssh_containers=None,
kubernetes_namespace=None, kernel_parameters_file_name=None, require_dkms=False,
kubernetes_namespace=None, kernel_parameters_file_name=None, require_dkms=True,
arch=None, job_mem_size=None, kubernetes_pvc=None, remote_build_node=""):
# Supplied
# v2.0
Expand Down Expand Up @@ -161,11 +163,11 @@ class V2JobRecordInputSchema(Schema):
ssh_containers = fields.List(fields.Nested(SshContainerInputSchema()), allow_none=True)

# v2.1
require_dkms = fields.Boolean(required=False, load_default=False, dump_default=False,
require_dkms = fields.Boolean(required=False, load_default=True, dump_default=True,
metadata={"metadata": {"description": "Job requires the use of dkms"}})

# v2.2
job_mem_size = fields.Integer(load_default=1, dump_default=1,
job_mem_size = fields.Integer(dump_default=1, required=False,
validate=Range(min=1, error="build_env_size must be greater than or equal to 1"),
metadata={"metadata": {"description": "Approximate working memory in GiB to reserve for the build job "
"environment (loosely proportional to the final image size)"}})
Expand Down Expand Up @@ -259,13 +261,26 @@ def find_remote_node_for_job(app, job: V2JobRecordSchema) -> str:
"""
app.logger.info(f"Checking for remote build node for job")
best_node = ""
best_node_job_count = 10000
best_node_job_count = 10000 # seed with a really big number of jobs

# make sure the ssh key was set up correctly
if not test_private_key_file(app):
app.logger.error("Problem with ssh key - unable to create remote jobs")
return best_node

# Since the ssh key is good - look for a valid node
for xname, remote_node in app.data['remote_build_nodes'].items():
arch, numJobs = remote_node.getStatus()
if arch != None and arch == job.arch:
app.logger.info(f"Matching remote node: {xname}, current jobs on node: {numJobs}")
# matching arch - can use the node, now pick the best
if best_node == "" or numJobs < best_node_job_count:
nodeStatus = remote_node.getStatus()
if nodeStatus.ableToRunJobs and nodeStatus.nodeArch == job.arch:
app.logger.info(f"Matching remote node: {xname}, current jobs on node: {nodeStatus.numCurrentJobs}")

# -1 means no job information, make sure we don't prefer those nodes
numNodeJobs = nodeStatus.numCurrentJobs
if numNodeJobs == -1:
numNodeJobs = 10000

# matching arch - can use the node, now pick the node with the least jobs running
if best_node == "" or numNodeJobs < best_node_job_count:
best_node = remote_node.xname
best_node_job_count = numJobs
best_node_job_count = numNodeJobs
return best_node
8 changes: 4 additions & 4 deletions src/server/models/recipes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2018-2023 Hewlett Packard Enterprise Development LP
# (C) Copyright 2018-2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -56,7 +56,7 @@ class V2RecipeRecord:

# pylint: disable=W0622
def __init__(self, name, recipe_type, linux_distribution, link=None, id=None, created=None,
template_dictionary=None, require_dkms=False, arch=ARCH_X86_64):
template_dictionary=None, require_dkms=True, arch=ARCH_X86_64):
# Supplied
self.name = name
self.link = link
Expand Down Expand Up @@ -91,7 +91,7 @@ class V2RecipeRecordInputSchema(Schema):
template_dictionary = fields.List(fields.Nested(RecipeKeyValuePair()), required=False, allow_none=True)

# v2.2
require_dkms = fields.Boolean(load_default=False, dump_default=False,
require_dkms = fields.Boolean(load_default=True, dump_default=True,
metadata={"metadata": {"description": "Recipe requires the use of dkms"}})
arch = fields.Str(required=False, metadata={"metadata": {"description": "Architecture of the recipe"}},
validate=OneOf([ARCH_ARM64,ARCH_X86_64]), load_default=ARCH_X86_64, dump_default=ARCH_X86_64)
Expand Down Expand Up @@ -126,6 +126,6 @@ class V2RecipeRecordPatchSchema(Schema):
arch = fields.Str(required=False, validate=OneOf([ARCH_ARM64,ARCH_X86_64]),
load_default=ARCH_X86_64, dump_default=ARCH_X86_64,
metadata={"metadata": {"description": "Architecture of the recipe"}})
require_dkms = fields.Boolean(required=False, load_default=False, dump_default=False,
require_dkms = fields.Boolean(required=False, load_default=True, dump_default=True,
metadata={"metadata": {"description": "Recipe requires the use of dkms"}})
template_dictionary = fields.List(fields.Nested(RecipeKeyValuePair()), required=False, allow_none=True)
71 changes: 48 additions & 23 deletions src/server/models/remote_build_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"""

import socket
import json
from flask import current_app as app

from marshmallow import Schema, fields, post_load, RAISE
Expand All @@ -38,6 +39,20 @@

from src.server.helper import ARCH_ARM64, ARCH_X86_64

class RemoteNodeStatus:
""" Object to hold the current status of a remote build node """

def __init__(self, xname: str) -> None:
self.xname = xname
self.sshStatus = "Unknown"
self.podmanStatus = "Unknown"
self.nodeArch = "Unknown"
self.numCurrentJobs = -1
self.ableToRunJobs = False

def toJson(self):
return self.__dict__

class V3RemoteBuildNodeRecord:
""" The RemoteBuildNodeRecord object """

Expand All @@ -49,21 +64,19 @@ def __init__(self, xname):
def __repr__(self):
return '<V3RemoteBuildNodeRecord(xname={self.xname!r})>'.format(self=self)

def getStatus(self) -> (str, int): #(arch, current jobs)
def getStatus(self) -> RemoteNodeStatus:
"""
Utility function to verify that a node is set up and available for remote
builds. If the node can not be contacted or is not set up for running IMS
jobs, this will return (None,None)
Returns:
Archetecture of the node if it can be determined
Number of jobs currently running on the node
RemoteNodeStatus object with details about the current state of the
remote build node.
"""

# start with status Invalid
arch = None
numJobs = None
status = RemoteNodeStatus(self.xname)

# connect to the remote node
connect_kwargs = {"key_filename": "/app/id_ecdsa"}
Expand All @@ -75,7 +88,9 @@ def getStatus(self) -> (str, int): #(arch, current jobs)
except (BadHostKeyException, AuthenticationException, NoValidConnectionsError,
SSHException, socket.error) as error:
app.logger.error(f"Unable to connect to node: {self.xname}, Error: {error}")
return arch, numJobs
status.sshStatus = f"Unable to connect to node. Error: {error}"
return status
status.sshStatus = "SSH connection established."

# make sure the above connection gets closed on exit
try:
Expand All @@ -86,20 +101,23 @@ def getStatus(self) -> (str, int): #(arch, current jobs)

# check result
if result.exited != 0:
app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout} {result.stderr}")
return arch, numJobs
app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {result.stdout} {result.stderr}")
status.nodeArch = f"Unable to determine architecture of node. Error: {result.stdout} {result.stderr}"
return status

# see if we can pull out a known arch type
if "aarch64" in result.stdout:
arch = ARCH_ARM64
status.nodeArch = ARCH_ARM64
elif "x86" in result.stdout:
arch = ARCH_X86_64
status.nodeArch = ARCH_X86_64
else:
app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout}")
return arch, numJobs
app.logger.error(f"Undefined architecture type for node: {self.xname}, Error: {result.stdout}")
status.nodeArch = f"Undefined architecture type for node, result: {result.stdout}"
return status
except (UnexpectedExit, Failure) as error:
app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {error}")
return arch, numJobs
app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {error}")
status.nodeArch = f"Unable to determine architecture of node. Error: {error}"
return status

# insure it has podman installed
try:
Expand All @@ -109,16 +127,26 @@ def getStatus(self) -> (str, int): #(arch, current jobs)
# check result
if result.exited != 0:
app.logger.error(f"Unable to determine if podman is installed on node: {self.xname}, Error: {result.stdout} {result.stderr}")
return None,None
status.podmanStatus = f"Unable to determine if podman is installed on node. Error: {result.stdout} {result.stderr}"
return status

# see if we can pull out a known arch type
if "/usr/bin/podman" not in result.stdout:
app.logger.error(f"Podman not installed on node: {self.xname}, Error: {result.stdout}")
return
status.podmanStatus = f"Podman not installed on node."
return status

# report podman is present
status.podmanStatus = f"Podman present at /usr/bin/podman"
except (UnexpectedExit, Failure) as error:
app.logger.error(f"Unable determine if tools are installed on node: {self.xname}, Error: {error}")
return None,None
status.podmanStatus = f"Unable determine if tools are installed on node. Error: {error}"
return status

# Don't fail the remote node over gathering number of current jobs - mark
# the node as valid now.
status.ableToRunJobs = True

# Every running IMS job will create a working directory '/tmp/ims_(IMS_JOB_ID)'.
# Count the number of these directories to find the number of running jobs on
# the node - they are cleaned up when the job is complete on the node.
Expand All @@ -128,19 +156,16 @@ def getStatus(self) -> (str, int): #(arch, current jobs)
if result.exited != 0:
# let this go through and schedule a job on the node
app.logger.error(f"Unable to determine number of jobs on node: {self.xname}, Error: {result.stdout} {result.stderr}")
numJobs = 0
else:
numJobs = int(result.stdout)
status.numCurrentJobs = int(result.stdout)
except (UnexpectedExit, Failure) as error:
# Just log this, but allow the job to run
app.logger.error(f"Unable determine number of running jobs on node: {self.xname}, Error: {error}")
numJobs = 0
finally:
# close tha active connection
c.close()

return arch, numJobs

return status

class V3RemoteBuildNodeRecordInputSchema(Schema):
""" A schema specifically for defining and validating user input """
Expand Down
Loading

0 comments on commit 65afcb4

Please sign in to comment.