From d0f0bca2e8918a88de8ce2a20216c19af17ee4e6 Mon Sep 17 00:00:00 2001 From: David Laine Date: Tue, 13 Aug 2024 11:58:43 -0500 Subject: [PATCH 1/8] CASMCMS-8979 - add remote build node status endpoint. --- CHANGELOG.md | 2 + api/openapi.yaml | 76 +++++++++++++++++++ src/server/models/jobs.py | 16 ++-- src/server/models/remote_build_nodes.py | 75 ++++++++++++------ src/server/v3/__init__.py | 10 ++- src/server/v3/resources/remote_build_nodes.py | 40 +++++++++- tests/v3/test_v3_remote_build_nodes.py | 50 +++++++++++- 7 files changed, 234 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8419877..9578e8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- CASMCMS-8979 - add a status endpoint for the remote build nodes. ## [3.16.2] - 2024-07-25 ### Dependencies diff --git a/api/openapi.yaml b/api/openapi.yaml index cf2e6ab..b7f048e 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -629,6 +629,46 @@ paths: $ref: '#/components/responses/NotFound' '500': $ref: '#/components/responses/InternalServerError' + /v3/remote-build-nodes/status/{remote_build_node_xname}: + parameters: + - $ref: '#/components/parameters/remote_build_node_xname' + get: + summary: List remote build node status objects + operationId: get_all_v3_remote_build_status + tags: + - remote build node status + - v3 + description: Retrieve the status of all remote build nodes that are registered with IMS. + responses: + '200': + description: A collection of the status of each remote build node + content: + application/json: + schema: + items: + $ref: '#/components/schemas/RemoteBuildNodeStatus' + type: array + '500': + $ref: '#/components/responses/InternalServerError' + /v3/remote-build-nodes/status: + get: + summary: List remote build node status objects + operationId: get_all_v3_remote_build_status + tags: + - remote build node status + - v3 + description: Retrieve the status of all remote build nodes that are registered with IMS. + responses: + '200': + description: A collection of the status of each remote build node + content: + application/json: + schema: + items: + $ref: '#/components/schemas/RemoteBuildNodeStatus' + type: array + '500': + $ref: '#/components/responses/InternalServerError' /v3/jobs: get: summary: Retrieve a list of JobRecords that are registered with IMS @@ -2072,6 +2112,42 @@ components: example: x3000c1s10b1n0 type: string minLength: 1 + RemoteBuildNodeStatus: + description: A Remote Build Node Status + type: object + required: + - xname + properties: + xname: + description: Xname of the remote build node + example: x3000c1s10b1n0 + type: string + minLength: 1 + nodeArch: + description: Architecture of the remote build node + example: x86_64 + type: string + minLength: 1 + numCurrentJobs: + description: Number of current jobs running on the remote build node + example: 15 + type: integer + minLength: 1 + podmanStatus: + description: Status of the podman executable on the remote build node + example: Podman present at /usr/bin/podman + type: string + minLength: 1 + sshStatus: + description: Status of the ssh connection to the remote build node + example: SSH connection established + type: string + minLength: 1 + ableToRunJobs: + description: If the node is able to run new jobs + example: True + type: boolean + minLength: 1 ArtifactLinkRecord: description: An Artifact Link Record type: object diff --git a/src/server/models/jobs.py b/src/server/models/jobs.py index 8a0642f..076226e 100644 --- a/src/server/models/jobs.py +++ b/src/server/models/jobs.py @@ -34,6 +34,7 @@ from marshmallow.validate import Length, OneOf, Range from src.server.helper import ARCH_ARM64, ARCH_X86_64 +from src.server.models.remote_build_nodes import RemoteNodeStatus JOB_TYPE_CREATE = 'create' JOB_TYPE_CUSTOMIZE = 'customize' @@ -259,13 +260,14 @@ def find_remote_node_for_job(app, job: V2JobRecordSchema) -> str: """ app.logger.info(f"Checking for remote build node for job") best_node = "" - best_node_job_count = 10000 + best_node_job_count = RemoteNodeStatus.UNKNOWN_NUM_JOBS - 1 + for xname, remote_node in app.data['remote_build_nodes'].items(): - arch, numJobs = remote_node.getStatus() - if arch != None and arch == job.arch: - app.logger.info(f"Matching remote node: {xname}, current jobs on node: {numJobs}") - # matching arch - can use the node, now pick the best - if best_node == "" or numJobs < best_node_job_count: + nodeStatus = remote_node.getStatus() + if nodeStatus.ableToRunJobs and nodeStatus.nodeArch == job.arch: + app.logger.info(f"Matching remote node: {xname}, current jobs on node: {nodeStatus.numCurrentJobs}") + # matching arch - can use the node, now pick the node with the least jobs running + if best_node == "" or nodeStatus.numCurrentJobs < best_node_job_count: best_node = remote_node.xname - best_node_job_count = numJobs + best_node_job_count = nodeStatus.numCurrentJobs return best_node diff --git a/src/server/models/remote_build_nodes.py b/src/server/models/remote_build_nodes.py index 7130250..9ee3998 100644 --- a/src/server/models/remote_build_nodes.py +++ b/src/server/models/remote_build_nodes.py @@ -26,6 +26,7 @@ """ import socket +import json from flask import current_app as app from marshmallow import Schema, fields, post_load, RAISE @@ -38,6 +39,24 @@ from src.server.helper import ARCH_ARM64, ARCH_X86_64 +class RemoteNodeStatus: + """ Object to hold the current status of a remote build node """ + + # status variable to represent and unknown number of jobs on a node + UNKNOWN_NUM_JOBS = 10000 + + def __init__(self, xname: str) -> None: + self.xname = xname + self.sshStatus = "Unknown" + self.podmanStatus = "Unknown" + self.nodeArch = "Unknown" + self.numCurrentJobs = self.UNKNOWN_NUM_JOBS + self.ableToRunJobs = False + + def toJson(self): + return self.__dict__ + #return json.dumps(self, default=lambda o: o.__dict__) + class V3RemoteBuildNodeRecord: """ The RemoteBuildNodeRecord object """ @@ -49,21 +68,19 @@ def __init__(self, xname): def __repr__(self): return ''.format(self=self) - def getStatus(self) -> (str, int): #(arch, current jobs) + def getStatus(self) -> RemoteNodeStatus: """ Utility function to verify that a node is set up and available for remote builds. If the node can not be contacted or is not set up for running IMS jobs, this will return (None,None) Returns: - Archetecture of the node if it can be determined - Number of jobs currently running on the node - + RemoteNodeStatus object with details about the current state of the + remote build node. """ # start with status Invalid - arch = None - numJobs = None + status = RemoteNodeStatus(self.xname) # connect to the remote node connect_kwargs = {"key_filename": "/app/id_ecdsa"} @@ -75,7 +92,9 @@ def getStatus(self) -> (str, int): #(arch, current jobs) except (BadHostKeyException, AuthenticationException, NoValidConnectionsError, SSHException, socket.error) as error: app.logger.error(f"Unable to connect to node: {self.xname}, Error: {error}") - return arch, numJobs + status.sshStatus = f"Unable to connect to node. Error: {error}" + return status + status.sshStatus = "SSH connection established." # make sure the above connection gets closed on exit try: @@ -86,20 +105,23 @@ def getStatus(self) -> (str, int): #(arch, current jobs) # check result if result.exited != 0: - app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout} {result.stderr}") - return arch, numJobs + app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {result.stdout} {result.stderr}") + status.nodeArch = f"Unable to determine architecture of node. Error: {result.stdout} {result.stderr}" + return status # see if we can pull out a known arch type if "aarch64" in result.stdout: - arch = ARCH_ARM64 + status.nodeArch = ARCH_ARM64 elif "x86" in result.stdout: - arch = ARCH_X86_64 + status.nodeArch = ARCH_X86_64 else: - app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout}") - return arch, numJobs + app.logger.error(f"Undefined architecture type for node: {self.xname}, Error: {result.stdout}") + status.nodeArch = f"Undefined architecture type for node, result: {result.stdout}" + return status except (UnexpectedExit, Failure) as error: - app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {error}") - return arch, numJobs + app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {error}") + status.nodeArch = f"Unable to determine architecture of node. Error: {error}" + return status # insure it has podman installed try: @@ -109,16 +131,26 @@ def getStatus(self) -> (str, int): #(arch, current jobs) # check result if result.exited != 0: app.logger.error(f"Unable to determine if podman is installed on node: {self.xname}, Error: {result.stdout} {result.stderr}") - return None,None + status.podmanStatus = f"Unable to determine if podman is installed on node. Error: {result.stdout} {result.stderr}" + return status # see if we can pull out a known arch type if "/usr/bin/podman" not in result.stdout: app.logger.error(f"Podman not installed on node: {self.xname}, Error: {result.stdout}") - return + status.podmanStatus = f"Podman not installed on node." + return status + + # report podman is present + status.podmanStatus = f"Podman present at /usr/bin/podman" except (UnexpectedExit, Failure) as error: app.logger.error(f"Unable determine if tools are installed on node: {self.xname}, Error: {error}") - return None,None + status.podmanStatus = f"Unable determine if tools are installed on node. Error: {error}" + return status + # Don't fail the remote node over gathering number of current jobs - mark + # the node as valid now. + status.ableToRunJobs = True + # Every running IMS job will create a working directory '/tmp/ims_(IMS_JOB_ID)'. # Count the number of these directories to find the number of running jobs on # the node - they are cleaned up when the job is complete on the node. @@ -128,19 +160,16 @@ def getStatus(self) -> (str, int): #(arch, current jobs) if result.exited != 0: # let this go through and schedule a job on the node app.logger.error(f"Unable to determine number of jobs on node: {self.xname}, Error: {result.stdout} {result.stderr}") - numJobs = 0 else: - numJobs = int(result.stdout) + status.numCurrentJobs = int(result.stdout) except (UnexpectedExit, Failure) as error: # Just log this, but allow the job to run app.logger.error(f"Unable determine number of running jobs on node: {self.xname}, Error: {error}") - numJobs = 0 finally: # close tha active connection c.close() - return arch, numJobs - + return status class V3RemoteBuildNodeRecordInputSchema(Schema): """ A schema specifically for defining and validating user input """ diff --git a/src/server/v3/__init__.py b/src/server/v3/__init__.py index 39792d8..1ec3e5c 100644 --- a/src/server/v3/__init__.py +++ b/src/server/v3/__init__.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2020-2022 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2022, 2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -41,7 +41,7 @@ V3RecipeResource, V3RecipeCollection, \ V3DeletedRecipeResource, V3DeletedRecipeCollection from src.server.v3.resources.remote_build_nodes import V3RemoteBuildNodeResource, \ - V3RemoteBuildNodeCollection + V3RemoteBuildNodeCollection, V3RemoteBuildStatus, V3RemoteBuildStatusCollection app_errors = { # Custom 405 error format to conform to RFC 7807 'MethodNotAllowed': json.loads( @@ -60,6 +60,12 @@ apiv3.add_resource(V3RemoteBuildNodeCollection, '/'.join([uri_prefix, 'remote-build-nodes']), endpoint='_'.join([endpoint_prefix, 'remote_build_nodes_collection'])) + apiv3.add_resource(V3RemoteBuildStatus, + '/'.join([uri_prefix, 'remote-build-nodes/status/']), + endpoint='_'.join([endpoint_prefix, 'remote_build_status'])) + apiv3.add_resource(V3RemoteBuildStatusCollection, + '/'.join([uri_prefix, 'remote-build-nodes/status']), + endpoint='_'.join([endpoint_prefix, 'remote_build_status_collection'])) apiv3.add_resource(V3PublicKeyResource, '/'.join([uri_prefix, 'public-keys/']), diff --git a/src/server/v3/resources/remote_build_nodes.py b/src/server/v3/resources/remote_build_nodes.py index 0ff0a3d..6c122be 100644 --- a/src/server/v3/resources/remote_build_nodes.py +++ b/src/server/v3/resources/remote_build_nodes.py @@ -32,12 +32,48 @@ from src.server.errors import problemify, generate_missing_input_response, generate_data_validation_failure, \ generate_resource_not_found_response from src.server.helper import get_log_id -from src.server.models.remote_build_nodes import V3RemoteBuildNodeRecordInputSchema, V3RemoteBuildNodeRecordSchema, V3RemoteBuildNodeRecord +from src.server.models.remote_build_nodes import V3RemoteBuildNodeRecordInputSchema, V3RemoteBuildNodeRecordSchema, V3RemoteBuildNodeRecord, RemoteNodeStatus from src.server.v3.models import PATCH_OPERATION_UNDELETE remote_build_node_user_input_schema = V3RemoteBuildNodeRecordInputSchema() remote_build_node_schema = V3RemoteBuildNodeRecordSchema() +class V3RemoteBuildStatus(Resource): + """ + Class for querying the current status of the remote build nodes + """ + + def get(self, remote_build_node_xname): + """ Retrieve a remote build node. """ + log_id = get_log_id() + current_app.logger.info("%s ++ remote_build_status.v3.GET %s", log_id, remote_build_node_xname) + + if remote_build_node_xname not in current_app.data['remote_build_nodes']: + current_app.logger.info("%s no IMS remote build node matches xname=%s", log_id, remote_build_node_xname) + return generate_resource_not_found_response() + + return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus().toJson() + #return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus() + current_app.logger.info("%s Returning json response: %s", log_id, return_json) + return jsonify(return_json) + +class V3RemoteBuildStatusCollection(Resource): + """ + Class for querying the current status of all the remote build nodes + """ + + def get(self): + """ Retrieve a remote build node. """ + log_id = get_log_id() + current_app.logger.info("%s ++ remote_build_status_collection.v3.GET", log_id) + + return_json = [] + for remote_node in current_app.data['remote_build_nodes'].values(): + return_json.append(remote_node.getStatus().toJson()) + + current_app.logger.info("%s Returning json response: %s", log_id, return_json) + return jsonify(return_json) + class V3RemoteBuildNodeCollection(Resource): """ Class representing the operations that can be taken on a collection of remote builds nodes @@ -113,7 +149,7 @@ def get(self, remote_build_node_xname): current_app.logger.info("%s ++ remote_build_nodes.v3.GET %s", log_id, remote_build_node_xname) if remote_build_node_xname not in current_app.data['remote_build_nodes']: - current_app.logger.info("%s no IMS remote bild node matches xname=%s", log_id, remote_build_node_xname) + current_app.logger.info("%s no IMS remote build node matches xname=%s", log_id, remote_build_node_xname) return generate_resource_not_found_response() return_json = remote_build_node_schema.dump(current_app.data['remote_build_nodes'][remote_build_node_xname]) diff --git a/tests/v3/test_v3_remote_build_nodes.py b/tests/v3/test_v3_remote_build_nodes.py index 55e1cba..be14a70 100644 --- a/tests/v3/test_v3_remote_build_nodes.py +++ b/tests/v3/test_v3_remote_build_nodes.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2023 Hewlett Packard Enterprise Development LP +# (C) Copyright 2023-2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -143,5 +143,53 @@ def test_post_422_name_is_blank(self): check_error_responses(self, response, 422, ['status', 'title', 'detail', 'errors']) self.assertIn("xname", response.json["errors"], "Expected xname to be listed in error detail") +class TestV3RemoteBuildStatusEndpoint(TestCase): + """ + Test the remote-build-nodes/status/{remote_build_node_xname} endpoint (ims.v3.resources.remote_build_node.RemoteBuildStatus) + """ + + def setUp(self): + super(TestV3RemoteBuildStatusEndpoint, self).setUp() + self.app = self.useFixture(V3FlaskTestClientFixture()).client + self.data = { + 'xname': self.getUniqueString() + } + self.useFixture(V3RemoteBuildNodesDataFixture(initial_data=self.data)) + self.test_uri = '/v3/remote-build-nodes/status/{}'.format(self.data['xname']) + + def test_get(self): + """ Test the remote-build-nodes/status/{remote_build_node_xname} resource retrieval """ + response = self.app.get(self.test_uri) + self.assertEqual(response.status_code, 200, 'status code was not 200') + response_data = json.loads(response.data) + self.assertEqual(response_data['xname'], self.data['xname']) + + def test_get_404_bad_id(self): + """ Test the remote-build-nodes/status/{remote_build_node_xname} resource retrieval with an unknown id """ + response = self.app.get('/v3/remote-build-nodes/status/{}'.format(str(uuid.uuid4()))) + check_error_responses(self, response, 404, ['status', 'title', 'detail']) + +class TestV3RemoteBuildStatusCollectionEndpoint(TestCase): + """ + Test the remote-build-nodes/ collection endpoint (ims.v3.resources.remote_build_nodes.RemoteBuildStatusCollection) + """ + + def setUp(self): + super(TestV3RemoteBuildStatusCollectionEndpoint, self).setUp() + self.test_uri = '/v3/remote-build-nodes/status' + self.app = self.useFixture(V3FlaskTestClientFixture()).client + self.data = { + 'xname': self.getUniqueString() + } + self.test_remote_build_nodes = self.useFixture(V3RemoteBuildNodesDataFixture(initial_data=self.data)).datastore + + def test_get(self): + """ Test happy path GET """ + response = self.app.get(self.test_uri) + self.assertEqual(response.status_code, 200, 'status code was not 200') + self.assertThat(json.loads(response.data), HasLength(1), 'collection did not have an entry') + response_data = json.loads(response.data)[0] + self.assertEqual(response_data['xname'], self.data['xname']) + if __name__ == '__main__': unittest.main() From 79efb943e9f0bd1c50871332eef64f79547f8f7c Mon Sep 17 00:00:00 2001 From: David Laine Date: Wed, 21 Aug 2024 14:49:30 -0500 Subject: [PATCH 2/8] CASMCMS-8977 - refetch ssh key if not present. --- CHANGELOG.md | 1 + src/server/models/jobs.py | 7 +++++++ src/server/v3/resources/remote_build_nodes.py | 10 +++++++++- src/server/vault.py | 13 +++++++++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9578e8c..a47ea25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - CASMCMS-8979 - add a status endpoint for the remote build nodes. +- CASMCMS-8977 - check that the ssh key is present each time spawning a remote job. ## [3.16.2] - 2024-07-25 ### Dependencies diff --git a/src/server/models/jobs.py b/src/server/models/jobs.py index 076226e..49d2360 100644 --- a/src/server/models/jobs.py +++ b/src/server/models/jobs.py @@ -34,6 +34,7 @@ from marshmallow.validate import Length, OneOf, Range from src.server.helper import ARCH_ARM64, ARCH_X86_64 +from src.server.vault import test_private_key_file from src.server.models.remote_build_nodes import RemoteNodeStatus JOB_TYPE_CREATE = 'create' @@ -262,6 +263,12 @@ def find_remote_node_for_job(app, job: V2JobRecordSchema) -> str: best_node = "" best_node_job_count = RemoteNodeStatus.UNKNOWN_NUM_JOBS - 1 + # make sure the ssh key was set up correctly + if not test_private_key_file(app): + app.logger.error("Problem with ssh key - unable to create remote jobs") + return best_node + + # Since the ssh key is good - look for a valid node for xname, remote_node in app.data['remote_build_nodes'].items(): nodeStatus = remote_node.getStatus() if nodeStatus.ableToRunJobs and nodeStatus.nodeArch == job.arch: diff --git a/src/server/v3/resources/remote_build_nodes.py b/src/server/v3/resources/remote_build_nodes.py index 6c122be..4acd4e0 100644 --- a/src/server/v3/resources/remote_build_nodes.py +++ b/src/server/v3/resources/remote_build_nodes.py @@ -32,6 +32,7 @@ from src.server.errors import problemify, generate_missing_input_response, generate_data_validation_failure, \ generate_resource_not_found_response from src.server.helper import get_log_id +from src.server.vault import test_private_key_file from src.server.models.remote_build_nodes import V3RemoteBuildNodeRecordInputSchema, V3RemoteBuildNodeRecordSchema, V3RemoteBuildNodeRecord, RemoteNodeStatus from src.server.v3.models import PATCH_OPERATION_UNDELETE @@ -48,12 +49,15 @@ def get(self, remote_build_node_xname): log_id = get_log_id() current_app.logger.info("%s ++ remote_build_status.v3.GET %s", log_id, remote_build_node_xname) + # verify that the remote build node ssh key is present + if not test_private_key_file(current_app): + current_app.logger.info("SSH key not present for remote build nodes") + if remote_build_node_xname not in current_app.data['remote_build_nodes']: current_app.logger.info("%s no IMS remote build node matches xname=%s", log_id, remote_build_node_xname) return generate_resource_not_found_response() return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus().toJson() - #return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus() current_app.logger.info("%s Returning json response: %s", log_id, return_json) return jsonify(return_json) @@ -67,6 +71,10 @@ def get(self): log_id = get_log_id() current_app.logger.info("%s ++ remote_build_status_collection.v3.GET", log_id) + # verify that the remote build node ssh key is present + if not test_private_key_file(current_app): + current_app.logger.info("SSH key not present for remote build nodes") + return_json = [] for remote_node in current_app.data['remote_build_nodes'].values(): return_json.append(remote_node.getStatus().toJson()) diff --git a/src/server/vault.py b/src/server/vault.py index a0ac31e..9ec08b8 100644 --- a/src/server/vault.py +++ b/src/server/vault.py @@ -109,6 +109,19 @@ def get_exportable_key(app): app.logger.error("Failed to get exportable key from vault: %s", err) return None +def test_private_key_file(app) -> bool: + # If the private key is present, just return + if os.path.isfile('id_ecdsa'): + app.logger.info("Private ssh key file present") + return True + + # Private key is not present, try to fetch or regenerate it + app.logger.info("Private ssh key file not present - attempting to refetch...") + remote_node_key_setup(app) + + # return if the key file is present + return os.path.isfile('id_ecdsa') + def export_private_key(app, private_key): # This will throw an error on attempting to write a null if private_key == None: From 37b0624710d0a33fdb2dfee567380d7b12e672e3 Mon Sep 17 00:00:00 2001 From: David Laine Date: Mon, 26 Aug 2024 11:27:25 -0500 Subject: [PATCH 3/8] CASMCMS-8979 - update status unknown jobs. --- CHANGELOG.md | 1 + src/server/models/jobs.py | 12 +++++++++--- src/server/models/remote_build_nodes.py | 6 +----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a47ea25..a9b0715 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - CASMCMS-8979 - add a status endpoint for the remote build nodes. +- CASMCMS-8979-v2 - clean up status object. - CASMCMS-8977 - check that the ssh key is present each time spawning a remote job. ## [3.16.2] - 2024-07-25 diff --git a/src/server/models/jobs.py b/src/server/models/jobs.py index 49d2360..c3f7bfe 100644 --- a/src/server/models/jobs.py +++ b/src/server/models/jobs.py @@ -261,7 +261,7 @@ def find_remote_node_for_job(app, job: V2JobRecordSchema) -> str: """ app.logger.info(f"Checking for remote build node for job") best_node = "" - best_node_job_count = RemoteNodeStatus.UNKNOWN_NUM_JOBS - 1 + best_node_job_count = 10000 # seed with a really big number of jobs # make sure the ssh key was set up correctly if not test_private_key_file(app): @@ -273,8 +273,14 @@ def find_remote_node_for_job(app, job: V2JobRecordSchema) -> str: nodeStatus = remote_node.getStatus() if nodeStatus.ableToRunJobs and nodeStatus.nodeArch == job.arch: app.logger.info(f"Matching remote node: {xname}, current jobs on node: {nodeStatus.numCurrentJobs}") + + # -1 means no job information, make sure we don't prefer those nodes + numNodeJobs = nodeStatus.numCurrentJobs + if numNodeJobs == -1: + numNodeJobs = 10000 + # matching arch - can use the node, now pick the node with the least jobs running - if best_node == "" or nodeStatus.numCurrentJobs < best_node_job_count: + if best_node == "" or numNodeJobs < best_node_job_count: best_node = remote_node.xname - best_node_job_count = nodeStatus.numCurrentJobs + best_node_job_count = numNodeJobs return best_node diff --git a/src/server/models/remote_build_nodes.py b/src/server/models/remote_build_nodes.py index 9ee3998..b3b6907 100644 --- a/src/server/models/remote_build_nodes.py +++ b/src/server/models/remote_build_nodes.py @@ -42,20 +42,16 @@ class RemoteNodeStatus: """ Object to hold the current status of a remote build node """ - # status variable to represent and unknown number of jobs on a node - UNKNOWN_NUM_JOBS = 10000 - def __init__(self, xname: str) -> None: self.xname = xname self.sshStatus = "Unknown" self.podmanStatus = "Unknown" self.nodeArch = "Unknown" - self.numCurrentJobs = self.UNKNOWN_NUM_JOBS + self.numCurrentJobs = -1 self.ableToRunJobs = False def toJson(self): return self.__dict__ - #return json.dumps(self, default=lambda o: o.__dict__) class V3RemoteBuildNodeRecord: """ The RemoteBuildNodeRecord object """ From e0800e07ad349b1477728807365b6b8e6495785a Mon Sep 17 00:00:00 2001 From: "Mitch Harding (the weird one)" Date: Tue, 27 Aug 2024 13:57:44 -0400 Subject: [PATCH 4/8] Update Kubernetes Python client version to match CSM 1.6 Kubernetes version --- CHANGELOG.md | 3 +++ constraints.txt | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b0715..700dcdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - CASMCMS-8979-v2 - clean up status object. - CASMCMS-8977 - check that the ssh key is present each time spawning a remote job. +### Dependencies +- CSM 1.6 moved to Kubernetes 1.24, so use client v24.x to ensure compatability + ## [3.16.2] - 2024-07-25 ### Dependencies - Resolved CVE: Require `setuptools` >= 70.0 diff --git a/constraints.txt b/constraints.txt index 3be531c..270b8fb 100644 --- a/constraints.txt +++ b/constraints.txt @@ -1,4 +1,4 @@ -# Constraints updated 5/29/2024 +# Constraints updated 8/27/2024 aniso8601==9.0.1 boto3==1.34.114 botocore==1.34.114 @@ -18,8 +18,8 @@ idna==3.7 itsdangerous==2.2.0 Jinja2==3.1.4 jmespath==1.0.1 -# CSM 1.6 uses Kubernetes 1.22, so use client v22.x to ensure compatability -kubernetes==22.6.0 +# CSM 1.6 moved to Kubernetes 1.24, so use client v24.x to ensure compatability +kubernetes>=24.2,<24.3 MarkupSafe==2.1.5 marshmallow==3.21.2 oauthlib==3.2.2 From 1f25c8fa94d7ee56bac64410299ba2f5c0586f4f Mon Sep 17 00:00:00 2001 From: David Laine Date: Wed, 28 Aug 2024 11:53:16 -0500 Subject: [PATCH 5/8] CASMINST-6602 - enable dkms by default. --- CHANGELOG.md | 1 + kubernetes/cray-ims/values.yaml | 4 ++-- src/server/models/jobs.py | 4 ++-- src/server/models/recipes.py | 8 ++++---- src/server/v2/resources/jobs.py | 17 ++++++++++------- src/server/v3/models/recipes.py | 4 ++-- src/server/v3/resources/jobs.py | 19 +++++++++++-------- 7 files changed, 32 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 700dcdf..713772f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - CASMCMS-8979 - add a status endpoint for the remote build nodes. - CASMCMS-8979-v2 - clean up status object. - CASMCMS-8977 - check that the ssh key is present each time spawning a remote job. +- CASMINST-6602 - enable dkms by default. ### Dependencies - CSM 1.6 moved to Kubernetes 1.24, so use client v24.x to ensure compatability diff --git a/kubernetes/cray-ims/values.yaml b/kubernetes/cray-ims/values.yaml index d0b8c3a..a86e7e3 100644 --- a/kubernetes/cray-ims/values.yaml +++ b/kubernetes/cray-ims/values.yaml @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2021-2023 Hewlett Packard Enterprise Development LP +# (C) Copyright 2021-2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -88,7 +88,7 @@ customer_access: subnet_name: "cmn" jobs: - enable_dkms: false + enable_dkms: true kata_runtime: "kata-qemu" aarch64_runtime: "kata-qemu" diff --git a/src/server/models/jobs.py b/src/server/models/jobs.py index c3f7bfe..218189c 100644 --- a/src/server/models/jobs.py +++ b/src/server/models/jobs.py @@ -85,7 +85,7 @@ def __init__(self, job_type, artifact_id, id=None, created=None, status=None, kubernetes_configmap=None, enable_debug=False, build_env_size=None, image_root_archive_name=None, kernel_file_name=None, initrd_file_name=None, resultant_image_id=None, ssh_containers=None, - kubernetes_namespace=None, kernel_parameters_file_name=None, require_dkms=False, + kubernetes_namespace=None, kernel_parameters_file_name=None, require_dkms=True, arch=None, job_mem_size=None, kubernetes_pvc=None, remote_build_node=""): # Supplied # v2.0 @@ -163,7 +163,7 @@ class V2JobRecordInputSchema(Schema): ssh_containers = fields.List(fields.Nested(SshContainerInputSchema()), allow_none=True) # v2.1 - require_dkms = fields.Boolean(required=False, load_default=False, dump_default=False, + require_dkms = fields.Boolean(required=False, load_default=True, dump_default=True, metadata={"metadata": {"description": "Job requires the use of dkms"}}) # v2.2 diff --git a/src/server/models/recipes.py b/src/server/models/recipes.py index 1d49c96..498f0ef 100644 --- a/src/server/models/recipes.py +++ b/src/server/models/recipes.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2018-2023 Hewlett Packard Enterprise Development LP +# (C) Copyright 2018-2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -56,7 +56,7 @@ class V2RecipeRecord: # pylint: disable=W0622 def __init__(self, name, recipe_type, linux_distribution, link=None, id=None, created=None, - template_dictionary=None, require_dkms=False, arch=ARCH_X86_64): + template_dictionary=None, require_dkms=True, arch=ARCH_X86_64): # Supplied self.name = name self.link = link @@ -91,7 +91,7 @@ class V2RecipeRecordInputSchema(Schema): template_dictionary = fields.List(fields.Nested(RecipeKeyValuePair()), required=False, allow_none=True) # v2.2 - require_dkms = fields.Boolean(load_default=False, dump_default=False, + require_dkms = fields.Boolean(load_default=True, dump_default=True, metadata={"metadata": {"description": "Recipe requires the use of dkms"}}) arch = fields.Str(required=False, metadata={"metadata": {"description": "Architecture of the recipe"}}, validate=OneOf([ARCH_ARM64,ARCH_X86_64]), load_default=ARCH_X86_64, dump_default=ARCH_X86_64) @@ -126,6 +126,6 @@ class V2RecipeRecordPatchSchema(Schema): arch = fields.Str(required=False, validate=OneOf([ARCH_ARM64,ARCH_X86_64]), load_default=ARCH_X86_64, dump_default=ARCH_X86_64, metadata={"metadata": {"description": "Architecture of the recipe"}}) - require_dkms = fields.Boolean(required=False, load_default=False, dump_default=False, + require_dkms = fields.Boolean(required=False, load_default=True, dump_default=True, metadata={"metadata": {"description": "Recipe requires the use of dkms"}}) template_dictionary = fields.List(fields.Nested(RecipeKeyValuePair()), required=False, allow_none=True) diff --git a/src/server/v2/resources/jobs.py b/src/server/v2/resources/jobs.py index 79a345c..bb49f59 100644 --- a/src/server/v2/resources/jobs.py +++ b/src/server/v2/resources/jobs.py @@ -97,7 +97,7 @@ def __init__(self): # {job.id}.ims.{job_customer_access_subnet_name}.{job_customer_access_network_domain}" self.job_customer_access_subnet_name = os.environ.get("JOB_CUSTOMER_ACCESS_SUBNET_NAME", "cmn") self.job_customer_access_network_domain = os.environ.get("JOB_CUSTOMER_ACCESS_NETWORK_DOMAIN", "shasta.local") - self.job_enable_dkms = os.getenv("JOB_ENABLE_DKMS", 'False').lower() in ('true', '1', 't') + self.job_enable_dkms = os.getenv("JOB_ENABLE_DKMS", 'True').lower() in ('true', '1', 't') # NOTE: make sure this isn't a non-zero length string of spaces self.job_kata_runtime = os.getenv("JOB_KATA_RUNTIME", "kata-qemu").strip() @@ -659,14 +659,17 @@ def post(self): current_app.logger.info(f" NOTE: aarch64 architecture requires dkms") new_job.require_dkms = True elif userSpecifiedDKMS==None: - if self.job_enable_dkms: + # if the user didn't specify for the job, look for defaults + if new_job.job_type == JOB_TYPE_CREATE: + # Let the setting from the recipe flow through if the user has not specified otherwise + if artifact_record.require_dkms != self.job_enable_dkms: + current_app.logger.info(f"Overriding require_dkms based on recipe setting") + current_app.logger.info(f"Setting require_dkms based on recipe setting: {artifact_record.require_dkms}") + new_job.require_dkms = artifact_record.require_dkms + elif not self.job_enable_dkms: # use the default from the ims-config config map current_app.logger.info(f"Setting require_dkms based on ims-config setting") - new_job.require_dkms = True - elif new_job.job_type == JOB_TYPE_CREATE and artifact_record.require_dkms: - # Let the setting from the recipe flow through if the user has not specified otherwise - current_app.logger.info(f"Overriding require_dkms based on recipe setting") - new_job.require_dkms = True + new_job.require_dkms = False # get the public key information public_key_data, problem = V2JobCollection.get_public_key_data(log_id, new_job.public_key_id) diff --git a/src/server/v3/models/recipes.py b/src/server/v3/models/recipes.py index 87e481f..3cb1565 100644 --- a/src/server/v3/models/recipes.py +++ b/src/server/v3/models/recipes.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -38,7 +38,7 @@ class V3DeletedRecipeRecord(V2RecipeRecord): # pylint: disable=W0622 def __init__(self, name, recipe_type, linux_distribution, link=None, id=None, created=None, deleted=None, - template_dictionary=None, require_dkms=False, arch=ARCH_X86_64): + template_dictionary=None, require_dkms=True, arch=ARCH_X86_64): # Supplied self.deleted = deleted or datetime.datetime.now() super().__init__(name, recipe_type=recipe_type, linux_distribution=linux_distribution, diff --git a/src/server/v3/resources/jobs.py b/src/server/v3/resources/jobs.py index a416b51..cb89fa0 100644 --- a/src/server/v3/resources/jobs.py +++ b/src/server/v3/resources/jobs.py @@ -97,7 +97,7 @@ def __init__(self): # {job.id}.ims.{job_customer_access_subnet_name}.{self.job_customer_access_network_domain}" self.job_customer_access_subnet_name = os.environ.get("JOB_CUSTOMER_ACCESS_SUBNET_NAME", "cmn") self.job_customer_access_network_domain = os.environ.get("JOB_CUSTOMER_ACCESS_NETWORK_DOMAIN", "shasta.local") - self.job_enable_dkms = os.getenv("JOB_ENABLE_DKMS", 'False').lower() in ('true', '1', 't') + self.job_enable_dkms = os.getenv("JOB_ENABLE_DKMS", 'True').lower() in ('true', '1', 't') # NOTE: make sure this isn't a non-zero length string of spaces self.job_kata_runtime = os.getenv("JOB_KATA_RUNTIME", "kata-qemu").strip() @@ -658,18 +658,21 @@ def post(self): # Determine cases where the dkms security settings are required without user specifying if new_job.arch == ARCH_ARM64: - # If the architecture is aarch64, then the dkms settings are required + # If the architecture is aarch64, then the dkms settings are required current_app.logger.info(f" NOTE: aarch64 architecture requires dkms") new_job.require_dkms = True elif userSpecifiedDKMS==None: - if self.job_enable_dkms: + # if the user didn't specify for the job, look for defaults + if new_job.job_type == JOB_TYPE_CREATE: + # Let the setting from the recipe flow through if the user has not specified otherwise + if artifact_record.require_dkms != self.job_enable_dkms: + current_app.logger.info(f"Overriding require_dkms based on recipe setting") + current_app.logger.info(f"Setting require_dkms based on recipe setting: {artifact_record.require_dkms}") + new_job.require_dkms = artifact_record.require_dkms + elif not self.job_enable_dkms: # use the default from the ims-config config map current_app.logger.info(f"Setting require_dkms based on ims-config setting") - new_job.require_dkms = True - elif new_job.job_type == JOB_TYPE_CREATE and artifact_record.require_dkms: - # Let the setting from the recipe flow through if the user has not specified otherwise - current_app.logger.info(f"Overriding require_dkms based on recipe setting") - new_job.require_dkms = True + new_job.require_dkms = False # get the public key information public_key_data, problem = V3JobCollection.get_public_key_data(log_id, new_job.public_key_id) From b07ca238ca1749d3a382ba3f9c32267e0f55ab1e Mon Sep 17 00:00:00 2001 From: David Laine Date: Wed, 28 Aug 2024 20:10:18 -0500 Subject: [PATCH 6/8] CASMTRIAGE-7169 - job memory size was not getting picked up correctly from ims configuration. --- CHANGELOG.md | 1 + src/server/models/jobs.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 700dcdf..9b940b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - CASMCMS-8979 - add a status endpoint for the remote build nodes. - CASMCMS-8979-v2 - clean up status object. - CASMCMS-8977 - check that the ssh key is present each time spawning a remote job. +- CASMTRIAGE-7169 - job memory size was not getting picked up correctly from the ims configuration settings. ### Dependencies - CSM 1.6 moved to Kubernetes 1.24, so use client v24.x to ensure compatability diff --git a/src/server/models/jobs.py b/src/server/models/jobs.py index c3f7bfe..75d0209 100644 --- a/src/server/models/jobs.py +++ b/src/server/models/jobs.py @@ -167,7 +167,7 @@ class V2JobRecordInputSchema(Schema): metadata={"metadata": {"description": "Job requires the use of dkms"}}) # v2.2 - job_mem_size = fields.Integer(load_default=1, dump_default=1, + job_mem_size = fields.Integer(dump_default=1, required=False, validate=Range(min=1, error="build_env_size must be greater than or equal to 1"), metadata={"metadata": {"description": "Approximate working memory in GiB to reserve for the build job " "environment (loosely proportional to the final image size)"}}) From cea244b71ffd933f71d60601fe012583dfb75675 Mon Sep 17 00:00:00 2001 From: David Laine Date: Thu, 29 Aug 2024 10:24:20 -0500 Subject: [PATCH 7/8] CASMCMS-9040 - pick up new ims-utils to resolve file permissions. --- CHANGELOG.md | 1 + update_external_versions.conf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbea7c1..47a0de5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - CASMCMS-8977 - check that the ssh key is present each time spawning a remote job. - CASMINST-6602 - enable dkms by default. - CASMTRIAGE-7169 - job memory size was not getting picked up correctly from the ims configuration settings. +- CASMCMS-9040 - change read/write permissions of recipe config files output in image. ### Dependencies - CSM 1.6 moved to Kubernetes 1.24, so use client v24.x to ensure compatibility diff --git a/update_external_versions.conf b/update_external_versions.conf index 23c5845..20d83dd 100644 --- a/update_external_versions.conf +++ b/update_external_versions.conf @@ -1,6 +1,6 @@ image: cray-ims-utils major: 2 - minor: 13 + minor: 14 image: cray-ims-kiwi-ng-opensuse-x86_64-builder major: 1 From 893091442a240f2c9518bf593a800e542908e68f Mon Sep 17 00:00:00 2001 From: David Laine Date: Thu, 29 Aug 2024 11:26:19 -0500 Subject: [PATCH 8/8] Release v3.17.0 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47a0de5..ea71a74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + +## [3.17.0] - 2024-08-29 ### Added - CASMCMS-8979 - add a status endpoint for the remote build nodes. - CASMCMS-8977 - check that the ssh key is present each time spawning a remote job.