From eb19b38e88a574d6974d9a26c63a31e839d0a240 Mon Sep 17 00:00:00 2001 From: David Laine Date: Fri, 27 Sep 2024 11:17:59 -0500 Subject: [PATCH 1/2] CASMTRIAGE-7327 - fix reading default values from ims-config. --- CHANGELOG.md | 5 ++++ kubernetes/cray-ims/Chart.yaml | 2 +- ...-v2-image-create-kiwi-ng-job-template.yaml | 30 +++++++++++++++---- ...y-ims-v2-image-customize-job-template.yaml | 23 ++++++++++++-- kubernetes/cray-ims/values.yaml | 4 +-- src/server/models/jobs.py | 4 +-- 6 files changed, 54 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e6de25..7e243a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- CASMTRIAGE-7327 - fix loading default values from ims-config. +- CASMTRIAGE-7274 - fix cpu limits to not overdrive kata vm, add job pod anti-affinity. +- CASMCMS-9147 - stop using alpine:latest image. + ## [3.18.0] - 2024-09-24 ### Changed diff --git a/kubernetes/cray-ims/Chart.yaml b/kubernetes/cray-ims/Chart.yaml index d43741a..508391f 100644 --- a/kubernetes/cray-ims/Chart.yaml +++ b/kubernetes/cray-ims/Chart.yaml @@ -57,5 +57,5 @@ annotations: - name: cray-ims-sshd image: artifactory.algol60.net/csm-docker/stable/cray-ims-sshd:0.0.0-imssshd - name: alpine - image: alpine:latest + image: artifactory.algol60.net/csm-docker/stable/docker.io/library/alpine:3 artifacthub.io/license: MIT diff --git a/kubernetes/cray-ims/templates/cray-ims-v2-image-create-kiwi-ng-job-template.yaml b/kubernetes/cray-ims/templates/cray-ims-v2-image-create-kiwi-ng-job-template.yaml index 7467e35..8a94cc0 100644 --- a/kubernetes/cray-ims/templates/cray-ims-v2-image-create-kiwi-ng-job-template.yaml +++ b/kubernetes/cray-ims/templates/cray-ims-v2-image-create-kiwi-ng-job-template.yaml @@ -20,7 +20,13 @@ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +NOTE: Kata hypevisor setup adds ALL container cpu limits together for + the hardware description. This changes the nproc return of available + cpus in the container, possibly overloading the VM causing it to + crash. Be careful adjusting any cpu limits for the containers. */}} + apiVersion: v1 data: image_configmap_create.yaml.template: | @@ -75,6 +81,18 @@ data: namespace: $namespace spec: backoffLimit: 0 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - topologyKey: kubernetes.io/hostname + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - cray-ims + namespaces: + - $namespace template: metadata: labels: @@ -121,7 +139,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "8" + cpu: "2" # NOTE: see comment at top of the file # Step 2: Wait for Repos - image: {{ .Values.cray_ims_utils.image.repository }}:{{ .Values.cray_ims_utils.image.tag }} imagePullPolicy: {{ .Values.cray_ims_utils.image.imagePullPolicy }} @@ -151,7 +169,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "8" + cpu: "1" # NOTE: see comment at top of the file # Step 3: Build a RPM containing the Cray Root CA certificate - image: {{ .Values.cray_ims_utils.image.repository }}:{{ .Values.cray_ims_utils.image.tag }} imagePullPolicy: {{ .Values.cray_ims_utils.image.imagePullPolicy }} @@ -186,7 +204,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "8" + cpu: "1" # NOTE: see comment at top of the file # Step 4: Build the image - image: {{ .Values.cray_ims_kiwi_ng_opensuse_x86_64_builder.image.repository }}:{{ .Values.cray_ims_kiwi_ng_opensuse_x86_64_builder.image.tag }} imagePullPolicy: {{ .Values.cray_ims_kiwi_ng_opensuse_x86_64_builder.image.imagePullPolicy }} @@ -197,7 +215,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "48" + cpu: "8" # NOTE: see comment at top of the file securityContext: privileged: true capabilities: @@ -255,7 +273,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "48" + cpu: "8" # NOTE: see comment at top of the file envFrom: - configMapRef: name: cray-ims-$id-configmap @@ -372,7 +390,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "8" + cpu: "2" # NOTE: see comment at top of the file volumes: - name: image-vol persistentVolumeClaim: diff --git a/kubernetes/cray-ims/templates/cray-ims-v2-image-customize-job-template.yaml b/kubernetes/cray-ims/templates/cray-ims-v2-image-customize-job-template.yaml index 5453740..8384606 100644 --- a/kubernetes/cray-ims/templates/cray-ims-v2-image-customize-job-template.yaml +++ b/kubernetes/cray-ims/templates/cray-ims-v2-image-customize-job-template.yaml @@ -20,6 +20,11 @@ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +NOTE: Kata hypevisor setup adds ALL container cpu limits together for + the hardware description. This changes the nproc return of available + cpus in the container, possibly overloading the VM causing it to + crash. Be careful adjusting any cpu limits for the containers. */}} apiVersion: v1 data: @@ -73,6 +78,18 @@ data: namespace: $namespace spec: backoffLimit: 0 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - topologyKey: kubernetes.io/hostname + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - cray-ims + namespaces: + - $namespace template: metadata: annotations: @@ -130,7 +147,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "8" + cpu: "4" # NOTE: see comment at top of the file securityContext: privileged: true capabilities: @@ -147,7 +164,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "48" + cpu: "8" # NOTE: see comment at top of the file env: - name: API_GATEWAY_HOSTNAME value: {{ .Values.api_gw.api_gw_service_name }}.{{ .Values.api_gw.api_gw_service_namespace }}.svc.cluster.local @@ -261,7 +278,7 @@ data: cpu: "500m" limits: memory: "$job_mem_limit" - cpu: "8" + cpu: "8" # NOTE: see comment at top of the file volumeMounts: - name: image-vol mountPath: /mnt/image diff --git a/kubernetes/cray-ims/values.yaml b/kubernetes/cray-ims/values.yaml index a86e7e3..256053b 100644 --- a/kubernetes/cray-ims/values.yaml +++ b/kubernetes/cray-ims/values.yaml @@ -43,8 +43,8 @@ s3: alpine: image: - repository: alpine - tag: latest + repository: artifactory.algol60.net/csm-docker/stable/docker.io/library/alpine + tag: 3 pullPolicy: IfNotPresent ims_config: diff --git a/src/server/models/jobs.py b/src/server/models/jobs.py index d99e0c3..ed4151c 100644 --- a/src/server/models/jobs.py +++ b/src/server/models/jobs.py @@ -145,7 +145,7 @@ class V2JobRecordInputSchema(Schema): validate=Length(min=1, error="image_root_archive_name field must not be blank")) enable_debug = fields.Boolean(load_default=False,dump_default=False, metadata={"metadata": {"description": "Whether to enable debugging of the job"}}) - build_env_size = fields.Integer(load_default=60,dump_default=60, + build_env_size = fields.Integer(dump_default=DEFAULT_IMAGE_SIZE, metadata={"metadata": {"description": "Approximate disk size in GiB to reserve for the image build environment (usually 2x final image size)"}}, validate=Range(min=1, error="build_env_size must be greater than or equal to 1")) kernel_file_name = fields.Str(metadata={"metadata": {"description": "Name of the kernel file to extract and upload"}}) @@ -166,7 +166,7 @@ class V2JobRecordInputSchema(Schema): metadata={"metadata": {"description": "Job requires the use of dkms"}}) # v2.2 - job_mem_size = fields.Integer(dump_default=8, required=False, + job_mem_size = fields.Integer(dump_default=DEFAULT_JOB_MEM_SIZE, required=False, validate=Range(min=1, error="build_env_size must be greater than or equal to 1"), metadata={"metadata": {"description": "Approximate working memory in GiB to reserve for the build job " "environment (loosely proportional to the final image size)"}}) From d45ef99f117a2e32cf92ddf0bbd6172015c4815f Mon Sep 17 00:00:00 2001 From: David Laine Date: Thu, 3 Oct 2024 13:48:02 -0500 Subject: [PATCH 2/2] Release v3.19.0. --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e243a3..fe840e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [3.19.0] - 2024-10-03 + ### Changed - CASMTRIAGE-7327 - fix loading default values from ims-config. - CASMTRIAGE-7274 - fix cpu limits to not overdrive kata vm, add job pod anti-affinity.