From faaa1cb7d9d3c24ea649bcc8e44d3be89ab540b3 Mon Sep 17 00:00:00 2001 From: Ho Kim Date: Wed, 21 Aug 2024 07:59:43 +0000 Subject: [PATCH] feat: fine-tune for ubuntu 24.04 on hpc --- templates/contrib/perf-test/Justfile | 28 +++++---- .../contrib/perf-test/deployment-toolkit.yaml | 8 +++ templates/contrib/perf-test/job-disk-io.yaml | 61 +++++++++++++------ templates/contrib/perf-test/pvc.yaml | 3 +- .../contrib/speed-test/iperf3/daemonset.yaml | 7 ++- templates/csi/directpv/install.sh | 33 ++++++++++ templates/csi/minio/install.sh | 44 +++++++++++++ templates/csi/minio/values-operator.yaml | 26 ++++++++ .../csi/rook-ceph/ceph-block-noreplicas.yaml | 37 +++++++++++ .../rook-ceph/ceph-filesystem-noreplicas.yaml | 22 +++++++ templates/csi/rook-ceph/values-cluster.yaml | 4 +- .../fabric/mellanox/values-operator.yaml | 10 ++- templates/kiss/daemonset-optimizer.yaml | 6 +- .../kiss/tasks/commission/network-tuned.yaml | 18 ++++++ templates/kiss/tasks/commission/network.yaml | 3 + .../tasks/commission/system-kernel-tune.yaml | 8 ++- .../template_system_kernel_sysctl.conf | 33 ++++++++++ utils/patch-template-cleanup-devices.sh | 22 ++++++- 18 files changed, 328 insertions(+), 45 deletions(-) create mode 100755 templates/csi/directpv/install.sh create mode 100755 templates/csi/minio/install.sh create mode 100644 templates/csi/minio/values-operator.yaml create mode 100644 templates/csi/rook-ceph/ceph-block-noreplicas.yaml create mode 100644 templates/kiss/tasks/commission/network-tuned.yaml create mode 100644 templates/kiss/tasks/commission/template_system_kernel_sysctl.conf diff --git a/templates/contrib/perf-test/Justfile b/templates/contrib/perf-test/Justfile index 081c0b09..39394f67 100644 --- a/templates/contrib/perf-test/Justfile +++ b/templates/contrib/perf-test/Justfile @@ -5,32 +5,38 @@ # Load environment variables set dotenv-load +clean: + kubectl delete -f job-disk-io.yaml || true + kubectl delete -f deployment-toolkit.yaml -f pvc.yaml || true + deploy: kubectl apply -f deployment-toolkit.yaml -f pvc.yaml + @sleep 1 kubectl rollout status deployment perf-test-toolkit -start *ARGS: +reset: + @just clean + @just deploy + +start *ARGS: deploy kubectl delete -f job-disk-io.yaml || true kubectl apply -f job-disk-io.yaml + @sleep 1 kubectl wait --for=condition=ready pods -l 'app.kubernetes.io/component=perf-test-disk-io' exec *ARGS: @kubectl exec -it -c shell 'deployment/perf-test-toolkit' -- {{ ARGS }} _stat_avg rw: - @echo "1024 * $( \ - just exec cat '/data/perf-test-disk-io_{{ rw }}.log' \ - | grep -P '^ +bw' \ - | grep -Po 'avg=\K[0-9\.]+' \ - | bc \ - )" | bc + @just exec cat '/data/perf-test-disk-io_{{ rw }}.log' \ + | grep -P '^ +bw' stat: @echo -n 'read (bps) ' @just _stat_avg 'read' @echo -n 'write (bps) ' @just _stat_avg 'write' - @echo -n 'randread (bps) ' - @just _stat_avg 'randread' - @echo -n 'randwrite (bps) ' - @just _stat_avg 'randwrite' + @# echo -n 'randread (bps) ' + @# just _stat_avg 'randread' + @# echo -n 'randwrite (bps) ' + @# just _stat_avg 'randwrite' diff --git a/templates/contrib/perf-test/deployment-toolkit.yaml b/templates/contrib/perf-test/deployment-toolkit.yaml index ae985242..46a2f950 100644 --- a/templates/contrib/perf-test/deployment-toolkit.yaml +++ b/templates/contrib/perf-test/deployment-toolkit.yaml @@ -27,6 +27,13 @@ spec: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/kiss + operator: In + values: + - Compute + - weight: 2 preference: matchExpressions: - key: node-role.kubernetes.io/kiss @@ -40,6 +47,7 @@ spec: operator: In values: - Compute + - ControlPlane - Gateway containers: - name: shell diff --git a/templates/contrib/perf-test/job-disk-io.yaml b/templates/contrib/perf-test/job-disk-io.yaml index 380af267..575e765a 100644 --- a/templates/contrib/perf-test/job-disk-io.yaml +++ b/templates/contrib/perf-test/job-disk-io.yaml @@ -22,6 +22,13 @@ spec: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/kiss + operator: In + values: + - Compute + - weight: 2 preference: matchExpressions: - key: node-role.kubernetes.io/kiss @@ -35,6 +42,7 @@ spec: operator: In values: - Compute + - ControlPlane - Gateway containers: - name: shell @@ -78,15 +86,27 @@ spec: FILE_OUTPUT_PREFIX="/out/${NAME}" + ARGS="" + ARGS+="--direct ${DIRECT} " + ARGS+="--fsync ${FSYNC} " + ARGS+="--group_reporting " + ARGS+="--iodepth ${IO_DEPTH} " + ARGS+="--numjobs ${NUM_JOBS} " + ARGS+="--runtime ${DURATION} " + ARGS+="--size ${FILE_SIZE} " + ARGS+="--time_based " + + ARGS+="--directory /data " + # ARGS+="--ioengine rbd " + # ARGS+="--rbdname rbd0 " + ########################################################### # Serial Read # ########################################################### echo '[serial-read]' fio --name "${NAME}_read" --rw 'read' --output "${FILE_OUTPUT_PREFIX}_read.log" \ - --direct "${BUFFERED}" --directory /data --group_reporting --time_based \ - --numjobs "${NUM_JOBS}" --runtime "${DURATION}" \ - --bs "${FILE_BLOCK_UNIT_SIZE}" --size "${FILE_SIZE}" + --bs "${FILE_BLOCK_UNIT_SIZE}" ${ARGS} ########################################################### # Serial Write # @@ -94,46 +114,47 @@ spec: echo '[serial-write]' fio --name "${NAME}_write" --rw 'write' --output "${FILE_OUTPUT_PREFIX}_write.log" \ - --direct "${BUFFERED}" --directory /data --group_reporting --time_based \ - --numjobs "${NUM_JOBS}" --runtime "${DURATION}" \ - --bs "${FILE_BLOCK_UNIT_SIZE}" --size "${FILE_SIZE}" + --bs "${FILE_BLOCK_UNIT_SIZE}" ${ARGS} ########################################################### # Random Read # ########################################################### - echo '[random-read]' - fio --name "${NAME}_randread" --rw 'randread' --output "${FILE_OUTPUT_PREFIX}_randread.log" \ - --direct "${BUFFERED}" --directory /data --group_reporting --time_based \ - --numjobs "${NUM_JOBS}" --runtime "${DURATION}" \ - --bs "${FILE_BLOCK_UNIT_SIZE}" --size "${FILE_SIZE}" + # echo '[random-read]' + # fio --name "${NAME}_randread" --rw 'randread' --output "${FILE_OUTPUT_PREFIX}_randread.log" \ + # --bs '4K' ${ARGS} ########################################################### # Random Write # ########################################################### - echo '[random-write]' - fio --name "${NAME}_randwrite" --rw 'randwrite' --output "${FILE_OUTPUT_PREFIX}_randwrite.log" \ - --direct "${BUFFERED}" --directory /data --group_reporting --time_based \ - --numjobs "${NUM_JOBS}" --runtime "${DURATION}" \ - --bs "${FILE_BLOCK_UNIT_SIZE}" --size "${FILE_SIZE}" + # echo '[random-write]' + # fio --name "${NAME}_randwrite" --rw 'randwrite' --output "${FILE_OUTPUT_PREFIX}_randwrite.log" \ + # --bs '4K' ${ARGS} env: - - name: BUFFERED + - name: DIRECT value: "1" - name: DURATION - value: "30" + value: "60" - name: FILE_BLOCK_UNIT_SIZE - value: 1M + value: 4M - name: FILE_PATH value: /data/testfile - name: FILE_SIZE value: 1G + - name: FSYNC + value: "1" + - name: IO_DEPTH + value: "64" - name: NAME valueFrom: fieldRef: fieldPath: metadata.labels['app.kubernetes.io/component'] - name: NUM_JOBS - value: "8" + value: "12" + # volumeDevices: + # - name: data + # devicePath: /dev/rbd0 volumeMounts: - name: data mountPath: /data diff --git a/templates/contrib/perf-test/pvc.yaml b/templates/contrib/perf-test/pvc.yaml index 118cb024..86be6e13 100644 --- a/templates/contrib/perf-test/pvc.yaml +++ b/templates/contrib/perf-test/pvc.yaml @@ -14,7 +14,8 @@ spec: resources: requests: storage: 1Ti - storageClassName: ceph-block + storageClassName: ceph-filesystem-noreplicas + # volumeMode: Block volumeMode: Filesystem --- apiVersion: v1 diff --git a/templates/contrib/speed-test/iperf3/daemonset.yaml b/templates/contrib/speed-test/iperf3/daemonset.yaml index 1e756a40..320f0fab 100644 --- a/templates/contrib/speed-test/iperf3/daemonset.yaml +++ b/templates/contrib/speed-test/iperf3/daemonset.yaml @@ -13,9 +13,14 @@ spec: labels: app: iperf3 spec: + hostIPC: true containers: - name: iperf3 - image: docker.io/leodotcloud/swiss-army-knife:latest + image: docker.io/library/ubuntu:24.04 imagePullPolicy: Always + command: + - /usr/bin/env + - sleep + - infinity ports: - containerPort: 5201 diff --git a/templates/csi/directpv/install.sh b/templates/csi/directpv/install.sh new file mode 100755 index 00000000..d6465bca --- /dev/null +++ b/templates/csi/directpv/install.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright (c) 2022 Ho Kim (ho.kim@ulagbulag.io). All rights reserved. +# Use of this source code is governed by a GPL-3-style license that can be +# found in the LICENSE file. + +# Prehibit errors +set -e -o pipefail +# Verbose +set -x + +########################################################### +# Install DirectPV # +########################################################### + +echo "- Installing DirectPV ... " + +kubectl krew install directpv + +kubectl directpv install --node-selector node-role.kubernetes.io/kiss=Storage + +########################################################### +# Provision DirectPV Drives # +########################################################### + +echo "- Provisioning DirectPV Drives ... " + +DRIVES_FILE="/tmp/drives.yaml" +kubectl directpv discover --output-file "${DRIVES_FILE}" +kubectl directpv init "${DRIVES_FILE}" --dangerous +rm -f "${DRIVES_FILE}" + +# Finished! +echo "Installed!" diff --git a/templates/csi/minio/install.sh b/templates/csi/minio/install.sh new file mode 100755 index 00000000..ff84c361 --- /dev/null +++ b/templates/csi/minio/install.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright (c) 2022 Ho Kim (ho.kim@ulagbulag.io). All rights reserved. +# Use of this source code is governed by a GPL-3-style license that can be +# found in the LICENSE file. + +# Prehibit errors +set -e -o pipefail +# Verbose +set -x + +########################################################### +# Configuration # +########################################################### + +# Configure default environment variables +HELM_CHART_DEFAULT="https://operator.min.io" +NAMESPACE_DEFAULT="minio-operator" + +# Set environment variables +HELM_CHART="${HELM_CHART:-$HELM_CHART_DEFAULT}" +NAMESPACE="${NAMESPACE:-$NAMESPACE_DEFAULT}" + +########################################################### +# Configure Helm Channel # +########################################################### + +echo "- Configuring Helm channel ... " + +helm repo add "${NAMESPACE}" "${HELM_CHART}" + +########################################################### +# Install Operator # +########################################################### + +echo "- Installing Operator ... " + +helm upgrade --install "minio-operator" \ + "${NAMESPACE}/minio-operator" \ + --create-namespace \ + --namespace "${NAMESPACE}" \ + --values "./values-operator.yaml" + +# Finished! +echo "Installed!" diff --git a/templates/csi/minio/values-operator.yaml b/templates/csi/minio/values-operator.yaml new file mode 100644 index 00000000..59200ff7 --- /dev/null +++ b/templates/csi/minio/values-operator.yaml @@ -0,0 +1,26 @@ +--- +# Root key for Operator Helm Chart +operator: + ### + # + # The `affinity `__ or anti-affinity settings to apply to Operator pods. + # + # These settings determine the distribution of pods across worker nodes and can help prevent or allow colocating pods onto the same worker nodes. + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/kiss + operator: In + values: + - Compute + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/kiss + operator: In + values: + - Compute + - ControlPlane diff --git a/templates/csi/rook-ceph/ceph-block-noreplicas.yaml b/templates/csi/rook-ceph/ceph-block-noreplicas.yaml new file mode 100644 index 00000000..a5775e21 --- /dev/null +++ b/templates/csi/rook-ceph/ceph-block-noreplicas.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: ceph.rook.io/v1 +kind: CephBlockPool +metadata: + name: ceph-blockpool-noreplicas + namespace: csi-rook-ceph +spec: + erasureCoded: + codingChunks: 0 + dataChunks: 0 + failureDomain: host + replicated: + requireSafeReplicaSize: false + size: 1 +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ceph-block-noreplicas + annotations: + storageclass.kubernetes.io/is-default-class: "false" +allowVolumeExpansion: true +provisioner: csi-rook-ceph.rbd.csi.ceph.com +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + clusterID: csi-rook-ceph + csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/controller-expand-secret-namespace: csi-rook-ceph + csi.storage.k8s.io/fstype: ext4 + csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node + csi.storage.k8s.io/node-stage-secret-namespace: csi-rook-ceph + csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/provisioner-secret-namespace: csi-rook-ceph + imageFeatures: layering + imageFormat: "2" + pool: ceph-blockpool-noreplicas diff --git a/templates/csi/rook-ceph/ceph-filesystem-noreplicas.yaml b/templates/csi/rook-ceph/ceph-filesystem-noreplicas.yaml index b46ecabb..59e48f24 100644 --- a/templates/csi/rook-ceph/ceph-filesystem-noreplicas.yaml +++ b/templates/csi/rook-ceph/ceph-filesystem-noreplicas.yaml @@ -66,3 +66,25 @@ spec: memory: 4Gi statusCheck: mirror: {} +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ceph-filesystem-noreplicas + annotations: + storageclass.kubernetes.io/is-default-class: "false" +allowVolumeExpansion: true +provisioner: csi-rook-ceph.cephfs.csi.ceph.com +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + clusterID: csi-rook-ceph + csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner + csi.storage.k8s.io/controller-expand-secret-namespace: csi-rook-ceph + csi.storage.k8s.io/fstype: ext4 + csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node + csi.storage.k8s.io/node-stage-secret-namespace: csi-rook-ceph + csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner + csi.storage.k8s.io/provisioner-secret-namespace: csi-rook-ceph + fsName: ceph-filesystem-noreplicas + pool: ceph-filesystem-noreplicas-data0 diff --git a/templates/csi/rook-ceph/values-cluster.yaml b/templates/csi/rook-ceph/values-cluster.yaml index ee5e2d86..dd675428 100644 --- a/templates/csi/rook-ceph/values-cluster.yaml +++ b/templates/csi/rook-ceph/values-cluster.yaml @@ -148,8 +148,8 @@ cephClusterSpec: cpu: "2" memory: "4Gi" requests: - cpu: "1" - memory: "1Gi" + cpu: "0.5" + memory: "500Mi" # The option to automatically remove OSDs that are out and are safe to destroy. removeOSDsIfOutAndSafeToRemove: false diff --git a/templates/fabric/mellanox/values-operator.yaml b/templates/fabric/mellanox/values-operator.yaml index a1707ef8..391f40ad 100644 --- a/templates/fabric/mellanox/values-operator.yaml +++ b/templates/fabric/mellanox/values-operator.yaml @@ -5,7 +5,7 @@ nfd: sriovNetworkOperator: # deploy SR-IOV Network Operator - enabled: true + enabled: false # SR-IOV Network Operator chart related values sriov-network-operator: @@ -45,7 +45,7 @@ deployCR: true ofedDriver: # deploy Mellanox OFED driver container - deploy: false + deploy: true nvPeerDriver: # deploy NVIDIA Peer memory driver container @@ -91,3 +91,9 @@ nodeAffinity: - matchExpressions: - key: node-role.kubernetes.io/master operator: DoesNotExist + - matchExpressions: + - key: node-role.kubernetes.io/kiss + operator: NotIn + values: + - Dashboard + - Desktop diff --git a/templates/kiss/daemonset-optimizer.yaml b/templates/kiss/daemonset-optimizer.yaml index 8b9a8675..ae3d269f 100644 --- a/templates/kiss/daemonset-optimizer.yaml +++ b/templates/kiss/daemonset-optimizer.yaml @@ -22,11 +22,6 @@ spec: args: - > echo madvise >/host/sys/kernel/mm/transparent_hugepage/enabled - && echo 2099999999 > /host/proc/sys/fs/inotify/max_queued_events - && echo 2099999999 > /host/proc/sys/fs/inotify/max_user_instances - && echo 2099999999 > /host/proc/sys/fs/inotify/max_user_watches - && echo 528482304 > /host/proc/sys/net/core/bpf_jit_limit - && echo 0 > /host/proc/sys/vm/nr_hugepages && exec sleep infinity securityContext: capabilities: @@ -41,6 +36,7 @@ spec: mountPath: /host/sys hostIPC: true hostNetwork: true + terminationGracePeriodSeconds: 1 volumes: - name: host-proc hostPath: diff --git a/templates/kiss/tasks/commission/network-tuned.yaml b/templates/kiss/tasks/commission/network-tuned.yaml new file mode 100644 index 00000000..a990fd3c --- /dev/null +++ b/templates/kiss/tasks/commission/network-tuned.yaml @@ -0,0 +1,18 @@ +--- +- name: Install TuneD + package: + name: tuned + state: present + ignore_errors: true + +- name: Enable TuneD + systemd: + name: tuned.service + state: started + enabled: true + daemon_reload: true + +- name: Change TuneD Profile + vars: + kiss_network_profile: throughput-performance + command: tuned-adm profile {{ kiss_network_profile }} diff --git a/templates/kiss/tasks/commission/network.yaml b/templates/kiss/tasks/commission/network.yaml index 301adb9e..2dc6eafa 100644 --- a/templates/kiss/tasks/commission/network.yaml +++ b/templates/kiss/tasks/commission/network.yaml @@ -4,3 +4,6 @@ - name: Provision Ethernet Interfaces include_tasks: network-ethernet.yaml + +- name: Provision Network Tuning + include_tasks: network-tuned.yaml diff --git a/templates/kiss/tasks/commission/system-kernel-tune.yaml b/templates/kiss/tasks/commission/system-kernel-tune.yaml index fe925944..92adfe1d 100644 --- a/templates/kiss/tasks/commission/system-kernel-tune.yaml +++ b/templates/kiss/tasks/commission/system-kernel-tune.yaml @@ -1,2 +1,8 @@ --- -[] +- name: Provision Kernel | Update sysctl.conf + template: + src: ./template_system_kernel_sysctl.conf + dest: /etc/sysctl.d/90-openark.conf + +- name: Provision Kernel | Apply sysctl.conf + command: sysctl --system diff --git a/templates/kiss/tasks/commission/template_system_kernel_sysctl.conf b/templates/kiss/tasks/commission/template_system_kernel_sysctl.conf new file mode 100644 index 00000000..6e8b8df0 --- /dev/null +++ b/templates/kiss/tasks/commission/template_system_kernel_sysctl.conf @@ -0,0 +1,33 @@ +# +# /etc/sysctl.conf - Configuration file for setting system variables +# See /etc/sysctl.d/ for additional system variables. +# See sysctl.conf (5) for information. +# + +# Configure aio +fs.aio-max-nr = 262144 + +# Adjust notify +fs.inotify.max_queued_events = 2099999999 +fs.inotify.max_user_instances = 2099999999 +fs.inotify.max_user_watches = 2099999999 + +# Configure BPF +net.core.bpf_jit_limit = 528482304 + +# Activate zero-copy +net.core.optmem_max = 1048576 + +# Adjust buffer size +net.core.rmem_max = 2147483647 +net.core.wmem_max = 2147483647 + +# Activate jumbo frames +net.ipv4.tcp_mtu_probing = 1 + +# Adjust TCP buffer size +net.ipv4.tcp_rmem = 4096 131072 1073741824 +net.ipv4.tcp_wmem = 4096 16384 1073741824 + +# Adjust pages +vm.nr_hugepages = 0 diff --git a/utils/patch-template-cleanup-devices.sh b/utils/patch-template-cleanup-devices.sh index c55229de..ee2e65a0 100644 --- a/utils/patch-template-cleanup-devices.sh +++ b/utils/patch-template-cleanup-devices.sh @@ -4,7 +4,7 @@ # found in the LICENSE file. # Cleanup all unused disks. -# It is compatiable with Ceph OSD. +# It is compatiable with Ceph OSD and DirectPV. # Prehibit errors set -e -o pipefail @@ -20,6 +20,11 @@ for disk in $( sort | uniq ); do + # Unmount all directpv volumes + if findmnt -S "${disk}" | grep -Pq '^/var/lib/directpv'; then + umount "${disk}" + fi + # Skip if mounted partiton if findmnt -S "${disk}" >/dev/null 2>/dev/null; then echo "Skipping mounted partition: ${disk}" @@ -29,8 +34,9 @@ for disk in $( # Skip if mounted disk if [ "$( lsblk --noheadings "${disk}" 2>/dev/null | + grep -P 'part +/.*$' | wc -l - )" != "1" ]; then + )" != "0" ]; then echo "Skipping mounted disk: ${disk}" continue fi @@ -44,6 +50,12 @@ for disk in $( continue fi + # Skip if logical disk + if echo "${disk}" | grep -Pq '^/dev/dm-'; then + echo "Skipping logical disk: ${disk}" + continue + fi + # Wipe all data echo "Wiping all: ${disk}" @@ -62,3 +74,9 @@ for disk in $( ## Inform the OS of partition table changes partprobe "${disk}" && sync done + +# Cleanup Rook Ceph +dmsetup remove_all +rm -rf /var/lib/rook +rm -rf /var/lib/kubelet/plugins/csi-rook-ceph.* +rm -rf /var/lib/kubelet/plugins_registry/csi-rook-ceph.*