From eeabdcb5b7179384187c1eeae189152795b57eaf Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 16 Sep 2024 08:06:41 +0800 Subject: [PATCH 1/5] test(robot): v2 volume should block trim when volume is degraded longhorn/longhorn-8430 Signed-off-by: Chin-Ya Huang --- e2e/keywords/workload.resource | 11 +++++++++++ e2e/libs/keywords/workload_keywords.py | 4 ++++ e2e/libs/volume/crd.py | 3 +++ e2e/libs/volume/rest.py | 17 +++++++++++++++++ e2e/libs/volume/volume.py | 3 +++ e2e/tests/regression/test_v2.robot | 22 ++++++++++++++++++++++ 6 files changed, 60 insertions(+) diff --git a/e2e/keywords/workload.resource b/e2e/keywords/workload.resource index 6b71fdbca5..e3dce4a006 100644 --- a/e2e/keywords/workload.resource +++ b/e2e/keywords/workload.resource @@ -189,3 +189,14 @@ Check ${workload_kind} ${workload_id} pod is ${expect_state} on another node Delete Longhorn ${workload_kind} ${workload_name} pod on node ${node_id} ${node_name} = get_node_by_index ${node_id} delete_workload_pod_on_node ${workload_name} ${node_name} longhorn-system + +Trim ${workload_kind} ${workload_id} volume should ${condition} + ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} + + IF $condition == "fail" + trim_workload_volume_filesystem ${workload_name} is_expect_fail=True + ELSE IF $condition == "pass" + trim_workload_volume_filesystem ${workload_name} is_expect_fail=False + ELSE + Fail "Invalid condition value: ${condition}" + END diff --git a/e2e/libs/keywords/workload_keywords.py b/e2e/libs/keywords/workload_keywords.py index 6f9175bd7b..d27845d91a 100644 --- a/e2e/libs/keywords/workload_keywords.py +++ b/e2e/libs/keywords/workload_keywords.py @@ -192,3 +192,7 @@ def is_workloads_pods_has_annotations(self, workload_names, annotation_key, name if not is_workload_pods_has_annotations(workload_name, annotation_key, namespace=namespace, label_selector=label_selector): return False return True + + def trim_workload_volume_filesystem(self, workload_name, is_expect_fail=False): + volume_name = get_workload_volume_name(workload_name) + self.volume.trim_filesystem(volume_name, is_expect_fail=is_expect_fail) diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index eea996d79b..b8ff66f586 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -511,3 +511,6 @@ def validate_volume_setting(self, volume_name, setting_name, value): volume = self.get(volume_name) assert str(volume["spec"][setting_name]) == value, \ f"Expected volume {volume_name} setting {setting_name} is {value}, but it's {str(volume['spec'][setting_name])}" + + def trim_filesystem(self, volume_name, is_expect_fail=False): + return Rest(self).trim_filesystem(volume_name, is_expect_fail=is_expect_fail) diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index 502d8f64e7..9cce306a46 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -370,3 +370,20 @@ def wait_for_replica_ready_to_rw(self, volume_name): break time.sleep(self.retry_interval) assert ready, f"Failed to get volume {volume_name} replicas ready: {replicas}" + + def trim_filesystem(self, volume_name, is_expect_fail=False): + is_unexpected_pass = False + try: + self.get(volume_name).trimFilesystem(name=volume_name) + + if is_expect_fail: + is_unexpected_pass = True + + except Exception as e: + if is_expect_fail: + logging(f"Failed to trim filesystem: {e}") + else: + raise e + + if is_unexpected_pass: + raise Exception(f"Expected volume {volume_name} trim filesystem to fail") diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py index bbfb2832bf..a6f5da7a85 100644 --- a/e2e/libs/volume/volume.py +++ b/e2e/libs/volume/volume.py @@ -154,3 +154,6 @@ def wait_for_engine_image_upgrade_completed(self, volume_name, engine_image_name def validate_volume_setting(self, volume_name, setting_name, value): return self.volume.validate_volume_setting(volume_name, setting_name, value) + + def trim_filesystem(self, volume_name, is_expect_fail=False): + return self.volume.trim_filesystem(volume_name, is_expect_fail=is_expect_fail) diff --git a/e2e/tests/regression/test_v2.robot b/e2e/tests/regression/test_v2.robot index 27c3831665..137d7eb7c3 100644 --- a/e2e/tests/regression/test_v2.robot +++ b/e2e/tests/regression/test_v2.robot @@ -11,6 +11,8 @@ Resource ../keywords/workload.resource Resource ../keywords/volume.resource Resource ../keywords/setting.resource Resource ../keywords/node.resource +Resource ../keywords/host.resource +Resource ../keywords/longhorn.resource Test Setup Set test environment Test Teardown Cleanup test resources @@ -50,3 +52,23 @@ Degraded Volume Replica Rebuilding And Wait for deployment 0 pods stable Then Check deployment 0 data in file data.txt is intact END + +V2 Volume Should Block Trim When Volume Is Degraded + Given Set setting auto-salvage to true + And Create storageclass longhorn-test with dataEngine=v2 + And Create persistentvolumeclaim 0 using RWO volume with longhorn-test storageclass + And Create deployment 0 with persistentvolumeclaim 0 + + FOR ${i} IN RANGE ${LOOP_COUNT} + And Keep writing data to pod of deployment 0 + + When Restart cluster + And Wait for longhorn ready + And Wait for volume of deployment 0 attached and degraded + Then Trim deployment 0 volume should fail + + When Wait for workloads pods stable + ... deployment 0 + And Check deployment 0 works + Then Trim deployment 0 volume should pass + END From 07472cfa8a1fe86108d4db2933da4f98dc5b5e89 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Tue, 29 Oct 2024 15:00:03 +0800 Subject: [PATCH 2/5] test(integration/system-backup): check outdated backup in if-not-present volume backup policy longhorn/longhorn-6027 Signed-off-by: Chin-Ya Huang --- .../tests/test_system_backup_restore.py | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/manager/integration/tests/test_system_backup_restore.py b/manager/integration/tests/test_system_backup_restore.py index c77092d461..9703ed4b59 100644 --- a/manager/integration/tests/test_system_backup_restore.py +++ b/manager/integration/tests/test_system_backup_restore.py @@ -31,6 +31,7 @@ from common import check_pv_existence from common import check_backing_image_disk_map_status from common import wait_for_backup_restore_completed +from common import write_volume_random_data from common import SETTING_BACKUPSTORE_POLL_INTERVAL from common import SIZE @@ -207,10 +208,11 @@ def test_system_backup_and_restore_volume_with_backingimage(client, core_api, vo def test_system_backup_with_volume_backup_policy_if_not_present(client, volume_name, set_random_backupstore): # NOQA """ Scenario: system backup with volume backup policy (if-not-present) should - only create volume backup when there is no existing backup in - the volume. + create volume backup when no backup exists for the volume or when + the last backup is outdated. Issue: https://github.com/longhorn/longhorn/issues/5011 + https://github.com/longhorn/longhorn/issues/6027 Given a volume is created. @@ -225,6 +227,13 @@ def test_system_backup_with_volume_backup_policy_if_not_present(client, volume_n And system backup (system-backup-2) created. Then system backup is in state (Ready). And volume has backup count (1). + + When system backup (system-backup-3) has volume backup policy + (if-not-present). + And write data to volume. + And system backup (system-backup-3) created. + Then system backup is in state (Ready). + And volume has backup count (2). """ host_id = get_self_host_id() @@ -232,25 +241,23 @@ def test_system_backup_with_volume_backup_policy_if_not_present(client, volume_n volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) - system_backup_name_1 = system_backup_random_name() - client.create_system_backup(Name=system_backup_name_1) + def create_system_backup_and_assert_volume_backup_count(count): + system_backup_name = system_backup_random_name() + client.create_system_backup(Name=system_backup_name, + VolumeBackupPolicy=IF_NOT_PRESENT) - system_backup = client.by_id_system_backup(system_backup_name_1) - assert system_backup.volumeBackupPolicy == IF_NOT_PRESENT + system_backup = client.by_id_system_backup(system_backup_name) + assert system_backup.volumeBackupPolicy == IF_NOT_PRESENT - system_backup_wait_for_state("Ready", system_backup_name_1, client) + system_backup_wait_for_state("Ready", system_backup_name, client) - backup_volume = client.by_id_backupVolume(volume_name) - wait_for_backup_count(backup_volume, 1) - - system_backup_name_2 = system_backup_random_name() - client.create_system_backup(Name=system_backup_name_2, - VolumeBackupPolicy=IF_NOT_PRESENT) + backup_volume = client.by_id_backupVolume(volume_name) + wait_for_backup_count(backup_volume, count) - system_backup_wait_for_state("Ready", system_backup_name_2, client) - - backup_volume = client.by_id_backupVolume(volume_name) - wait_for_backup_count(backup_volume, 1) + create_system_backup_and_assert_volume_backup_count(1) + create_system_backup_and_assert_volume_backup_count(1) + write_volume_random_data(volume) + create_system_backup_and_assert_volume_backup_count(2) @pytest.mark.system_backup_restore # NOQA From f0a878d7df7619fb5453e72a81697182b880784b Mon Sep 17 00:00:00 2001 From: Roger Yao Date: Fri, 18 Oct 2024 10:12:39 +0800 Subject: [PATCH 3/5] Add case test_metric_longhorn_backup longhorn/longhorn#9430 Signed-off-by: Roger Yao --- manager/integration/tests/test_metric.py | 167 ++++++++++++++++++++++- 1 file changed, 162 insertions(+), 5 deletions(-) diff --git a/manager/integration/tests/test_metric.py b/manager/integration/tests/test_metric.py index e35a72ebb8..164043e010 100644 --- a/manager/integration/tests/test_metric.py +++ b/manager/integration/tests/test_metric.py @@ -6,7 +6,7 @@ from kubernetes.stream import stream from prometheus_client.parser import text_string_to_metric_families -from common import client, core_api, pod, volume_name # NOQA +from common import client, core_api, pod, volume_name, batch_v1_api # NOQA from common import crash_engine_process_with_sigkill from common import delete_replica_processes @@ -35,6 +35,25 @@ from common import DEFAULT_DISK_PATH from common import Gi +from backupstore import set_random_backupstore # NOQA +from common import create_recurring_jobs +from common import check_recurring_jobs +from common import wait_for_cron_job_count +from common import create_backup +from common import wait_for_backup_count +from common import delete_backup_volume + +RECURRING_JOB_NAME = "recurring-test" +TASK = "task" +GROUPS = "groups" +CRON = "cron" +RETAIN = "retain" +BACKUP = "backup" +CONCURRENCY = "concurrency" +LABELS = "labels" +DEFAULT = "default" +SCHEDULE_1MIN = "* * * * *" + # The dictionaries use float type of value because the value obtained from # prometheus_client is in float type. # https://github.com/longhorn/longhorn-tests/pull/1531#issuecomment-1833349994 @@ -138,6 +157,21 @@ def examine_metric_value(found_metric, metric_labels, expected_value=None): assert found_metric.value >= 0.0 +def wait_for_metric_sum_on_all_nodes(client, core_api, metric_name, metric_labels, expected_value): # NOQA + for _ in range(RETRY_COUNTS): + time.sleep(RETRY_INTERVAL) + + try: + check_metric_sum_on_all_nodes(client, core_api, metric_name, + metric_labels, expected_value) + return + except AssertionError: + continue + + check_metric_sum_on_all_nodes(client, core_api, metric_name, + metric_labels, expected_value) + + def check_metric_sum_on_all_nodes(client, core_api, metric_name, expected_labels, expected_value=None): # NOQA # Initialize total_metrics to store the sum of the metric values. total_metrics = {"labels": defaultdict(None), "value": 0.0} @@ -440,12 +474,12 @@ def test_metric_longhorn_snapshot_actual_size_bytes(client, core_api, volume_nam When 1 snapshot is created by user And 1 snapshot is created by system - Then has a metric longhorn_snapshot_actual_size_bytes value equals to the - size of the user created snapshot, + Then has a metric longhorn_snapshot_actual_size_bytes value + equals to the size of the user created snapshot, and volume label is the volume name and user_created label is true - And has a metric longhorn_snapshot_actual_size_bytes value equals to the - size of the system created snapshot, + And has a metric longhorn_snapshot_actual_size_bytes value + equals to the size of the system created snapshot, and volume label is the volume name and user_created label is false @@ -615,3 +649,126 @@ def test_node_metrics(client, core_api): # NOQA wait_for_node_update(client, lht_hostId, "allowScheduling", False) check_metric_with_condition(core_api, "longhorn_node_status", metric_labels, 0.0) + + +def test_metric_longhorn_backup(set_random_backupstore, client, core_api, batch_v1_api, volume_name): # NOQA + """ + Scenario: test metric longhorn_backup_actual_size_bytes and + longhorn_backup_state + + Issue: https://github.com/longhorn/longhorn/issues/9429 + + Given a volume + + When a backup is created by user + Then has a metric longhorn_backup_actual_size_bytes value + equals to the size of the backup, + and volume label is the volume name + and recurring_job label is empty + And has a metric longhorn_backup_state value equals to 3 (Completed), + and volume label is the volume name + and recurring_job label is empty + + When a recurring backup job is created + Then should have a metric longhorn_backup_actual_size_bytes value + equals to the size of the backup, + and volume label is the volume name + and recurring_job label is the job name + And should have a metric longhorn_backup_state + value equals to 3 (Completed), + and volume label is the volume name + and recurring_job label is the job name + """ + self_hostId = get_self_host_id() + + # create a volume and attach it to a node. + volume_size = 50 * Mi + client.create_volume(name=volume_name, + numberOfReplicas=1, + size=str(volume_size)) + volume = wait_for_volume_detached(client, volume_name) + volume.attach(hostId=self_hostId) + volume = wait_for_volume_healthy(client, volume_name) + + # create the user backup. + data_size = 10 * Mi + backup_data = {'pos': 0, + 'len': data_size, + 'content': generate_random_data(data_size)} + write_volume_data(volume, backup_data) + create_backup(client, volume_name) + bv = client.by_id_backupVolume(volume_name) + wait_for_backup_count(bv, 1) + + # get the backup size. + backup_size = 0 + backups = bv.backupList().data + for backup in backups: + if backup['snapshotName'] == "volume-head": + continue + + backup_size = int(backup['size']) + assert backup_size > 0 + + # assert the metric values for the user backup. + user_backup_metric_labels = { + "volume": volume_name, + "recurring_job": "", + } + wait_for_metric_sum_on_all_nodes(client, core_api, + "longhorn_backup_actual_size_bytes", + user_backup_metric_labels, + backup_size) + + wait_for_metric_sum_on_all_nodes(client, core_api, + "longhorn_backup_state", + user_backup_metric_labels, + 3) + + # delete the existing backup before creating a recurring backup job. + delete_backup_volume(client, volume_name) + + # create a recurring backup job. + recurring_jobs = { + RECURRING_JOB_NAME: { + TASK: BACKUP, + GROUPS: [DEFAULT], + CRON: SCHEDULE_1MIN, + RETAIN: 1, + CONCURRENCY: 1, + LABELS: {}, + }, + } + create_recurring_jobs(client, recurring_jobs) + check_recurring_jobs(client, recurring_jobs) + wait_for_cron_job_count(batch_v1_api, 1) + + # wait for the recurring backup job to run. + time.sleep(60) + bv = client.by_id_backupVolume(volume_name) + wait_for_backup_count(bv, 1) + + # get the recurring backup size. + recurring_backup_size = 0 + backups = bv.backupList().data + for backup in backups: + if backup['snapshotName'] == "volume-head": + continue + + recurring_backup_size = int(backup['size']) + assert recurring_backup_size > 0 + + # assert the metric values for the recurring backup. + recurring_backup_metric_labels = { + "volume": volume_name, + "recurring_job": RECURRING_JOB_NAME, + } + wait_for_metric_sum_on_all_nodes(client, core_api, + "longhorn_backup_actual_size_bytes", + recurring_backup_metric_labels, + recurring_backup_size) + + wait_for_metric_sum_on_all_nodes(client, core_api, + "longhorn_backup_state", + recurring_backup_metric_labels, + 3) From fa8abb495b39089fef3bd2277dd1b68eb8f4f603 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 00:55:37 +0000 Subject: [PATCH 4/5] chore(deps): update terraform azurerm to v3.117.0 --- test_framework/terraform/azure/aks/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_framework/terraform/azure/aks/main.tf b/test_framework/terraform/azure/aks/main.tf index 0d5870bdb2..fb01aab3d2 100644 --- a/test_framework/terraform/azure/aks/main.tf +++ b/test_framework/terraform/azure/aks/main.tf @@ -2,7 +2,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.116.0" + version = "3.117.0" } } } From 95b2041cca5d42ee2c09397105476845ffd4eef3 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 00:55:30 +0000 Subject: [PATCH 5/5] chore(deps): update dependency boto3 to v1.35.57 --- e2e/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/requirements.txt b/e2e/requirements.txt index d877954200..099d7c8efd 100644 --- a/e2e/requirements.txt +++ b/e2e/requirements.txt @@ -4,6 +4,6 @@ directio==1.3 flake8 kubernetes==27.2.0 requests==2.32.3 -boto3==1.35.54 +boto3==1.35.57 pyyaml==6.0.2 minio==5.0.10