Merge branch 'master' into migrate-replica-rebuild-per-volume-limit

longhorn · Nov 11, 2024 · 69877c2 · 69877c2
2 parents cf7e52d + 95b2041
commit 69877c2
Show file tree

Hide file tree

Showing 10 changed files with 248 additions and 24 deletions.
diff --git a/e2e/keywords/workload.resource b/e2e/keywords/workload.resource
@@ -189,3 +189,14 @@ Check ${workload_kind} ${workload_id} pod is ${expect_state} on another node
 Delete Longhorn ${workload_kind} ${workload_name} pod on node ${node_id}
     ${node_name} =    get_node_by_index    ${node_id}
     delete_workload_pod_on_node    ${workload_name}    ${node_name}    longhorn-system
+
+Trim ${workload_kind} ${workload_id} volume should ${condition}
+    ${workload_name} =   generate_name_with_suffix    ${workload_kind}    ${workload_id}
+
+    IF    $condition == "fail"
+        trim_workload_volume_filesystem    ${workload_name}    is_expect_fail=True
+    ELSE IF    $condition == "pass"
+        trim_workload_volume_filesystem    ${workload_name}    is_expect_fail=False
+    ELSE
+        Fail    "Invalid condition value: ${condition}"
+    END
diff --git a/e2e/libs/keywords/workload_keywords.py b/e2e/libs/keywords/workload_keywords.py
@@ -192,3 +192,7 @@ def is_workloads_pods_has_annotations(self, workload_names, annotation_key, name
             if not is_workload_pods_has_annotations(workload_name, annotation_key, namespace=namespace, label_selector=label_selector):
                 return False
         return True
+
+    def trim_workload_volume_filesystem(self, workload_name, is_expect_fail=False):
+        volume_name = get_workload_volume_name(workload_name)
+        self.volume.trim_filesystem(volume_name, is_expect_fail=is_expect_fail)
diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py
@@ -528,3 +528,6 @@ def validate_volume_setting(self, volume_name, setting_name, value):
         volume = self.get(volume_name)
         assert str(volume["spec"][setting_name]) == value, \
             f"Expected volume {volume_name} setting {setting_name} is {value}, but it's {str(volume['spec'][setting_name])}"
+
+    def trim_filesystem(self, volume_name, is_expect_fail=False):
+        return Rest(self).trim_filesystem(volume_name, is_expect_fail=is_expect_fail)
diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py
@@ -401,3 +401,20 @@ def wait_for_replica_ready_to_rw(self, volume_name):
                 break
             time.sleep(self.retry_interval)
         assert ready, f"Failed to get volume {volume_name} replicas ready: {replicas}"
+
+    def trim_filesystem(self, volume_name, is_expect_fail=False):
+        is_unexpected_pass = False
+        try:
+            self.get(volume_name).trimFilesystem(name=volume_name)
+
+            if is_expect_fail:
+                is_unexpected_pass = True
+
+        except Exception as e:
+            if is_expect_fail:
+                logging(f"Failed to trim filesystem: {e}")
+            else:
+                raise e
+
+        if is_unexpected_pass:
+            raise Exception(f"Expected volume {volume_name} trim filesystem to fail")
diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py
@@ -157,3 +157,6 @@ def wait_for_engine_image_upgrade_completed(self, volume_name, engine_image_name
 
     def validate_volume_setting(self, volume_name, setting_name, value):
         return self.volume.validate_volume_setting(volume_name, setting_name, value)
+
+    def trim_filesystem(self, volume_name, is_expect_fail=False):
+        return self.volume.trim_filesystem(volume_name, is_expect_fail=is_expect_fail)
diff --git a/e2e/requirements.txt b/e2e/requirements.txt
@@ -4,6 +4,6 @@ directio==1.3
 flake8
 kubernetes==27.2.0
 requests==2.32.3
-boto3==1.35.54
+boto3==1.35.57
 pyyaml==6.0.2
 minio==5.0.10
diff --git a/e2e/tests/regression/test_v2.robot b/e2e/tests/regression/test_v2.robot
@@ -11,6 +11,8 @@ Resource    ../keywords/workload.resource
 Resource    ../keywords/volume.resource
 Resource    ../keywords/setting.resource
 Resource    ../keywords/node.resource
+Resource    ../keywords/host.resource
+Resource    ../keywords/longhorn.resource
 
 Test Setup    Set test environment
 Test Teardown    Cleanup test resources
@@ -50,3 +52,23 @@ Degraded Volume Replica Rebuilding
         And Wait for deployment 0 pods stable
         Then Check deployment 0 data in file data.txt is intact
     END
+
+V2 Volume Should Block Trim When Volume Is Degraded
+    Given Set setting auto-salvage to true
+    And Create storageclass longhorn-test with    dataEngine=v2
+    And Create persistentvolumeclaim 0 using RWO volume with longhorn-test storageclass
+    And Create deployment 0 with persistentvolumeclaim 0
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        And Keep writing data to pod of deployment 0
+
+        When Restart cluster
+        And Wait for longhorn ready
+        And Wait for volume of deployment 0 attached and degraded
+        Then Trim deployment 0 volume should fail
+
+        When Wait for workloads pods stable
+        ...    deployment 0
+        And Check deployment 0 works
+        Then Trim deployment 0 volume should pass
+    END
diff --git a/manager/integration/tests/test_metric.py b/manager/integration/tests/test_metric.py
@@ -6,7 +6,7 @@
 from kubernetes.stream import stream
 from prometheus_client.parser import text_string_to_metric_families
 
-from common import client, core_api, pod, volume_name  # NOQA
+from common import client, core_api, pod, volume_name, batch_v1_api  # NOQA
 
 from common import crash_engine_process_with_sigkill
 from common import delete_replica_processes
@@ -35,6 +35,25 @@
 from common import DEFAULT_DISK_PATH
 from common import Gi
 
+from backupstore import set_random_backupstore  # NOQA
+from common import create_recurring_jobs
+from common import check_recurring_jobs
+from common import wait_for_cron_job_count
+from common import create_backup
+from common import wait_for_backup_count
+from common import delete_backup_volume
+
+RECURRING_JOB_NAME = "recurring-test"
+TASK = "task"
+GROUPS = "groups"
+CRON = "cron"
+RETAIN = "retain"
+BACKUP = "backup"
+CONCURRENCY = "concurrency"
+LABELS = "labels"
+DEFAULT = "default"
+SCHEDULE_1MIN = "* * * * *"
+
 # The dictionaries use float type of value because the value obtained from
 # prometheus_client is in float type.
 # https://github.com/longhorn/longhorn-tests/pull/1531#issuecomment-1833349994
@@ -138,6 +157,21 @@ def examine_metric_value(found_metric, metric_labels, expected_value=None):
         assert found_metric.value >= 0.0
 
 
+def wait_for_metric_sum_on_all_nodes(client, core_api, metric_name, metric_labels, expected_value): # NOQA
+    for _ in range(RETRY_COUNTS):
+        time.sleep(RETRY_INTERVAL)
+
+        try:
+            check_metric_sum_on_all_nodes(client, core_api, metric_name,
+                                          metric_labels, expected_value)
+            return
+        except AssertionError:
+            continue
+
+    check_metric_sum_on_all_nodes(client, core_api, metric_name,
+                                  metric_labels, expected_value)
+
+
 def check_metric_sum_on_all_nodes(client, core_api, metric_name, expected_labels, expected_value=None): # NOQA
     # Initialize total_metrics to store the sum of the metric values.
     total_metrics = {"labels": defaultdict(None), "value": 0.0}
@@ -440,12 +474,12 @@ def test_metric_longhorn_snapshot_actual_size_bytes(client, core_api, volume_nam
 
     When 1 snapshot is created by user
     And 1 snapshot is created by system
-    Then has a metric longhorn_snapshot_actual_size_bytes value equals to the
-         size of the user created snapshot,
+    Then has a metric longhorn_snapshot_actual_size_bytes value
+         equals to the size of the user created snapshot,
          and volume label is the volume name
          and user_created label is true
-    And has a metric longhorn_snapshot_actual_size_bytes value equals to the
-        size of the system created snapshot,
+    And has a metric longhorn_snapshot_actual_size_bytes value
+        equals to the size of the system created snapshot,
         and volume label is the volume name
         and user_created label is false
 
@@ -615,3 +649,126 @@ def test_node_metrics(client, core_api): # NOQA
     wait_for_node_update(client, lht_hostId, "allowScheduling", False)
     check_metric_with_condition(core_api, "longhorn_node_status",
                                 metric_labels, 0.0)
+
+
+def test_metric_longhorn_backup(set_random_backupstore, client, core_api, batch_v1_api, volume_name): # NOQA
+    """
+    Scenario: test metric longhorn_backup_actual_size_bytes and
+                          longhorn_backup_state
+
+    Issue: https://github.com/longhorn/longhorn/issues/9429
+
+    Given a volume
+
+    When a backup is created by user
+    Then has a metric longhorn_backup_actual_size_bytes value
+         equals to the size of the backup,
+         and volume label is the volume name
+         and recurring_job label is empty
+    And has a metric longhorn_backup_state value equals to 3 (Completed),
+        and volume label is the volume name
+        and recurring_job label is empty
+
+    When a recurring backup job is created
+    Then should have a metric longhorn_backup_actual_size_bytes value
+         equals to the size of the backup,
+         and volume label is the volume name
+         and recurring_job label is the job name
+    And should have a metric longhorn_backup_state
+        value equals to 3 (Completed),
+        and volume label is the volume name
+        and recurring_job label is the job name
+    """
+    self_hostId = get_self_host_id()
+
+    # create a volume and attach it to a node.
+    volume_size = 50 * Mi
+    client.create_volume(name=volume_name,
+                         numberOfReplicas=1,
+                         size=str(volume_size))
+    volume = wait_for_volume_detached(client, volume_name)
+    volume.attach(hostId=self_hostId)
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    # create the user backup.
+    data_size = 10 * Mi
+    backup_data = {'pos': 0,
+                   'len': data_size,
+                   'content': generate_random_data(data_size)}
+    write_volume_data(volume, backup_data)
+    create_backup(client, volume_name)
+    bv = client.by_id_backupVolume(volume_name)
+    wait_for_backup_count(bv, 1)
+
+    # get the backup size.
+    backup_size = 0
+    backups = bv.backupList().data
+    for backup in backups:
+        if backup['snapshotName'] == "volume-head":
+            continue
+
+        backup_size = int(backup['size'])
+    assert backup_size > 0
+
+    # assert the metric values for the user backup.
+    user_backup_metric_labels = {
+        "volume": volume_name,
+        "recurring_job": "",
+    }
+    wait_for_metric_sum_on_all_nodes(client, core_api,
+                                     "longhorn_backup_actual_size_bytes",
+                                     user_backup_metric_labels,
+                                     backup_size)
+
+    wait_for_metric_sum_on_all_nodes(client, core_api,
+                                     "longhorn_backup_state",
+                                     user_backup_metric_labels,
+                                     3)
+
+    # delete the existing backup before creating a recurring backup job.
+    delete_backup_volume(client, volume_name)
+
+    # create a recurring backup job.
+    recurring_jobs = {
+        RECURRING_JOB_NAME: {
+            TASK: BACKUP,
+            GROUPS: [DEFAULT],
+            CRON: SCHEDULE_1MIN,
+            RETAIN: 1,
+            CONCURRENCY: 1,
+            LABELS: {},
+        },
+    }
+    create_recurring_jobs(client, recurring_jobs)
+    check_recurring_jobs(client, recurring_jobs)
+    wait_for_cron_job_count(batch_v1_api, 1)
+
+    # wait for the recurring backup job to run.
+    time.sleep(60)
+    bv = client.by_id_backupVolume(volume_name)
+    wait_for_backup_count(bv, 1)
+
+    # get the recurring backup size.
+    recurring_backup_size = 0
+    backups = bv.backupList().data
+    for backup in backups:
+        if backup['snapshotName'] == "volume-head":
+            continue
+
+        recurring_backup_size = int(backup['size'])
+    assert recurring_backup_size > 0
+
+    # assert the metric values for the recurring backup.
+    recurring_backup_metric_labels = {
+        "volume": volume_name,
+        "recurring_job": RECURRING_JOB_NAME,
+    }
+    wait_for_metric_sum_on_all_nodes(client, core_api,
+                                     "longhorn_backup_actual_size_bytes",
+                                     recurring_backup_metric_labels,
+                                     recurring_backup_size)
+
+    wait_for_metric_sum_on_all_nodes(client, core_api,
+                                     "longhorn_backup_state",
+                                     recurring_backup_metric_labels,
+                                     3)
diff --git a/manager/integration/tests/test_system_backup_restore.py b/manager/integration/tests/test_system_backup_restore.py
@@ -31,6 +31,7 @@
 from common import check_pv_existence
 from common import check_backing_image_disk_map_status
 from common import wait_for_backup_restore_completed
+from common import write_volume_random_data
 
 from common import SETTING_BACKUPSTORE_POLL_INTERVAL
 from common import SIZE
@@ -207,10 +208,11 @@ def test_system_backup_and_restore_volume_with_backingimage(client, core_api, vo
 def test_system_backup_with_volume_backup_policy_if_not_present(client, volume_name, set_random_backupstore):  # NOQA
     """
     Scenario: system backup with volume backup policy (if-not-present) should
-              only create volume backup when there is no existing backup in
-              the volume.
+              create volume backup when no backup exists for the volume or when
+              the last backup is outdated.
 
     Issue: https://github.com/longhorn/longhorn/issues/5011
+           https://github.com/longhorn/longhorn/issues/6027
 
     Given a volume is created.
 
@@ -225,32 +227,37 @@ def test_system_backup_with_volume_backup_policy_if_not_present(client, volume_n
     And system backup (system-backup-2) created.
     Then system backup is in state (Ready).
     And volume has backup count (1).
+
+    When system backup (system-backup-3) has volume backup policy
+         (if-not-present).
+    And write data to volume.
+    And system backup (system-backup-3) created.
+    Then system backup is in state (Ready).
+    And volume has backup count (2).
     """
     host_id = get_self_host_id()
 
     volume = create_and_check_volume(client, volume_name)
     volume.attach(hostId=host_id)
     volume = wait_for_volume_healthy(client, volume_name)
 
-    system_backup_name_1 = system_backup_random_name()
-    client.create_system_backup(Name=system_backup_name_1)
+    def create_system_backup_and_assert_volume_backup_count(count):
+        system_backup_name = system_backup_random_name()
+        client.create_system_backup(Name=system_backup_name,
+                                    VolumeBackupPolicy=IF_NOT_PRESENT)
 
-    system_backup = client.by_id_system_backup(system_backup_name_1)
-    assert system_backup.volumeBackupPolicy == IF_NOT_PRESENT
+        system_backup = client.by_id_system_backup(system_backup_name)
+        assert system_backup.volumeBackupPolicy == IF_NOT_PRESENT
 
-    system_backup_wait_for_state("Ready", system_backup_name_1, client)
+        system_backup_wait_for_state("Ready", system_backup_name, client)
 
-    backup_volume = client.by_id_backupVolume(volume_name)
-    wait_for_backup_count(backup_volume, 1)
-
-    system_backup_name_2 = system_backup_random_name()
-    client.create_system_backup(Name=system_backup_name_2,
-                                VolumeBackupPolicy=IF_NOT_PRESENT)
+        backup_volume = client.by_id_backupVolume(volume_name)
+        wait_for_backup_count(backup_volume, count)
 
-    system_backup_wait_for_state("Ready", system_backup_name_2, client)
-
-    backup_volume = client.by_id_backupVolume(volume_name)
-    wait_for_backup_count(backup_volume, 1)
+    create_system_backup_and_assert_volume_backup_count(1)
+    create_system_backup_and_assert_volume_backup_count(1)
+    write_volume_random_data(volume)
+    create_system_backup_and_assert_volume_backup_count(2)
 
 
 @pytest.mark.system_backup_restore   # NOQA

diff --git a/test_framework/terraform/azure/aks/main.tf b/test_framework/terraform/azure/aks/main.tf
@@ -2,7 +2,7 @@ terraform {
   required_providers {
     azurerm = {
       source  = "hashicorp/azurerm"
-      version = "3.116.0"
+      version = "3.117.0"
     }
   }
 }