Skip to content

Commit

Permalink
Merge branch 'master' into migrate-replica-rebuild-per-volume-limit
Browse files Browse the repository at this point in the history
  • Loading branch information
yangchiu authored Nov 11, 2024
2 parents cf7e52d + 95b2041 commit 69877c2
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 24 deletions.
11 changes: 11 additions & 0 deletions e2e/keywords/workload.resource
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,14 @@ Check ${workload_kind} ${workload_id} pod is ${expect_state} on another node
Delete Longhorn ${workload_kind} ${workload_name} pod on node ${node_id}
${node_name} = get_node_by_index ${node_id}
delete_workload_pod_on_node ${workload_name} ${node_name} longhorn-system

Trim ${workload_kind} ${workload_id} volume should ${condition}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}

IF $condition == "fail"
trim_workload_volume_filesystem ${workload_name} is_expect_fail=True
ELSE IF $condition == "pass"
trim_workload_volume_filesystem ${workload_name} is_expect_fail=False
ELSE
Fail "Invalid condition value: ${condition}"
END
4 changes: 4 additions & 0 deletions e2e/libs/keywords/workload_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,7 @@ def is_workloads_pods_has_annotations(self, workload_names, annotation_key, name
if not is_workload_pods_has_annotations(workload_name, annotation_key, namespace=namespace, label_selector=label_selector):
return False
return True

def trim_workload_volume_filesystem(self, workload_name, is_expect_fail=False):
volume_name = get_workload_volume_name(workload_name)
self.volume.trim_filesystem(volume_name, is_expect_fail=is_expect_fail)
3 changes: 3 additions & 0 deletions e2e/libs/volume/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,3 +528,6 @@ def validate_volume_setting(self, volume_name, setting_name, value):
volume = self.get(volume_name)
assert str(volume["spec"][setting_name]) == value, \
f"Expected volume {volume_name} setting {setting_name} is {value}, but it's {str(volume['spec'][setting_name])}"

def trim_filesystem(self, volume_name, is_expect_fail=False):
return Rest(self).trim_filesystem(volume_name, is_expect_fail=is_expect_fail)
17 changes: 17 additions & 0 deletions e2e/libs/volume/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,20 @@ def wait_for_replica_ready_to_rw(self, volume_name):
break
time.sleep(self.retry_interval)
assert ready, f"Failed to get volume {volume_name} replicas ready: {replicas}"

def trim_filesystem(self, volume_name, is_expect_fail=False):
is_unexpected_pass = False
try:
self.get(volume_name).trimFilesystem(name=volume_name)

if is_expect_fail:
is_unexpected_pass = True

except Exception as e:
if is_expect_fail:
logging(f"Failed to trim filesystem: {e}")
else:
raise e

if is_unexpected_pass:
raise Exception(f"Expected volume {volume_name} trim filesystem to fail")
3 changes: 3 additions & 0 deletions e2e/libs/volume/volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,6 @@ def wait_for_engine_image_upgrade_completed(self, volume_name, engine_image_name

def validate_volume_setting(self, volume_name, setting_name, value):
return self.volume.validate_volume_setting(volume_name, setting_name, value)

def trim_filesystem(self, volume_name, is_expect_fail=False):
return self.volume.trim_filesystem(volume_name, is_expect_fail=is_expect_fail)
2 changes: 1 addition & 1 deletion e2e/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ directio==1.3
flake8
kubernetes==27.2.0
requests==2.32.3
boto3==1.35.54
boto3==1.35.57
pyyaml==6.0.2
minio==5.0.10
22 changes: 22 additions & 0 deletions e2e/tests/regression/test_v2.robot
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Resource ../keywords/workload.resource
Resource ../keywords/volume.resource
Resource ../keywords/setting.resource
Resource ../keywords/node.resource
Resource ../keywords/host.resource
Resource ../keywords/longhorn.resource

Test Setup Set test environment
Test Teardown Cleanup test resources
Expand Down Expand Up @@ -50,3 +52,23 @@ Degraded Volume Replica Rebuilding
And Wait for deployment 0 pods stable
Then Check deployment 0 data in file data.txt is intact
END

V2 Volume Should Block Trim When Volume Is Degraded
Given Set setting auto-salvage to true
And Create storageclass longhorn-test with dataEngine=v2
And Create persistentvolumeclaim 0 using RWO volume with longhorn-test storageclass
And Create deployment 0 with persistentvolumeclaim 0

FOR ${i} IN RANGE ${LOOP_COUNT}
And Keep writing data to pod of deployment 0

When Restart cluster
And Wait for longhorn ready
And Wait for volume of deployment 0 attached and degraded
Then Trim deployment 0 volume should fail

When Wait for workloads pods stable
... deployment 0
And Check deployment 0 works
Then Trim deployment 0 volume should pass
END
167 changes: 162 additions & 5 deletions manager/integration/tests/test_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from kubernetes.stream import stream
from prometheus_client.parser import text_string_to_metric_families

from common import client, core_api, pod, volume_name # NOQA
from common import client, core_api, pod, volume_name, batch_v1_api # NOQA

from common import crash_engine_process_with_sigkill
from common import delete_replica_processes
Expand Down Expand Up @@ -35,6 +35,25 @@
from common import DEFAULT_DISK_PATH
from common import Gi

from backupstore import set_random_backupstore # NOQA
from common import create_recurring_jobs
from common import check_recurring_jobs
from common import wait_for_cron_job_count
from common import create_backup
from common import wait_for_backup_count
from common import delete_backup_volume

RECURRING_JOB_NAME = "recurring-test"
TASK = "task"
GROUPS = "groups"
CRON = "cron"
RETAIN = "retain"
BACKUP = "backup"
CONCURRENCY = "concurrency"
LABELS = "labels"
DEFAULT = "default"
SCHEDULE_1MIN = "* * * * *"

# The dictionaries use float type of value because the value obtained from
# prometheus_client is in float type.
# https://github.com/longhorn/longhorn-tests/pull/1531#issuecomment-1833349994
Expand Down Expand Up @@ -138,6 +157,21 @@ def examine_metric_value(found_metric, metric_labels, expected_value=None):
assert found_metric.value >= 0.0


def wait_for_metric_sum_on_all_nodes(client, core_api, metric_name, metric_labels, expected_value): # NOQA
for _ in range(RETRY_COUNTS):
time.sleep(RETRY_INTERVAL)

try:
check_metric_sum_on_all_nodes(client, core_api, metric_name,
metric_labels, expected_value)
return
except AssertionError:
continue

check_metric_sum_on_all_nodes(client, core_api, metric_name,
metric_labels, expected_value)


def check_metric_sum_on_all_nodes(client, core_api, metric_name, expected_labels, expected_value=None): # NOQA
# Initialize total_metrics to store the sum of the metric values.
total_metrics = {"labels": defaultdict(None), "value": 0.0}
Expand Down Expand Up @@ -440,12 +474,12 @@ def test_metric_longhorn_snapshot_actual_size_bytes(client, core_api, volume_nam
When 1 snapshot is created by user
And 1 snapshot is created by system
Then has a metric longhorn_snapshot_actual_size_bytes value equals to the
size of the user created snapshot,
Then has a metric longhorn_snapshot_actual_size_bytes value
equals to the size of the user created snapshot,
and volume label is the volume name
and user_created label is true
And has a metric longhorn_snapshot_actual_size_bytes value equals to the
size of the system created snapshot,
And has a metric longhorn_snapshot_actual_size_bytes value
equals to the size of the system created snapshot,
and volume label is the volume name
and user_created label is false
Expand Down Expand Up @@ -615,3 +649,126 @@ def test_node_metrics(client, core_api): # NOQA
wait_for_node_update(client, lht_hostId, "allowScheduling", False)
check_metric_with_condition(core_api, "longhorn_node_status",
metric_labels, 0.0)


def test_metric_longhorn_backup(set_random_backupstore, client, core_api, batch_v1_api, volume_name): # NOQA
"""
Scenario: test metric longhorn_backup_actual_size_bytes and
longhorn_backup_state
Issue: https://github.com/longhorn/longhorn/issues/9429
Given a volume
When a backup is created by user
Then has a metric longhorn_backup_actual_size_bytes value
equals to the size of the backup,
and volume label is the volume name
and recurring_job label is empty
And has a metric longhorn_backup_state value equals to 3 (Completed),
and volume label is the volume name
and recurring_job label is empty
When a recurring backup job is created
Then should have a metric longhorn_backup_actual_size_bytes value
equals to the size of the backup,
and volume label is the volume name
and recurring_job label is the job name
And should have a metric longhorn_backup_state
value equals to 3 (Completed),
and volume label is the volume name
and recurring_job label is the job name
"""
self_hostId = get_self_host_id()

# create a volume and attach it to a node.
volume_size = 50 * Mi
client.create_volume(name=volume_name,
numberOfReplicas=1,
size=str(volume_size))
volume = wait_for_volume_detached(client, volume_name)
volume.attach(hostId=self_hostId)
volume = wait_for_volume_healthy(client, volume_name)

# create the user backup.
data_size = 10 * Mi
backup_data = {'pos': 0,
'len': data_size,
'content': generate_random_data(data_size)}
write_volume_data(volume, backup_data)
create_backup(client, volume_name)
bv = client.by_id_backupVolume(volume_name)
wait_for_backup_count(bv, 1)

# get the backup size.
backup_size = 0
backups = bv.backupList().data
for backup in backups:
if backup['snapshotName'] == "volume-head":
continue

backup_size = int(backup['size'])
assert backup_size > 0

# assert the metric values for the user backup.
user_backup_metric_labels = {
"volume": volume_name,
"recurring_job": "",
}
wait_for_metric_sum_on_all_nodes(client, core_api,
"longhorn_backup_actual_size_bytes",
user_backup_metric_labels,
backup_size)

wait_for_metric_sum_on_all_nodes(client, core_api,
"longhorn_backup_state",
user_backup_metric_labels,
3)

# delete the existing backup before creating a recurring backup job.
delete_backup_volume(client, volume_name)

# create a recurring backup job.
recurring_jobs = {
RECURRING_JOB_NAME: {
TASK: BACKUP,
GROUPS: [DEFAULT],
CRON: SCHEDULE_1MIN,
RETAIN: 1,
CONCURRENCY: 1,
LABELS: {},
},
}
create_recurring_jobs(client, recurring_jobs)
check_recurring_jobs(client, recurring_jobs)
wait_for_cron_job_count(batch_v1_api, 1)

# wait for the recurring backup job to run.
time.sleep(60)
bv = client.by_id_backupVolume(volume_name)
wait_for_backup_count(bv, 1)

# get the recurring backup size.
recurring_backup_size = 0
backups = bv.backupList().data
for backup in backups:
if backup['snapshotName'] == "volume-head":
continue

recurring_backup_size = int(backup['size'])
assert recurring_backup_size > 0

# assert the metric values for the recurring backup.
recurring_backup_metric_labels = {
"volume": volume_name,
"recurring_job": RECURRING_JOB_NAME,
}
wait_for_metric_sum_on_all_nodes(client, core_api,
"longhorn_backup_actual_size_bytes",
recurring_backup_metric_labels,
recurring_backup_size)

wait_for_metric_sum_on_all_nodes(client, core_api,
"longhorn_backup_state",
recurring_backup_metric_labels,
3)
41 changes: 24 additions & 17 deletions manager/integration/tests/test_system_backup_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from common import check_pv_existence
from common import check_backing_image_disk_map_status
from common import wait_for_backup_restore_completed
from common import write_volume_random_data

from common import SETTING_BACKUPSTORE_POLL_INTERVAL
from common import SIZE
Expand Down Expand Up @@ -207,10 +208,11 @@ def test_system_backup_and_restore_volume_with_backingimage(client, core_api, vo
def test_system_backup_with_volume_backup_policy_if_not_present(client, volume_name, set_random_backupstore): # NOQA
"""
Scenario: system backup with volume backup policy (if-not-present) should
only create volume backup when there is no existing backup in
the volume.
create volume backup when no backup exists for the volume or when
the last backup is outdated.
Issue: https://github.com/longhorn/longhorn/issues/5011
https://github.com/longhorn/longhorn/issues/6027
Given a volume is created.
Expand All @@ -225,32 +227,37 @@ def test_system_backup_with_volume_backup_policy_if_not_present(client, volume_n
And system backup (system-backup-2) created.
Then system backup is in state (Ready).
And volume has backup count (1).
When system backup (system-backup-3) has volume backup policy
(if-not-present).
And write data to volume.
And system backup (system-backup-3) created.
Then system backup is in state (Ready).
And volume has backup count (2).
"""
host_id = get_self_host_id()

volume = create_and_check_volume(client, volume_name)
volume.attach(hostId=host_id)
volume = wait_for_volume_healthy(client, volume_name)

system_backup_name_1 = system_backup_random_name()
client.create_system_backup(Name=system_backup_name_1)
def create_system_backup_and_assert_volume_backup_count(count):
system_backup_name = system_backup_random_name()
client.create_system_backup(Name=system_backup_name,
VolumeBackupPolicy=IF_NOT_PRESENT)

system_backup = client.by_id_system_backup(system_backup_name_1)
assert system_backup.volumeBackupPolicy == IF_NOT_PRESENT
system_backup = client.by_id_system_backup(system_backup_name)
assert system_backup.volumeBackupPolicy == IF_NOT_PRESENT

system_backup_wait_for_state("Ready", system_backup_name_1, client)
system_backup_wait_for_state("Ready", system_backup_name, client)

backup_volume = client.by_id_backupVolume(volume_name)
wait_for_backup_count(backup_volume, 1)

system_backup_name_2 = system_backup_random_name()
client.create_system_backup(Name=system_backup_name_2,
VolumeBackupPolicy=IF_NOT_PRESENT)
backup_volume = client.by_id_backupVolume(volume_name)
wait_for_backup_count(backup_volume, count)

system_backup_wait_for_state("Ready", system_backup_name_2, client)

backup_volume = client.by_id_backupVolume(volume_name)
wait_for_backup_count(backup_volume, 1)
create_system_backup_and_assert_volume_backup_count(1)
create_system_backup_and_assert_volume_backup_count(1)
write_volume_random_data(volume)
create_system_backup_and_assert_volume_backup_count(2)


@pytest.mark.system_backup_restore # NOQA
Expand Down
2 changes: 1 addition & 1 deletion test_framework/terraform/azure/aks/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ terraform {
required_providers {
azurerm = {
source = "hashicorp/azurerm"
version = "3.116.0"
version = "3.117.0"
}
}
}
Expand Down

0 comments on commit 69877c2

Please sign in to comment.