diff --git a/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md b/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md deleted file mode 100644 index 2702e1fa72..0000000000 --- a/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: "[#1366](https://github.com/longhorn/longhorn/issues/1366) && [#1328](https://github.com/longhorn/longhorn/issues/1328) The node the DR volume attached to is rebooted" ---- -#### Scenario 1 -1. Create a pod with Longhorn volume. -2. Write data to the volume and get the md5sum. -3. Create the 1st backup for the volume. -4. Create a DR volume from the backup. -5. Wait for the DR volume starting the initial restore. Then reboot the DR volume attached node immediately. -6. Wait for the DR volume detached then reattached. -7. Wait for the DR volume restore complete after the reattachment. -8. Activate the DR volume and check the data md5sum. -#### Scenario 2 -1. Create a pod with Longhorn volume. -2. Write data to the volume and get the md5sum. -3. Create the 1st backup for the volume. -4. Create a DR volume from the backup. -5. Wait for the DR volume to complete the initial restore. -6. Write more data to the original volume and get the md5sum. -7. Create the 2nd backup for the volume. -8. Wait for the DR volume incremental restore getting triggered. Then reboot the DR volume attached node immediately. -9. Wait for the DR volume detached then reattached. -10. Wait for the DR volume restore complete after the reattachment. -11. Activate the DR volume and check the data md5sum. diff --git a/e2e/README.md b/e2e/README.md index e2933e0924..898ace2ba2 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -14,10 +14,14 @@ curl -sSfL https://raw.githubusercontent.com/longhorn/longhorn/master/scripts/en ### Run the test -1. Deploy all backupstore servers (including `NFS` server and `Minio` as s3 server) for test purposes. +1. Deploy all backupstore servers (including `NFS` server and `Minio` as s3 server, `CIFS` and `Azurite` server) for test purposes. + + For Azurite, there are some manual steps need to be done after manifest deployed(https://github.com/longhorn/longhorn-tests/wiki/Setup-Azurite-Backupstore-For-Testing). ``` -kubectl create -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml \ - -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml +kubectl create -f https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/minio-backupstore.yaml \ + -f https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/nfs-backupstore.yaml \ + -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/cifs-backupstore.yaml \ + -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/azurite-backupstore.yaml ``` 1. Expose Longhorn API: diff --git a/e2e/keywords/backup.resource b/e2e/keywords/backup.resource index e0a14133e6..309a94c821 100644 --- a/e2e/keywords/backup.resource +++ b/e2e/keywords/backup.resource @@ -47,3 +47,8 @@ Check volume ${volume_id} data is backup ${backup_id} Check backup synced from backupstore assert_all_backups_before_uninstall_exist ${backups_before_uninstall} + +Volume ${volume_id} backup ${backup_id} should be able to create + Create backup ${backup_id} for volume ${volume_id} + Verify backup list contains no error for volume ${volume_id} + Verify backup list contains backup ${backup_id} of volume ${volume_id} diff --git a/e2e/keywords/longhorn.resource b/e2e/keywords/longhorn.resource index f413066528..af4475b20a 100644 --- a/e2e/keywords/longhorn.resource +++ b/e2e/keywords/longhorn.resource @@ -82,3 +82,11 @@ Delete instance-manager of deployment ${deployment_id} volume Wait for Longhorn components all running wait_for_namespace_pods_running longhorn-system + +Install Longhorn stable version + install_longhorn_system is_stable_version=True + +Uninstall Longhorn stable version + ${backups_before_uninstall} = list_all_backups + uninstall_longhorn_system is_stable_version=True + Set Test Variable ${backups_before_uninstall} diff --git a/e2e/keywords/sharemanager.resource b/e2e/keywords/sharemanager.resource index 6fe84fda83..3e8026de2d 100644 --- a/e2e/keywords/sharemanager.resource +++ b/e2e/keywords/sharemanager.resource @@ -21,12 +21,12 @@ Check sharemanager ${condition} using headless service Wait for all sharemanager to be deleted wait_for_sharemanagers_deleted -Delete sharemanager of deployment ${deployment_id} and wait for recreation +Delete sharemanager pod of deployment ${deployment_id} and wait for recreation ${deployment_name} = generate_name_with_suffix deployment ${deployment_id} ${volume_name} = get_workload_volume_name ${deployment_name} - delete_sharemanager_and_wait_for_recreation ${volume_name} + delete_sharemanager_pod_and_wait_for_recreation ${volume_name} -Wait for sharemanager of deployment ${deployment_id} running +Wait for sharemanager pod of deployment ${deployment_id} running ${deployment_name} = generate_name_with_suffix deployment ${deployment_id} ${volume_name} = get_workload_volume_name ${deployment_name} - wait_for_share_manager_running ${volume_name} + wait_for_share_manager_pod_running ${volume_name} diff --git a/e2e/keywords/volume.resource b/e2e/keywords/volume.resource index c9bc00ec11..6dfe25423a 100644 --- a/e2e/keywords/volume.resource +++ b/e2e/keywords/volume.resource @@ -91,6 +91,10 @@ Write data ${data_id} to volume ${volume_id} ${volume_name} = generate_name_with_suffix volume ${volume_id} write_volume_random_data ${volume_name} 2048 ${data_id} +Write data ${data_id} ${size} MB to volume ${volume_id} + ${volume_name} = generate_name_with_suffix volume ${volume_id} + write_volume_random_data ${volume_name} ${size} ${data_id} + Keep writing data to volume ${volume_id} ${volume_name} = generate_name_with_suffix volume ${volume_id} keep_writing_data ${volume_name} @@ -177,6 +181,11 @@ Wait for volume ${volume_id} restoration from backup ${backup_id} completed ${backup_name} = get_backup_name ${backup_id} wait_for_volume_restoration_completed ${volume_name} ${backup_name} +Wait for volume ${volume_id} restoration from backup ${backup_id} start + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${backup_name} = get_backup_name ${backup_id} + wait_for_volume_restoration_start ${volume_name} ${backup_name} + Wait until volume ${volume_id} replica rebuilding started on ${replica_locality} ${volume_name} = generate_name_with_suffix volume ${volume_id} wait_for_replica_rebuilding_to_start_on_node ${volume_name} ${replica_locality} @@ -238,6 +247,44 @@ Check volume ${volume_id} replica on node ${node_id} exist ${replica_name} get_replica_name_on_node ${volume_name} ${node_name} Should Not Be Equal ${replica_name} ${None} +Volume ${volume_id} should have ${expected_replica_count} replicas running + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${replica_count} = wait_for_replica_count ${volume_name} node_name= replica_count=${expected_replica_count} + +Volume ${volume_id} should have ${expected_replica_count} replicas running on node ${node_id} + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${node_name} = get_node_by_index ${node_id} + ${replica_count} = wait_for_replica_count ${volume_name} node_name=${node_name} replica_count=${expected_replica_count} + Set Test Variable ${volume_name} + Set Test Variable ${node_name} + Set Test Variable ${replica_count} + +Volume ${volume_id} should have replicas running on node ${node_id} + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${node_name} = get_node_by_index ${node_id} + ${replica_count} = wait_for_replica_count ${volume_name} node_name=${node_name} + Set Test Variable ${volume_name} + Set Test Variable ${node_name} + Set Test Variable ${replica_count} + +Volume ${volume_id} should have ${expected_replica_count} replicas running on node ${node_id} and no additional scheduling occurs + Volume ${volume_id} should have ${expected_replica_count} replicas running on node ${node_id} + FOR ${i} IN RANGE 3 + Log to console Ensuring there's no additional scheduling for node ${node_name} ... (${i}) + ${new_replica_count} = wait_for_replica_count ${volume_name} node_name=${node_name} replica_count=${expected_replica_count} + Should Be Equal As Integers ${replica_count} ${new_replica_count} + Sleep 5 + END + +Volume ${volume_id} should have replicas running on node ${node_id} and no additional scheduling occurs + Volume ${volume_id} should have replicas running on node ${node_id} + FOR ${i} IN RANGE 3 + Log to console Ensuring there's no additional scheduling for node ${node_name} ... (${i}) + ${new_replica_count} = wait_for_replica_count ${volume_name} node_name=${node_name} + Should Be Equal As Integers ${replica_count} ${new_replica_count} + Sleep 5 + END + Check volume ${volume_id} data is intact ${volume_name} = generate_name_with_suffix volume ${volume_id} check_data_checksum ${volume_name} @@ -299,6 +346,11 @@ Check volume ${volume_id} data is backup ${backup_id} created in another cluster ${backup_data} = get_backup_data_from_backup_list ${backups_before_uninstall} ${backup_id} Should Be Equal ${current_checksum} ${backup_data} +Create volume ${volume_id} from backup ${backup_id} in another cluster + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${backup_url} = get_backup_url_from_backup_list ${backups_before_uninstall} ${backup_id} + create_volume ${volume_name} fromBackup=${backup_url} + Create DR volume ${volume_id} from backup ${backup_id} in another cluster ${volume_name} = generate_name_with_suffix volume ${volume_id} ${backup_url} = get_backup_url_from_backup_list ${backups_before_uninstall} ${backup_id} diff --git a/e2e/libs/backup/rest.py b/e2e/libs/backup/rest.py index 977c13d60f..7fe9fa3dfe 100644 --- a/e2e/libs/backup/rest.py +++ b/e2e/libs/backup/rest.py @@ -53,8 +53,13 @@ def get(self, backup_id, volume_name): def get_from_list(self, backup_list, backup_id): for backup in backup_list["items"]: - if backup['metadata']['annotations']['test.longhorn.io/backup-id'] == backup_id: - return backup + try: + if backup['metadata']['annotations']['test.longhorn.io/backup-id'] == backup_id: + return backup + except KeyError as e: + logging(f"Missing key in backup metadata: {str(e)} for backup {backup['metadata']['name']}") + except Exception as e: + logging(f"Unexpected error accessing backup {backup['metadata']['name']}: {str(e)}") return None def get_by_snapshot(self, volume_name, snapshot_name): diff --git a/e2e/libs/keywords/longhorn_deploy_keywords.py b/e2e/libs/keywords/longhorn_deploy_keywords.py index 9043815788..bd8c596b56 100644 --- a/e2e/libs/keywords/longhorn_deploy_keywords.py +++ b/e2e/libs/keywords/longhorn_deploy_keywords.py @@ -6,11 +6,11 @@ class longhorn_deploy_keywords: def __init__(self): self.longhorn = LonghornDeploy() - def uninstall_longhorn_system(self): - self.longhorn.uninstall() + def uninstall_longhorn_system(self, is_stable_version=False): + self.longhorn.uninstall(is_stable_version) def check_longhorn_crd_removed(self): self.longhorn.check_longhorn_crd_removed() - def install_longhorn_system(self): - self.longhorn.install() + def install_longhorn_system(self, is_stable_version=False): + self.longhorn.install(is_stable_version) diff --git a/e2e/libs/keywords/sharemanager_keywords.py b/e2e/libs/keywords/sharemanager_keywords.py index f9d501f349..b541f5b26b 100644 --- a/e2e/libs/keywords/sharemanager_keywords.py +++ b/e2e/libs/keywords/sharemanager_keywords.py @@ -7,7 +7,7 @@ from utility.utility import get_retry_count_and_interval from utility.utility import logging - +from utility.utility import get_pod, delete_pod class sharemanager_keywords: @@ -48,14 +48,32 @@ def wait_for_sharemanagers_deleted(self, name=[]): assert AssertionError, f"Failed to wait for all sharemanagers to be deleted" - def delete_sharemanager(self, name): - return self.sharemanager.delete(name) - def delete_sharemanager_and_wait_for_recreation(self, name): - sharemanager = self.sharemanager.get(name) - last_creation_time = sharemanager["metadata"]["creationTimestamp"] - self.sharemanager.delete(name) - self.sharemanager.wait_for_restart(name, last_creation_time) + def delete_sharemanager_pod_and_wait_for_recreation(self, name): + sharemanager_pod_name = "share-manager-" + name + sharemanager_pod = get_pod(sharemanager_pod_name, "longhorn-system") + last_creation_time = sharemanager_pod.metadata.creation_timestamp + delete_pod(sharemanager_pod_name, "longhorn-system") + + retry_count, retry_interval = get_retry_count_and_interval() + for i in range(retry_count): + time.sleep(retry_interval) + sharemanager_pod = get_pod(sharemanager_pod_name, "longhorn-system") + if sharemanager_pod == None: + continue + creation_time = sharemanager_pod.metadata.creation_timestamp + if creation_time > last_creation_time: + return + + assert False, f"sharemanager pod {sharemanager_pod_name} not recreated" + + + def wait_for_share_manager_pod_running(self, name): + sharemanager_pod_name = "share-manager-" + name + retry_count, retry_interval = get_retry_count_and_interval() + for i in range(retry_count): + sharemanager_pod = get_pod(sharemanager_pod_name, "longhorn-system") + if sharemanager_pod.status.phase == "Running": + return - def wait_for_share_manager_running(self, name): - return self.sharemanager.wait_for_running(name) + assert False, f"sharemanager pod {sharemanager_pod_name} not running" diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py index 2ef9c77ac0..829c73e6c1 100644 --- a/e2e/libs/keywords/volume_keywords.py +++ b/e2e/libs/keywords/volume_keywords.py @@ -236,6 +236,9 @@ def wait_for_replica_running(self, volume_name, node_name): def get_replica_name_on_node(self, volume_name, node_name): return self.volume.get_replica_name_on_node(volume_name, node_name) + def wait_for_replica_count(self, volume_name, node_name=None, replica_count=None): + return self.volume.wait_for_replica_count(volume_name, node_name, replica_count) + def wait_for_replica_rebuilding_to_stop_on_node(self, volume_name, replica_locality): node_id = self.get_node_id_by_replica_locality(volume_name, replica_locality) retry_count, retry_interval = get_retry_count_and_interval() @@ -280,6 +283,10 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name): logging(f'Waiting for volume {volume_name} restoration from {backup_name} completed') self.volume.wait_for_volume_restoration_completed(volume_name, backup_name) + def wait_for_volume_restoration_start(self, volume_name, backup_name): + logging(f'Waiting for volume {volume_name} restoration from {backup_name} start') + self.volume.wait_for_volume_restoration_start(volume_name, backup_name) + def validate_volume_replicas_anti_affinity(self, volume_name): self.volume.validate_volume_replicas_anti_affinity(volume_name) diff --git a/e2e/libs/longhorn_deploy/base.py b/e2e/libs/longhorn_deploy/base.py index 1ba6468cbf..e0162b45c2 100644 --- a/e2e/libs/longhorn_deploy/base.py +++ b/e2e/libs/longhorn_deploy/base.py @@ -19,7 +19,7 @@ def install(self): return NotImplemented @abstractmethod - def uninstall(self, longhorn_branch=None): + def uninstall(self, is_stable_version=False): return NotImplemented def check_longhorn_crd_removed(self): @@ -29,17 +29,27 @@ def check_longhorn_crd_removed(self): def check_longhorn_uninstall_pod_log(self): logs = k8s.get_pod_logs(LONGHORN_NAMESPACE, LONGHORN_UNINSTALL_JOB_LABEL) - assert "error" not in logs - assert "level=fatal" not in logs + assert "level=error" not in logs, f"find string 'level=error' in uninstall log {logs}" + assert "level=fatal" not in logs, f"find string 'level=fatal' in uninstall log {logs}" - def install_longhorn(self): + def install_longhorn(self, is_stable_version=False): current_path=os.getcwd() full_path = os.path.join(current_path, LONGHORN_INSTALL_SCRIPT_PATH) + if is_stable_version is True: + cmd = ['bash', '-c', f'IS_INSTALL_STABLE_VERSION=true {full_path}'] + else: + cmd = ['bash', full_path] + try: - output = subprocess.check_output(['bash', full_path], timeout=LONGHORN_INSTALL_TIMEOUT) + output = subprocess.check_output(cmd, timeout=LONGHORN_INSTALL_TIMEOUT) logging(output) except subprocess.CalledProcessError as e: - logging(f"Error: {e.stderr}") + logging(f"Command failed with exit code {e.returncode}") + logging(f"stdout: {e.output}") + logging(f"stderr: {e.stderr}") + raise except subprocess.TimeoutExpired as e: logging(f"Command timed out after {e.timeout} seconds") + logging(f"stdout: {e.output}") + raise diff --git a/e2e/libs/longhorn_deploy/longhorn_deploy.py b/e2e/libs/longhorn_deploy/longhorn_deploy.py index 2023e7d73d..47fed5fadb 100644 --- a/e2e/libs/longhorn_deploy/longhorn_deploy.py +++ b/e2e/libs/longhorn_deploy/longhorn_deploy.py @@ -14,11 +14,11 @@ def __init__(self): elif self._method == "helm": self.longhorn = LonghornHelmChart() - def uninstall(self): - return self.longhorn.uninstall() + def uninstall(self, is_stable_version=False): + return self.longhorn.uninstall(is_stable_version) def check_longhorn_crd_removed(self): return self.longhorn.check_longhorn_crd_removed() - def install(self): - return self.longhorn.install() + def install(self, is_stable_version=False): + return self.longhorn.install(is_stable_version) diff --git a/e2e/libs/longhorn_deploy/longhorn_helm_chart.py b/e2e/libs/longhorn_deploy/longhorn_helm_chart.py index 47f3cd345e..67193bc1ee 100644 --- a/e2e/libs/longhorn_deploy/longhorn_helm_chart.py +++ b/e2e/libs/longhorn_deploy/longhorn_helm_chart.py @@ -8,7 +8,7 @@ class LonghornHelmChart(Base): - def uninstall(self): + def uninstall(self, is_stable_version=False): control_plane_nodes = Node.list_node_names_by_role(self, role="control-plane") control_plane_node = control_plane_nodes[0] @@ -19,5 +19,5 @@ def uninstall(self): k8s.delete_namespace(namespace=LONGHORN_NAMESPACE) k8s.wait_namespace_terminated(namespace=LONGHORN_NAMESPACE) - def install(self): - self.install_longhorn() + def install(self, is_stable_version=False): + self.install_longhorn(is_stable_version) diff --git a/e2e/libs/longhorn_deploy/longhorn_kubectl.py b/e2e/libs/longhorn_deploy/longhorn_kubectl.py index 960088ed99..d915bae2e7 100644 --- a/e2e/libs/longhorn_deploy/longhorn_kubectl.py +++ b/e2e/libs/longhorn_deploy/longhorn_kubectl.py @@ -9,8 +9,11 @@ class LonghornKubectl(Base): - def uninstall(self): - longhorn_branch = os.getenv("LONGHORN_REPO_BRANCH") + def uninstall(self, is_stable_version=False): + env_var = "LONGHORN_STABLE_VERSION" if is_stable_version else "LONGHORN_REPO_BRANCH" + longhorn_branch = os.getenv(env_var) + if not longhorn_branch: + raise ValueError(f"Required environment variable {env_var} is not set") control_plane_nodes = Node.list_node_names_by_role(self, role="control-plane") control_plane_node = control_plane_nodes[0] @@ -30,5 +33,5 @@ def uninstall(self): assert res, "delete uninstallation components failed" k8s.wait_namespace_terminated(namespace=LONGHORN_NAMESPACE) - def install(self): - self.install_longhorn() + def install(self, is_stable_version=False): + self.install_longhorn(is_stable_version) diff --git a/e2e/libs/volume/base.py b/e2e/libs/volume/base.py index ef94d19b30..c58489e866 100644 --- a/e2e/libs/volume/base.py +++ b/e2e/libs/volume/base.py @@ -92,6 +92,10 @@ def wait_for_volume_migration_completed(self, volume_name, node_name): def wait_for_volume_restoration_completed(self, volume_name, backup_name): return NotImplemented + @abstractmethod + def wait_for_volume_restoration_start(self, volume_name, backup_name): + return NotImplemented + @abstractmethod def get_endpoint(self, volume_name): return NotImplemented diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 349e9a8a82..84acc0fab9 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -262,6 +262,9 @@ def is_replica_running(self, volume_name, node_name, is_running): def get_replica_name_on_node(self, volume_name, node_name): return Rest().get_replica_name_on_node(volume_name, node_name) + def wait_for_replica_count(self, volume_name, node_name, replica_count): + return Rest().wait_for_replica_count(volume_name, node_name, replica_count) + def wait_for_volume_keep_in_state(self, volume_name, desired_state): self.wait_for_volume_state(volume_name, desired_state) @@ -355,6 +358,30 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name): time.sleep(self.retry_interval) assert updated + def wait_for_volume_restoration_start(self, volume_name, backup_name, + progress=0): + started = False + for i in range(self.retry_count): + try: + engines = self.engine.get_engines(volume_name) + for engine in engines: + for status in engine['status']['restoreStatus'].values(): + if status['state'] == "in_progress" and status['progress'] > progress: + started = True + break + # Sometime the restore time is pretty short + # and the test may not be able to catch the intermediate status. + if engine['status']['lastRestoredBackup'] == backup_name: + started = True + if started: + break + if started: + break + except Exception as e: + logging(f"Getting volume {volume_name} engines error: {e}") + time.sleep(self.retry_interval) + assert started + def wait_for_volume_expand_to_size(self, volume_name, expected_size): engine = None engine_current_size = 0 diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index 3d6a4225a3..b048f34db2 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -80,6 +80,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name): def wait_for_volume_restoration_completed(self, volume_name): return NotImplemented + def wait_for_volume_restoration_start(self, volume_name): + return NotImplemented + def get_endpoint(self, volume_name): endpoint = "" v = self.get(volume_name) @@ -231,6 +234,23 @@ def get_replica_name_on_node(self, volume_name, node_name): if r.hostId == node_name: return r.name + def wait_for_replica_count(self, volume_name, node_name, replica_count): + for i in range(self.retry_count): + running_replica_count = 0 + volume = get_longhorn_client().by_id_volume(volume_name) + for r in volume.replicas: + if node_name and r.hostId == node_name and r.running: + running_replica_count += 1 + elif not node_name and r.running: + running_replica_count += 1 + logging(f"Waiting for {replica_count if replica_count else ''} replicas for volume {volume_name} running on {node_name if node_name else 'nodes'}, currently it's {running_replica_count} ... ({i})") + if replica_count and running_replica_count == int(replica_count): + break + elif not replica_count and running_replica_count: + break + time.sleep(self.retry_interval) + return running_replica_count + def wait_for_replica_rebuilding_complete(self, volume_name, node_name=None): completed = False for i in range(self.retry_count): diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py index b039545a45..578480c22c 100644 --- a/e2e/libs/volume/volume.py +++ b/e2e/libs/volume/volume.py @@ -78,6 +78,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name): def wait_for_volume_restoration_completed(self, volume_name, backup_name): self.volume.wait_for_volume_restoration_completed(volume_name, backup_name) + def wait_for_volume_restoration_start(self, volume_name, backup_name): + self.volume.wait_for_volume_restoration_start(volume_name, backup_name) + def wait_for_volume_expand_to_size(self, volume_name, size): return self.volume.wait_for_volume_expand_to_size(volume_name, size) @@ -125,6 +128,9 @@ def wait_for_replica_running(self, volume_name, node_name): def get_replica_name_on_node(self, volume_name, node_name): return self.volume.get_replica_name_on_node(volume_name, node_name) + def wait_for_replica_count(self, volume_name, node_name, replica_count): + return self.volume.wait_for_replica_count(volume_name, node_name, replica_count) + def wait_for_replica_rebuilding_complete(self, volume_name, node_name=None): return self.volume.wait_for_replica_rebuilding_complete(volume_name, node_name) diff --git a/e2e/requirements.txt b/e2e/requirements.txt index 099d7c8efd..ed4bf78099 100644 --- a/e2e/requirements.txt +++ b/e2e/requirements.txt @@ -4,6 +4,6 @@ directio==1.3 flake8 kubernetes==27.2.0 requests==2.32.3 -boto3==1.35.57 +boto3==1.35.71 pyyaml==6.0.2 minio==5.0.10 diff --git a/e2e/tests/negative/component_resilience.robot b/e2e/tests/negative/component_resilience.robot index 4c5cc50596..fa45633760 100644 --- a/e2e/tests/negative/component_resilience.robot +++ b/e2e/tests/negative/component_resilience.robot @@ -174,8 +174,8 @@ Test Longhorn dynamic provisioned RWX volume recovery And Wait until volume of deployment 0 replica rebuilding started on replica node Then Delete instance-manager of deployment 0 volume and wait for recover - When Delete sharemanager of deployment 0 and wait for recreation - And Wait for sharemanager of deployment 0 running + When Delete sharemanager pod of deployment 0 and wait for recreation + And Wait for sharemanager pod of deployment 0 running And Wait for deployment 0 pods stable And Check deployment 0 data in file data.txt is intact END diff --git a/e2e/tests/negative/pull_backup_from_another_longhorn.robot b/e2e/tests/negative/pull_backup_from_another_longhorn.robot new file mode 100644 index 0000000000..819350ad68 --- /dev/null +++ b/e2e/tests/negative/pull_backup_from_another_longhorn.robot @@ -0,0 +1,98 @@ +*** Settings *** +Documentation Uninstallation Checks + +Test Tags negative + +Resource ../keywords/common.resource +Resource ../keywords/setting.resource +Resource ../keywords/volume.resource +Resource ../keywords/persistentvolume.resource +Resource ../keywords/persistentvolumeclaim.resource +Resource ../keywords/workload.resource +Resource ../keywords/backup.resource +Resource ../keywords/snapshot.resource +Resource ../keywords/backupstore.resource +Resource ../keywords/longhorn.resource +Library ../libs/keywords/setting_keywords.py + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${LOOP_COUNT} 1 +${RETRY_COUNT} 300 +${RETRY_INTERVAL} 1 +${DATA_ENGINE} v1 + +*** Test Cases *** +Pull backup created by another Longhorn system + [Documentation] Pull backup created by another Longhorn system + ... 1. Install test version of Longhorn. + ... 2. Create volume, write data, and take backup. + ... 3. Uninstall Longhorn. + ... 4. Install test version of Longhorn. + ... 5. Restore the backup create in step 2 and verify the data. + ... 6. Uninstall Longhorn. + ... 7. Install previous version of Longhorn. + ... 8. Create volume, write data, and take backup. + ... 9. Uninstall Longhorn. + ... 10. Install test version of Longhorn. + ... 11. Restore the backup create in step 8 and verify the data. + ... + ... Important + ... - This test case need have set environment variable manually first if not run on Jenkins + ... - LONGHORN_INSTALL_METHOD : helm or manifest + ... - LONGHORN_REPO_BRANCH (ex:master) + ... - CUSTOM_LONGHORN_MANAGER_IMAGE (if not using master-head) + ... - CUSTOM_LONGHORN_ENGINE_IMAGE (if not using master-head) + ... - CUSTOM_LONGHORN_INSTANCE_MANAGER_IMAGE (if not using master-head) + ... - CUSTOM_LONGHORN_SHARE_MANAGER_IMAGE (if not using master-head) + ... - CUSTOM_LONGHORN_BACKING_IMAGE_MANAGER_IMAGE (if not using master-head) + ... - LONGHORN_STABLE_VERSION (ex:v1.6.3) + Given Set setting deleting-confirmation-flag to true + And Create volume 0 with dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data 0 300 MB to volume 0 + When Create backup 0 for volume 0 + Then Verify backup list contains no error for volume 0 + And Verify backup list contains backup 0 of volume 0 + Then Uninstall Longhorn + And Check Longhorn CRD removed + + # Install current version then pull backup and verify data + Then Install Longhorn + And Set setting deleting-confirmation-flag to true + And Set backupstore + And Check backup synced from backupstore + And Create volume 1 from backup 0 in another cluster + And Wait for volume 1 detached + And Attach volume 1 + And Wait for volume 1 healthy + Then Check volume 1 data is backup 0 created in another cluster + Then Uninstall Longhorn + And Check Longhorn CRD removed + + # Install previous version and create backup + Then Install Longhorn stable version + And Set setting deleting-confirmation-flag to true + And Set backupstore + And Create volume 2 with dataEngine=${DATA_ENGINE} + And Attach volume 2 + And Wait for volume 2 healthy + And Write data 1 300 MB to volume 2 + When Create backup 1 for volume 2 + Then Verify backup list contains no error for volume 2 + And Verify backup list contains backup 1 of volume 2 + Then Uninstall Longhorn stable version + And Check Longhorn CRD removed + + # Install current version then pull backup and verify data + Then Install Longhorn + And Set backupstore + And Check backup synced from backupstore + And Create volume 3 from backup 1 in another cluster + And Wait for volume 3 detached + And Attach volume 3 + And Wait for volume 3 healthy + Then Check volume 3 data is backup 1 created in another cluster diff --git a/e2e/tests/negative/test_backup_listing.robot b/e2e/tests/negative/test_backup_listing.robot index 0d292c7141..6e0c921b25 100644 --- a/e2e/tests/negative/test_backup_listing.robot +++ b/e2e/tests/negative/test_backup_listing.robot @@ -101,11 +101,6 @@ Create pod ${pod_id} mount ${size} GB volume ${volume_id} Create pod ${pod_id} using volume ${volume_id} Wait for pod ${pod_id} running -Volume ${volume_id} backup ${backup_id} should be able to create - Create backup ${backup_id} for volume ${volume_id} - Verify backup list contains no error for volume ${volume_id} - Verify backup list contains backup ${backup_id} of volume ${volume_id} - Write data to file in deployment 0 Write 100 MB data to file data in deployment 0 diff --git a/e2e/tests/negative/test_dr_volume_node_reboot.robot b/e2e/tests/negative/test_dr_volume_node_reboot.robot new file mode 100644 index 0000000000..2a8d2c7f30 --- /dev/null +++ b/e2e/tests/negative/test_dr_volume_node_reboot.robot @@ -0,0 +1,91 @@ +*** Settings *** +Documentation Test DR volume node reboot +... https://github.com/longhorn/longhorn/issues/8425 + +Test Tags manual longhorn-8425 + +Resource ../keywords/common.resource +Resource ../keywords/deployment.resource +Resource ../keywords/workload.resource +Resource ../keywords/longhorn.resource +Resource ../keywords/host.resource +Resource ../keywords/storageclass.resource +Resource ../keywords/persistentvolumeclaim.resource +Resource ../keywords/recurringjob.resource +Resource ../keywords/statefulset.resource +Resource ../keywords/volume.resource +Resource ../keywords/snapshot.resource +Resource ../keywords/backup.resource + + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${RETRY_COUNT} 400 +${LOOP_COUNT} 5 +${RETRY_INTERVAL} 1 +${DATA_ENGINE} v1 + +*** Test Cases *** +DR Volume Node Reboot During Initial Restoration + [Tags] manual longhorn-8425 + [Documentation] Test DR volume node reboot during initial restoration + ... Create a pod with Longhorn volume. + ... Write data to the volume and get the md5sum. + ... Create the 1st backup for the volume. + ... Create a DR volume from the backup. + ... Wait for the DR volume starting the initial restore. + ... Then reboot the DR volume attached node immediately. + ... Wait for the DR volume detached then reattached. + ... Wait for the DR volume restore complete after the reattachment. + ... Activate the DR volume and check the data md5sum. + Given Create volume 0 with dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data 0 to volume 0 + Then Volume 0 backup 0 should be able to create + FOR ${i} IN RANGE ${LOOP_COUNT} + Then Create DR volume 1 from backup 0 dataEngine=${DATA_ENGINE} + And Wait for volume 1 restoration from backup 0 start + Then Reboot volume 1 volume node + And Wait for volume 1 restoration from backup 0 completed + When Activate DR volume 1 + And Attach volume 1 + And Wait for volume 1 healthy + Then Check volume 1 data is backup 0 + Then Detach volume 1 + And Delete volume 1 + END + +DR Volume Node Reboot During Incremental Restoration + [Tags] manual longhorn-8425 + [Documentation] Test DR volume node reboot During Incremental Restoration + ... Create a pod with Longhorn volume. + ... Write data to the volume and get the md5sum. + ... Create the 1st backup for the volume. + ... Create a DR volume from the backup. + ... Wait for the DR volume to complete the initial restore. + ... Write more data to the original volume and get the md5sum. + ... Create the 2nd backup for the volume. + ... Wait for the DR volume incremental restore getting triggered. + ... Then reboot the DR volume attached node immediately. + ... Wait for the DR volume detached then reattached. + ... Wait for the DR volume restore complete after the reattachment. + ... Activate the DR volume and check the data md5sum. + Given Create volume 0 with dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data 0 to volume 0 + Then Volume 0 backup 0 should be able to create + Then Create DR volume 1 from backup 0 dataEngine=${DATA_ENGINE} + And Wait for volume 1 restoration from backup 0 completed + Then Write data 1 to volume 0 + And Volume 0 backup 1 should be able to create + And Wait for volume 1 restoration from backup 1 start + Then Reboot volume 1 volume node + Then Wait for volume 1 restoration from backup 1 completed + And Activate DR volume 1 + And Attach volume 1 + And Wait for volume 1 healthy + And Check volume 1 data is backup 1 \ No newline at end of file diff --git a/e2e/tests/regression/test_scheduling.robot b/e2e/tests/regression/test_scheduling.robot index d5375b5290..b9520edd8a 100644 --- a/e2e/tests/regression/test_scheduling.robot +++ b/e2e/tests/regression/test_scheduling.robot @@ -100,3 +100,44 @@ Test Replica Auto Balance Disk In Pressure And Check statefulset 0 data in file data.bin is intact And Check statefulset 1 data in file data.bin is intact And Check statefulset 2 data in file data.bin is intact + +Test Replica Auto Balance Node Least Effort + [Tags] coretest + [Documentation] Scenario: replica auto-balance nodes with `least_effort` + Given Set setting replica-soft-anti-affinity to true + And Set setting replica-auto-balance to least-effort + + When Disable node 1 scheduling + And Disable node 2 scheduling + And Create volume 0 with numberOfReplicas=6 dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data to volume 0 + Then Volume 0 should have 6 replicas running on node 0 + And Volume 0 should have 0 replicas running on node 1 + And Volume 0 should have 0 replicas running on node 2 + + When Enable node 1 scheduling + # wait for auto balance + Then Volume 0 should have replicas running on node 1 + And Volume 0 should have 6 replicas running + # loop 3 times with 5-second wait and compare the replica count to: + # ensure no additional scheduling occurs + # the replica count remains unchanged + And Volume 0 should have 5 replicas running on node 0 and no additional scheduling occurs + And Volume 0 should have 1 replicas running on node 1 and no additional scheduling occurs + And Volume 0 should have 0 replicas running on node 2 and no additional scheduling occurs + + When Enable node 2 scheduling + # wait for auto balance + Then Volume 0 should have replicas running on node 2 + And Volume 0 should have 6 replicas running + # loop 3 times with 5-second wait and compare the replica count to: + # ensure no additional scheduling occurs + # the replica count remains unchanged + And Volume 0 should have 4 replicas running on node 0 and no additional scheduling occurs + And Volume 0 should have 1 replicas running on node 1 and no additional scheduling occurs + And Volume 0 should have 1 replicas running on node 2 and no additional scheduling occurs + + And Wait for volume 0 healthy + And Check volume 0 data is intact diff --git a/e2e/utilities/longhorn-install.sh b/e2e/utilities/longhorn-install.sh old mode 100644 new mode 100755 index 9cd0d428c2..eafdd6beb0 --- a/e2e/utilities/longhorn-install.sh +++ b/e2e/utilities/longhorn-install.sh @@ -13,6 +13,8 @@ source ../pipelines/utilities/longhorn_manifest.sh # create and clean tmpdir TMPDIR="/tmp/longhorn" LONGHORN_NAMESPACE="longhorn-system" +LONGHORN_REPO_DIR="${TMPDIR}/longhorn" +LONGHORN_REPO_URI=${LONGHORN_REPO_URI:-"https://github.com/longhorn/longhorn.git"} mkdir -p ${TMPDIR} rm -rf "${TMPDIR}/" @@ -23,19 +25,48 @@ install_longhorn_by_chart(){ wait_longhorn_status_running } +install_longhorn_stable_by_chart(){ + git clone --single-branch \ + --branch "${LONGHORN_STABLE_VERSION}" \ + "${LONGHORN_REPO_URI}" \ + "${LONGHORN_REPO_DIR}" + helm upgrade --install longhorn "${LONGHORN_REPO_DIR}/chart/" --namespace "${LONGHORN_NAMESPACE}" + wait_longhorn_status_running +} + +install_longhorn_stable_by_manifest(){ + LONGHORN_STABLE_VERSION=${LONGHORN_STABLE_VERSION} + LONGHORN_STABLE_MANIFEST_URL="https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_STABLE_VERSION}/deploy/longhorn.yaml" + kubectl apply -f "${LONGHORN_STABLE_MANIFEST_URL}" + wait_longhorn_status_running +} + install_longhorn(){ create_longhorn_namespace install_backupstores if [[ "${LONGHORN_INSTALL_METHOD}" == "helm" ]]; then - LONGHORN_REPO_URI=${LONGHORN_REPO_URI:-"https://github.com/longhorn/longhorn.git"} - LONGHORN_REPO_DIR="${TMPDIR}/longhorn" install_longhorn_by_chart elif [[ "${LONGHORN_INSTALL_METHOD}" == "manifest" ]]; then generate_longhorn_yaml_manifest "${TF_VAR_tf_workspace}" install_longhorn_by_manifest "${TF_VAR_tf_workspace}/longhorn.yaml" fi setup_longhorn_ui_nodeport +} +install_longhorn_stable_version(){ + create_longhorn_namespace + install_backupstores + if [[ "${LONGHORN_INSTALL_METHOD}" == "helm" ]]; then + install_longhorn_stable_by_chart + elif [[ "${LONGHORN_INSTALL_METHOD}" == "manifest" ]]; then + install_longhorn_stable_by_manifest + fi + setup_longhorn_ui_nodeport } -install_longhorn +IS_INSTALL_STABLE_VERSION="${IS_INSTALL_STABLE_VERSION:-false}" +if [[ "${IS_INSTALL_STABLE_VERSION}" == "true" ]]; then + install_longhorn_stable_version +else + install_longhorn +fi diff --git a/manager/integration/README.md b/manager/integration/README.md index 77fcef7cf5..3cdd7c3677 100644 --- a/manager/integration/README.md +++ b/manager/integration/README.md @@ -16,10 +16,14 @@ Requirement: 6. Make sure `nfs-common` or equivalent has been installed on the node to allow the NFS client to work. Run the test: -1. Deploy all backupstore servers(including `NFS` server and `Minio` as s3 server) for test purposes. +1. Deploy all backupstore servers(including `NFS` server and `Minio` as s3 server `CIFS` and `Azurite` server) for test purposes. + + For Azurite, there are some manual steps need to be done after manifest deployed(https://github.com/longhorn/longhorn-tests/wiki/Setup-Azurite-Backupstore-For-Testing). ``` -kubectl create -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml \ - -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml +kubectl create -f https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/minio-backupstore.yaml \ + -f https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/nfs-backupstore.yaml \ + -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/cifs-backupstore.yaml \ + -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/azurite-backupstore.yaml ``` 2. Deploy the test script to the Kubernetes cluster. ``` diff --git a/manager/integration/deploy/backupstores/minio-backupstore.yaml b/manager/integration/deploy/backupstores/minio-backupstore.yaml new file mode 100644 index 0000000000..398bc6a765 --- /dev/null +++ b/manager/integration/deploy/backupstores/minio-backupstore.yaml @@ -0,0 +1,112 @@ +apiVersion: v1 +kind: Secret +metadata: + name: minio-secret + namespace: default +type: Opaque +data: + AWS_ACCESS_KEY_ID: bG9uZ2hvcm4tdGVzdC1hY2Nlc3Mta2V5 # longhorn-test-access-key + AWS_SECRET_ACCESS_KEY: bG9uZ2hvcm4tdGVzdC1zZWNyZXQta2V5 # longhorn-test-secret-key + AWS_ENDPOINTS: aHR0cHM6Ly9taW5pby1zZXJ2aWNlLmRlZmF1bHQ6OTAwMA== # https://minio-service.default:9000 + AWS_CERT: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURMRENDQWhTZ0F3SUJBZ0lSQU1kbzQycGhUZXlrMTcvYkxyWjVZRHN3RFFZSktvWklodmNOQVFFTEJRQXcKR2pFWU1CWUdBMVVFQ2hNUFRHOXVaMmh2Y200Z0xTQlVaWE4wTUNBWERUSXdNRFF5TnpJek1EQXhNVm9ZRHpJeApNakF3TkRBek1qTXdNREV4V2pBYU1SZ3dGZ1lEVlFRS0V3OU1iMjVuYUc5eWJpQXRJRlJsYzNRd2dnRWlNQTBHCkNTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFEWHpVdXJnUFpEZ3pUM0RZdWFlYmdld3Fvd2RlQUQKODRWWWF6ZlN1USs3K21Oa2lpUVBvelVVMmZvUWFGL1BxekJiUW1lZ29hT3l5NVhqM1VFeG1GcmV0eDBaRjVOVgpKTi85ZWFJNWRXRk9teHhpMElPUGI2T0RpbE1qcXVEbUVPSXljdjRTaCsvSWo5Zk1nS0tXUDdJZGxDNUJPeThkCncwOVdkckxxaE9WY3BKamNxYjN6K3hISHd5Q05YeGhoRm9tb2xQVnpJbnlUUEJTZkRuSDBuS0lHUXl2bGhCMGsKVHBHSzYxc2prZnFTK3hpNTlJeHVrbHZIRXNQcjFXblRzYU9oaVh6N3lQSlorcTNBMWZoVzBVa1JaRFlnWnNFbQovZ05KM3JwOFhZdURna2kzZ0UrOElXQWRBWHExeWhqRDdSSkI4VFNJYTV0SGpKUUtqZ0NlSG5HekFnTUJBQUdqCmF6QnBNQTRHQTFVZER3RUIvd1FFQXdJQ3BEQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBVEFQQmdOVkhSTUIKQWY4RUJUQURBUUgvTURFR0ExVWRFUVFxTUNpQ0NXeHZZMkZzYUc5emRJSVZiV2x1YVc4dGMyVnlkbWxqWlM1awpaV1poZFd4MGh3Ui9BQUFCTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFDbUZMMzlNSHVZMzFhMTFEajRwMjVjCnFQRUM0RHZJUWozTk9kU0dWMmQrZjZzZ3pGejFXTDhWcnF2QjFCMVM2cjRKYjJQRXVJQkQ4NFlwVXJIT1JNU2MKd3ViTEppSEtEa0Jmb2U5QWI1cC9VakpyS0tuajM0RGx2c1cvR3AwWTZYc1BWaVdpVWorb1JLbUdWSTI0Q0JIdgpnK0JtVzNDeU5RR1RLajk0eE02czNBV2xHRW95YXFXUGU1eHllVWUzZjFBWkY5N3RDaklKUmVWbENtaENGK0JtCmFUY1RSUWN3cVdvQ3AwYmJZcHlERFlwUmxxOEdQbElFOW8yWjZBc05mTHJVcGFtZ3FYMmtYa2gxa3lzSlEralAKelFadHJSMG1tdHVyM0RuRW0yYmk0TktIQVFIcFc5TXUxNkdRakUxTmJYcVF0VEI4OGpLNzZjdEg5MzRDYWw2VgotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0t + AWS_CERT_KEY: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRRFh6VXVyZ1BaRGd6VDMKRFl1YWViZ2V3cW93ZGVBRDg0VllhemZTdVErNyttTmtpaVFQb3pVVTJmb1FhRi9QcXpCYlFtZWdvYU95eTVYagozVUV4bUZyZXR4MFpGNU5WSk4vOWVhSTVkV0ZPbXh4aTBJT1BiNk9EaWxNanF1RG1FT0l5Y3Y0U2grL0lqOWZNCmdLS1dQN0lkbEM1Qk95OGR3MDlXZHJMcWhPVmNwSmpjcWIzeit4SEh3eUNOWHhoaEZvbW9sUFZ6SW55VFBCU2YKRG5IMG5LSUdReXZsaEIwa1RwR0s2MXNqa2ZxUyt4aTU5SXh1a2x2SEVzUHIxV25Uc2FPaGlYejd5UEpaK3EzQQoxZmhXMFVrUlpEWWdac0VtL2dOSjNycDhYWXVEZ2tpM2dFKzhJV0FkQVhxMXloakQ3UkpCOFRTSWE1dEhqSlFLCmpnQ2VIbkd6QWdNQkFBRUNnZ0VBZlVyQ1hrYTN0Q2JmZjNpcnp2cFFmZnVEbURNMzV0TmlYaDJTQVpSVW9FMFYKbSsvZ1UvdnIrN2s2eUgvdzhMOXhpZXFhQTljVkZkL0JuTlIrMzI2WGc2dEpCNko2ZGZxODJZdmZOZ0VDaUFMaQpqalNGemFlQmhnT3ZsWXZHbTR5OTU1Q0FGdjQ1cDNac1VsMTFDRXJlL1BGbGtaWHRHeGlrWFl6NC85UTgzblhZCnM2eDdPYTgyUjdwT2lraWh3Q0FvVTU3Rjc4ZWFKOG1xTmkwRlF2bHlxSk9QMTFCbVp4dm54ZU11S2poQjlPTnAKTFNwMWpzZXk5bDZNR2pVbjBGTG53RHZkVWRiK0ZlUEkxTjdWYUNBd3hJK3JHa3JTWkhnekhWWE92VUpON2t2QQpqNUZPNW9uNGgvK3hXbkYzM3lxZ0VvWWZ0MFFJL2pXS2NOV1d1a2pCd1FLQmdRRGVFNlJGRUpsT2Q1aVcxeW1qCm45RENnczVFbXFtRXN3WU95bkN3U2RhK1lNNnZVYmlac1k4WW9wMVRmVWN4cUh2NkFQWGpVd2NBUG1QVE9KRW8KMlJtS0xTYkhsTnc4bFNOMWJsWDBEL3Mzamc1R3VlVW9nbW5TVnhMa0h1OFhKR0o3VzFReEUzZG9IUHRrcTNpagpoa09QTnJpZFM0UmxqNTJwYkhscjUvQzRjUUtCZ1FENHhFYmpuck1heFV2b0xxVTRvT2xiOVc5UytSUllTc0cxCmxJUmgzNzZTV0ZuTTlSdGoyMTI0M1hkaE4zUFBtSTNNeiswYjdyMnZSUi9LMS9Cc1JUQnlrTi9kbkVuNVUxQkEKYm90cGZIS1Jvc1FUR1hIQkEvM0JrNC9qOWplU3RmVXgzZ2x3eUI0L2hORy9KM1ZVV2FXeURTRm5qZFEvcGJsRwp6VWlsSVBmK1l3S0JnUUNwMkdYYmVJMTN5TnBJQ3psS2JqRlFncEJWUWVDQ29CVHkvUHRncUtoM3BEeVBNN1kyCnZla09VMWgyQVN1UkhDWHRtQXgzRndvVXNxTFFhY1FEZEw4bXdjK1Y5eERWdU02TXdwMDBjNENVQmE1L2d5OXoKWXdLaUgzeFFRaVJrRTZ6S1laZ3JqSkxYYXNzT1BHS2cxbEFYV1NlckRaV3R3MEEyMHNLdXQ0NlEwUUtCZ0hGZQpxZHZVR0ZXcjhvTDJ0dzlPcmVyZHVJVTh4RnZVZmVFdHRRTVJ2N3pjRE5qT0gxUnJ4Wk9aUW0ySW92dkp6MTIyCnFKMWhPUXJtV3EzTHFXTCtTU3o4L3pqMG4vWERWVUIzNElzTFR2ODJDVnVXN2ZPRHlTSnVDRlpnZ0VVWkxZd3oKWDJRSm4xZGRSV1Z6S3hKczVJbDNXSERqL3dXZWxnaEJSOGtSZEZOM0FvR0FJNldDdjJQQ1lUS1ZZNjAwOFYwbgpyTDQ3YTlPanZ0Yy81S2ZxSjFpMkpKTUgyQi9jbU1WRSs4M2dpODFIU1FqMWErNnBjektmQVppZWcwRk9nL015ClB6VlZRYmpKTnY0QzM5KzdxSDg1WGdZTXZhcTJ0aDFEZWUvQ3NsMlM4QlV0cW5mc0VuMUYwcWhlWUJZb2RibHAKV3RUaE5oRi9oRVhzbkJROURyWkJKT1U9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +# same secret for longhorn-system namespace +apiVersion: v1 +kind: Secret +metadata: + name: minio-secret + namespace: longhorn-system +type: Opaque +data: + AWS_ACCESS_KEY_ID: bG9uZ2hvcm4tdGVzdC1hY2Nlc3Mta2V5 # longhorn-test-access-key + AWS_SECRET_ACCESS_KEY: bG9uZ2hvcm4tdGVzdC1zZWNyZXQta2V5 # longhorn-test-secret-key + AWS_ENDPOINTS: aHR0cHM6Ly9taW5pby1zZXJ2aWNlLmRlZmF1bHQ6OTAwMA== # https://minio-service.default:9000 + AWS_CERT: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURMRENDQWhTZ0F3SUJBZ0lSQU1kbzQycGhUZXlrMTcvYkxyWjVZRHN3RFFZSktvWklodmNOQVFFTEJRQXcKR2pFWU1CWUdBMVVFQ2hNUFRHOXVaMmh2Y200Z0xTQlVaWE4wTUNBWERUSXdNRFF5TnpJek1EQXhNVm9ZRHpJeApNakF3TkRBek1qTXdNREV4V2pBYU1SZ3dGZ1lEVlFRS0V3OU1iMjVuYUc5eWJpQXRJRlJsYzNRd2dnRWlNQTBHCkNTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFEWHpVdXJnUFpEZ3pUM0RZdWFlYmdld3Fvd2RlQUQKODRWWWF6ZlN1USs3K21Oa2lpUVBvelVVMmZvUWFGL1BxekJiUW1lZ29hT3l5NVhqM1VFeG1GcmV0eDBaRjVOVgpKTi85ZWFJNWRXRk9teHhpMElPUGI2T0RpbE1qcXVEbUVPSXljdjRTaCsvSWo5Zk1nS0tXUDdJZGxDNUJPeThkCncwOVdkckxxaE9WY3BKamNxYjN6K3hISHd5Q05YeGhoRm9tb2xQVnpJbnlUUEJTZkRuSDBuS0lHUXl2bGhCMGsKVHBHSzYxc2prZnFTK3hpNTlJeHVrbHZIRXNQcjFXblRzYU9oaVh6N3lQSlorcTNBMWZoVzBVa1JaRFlnWnNFbQovZ05KM3JwOFhZdURna2kzZ0UrOElXQWRBWHExeWhqRDdSSkI4VFNJYTV0SGpKUUtqZ0NlSG5HekFnTUJBQUdqCmF6QnBNQTRHQTFVZER3RUIvd1FFQXdJQ3BEQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBVEFQQmdOVkhSTUIKQWY4RUJUQURBUUgvTURFR0ExVWRFUVFxTUNpQ0NXeHZZMkZzYUc5emRJSVZiV2x1YVc4dGMyVnlkbWxqWlM1awpaV1poZFd4MGh3Ui9BQUFCTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFDbUZMMzlNSHVZMzFhMTFEajRwMjVjCnFQRUM0RHZJUWozTk9kU0dWMmQrZjZzZ3pGejFXTDhWcnF2QjFCMVM2cjRKYjJQRXVJQkQ4NFlwVXJIT1JNU2MKd3ViTEppSEtEa0Jmb2U5QWI1cC9VakpyS0tuajM0RGx2c1cvR3AwWTZYc1BWaVdpVWorb1JLbUdWSTI0Q0JIdgpnK0JtVzNDeU5RR1RLajk0eE02czNBV2xHRW95YXFXUGU1eHllVWUzZjFBWkY5N3RDaklKUmVWbENtaENGK0JtCmFUY1RSUWN3cVdvQ3AwYmJZcHlERFlwUmxxOEdQbElFOW8yWjZBc05mTHJVcGFtZ3FYMmtYa2gxa3lzSlEralAKelFadHJSMG1tdHVyM0RuRW0yYmk0TktIQVFIcFc5TXUxNkdRakUxTmJYcVF0VEI4OGpLNzZjdEg5MzRDYWw2VgotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0t +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: longhorn-test-minio + namespace: default + labels: + app: longhorn-test-minio +spec: + replicas: 1 + selector: + matchLabels: + app: longhorn-test-minio + template: + metadata: + labels: + app: longhorn-test-minio + spec: + nodeSelector: + node-role.kubernetes.io/control-plane: "true" + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoExecute" + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoExecute" + - key: "node-role.kubernetes.io/etcd" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/etcd" + operator: "Exists" + effect: "NoExecute" + volumes: + - name: minio-volume + emptyDir: {} + - name: minio-certificates + secret: + secretName: minio-secret + items: + - key: AWS_CERT + path: public.crt + - key: AWS_CERT_KEY + path: private.key + containers: + - name: minio + image: minio/minio:RELEASE.2022-02-01T18-00-14Z + command: ["sh", "-c", "mkdir -p /storage/backupbucket && mkdir -p /root/.minio/certs && ln -s /root/certs/private.key /root/.minio/certs/private.key && ln -s /root/certs/public.crt /root/.minio/certs/public.crt && exec minio server /storage"] + env: + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: minio-secret + key: AWS_ACCESS_KEY_ID + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: minio-secret + key: AWS_SECRET_ACCESS_KEY + ports: + - containerPort: 9000 + volumeMounts: + - name: minio-volume + mountPath: "/storage" + - name: minio-certificates + mountPath: "/root/certs" + readOnly: true +--- +apiVersion: v1 +kind: Service +metadata: + name: minio-service + namespace: default +spec: + selector: + app: longhorn-test-minio + ports: + - port: 9000 + targetPort: 9000 + protocol: TCP + sessionAffinity: ClientIP diff --git a/manager/integration/deploy/backupstores/nfs-backupstore.yaml b/manager/integration/deploy/backupstores/nfs-backupstore.yaml new file mode 100644 index 0000000000..548cb7b884 --- /dev/null +++ b/manager/integration/deploy/backupstores/nfs-backupstore.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: longhorn-test-nfs + namespace: default + labels: + app: longhorn-test-nfs +spec: + selector: + matchLabels: + app: longhorn-test-nfs + template: + metadata: + labels: + app: longhorn-test-nfs + spec: + nodeSelector: + node-role.kubernetes.io/control-plane: "true" + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoExecute" + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoExecute" + - key: "node-role.kubernetes.io/etcd" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/etcd" + operator: "Exists" + effect: "NoExecute" + volumes: + - name: nfs-volume + emptyDir: {} + containers: + - name: longhorn-test-nfs-container + image: longhornio/nfs-ganesha:latest + imagePullPolicy: Always + env: + - name: EXPORT_ID + value: "14" + - name: EXPORT_PATH + value: /opt/backupstore + - name: PSEUDO_PATH + value: /opt/backupstore + - name: NFS_DISK_IMAGE_SIZE_MB + value: "4096" + command: ["bash", "-c", "chmod 700 /opt/backupstore && /opt/start_nfs.sh | tee /var/log/ganesha.log"] + securityContext: + privileged: true + capabilities: + add: ["SYS_ADMIN", "DAC_READ_SEARCH"] + volumeMounts: + - name: nfs-volume + mountPath: "/opt/backupstore" + livenessProbe: + exec: + command: ["bash", "-c", "grep \"No export entries found\" /var/log/ganesha.log > /dev/null 2>&1 ; [ $? -ne 0 ]"] + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 4 +--- +kind: Service +apiVersion: v1 +metadata: + name: longhorn-test-nfs-svc + namespace: default +spec: + selector: + app: longhorn-test-nfs + clusterIP: None + ports: + - name: notnecessary + port: 1234 + targetPort: 1234 diff --git a/manager/integration/deploy/test.yaml b/manager/integration/deploy/test.yaml index eb3a44c8f5..df56d679db 100644 --- a/manager/integration/deploy/test.yaml +++ b/manager/integration/deploy/test.yaml @@ -42,7 +42,7 @@ spec: - name: LONGHORN_JUNIT_REPORT_PATH value: /tmp/test-report/longhorn-test-junit-report.xml - name: LONGHORN_BACKUPSTORES - value: "s3://backupbucket@us-east-1/backupstore$minio-secret, nfs://longhorn-test-nfs-svc.default:/opt/backupstore" + value: "s3://backupbucket@us-east-1/backupstore$minio-secret, nfs://longhorn-test-nfs-svc.default:/opt/backupstore, cifs://longhorn-test-cifs-svc.default/backupstore$cifs-secret, azblob://longhorn-test-azurite@core.windows.net/$azblob-secret" - name: LONGHORN_BACKUPSTORE_POLL_INTERVAL value: "30" - name: LONGHORN_DISK_TYPE diff --git a/manager/integration/tests/backupstore.py b/manager/integration/tests/backupstore.py index ac74615c6e..b938636d85 100644 --- a/manager/integration/tests/backupstore.py +++ b/manager/integration/tests/backupstore.py @@ -17,6 +17,8 @@ from common import cleanup_all_volumes from common import is_backupTarget_s3 from common import is_backupTarget_nfs +from common import is_backupTarget_cifs +from common import is_backupTarget_azurite from common import get_longhorn_api_client from common import delete_backup_volume from common import delete_backup_backing_image @@ -64,8 +66,12 @@ def set_random_backupstore(request, client): elif request.param == "nfs": set_backupstore_nfs(client) mount_nfs_backupstore(client) + elif request.param == "cifs": + set_backupstore_cifs(client) + elif request.param == "azblob": + set_backupstore_azurite(client) - yield + yield request.param cleanup_all_volumes(client) backupstore_cleanup(client) system_backups_cleanup(client) @@ -116,6 +122,30 @@ def set_backupstore_nfs(client): break +def set_backupstore_cifs(client): + backupstores = get_backupstore_url() + poll_interval = get_backupstore_poll_interval() + for backupstore in backupstores: + if is_backupTarget_cifs(backupstore): + backupsettings = backupstore.split("$") + set_backupstore_url(client, backupsettings[0]) + set_backupstore_credential_secret(client, backupsettings[1]) + set_backupstore_poll_interval(client, poll_interval) + break + + +def set_backupstore_azurite(client): + backupstores = get_backupstore_url() + poll_interval = get_backupstore_poll_interval() + for backupstore in backupstores: + if is_backupTarget_azurite(backupstore): + backupsettings = backupstore.split("$") + set_backupstore_url(client, backupsettings[0]) + set_backupstore_credential_secret(client, backupsettings[1]) + set_backupstore_poll_interval(client, poll_interval) + break + + def set_backupstore_url(client, url): backup_target_setting = client.by_id_setting(SETTING_BACKUP_TARGET) backup_target_setting = client.update(backup_target_setting, @@ -274,7 +304,7 @@ def backupstore_get_backup_volume_prefix(client, volume_name): return nfs_get_backup_volume_prefix(client, volume_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def minio_get_backup_volume_prefix(volume_name): @@ -311,7 +341,7 @@ def backupstore_get_backup_cfg_file_path(client, volume_name, backup_name): return nfs_get_backup_cfg_file_path(client, volume_name, backup_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def minio_get_backup_cfg_file_path(volume_name, backup_name): @@ -334,7 +364,7 @@ def backupstore_get_volume_cfg_file_path(client, volume_name): return nfs_get_volume_cfg_file_path(client, volume_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def nfs_get_volume_cfg_file_path(client, volume_name): @@ -357,7 +387,7 @@ def backupstore_get_backup_blocks_dir(client, volume_name): return nfs_get_backup_blocks_dir(client, volume_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def minio_get_backup_blocks_dir(volume_name): @@ -383,7 +413,7 @@ def backupstore_create_file(client, core_api, file_path, data={}): return nfs_create_file_in_backupstore(file_path, data={}) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def mino_create_file_in_backupstore(client, core_api, file_path, data={}): # NOQA @@ -433,7 +463,7 @@ def backupstore_write_backup_cfg_file(client, core_api, volume_name, backup_name data) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def nfs_write_backup_cfg_file(client, volume_name, backup_name, data): @@ -481,7 +511,7 @@ def backupstore_delete_file(client, core_api, file_path): return nfs_delete_file_in_backupstore(file_path) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def mino_delete_file_in_backupstore(client, core_api, file_path): @@ -521,7 +551,7 @@ def backupstore_delete_backup_cfg_file(client, core_api, volume_name, backup_nam nfs_delete_backup_cfg_file(client, volume_name, backup_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def nfs_delete_backup_cfg_file(client, volume_name, backup_name): @@ -563,7 +593,7 @@ def backupstore_delete_volume_cfg_file(client, core_api, volume_name): # NOQA nfs_delete_volume_cfg_file(client, volume_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def nfs_delete_volume_cfg_file(client, volume_name): @@ -632,7 +662,7 @@ def backupstore_delete_random_backup_block(client, core_api, volume_name): nfs_delete_random_backup_block(client, volume_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def nfs_delete_random_backup_block(client, volume_name): @@ -681,7 +711,7 @@ def backupstore_count_backup_block_files(client, core_api, volume_name): return nfs_count_backup_block_files(client, volume_name) else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA def nfs_count_backup_block_files(client, volume_name): diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index 439129e0ab..dfbda10f89 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -2536,8 +2536,9 @@ def wait_for_replica_scheduled(client, volume_name, to_nodes, assert volume.robustness == VOLUME_ROBUSTNESS_HEALTHY scheduled = 0 - unexpect_fail = expect_fail - expect_nodes = [n for n in to_nodes] + unexpect_fail = max(0, expect_fail) + + expect_nodes = set(to_nodes) for r in volume.replicas: try: assert r.hostId in expect_nodes @@ -2551,7 +2552,8 @@ def wait_for_replica_scheduled(client, volume_name, to_nodes, scheduled += 1 except AssertionError: - unexpect_fail -= 1 + if expect_fail >= 0: + unexpect_fail -= 1 if scheduled == expect_success and unexpect_fail == 0: break @@ -2559,9 +2561,12 @@ def wait_for_replica_scheduled(client, volume_name, to_nodes, time.sleep(RETRY_INTERVAL) assert scheduled == expect_success, f" Volume = {volume}" - assert unexpect_fail == 0, f" Volume = {volume}" - assert len(volume.replicas) == expect_success + expect_fail, \ - f" Volume = {volume}" + assert unexpect_fail == 0, f"Got {unexpect_fail} unexpected fail" + + if expect_fail >= 0: + assert len(volume.replicas) == expect_success + expect_fail, \ + f" Volume = {volume}" + return volume @@ -3895,6 +3900,14 @@ def is_backupTarget_nfs(s): return s.startswith("nfs://") +def is_backupTarget_cifs(s): + return s.startswith("cifs://") + + +def is_backupTarget_azurite(s): + return s.startswith("azblob://") + + def wait_for_backup_volume(client, vol_name, backing_image=""): for _ in range(RETRY_BACKUP_COUNTS): bv = client.by_id_backupVolume(vol_name) diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py index aa531040f5..6c1f756072 100644 --- a/manager/integration/tests/test_basic.py +++ b/manager/integration/tests/test_basic.py @@ -673,6 +673,10 @@ def test_backup_block_deletion(set_random_backupstore, client, core_api, volume_ 17. Delete the backup volume 18. Cleanup the volume """ + backup_store_type = set_random_backupstore + if backup_store_type not in ["nfs", "s3"]: + pytest.skip("Skip test case because the backup store type is not supported") # NOQA + backupstore_cleanup(client) volume = create_and_check_volume(client, volume_name) @@ -1106,6 +1110,10 @@ def test_backup_volume_list(set_random_backupstore, client, core_api): # NOQA 11. delete backup volumes(1 & 2) 12. cleanup """ + backup_store_type = set_random_backupstore + if backup_store_type not in ["nfs", "s3"]: + pytest.skip("Skip test case because the backup store type is not supported") # NOQA + backupstore_cleanup(client) # create 2 volumes. @@ -1200,6 +1208,10 @@ def test_backup_metadata_deletion(set_random_backupstore, client, core_api, volu 18. verify that volume(1) has been deleted in the backupstore. 19. cleanup """ + backup_store_type = set_random_backupstore + if backup_store_type not in ["nfs", "s3"]: + pytest.skip("Skip test case because the backup store type is not supported") # NOQA + backupstore_cleanup(client) volume1_name = volume_name + "-1" @@ -3791,7 +3803,7 @@ def test_allow_volume_creation_with_degraded_availability_restore(set_random_bac to_nodes=[node1.name, node2.name], expect_success=2, - expect_fail=0, + expect_fail=-1, chk_vol_healthy=False, chk_replica_running=False) @@ -4392,7 +4404,7 @@ def test_backuptarget_available_during_engine_image_not_ready(client, apps_api): url = backupstore cred_secret = "" else: - raise NotImplementedError + pytest.skip("Skip test case because the backup store type is not supported") # NOQA poll_intervals = ["0", "300"] for poll_interval in poll_intervals: diff --git a/manager/integration/tests/test_ha.py b/manager/integration/tests/test_ha.py index 7deafc060d..40747344d4 100644 --- a/manager/integration/tests/test_ha.py +++ b/manager/integration/tests/test_ha.py @@ -1409,6 +1409,10 @@ def test_all_replica_restore_failure(set_random_backupstore, client, core_api, v 15. Verify the faulted volume cannot be attached to a node. 16. Verify this faulted volume can be deleted. """ + backup_store_type = set_random_backupstore + if backup_store_type not in ["nfs", "s3"]: + pytest.skip("Skip test case because the backup store type is not supported") # NOQA + auto_salvage_setting = client.by_id_setting(SETTING_AUTO_SALVAGE) assert auto_salvage_setting.name == SETTING_AUTO_SALVAGE assert auto_salvage_setting.value == "true" diff --git a/pipelines/e2e/scripts/longhorn-setup.sh b/pipelines/e2e/scripts/longhorn-setup.sh index dac0b3ae9c..7f7f317bd6 100755 --- a/pipelines/e2e/scripts/longhorn-setup.sh +++ b/pipelines/e2e/scripts/longhorn-setup.sh @@ -8,6 +8,7 @@ source pipelines/utilities/install_csi_snapshotter.sh source pipelines/utilities/create_aws_secret.sh source pipelines/utilities/create_harvester_secret.sh source pipelines/utilities/install_backupstores.sh +source pipelines/utilities/install_metrics_server.sh source pipelines/utilities/create_longhorn_namespace.sh source pipelines/utilities/longhorn_manifest.sh source pipelines/utilities/longhorn_ui.sh @@ -60,6 +61,10 @@ main(){ longhornctl_check fi + if [[ "${DISTRO}" == "talos" ]]; then + install_metrics_server + fi + generate_longhorn_yaml_manifest install_longhorn_by_manifest diff --git a/pipelines/gke/terraform/main.tf b/pipelines/gke/terraform/main.tf index 471028798e..519d676777 100644 --- a/pipelines/gke/terraform/main.tf +++ b/pipelines/gke/terraform/main.tf @@ -42,7 +42,7 @@ resource "google_container_cluster" "cluster" { network = google_compute_network.vpc_network.id subnetwork = google_compute_subnetwork.subnetwork.id location = data.google_compute_zones.available.names[0] - min_master_version = "1.30.2-gke.1587003" + min_master_version = "1.31.1-gke.2105000" remove_default_node_pool = true deletion_protection = false initial_node_count = 1 diff --git a/pipelines/utilities/install_backupstores.sh b/pipelines/utilities/install_backupstores.sh index 7f043e8a8d..3355931f79 100755 --- a/pipelines/utilities/install_backupstores.sh +++ b/pipelines/utilities/install_backupstores.sh @@ -1,6 +1,34 @@ install_backupstores(){ - MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml" - NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml" + MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/minio-backupstore.yaml" + NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/nfs-backupstore.yaml" + CIFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/cifs-backupstore.yaml" + AZURITE_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/azurite-backupstore.yaml" kubectl create -f ${MINIO_BACKUPSTORE_URL} \ - -f ${NFS_BACKUPSTORE_URL} + -f ${NFS_BACKUPSTORE_URL} \ + -f ${CIFS_BACKUPSTORE_URL} \ + -f ${AZURITE_BACKUPSTORE_URL} + setup_azuitize_backup_store +} + +setup_azuitize_backup_store(){ + RETRY=0 + MAX_RETRY=60 + until (kubectl get pods | grep 'longhorn-test-azblob' | grep 'Running'); do + echo 'Waiting azurite pod running' + sleep 5 + if [ $RETRY -eq $MAX_RETRY ]; then + break + fi + RETRY=$((RETRY+1)) + done + + AZBLOB_ENDPOINT=$(echo -n "http://$(kubectl get svc azblob-service -o jsonpath='{.spec.clusterIP}'):10000/" | base64) + kubectl -n longhorn-system patch secret azblob-secret \ + --type=json \ + -p="[{'op': 'replace', 'path': '/data/AZBLOB_ENDPOINT', 'value': \"${AZBLOB_ENDPOINT}\"}]" + + CONTROL_PLANE_PUBLIC_IP=$(cat /tmp/controlplane_public_ip) + # port forward and az container create need to be run on control node + ssh ec2-user@${CONTROL_PLANE_PUBLIC_IP} "nohup kubectl port-forward --address 0.0.0.0 service/azblob-service 20001:10000 > /dev/null 2>&1 &" + ssh ec2-user@${CONTROL_PLANE_PUBLIC_IP} "az storage container create -n longhorn-test-azurite --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://0.0.0.0:20001/devstoreaccount1;'" } \ No newline at end of file diff --git a/pipelines/utilities/install_metrics_server.sh b/pipelines/utilities/install_metrics_server.sh new file mode 100644 index 0000000000..0554f26c31 --- /dev/null +++ b/pipelines/utilities/install_metrics_server.sh @@ -0,0 +1,4 @@ +install_metrics_server(){ + kubectl apply -f "https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml" + kubectl patch deployment metrics-server -n kube-system --type='json' -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--kubelet-insecure-tls"}]' +} diff --git a/pipelines/utilities/run_longhorn_e2e_test.sh b/pipelines/utilities/run_longhorn_e2e_test.sh index e75b7c3732..9e09a90d80 100755 --- a/pipelines/utilities/run_longhorn_e2e_test.sh +++ b/pipelines/utilities/run_longhorn_e2e_test.sh @@ -1,5 +1,7 @@ S3_BACKUP_STORE='s3://backupbucket@us-east-1/backupstore$minio-secret' NFS_BACKUP_STORE='nfs://longhorn-test-nfs-svc.default:/opt/backupstore' +CIFS_BACKUP_STORE='cifs://longhorn-test-cifs-svc.default/backupstore$cifs-secret' +AZURITE_BACKUP_STORE='azblob://longhorn-test-azurite@core.windows.net/$azblob-secret' run_longhorn_e2e_test(){ @@ -22,6 +24,10 @@ run_longhorn_e2e_test(){ yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${S3_BACKUP_STORE}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} elif [[ $BACKUP_STORE_TYPE = "nfs" ]]; then yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${NFS_BACKUP_STORE}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "cifs" ]]; then + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${CIFS_BACKUP_STORE}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "azurite" ]]; then + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${AZURITE_BACKUP_STORE}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} fi if [[ "${TF_VAR_use_hdd}" == true ]]; then @@ -42,6 +48,7 @@ run_longhorn_e2e_test(){ yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env += {"name": "CUSTOM_LONGHORN_SHARE_MANAGER_IMAGE", "value": "'${CUSTOM_LONGHORN_SHARE_MANAGER_IMAGE}'"}' "${LONGHORN_TESTS_MANIFEST_FILE_PATH}" yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env += {"name": "CUSTOM_LONGHORN_BACKING_IMAGE_MANAGER_IMAGE", "value": "'${CUSTOM_LONGHORN_BACKING_IMAGE_MANAGER_IMAGE}'"}' "${LONGHORN_TESTS_MANIFEST_FILE_PATH}" yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env += {"name": "LONGHORN_INSTALL_METHOD", "value": "'${LONGHORN_INSTALL_METHOD}'"}' "${LONGHORN_TESTS_MANIFEST_FILE_PATH}" + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env += {"name": "LONGHORN_STABLE_VERSION", "value": "'${LONGHORN_STABLE_VERSION}'"}' "${LONGHORN_TESTS_MANIFEST_FILE_PATH}" LONGHORN_TEST_POD_NAME=`yq e 'select(.spec.containers[0] != null).metadata.name' ${LONGHORN_TESTS_MANIFEST_FILE_PATH}` @@ -75,6 +82,10 @@ run_longhorn_e2e_test_out_of_cluster(){ LONGHORN_BACKUPSTORES=${S3_BACKUP_STORE} elif [[ $BACKUP_STORE_TYPE = "nfs" ]]; then LONGHORN_BACKUPSTORES=${NFS_BACKUP_STORE} + elif [[ $BACKUP_STORE_TYPE = "cifs" ]]; then + LONGHORN_BACKUPSTORES=${CIFS_BACKUP_STORE} + elif [[ $BACKUP_STORE_TYPE = "azurite" ]]; then + LONGHORN_BACKUPSTORES=${AZURITE_BACKUP_STORE} fi LONGHORN_BACKUPSTORE_POLL_INTERVAL="30" @@ -106,6 +117,7 @@ run_longhorn_e2e_test_out_of_cluster(){ -e CUSTOM_LONGHORN_SHARE_MANAGER_IMAGE="${CUSTOM_LONGHORN_SHARE_MANAGER_IMAGE}"\ -e CUSTOM_LONGHORN_BACKING_IMAGE_MANAGER_IMAGE="${CUSTOM_LONGHORN_BACKING_IMAGE_MANAGER_IMAGE}"\ -e LONGHORN_INSTALL_METHOD="${LONGHORN_INSTALL_METHOD}"\ + -e LONGHORN_STABLE_VERSION="${LONGHORN_STABLE_VERSION}"\ --mount source="vol-${IMAGE_NAME}",target=/tmp \ "${LONGHORN_TESTS_CUSTOM_IMAGE}" "${ROBOT_COMMAND_ARGS[@]}" docker stop "${CONTAINER_NAME}" diff --git a/pipelines/utilities/run_longhorn_test.sh b/pipelines/utilities/run_longhorn_test.sh index 7ef17dfee9..26fd8f93e8 100755 --- a/pipelines/utilities/run_longhorn_test.sh +++ b/pipelines/utilities/run_longhorn_test.sh @@ -24,6 +24,12 @@ run_longhorn_test(){ elif [[ $BACKUP_STORE_TYPE = "nfs" ]]; then BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $2}' | sed 's/ *//'` yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "cifs" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $3}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "azurite" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $4}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} fi if [[ "${TF_VAR_use_hdd}" == true ]]; then @@ -107,6 +113,12 @@ run_longhorn_upgrade_test(){ elif [[ $BACKUP_STORE_TYPE = "nfs" ]]; then BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $2}' | sed 's/ *//'` yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "cifs" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $3}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "azurite" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $4}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} fi yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[4].value="'${LONGHORN_UPGRADE_TYPE}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} diff --git a/test_framework/Dockerfile.setup b/test_framework/Dockerfile.setup index 8cb07be700..d41043ad1c 100644 --- a/test_framework/Dockerfile.setup +++ b/test_framework/Dockerfile.setup @@ -1,4 +1,4 @@ -FROM golang:1.22-alpine3.19 +FROM golang:1.23-alpine3.19 ARG KUBECTL_VERSION=v1.30.0 diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh index 56f4937872..b091c3a9b6 100755 --- a/test_framework/scripts/longhorn-setup.sh +++ b/test_framework/scripts/longhorn-setup.sh @@ -5,6 +5,7 @@ set -x source test_framework/scripts/kubeconfig.sh source pipelines/utilities/longhorn_manifest.sh source pipelines/utilities/longhorn_ui.sh +source pipelines/utilities/install_metrics_server.sh # create and clean tmpdir TMPDIR="/tmp/longhorn" @@ -331,12 +332,39 @@ create_longhorn_namespace(){ install_backupstores(){ - MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml" - NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml" + MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/minio-backupstore.yaml" + NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/nfs-backupstore.yaml" + CIFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/cifs-backupstore.yaml" + AZURITE_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/azurite-backupstore.yaml" kubectl create -f ${MINIO_BACKUPSTORE_URL} \ - -f ${NFS_BACKUPSTORE_URL} + -f ${NFS_BACKUPSTORE_URL} \ + -f ${CIFS_BACKUPSTORE_URL} \ + -f ${AZURITE_BACKUPSTORE_URL} + setup_azuitize_backup_store } +setup_azuitize_backup_store(){ + RETRY=0 + MAX_RETRY=60 + until (kubectl get pods | grep 'longhorn-test-azblob' | grep 'Running'); do + echo 'Waiting azurite pod running' + sleep 5 + if [ $RETRY -eq $MAX_RETRY ]; then + break + fi + RETRY=$((RETRY+1)) + done + + AZBLOB_ENDPOINT=$(echo -n "http://$(kubectl get svc azblob-service -o jsonpath='{.spec.clusterIP}'):10000/" | base64) + kubectl -n longhorn-system patch secret azblob-secret \ + --type=json \ + -p="[{'op': 'replace', 'path': '/data/AZBLOB_ENDPOINT', 'value': \"${AZBLOB_ENDPOINT}\"}]" + + CONTROL_PLANE_PUBLIC_IP=$(cat /tmp/controlplane_public_ip) + # port forward and az container create need to be run on control node + ssh ec2-user@${CONTROL_PLANE_PUBLIC_IP} "nohup kubectl port-forward --address 0.0.0.0 service/azblob-service 20001:10000 > /dev/null 2>&1 &" + ssh ec2-user@${CONTROL_PLANE_PUBLIC_IP} "az storage container create -n longhorn-test-azurite --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://0.0.0.0:20001/devstoreaccount1;'" +} create_aws_secret(){ AWS_ACCESS_KEY_ID_BASE64=`echo -n "${TF_VAR_lh_aws_access_key}" | base64` @@ -396,6 +424,12 @@ run_longhorn_upgrade_test(){ elif [[ $BACKUP_STORE_TYPE = "nfs" ]]; then BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $2}' | sed 's/ *//'` yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "cifs" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $3}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "azurite" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $4}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} fi yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[4].value="'${LONGHORN_UPGRADE_TYPE}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} @@ -450,6 +484,12 @@ run_longhorn_tests(){ elif [[ $BACKUP_STORE_TYPE = "nfs" ]]; then BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $2}' | sed 's/ *//'` yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "cifs" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $3}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} + elif [[ $BACKUP_STORE_TYPE = "azurite" ]]; then + BACKUP_STORE_FOR_TEST=`yq e 'select(.spec.containers[0] != null).spec.containers[0].env[1].value' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} | awk -F ',' '{print $4}' | sed 's/ *//'` + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[1].value="'${BACKUP_STORE_FOR_TEST}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} fi if [[ "${TF_VAR_use_hdd}" == true ]]; then @@ -529,6 +569,10 @@ main(){ longhornctl_check fi + if [[ "${DISTRO}" == "talos" ]]; then + install_metrics_server + fi + if [[ "${AIR_GAP_INSTALLATION}" == true ]]; then if [[ "${LONGHORN_INSTALL_METHOD}" == "manifest-file" ]]; then create_registry_secret diff --git a/test_framework/terraform/aws/centos/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/centos/user-data-scripts/provision_k3s_agent.sh.tpl index fe80f7f8a0..8c4c5c2d9b 100755 --- a/test_framework/terraform/aws/centos/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/centos/user-data-scripts/provision_k3s_agent.sh.tpl @@ -14,7 +14,7 @@ sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' / sudo yum update -y sudo yum group install -y "Development Tools" -sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools +sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools samba sudo systemctl -q enable iscsid sudo systemctl start iscsid diff --git a/test_framework/terraform/aws/centos/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/centos/user-data-scripts/provision_rke2_agent.sh.tpl index dabe432468..9a1628ff0e 100755 --- a/test_framework/terraform/aws/centos/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/centos/user-data-scripts/provision_rke2_agent.sh.tpl @@ -14,7 +14,7 @@ sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' / sudo yum update -y sudo yum group install -y "Development Tools" -sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools nc +sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools nc samba sudo systemctl -q enable iscsid sudo systemctl start iscsid diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl index 821f2b4723..a83a51e404 100755 --- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl @@ -2,7 +2,7 @@ sudo yum update -y sudo yum group install -y "Development Tools" -sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper +sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper samba sudo systemctl -q enable iscsid sudo systemctl start iscsid # disable nm-cloud-setup otherwise k3s-agent service won’t start. diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl index 19f5b99fc2..6e681cb0ac 100644 --- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl @@ -2,7 +2,7 @@ sudo yum update -y sudo yum group install -y "Development Tools" -sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper nc +sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper nc samba sudo systemctl -q enable iscsid sudo systemctl start iscsid sudo systemctl disable nm-cloud-setup.service nm-cloud-setup.timer diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl index 45c9e8580a..a1a8b75d56 100755 --- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl @@ -10,7 +10,7 @@ fi sudo yum update -y sudo yum group install -y "Development Tools" -sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper +sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper samba sudo systemctl -q enable iscsid sudo systemctl start iscsid sudo systemctl disable nm-cloud-setup.service nm-cloud-setup.timer diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl index 5c3cebefd9..475a243abd 100755 --- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl @@ -10,7 +10,7 @@ fi sudo yum update -y sudo yum group install -y "Development Tools" -sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper nc +sudo yum install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper nc samba sudo systemctl -q enable iscsid sudo systemctl start iscsid sudo systemctl disable nm-cloud-setup.service nm-cloud-setup.timer diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl index e5ea498944..a6e8aeb1ca 100755 --- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl @@ -10,7 +10,7 @@ fi # Do not arbitrarily run "dnf update", as this will effectively move us up to the latest minor release. sudo dnf group install -y "Development Tools" -sudo dnf install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper +sudo dnf install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper samba sudo systemctl -q enable iscsid sudo systemctl start iscsid diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl index d2c2a65f70..6b9732ed14 100755 --- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl @@ -10,7 +10,7 @@ fi # Do not arbitrarily run "dnf update", as this will effectively move us up to the latest minor release. sudo dnf group install -y "Development Tools" -sudo dnf install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper jq nmap-ncat +sudo dnf install -y iscsi-initiator-utils nfs-utils nfs4-acl-tools cryptsetup device-mapper jq nmap-ncat samba sudo systemctl -q enable iscsid sudo systemctl start iscsid diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl index d888d29949..579c6565c1 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl @@ -5,7 +5,7 @@ set -e sudo systemctl restart guestregister # Sometimes registration fails on first boot. sudo zypper ref sudo zypper install -y -t pattern devel_basis -sudo zypper install -y open-iscsi nfs-client cryptsetup device-mapper +sudo zypper install -y open-iscsi nfs-client cryptsetup device-mapper samba sudo systemctl -q enable iscsid sudo systemctl start iscsid diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl index 2a2df03018..3057431068 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl @@ -5,7 +5,7 @@ set -e sudo systemctl restart guestregister # Sometimes registration fails on first boot. sudo zypper ref sudo zypper install -y -t pattern devel_basis -sudo zypper install -y open-iscsi nfs-client jq +sudo zypper install -y open-iscsi nfs-client jq azure-cli sudo systemctl -q enable iscsid sudo systemctl start iscsid @@ -27,4 +27,4 @@ done if [[ -n "${custom_ssh_public_key}" ]]; then echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys -fi +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl index f1c8755125..5e43b9749b 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl @@ -5,7 +5,7 @@ set -e sudo systemctl restart guestregister # Sometimes registration fails on first boot. sudo zypper ref sudo zypper install -y -t pattern devel_basis -sudo zypper install -y open-iscsi nfs-client cryptsetup device-mapper +sudo zypper install -y open-iscsi nfs-client cryptsetup device-mapper samba sudo systemctl -q enable iscsid sudo systemctl start iscsid diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl index 6bf855bc44..3f213525aa 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl @@ -5,7 +5,7 @@ set -e sudo systemctl restart guestregister # Sometimes registration fails on first boot. sudo zypper ref sudo zypper install -y -t pattern devel_basis -sudo zypper install -y open-iscsi nfs-client jq +sudo zypper install -y open-iscsi nfs-client jq azure-cli sudo systemctl -q enable iscsid sudo systemctl start iscsid diff --git a/test_framework/terraform/aws/talos/main.tf b/test_framework/terraform/aws/talos/main.tf index b572687ec8..20634e8347 100644 --- a/test_framework/terraform/aws/talos/main.tf +++ b/test_framework/terraform/aws/talos/main.tf @@ -189,14 +189,10 @@ data "talos_machine_configuration" "controlplane" { depends_on = [ aws_instance.lh_aws_instance_controlplane ] - count = var.lh_aws_instance_count_controlplane - cluster_name = "lh-tests-cluster" cluster_endpoint = "https://${aws_instance.lh_aws_instance_controlplane[0].public_ip}:6443" machine_type = "controlplane" machine_secrets = talos_machine_secrets.machine_secrets.machine_secrets - kubernetes_version = var.k8s_distro_version - talos_version = "v${var.os_distro_version}" docs = false examples = false config_patches = [ @@ -208,18 +204,14 @@ data "talos_machine_configuration" "worker" { depends_on = [ aws_instance.lh_aws_instance_controlplane ] - count = var.lh_aws_instance_count_worker - cluster_name = "lh-tests-cluster" cluster_endpoint = "https://${aws_instance.lh_aws_instance_controlplane[0].public_ip}:6443" machine_type = "worker" machine_secrets = talos_machine_secrets.machine_secrets.machine_secrets - kubernetes_version = var.k8s_distro_version - talos_version = "v${var.os_distro_version}" docs = false examples = false config_patches = [ - file("${path.module}/talos-patch.yaml") + file("${path.module}/talos-patch-worker.yaml") ] } @@ -227,7 +219,7 @@ resource "talos_machine_configuration_apply" "controlplane" { count = var.lh_aws_instance_count_controlplane client_configuration = talos_machine_secrets.machine_secrets.client_configuration - machine_configuration_input = data.talos_machine_configuration.controlplane[count.index].machine_configuration + machine_configuration_input = data.talos_machine_configuration.controlplane.machine_configuration endpoint = aws_instance.lh_aws_instance_controlplane[count.index].public_ip node = aws_instance.lh_aws_instance_controlplane[count.index].private_ip } @@ -236,7 +228,7 @@ resource "talos_machine_configuration_apply" "worker" { count = var.lh_aws_instance_count_worker client_configuration = talos_machine_secrets.machine_secrets.client_configuration - machine_configuration_input = data.talos_machine_configuration.worker[count.index].machine_configuration + machine_configuration_input = data.talos_machine_configuration.worker.machine_configuration endpoint = aws_instance.lh_aws_instance_worker[count.index].public_ip node = aws_instance.lh_aws_instance_worker[count.index].private_ip } @@ -260,15 +252,28 @@ resource "local_file" "talosconfig" { filename = "talos_k8s_config" } -data "talos_cluster_kubeconfig" "this" { +resource "talos_cluster_kubeconfig" "this" { depends_on = [talos_machine_bootstrap.this] client_configuration = talos_machine_secrets.machine_secrets.client_configuration - endpoint = aws_instance.lh_aws_instance_controlplane[0].public_ip + endpoint = aws_instance.lh_aws_instance_controlplane.0.public_ip node = aws_instance.lh_aws_instance_controlplane.0.private_ip } resource "local_file" "kubeconfig" { - content = nonsensitive(data.talos_cluster_kubeconfig.this.kubeconfig_raw) + content = nonsensitive(talos_cluster_kubeconfig.this.kubeconfig_raw) filename = "kubeconfig" } + +data "talos_cluster_health" "this" { + depends_on = [ + talos_machine_configuration_apply.controlplane, + talos_machine_configuration_apply.worker, + talos_cluster_kubeconfig.this + ] + + client_configuration = talos_machine_secrets.machine_secrets.client_configuration + endpoints = aws_instance.lh_aws_instance_controlplane.*.public_ip + control_plane_nodes = aws_instance.lh_aws_instance_controlplane.*.private_ip + worker_nodes = aws_instance.lh_aws_instance_worker.*.private_ip +} diff --git a/test_framework/terraform/aws/talos/variables.tf b/test_framework/terraform/aws/talos/variables.tf index 89a763ebfd..e838ce737a 100644 --- a/test_framework/terraform/aws/talos/variables.tf +++ b/test_framework/terraform/aws/talos/variables.tf @@ -31,7 +31,7 @@ variable "arch" { variable "os_distro_version" { type = string - default = "1.7.6" + default = "1.8.3" } variable "aws_ami_talos_account_number" { diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl index cb13a443c8..0366fae36d 100755 --- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl @@ -1,7 +1,7 @@ #!/bin/bash apt-get update -apt-get install -y nfs-common cryptsetup dmsetup linux-modules-extra-`uname -r` +apt-get install -y nfs-common cryptsetup dmsetup samba linux-modules-extra-`uname -r` modprobe uio modprobe uio_pci_generic diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl index 5de3d402aa..642485c5ef 100755 --- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl @@ -1,7 +1,7 @@ #!/bin/bash apt-get update -apt-get install -y nfs-common cryptsetup dmsetup linux-modules-extra-`uname -r` +apt-get install -y nfs-common cryptsetup dmsetup samba linux-modules-extra-`uname -r` modprobe uio modprobe uio_pci_generic diff --git a/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl index 2bb4fe66d2..2e0832903e 100644 --- a/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl @@ -4,7 +4,7 @@ set -e set -x apt-get update -apt-get install -y nfs-common cryptsetup dmsetup linux-modules-extra-`uname -r` +apt-get install -y nfs-common cryptsetup dmsetup samba linux-modules-extra-`uname -r` modprobe uio modprobe uio_pci_generic diff --git a/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl index 39da2ad640..f3bb124b88 100755 --- a/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/equinix/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl @@ -4,7 +4,7 @@ set -e set -x apt-get update -apt-get install -y nfs-common cryptsetup dmsetup linux-modules-extra-`uname -r` +apt-get install -y nfs-common cryptsetup dmsetup samba linux-modules-extra-`uname -r` modprobe uio modprobe uio_pci_generic