From 41e82af2be7c0e56d1ea0b729b22d7e28faedfb6 Mon Sep 17 00:00:00 2001 From: Aishwarya-Hebbar Date: Mon, 2 Sep 2024 17:47:09 +0530 Subject: [PATCH] vsan stretch automation for TKG, WCP and VMService VMs --- tests/e2e/e2e_common.go | 1 + tests/e2e/file_volume_statefulsets.go | 2 +- tests/e2e/hci.go | 4 +- tests/e2e/hci_mesh_rwx_disruptive.go | 8 +- tests/e2e/multi_vc.go | 2 +- ...i_mesh_rwx_singlevc_topology_disruptive.go | 2 +- tests/e2e/prevent_duplicate_cluster_ids.go | 10 +- tests/e2e/rwx_topology_utils.go | 16 +- tests/e2e/statefulsets.go | 2 +- tests/e2e/util.go | 12 +- tests/e2e/vm_service_vsan_stretch_cluster.go | 1190 ++++++++++++++ tests/e2e/vmservice_utils.go | 142 ++ tests/e2e/vsan_stretched_cluster.go | 1388 ++++++++++++----- tests/e2e/vsan_stretched_cluster_utils.go | 164 +- 14 files changed, 2530 insertions(+), 413 deletions(-) create mode 100644 tests/e2e/vm_service_vsan_stretch_cluster.go diff --git a/tests/e2e/e2e_common.go b/tests/e2e/e2e_common.go index 237ae0bccc..29ff254826 100644 --- a/tests/e2e/e2e_common.go +++ b/tests/e2e/e2e_common.go @@ -34,6 +34,7 @@ const ( nginxImage = "registry.k8s.io/nginx-slim:0.26" nginxImage4upg = "registry.k8s.io/nginx-slim:0.27" retainClaimPolicy = "Retain" + cloudInitLabel = "CloudInit" configSecret = "vsphere-config-secret" contollerClusterKubeConfig = "CONTROLLER_CLUSTER_KUBECONFIG" controlPlaneLabel = "node-role.kubernetes.io/control-plane" diff --git a/tests/e2e/file_volume_statefulsets.go b/tests/e2e/file_volume_statefulsets.go index 5034fd1fac..2298c3ac4b 100644 --- a/tests/e2e/file_volume_statefulsets.go +++ b/tests/e2e/file_volume_statefulsets.go @@ -766,7 +766,7 @@ var _ = ginkgo.Describe("[csi-file-vanilla] File Volume statefulset", func() { }() ginkgo.By("Creating statefulset with replica 3") statefulset, _, volumesBeforeScaleUp := createStsDeployment(ctx, client, namespace, sc, false, - false, 0, "", v1.ReadWriteMany) + false, 3, "", 0, v1.ReadWriteMany) replicas := *(statefulset.Spec.Replicas) //List volume responses will show up in the interval of every 1 minute. diff --git a/tests/e2e/hci.go b/tests/e2e/hci.go index 97b46a32f1..d7287d8f6f 100644 --- a/tests/e2e/hci.go +++ b/tests/e2e/hci.go @@ -222,7 +222,7 @@ var _ bool = ginkgo.Describe("hci", func() { ginkgo.By("create a sts with 3 replicas") var replicas int32 = 3 - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, false, replicas, "", "") + statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, false, replicas, "", 0, "") defer func() { ginkgo.By(fmt.Sprintf("Deleting all statefulsets in namespace: %v", namespace)) fss.DeleteAllStatefulSets(ctx, client, namespace) @@ -304,7 +304,7 @@ var _ bool = ginkgo.Describe("hci", func() { ginkgo.By("Create a sts with 3 replicas") var replicas int32 = 3 - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, false, replicas, "", "") + statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, false, replicas, "", 0, "") defer func() { ginkgo.By(fmt.Sprintf("Deleting all statefulsets in namespace: %v", namespace)) diff --git a/tests/e2e/hci_mesh_rwx_disruptive.go b/tests/e2e/hci_mesh_rwx_disruptive.go index 1cbe55129f..e7c26f8110 100644 --- a/tests/e2e/hci_mesh_rwx_disruptive.go +++ b/tests/e2e/hci_mesh_rwx_disruptive.go @@ -677,7 +677,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD all host in remote cluster4 and when psod is triggered, create new set of rwx pvc") for i := 0; i < len(hostListCluster4); i++ { - err = psodHost(hostListCluster4[i]) + err = psodHost(hostListCluster4[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { @@ -729,7 +729,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD again all host in remote cluster4 and perform scaleup " + "operation on deployment and statefulset") for i := 0; i < len(hostListCluster4); i++ { - err = psodHost(hostListCluster4[i]) + err = psodHost(hostListCluster4[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { @@ -1873,7 +1873,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD all host in local cluster2 and when psod is triggered, create new set of rwx pvc") for i := 0; i < len(hostListCluster2); i++ { - err = psodHost(hostListCluster2[i]) + err = psodHost(hostListCluster2[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { @@ -1929,7 +1929,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD all host in local cluster3 and perform scaleup " + "operation on deployment and statefulset") for i := 0; i < len(hostListCluster3); i++ { - err = psodHost(hostListCluster3[i]) + err = psodHost(hostListCluster3[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { diff --git a/tests/e2e/multi_vc.go b/tests/e2e/multi_vc.go index db0476bb0f..71c0854c71 100644 --- a/tests/e2e/multi_vc.go +++ b/tests/e2e/multi_vc.go @@ -2045,7 +2045,7 @@ var _ = ginkgo.Describe("[multivc-positive] MultiVc-Topology-Positive", func() { }() ginkgo.By("Creating statefulset with replica 3 and a deployment") statefulset, deployment, volumesBeforeScaleUp := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", "") + false, 3, "", 1, "") ginkgo.By("Verify PV node affinity and that the PODS are running on appropriate node") err = verifyPVnodeAffinityAndPODnodedetailsForStatefulsetsLevel5(ctx, client, statefulset, diff --git a/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go b/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go index 33ce89e20a..890261ce99 100644 --- a/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go +++ b/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go @@ -752,7 +752,7 @@ var _ = ginkgo.Describe("[rwx-nohci-singlevc-disruptive] RWX-Topology-NoHciMesh- ginkgo.By("PSOD all host") for i := 0; i < len(hostList); i++ { - err = psodHost(hostList[i]) + err = psodHost(hostList[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 2 { diff --git a/tests/e2e/prevent_duplicate_cluster_ids.go b/tests/e2e/prevent_duplicate_cluster_ids.go index c659b2be9b..cf5fd3d23b 100644 --- a/tests/e2e/prevent_duplicate_cluster_ids.go +++ b/tests/e2e/prevent_duplicate_cluster_ids.go @@ -187,7 +187,7 @@ var _ = ginkgo.Describe("Prevent duplicate cluster ID", func() { } ginkgo.By("Creating statefulset with replica 3 and a deployment") statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", accessMode) + false, 3, "", 1, accessMode) replicas := *(statefulset.Spec.Replicas) defer func() { @@ -345,7 +345,7 @@ var _ = ginkgo.Describe("Prevent duplicate cluster ID", func() { } ginkgo.By("Creating statefulset with replica 3 and a deployment") statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", accessMode) + false, 3, "", 1, accessMode) replicas := *(statefulset.Spec.Replicas) defer func() { @@ -503,7 +503,7 @@ var _ = ginkgo.Describe("Prevent duplicate cluster ID", func() { } ginkgo.By("Creating statefulset with replica 3 and a deployment") statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", accessMode) + false, 0, "", 1, accessMode) replicas := *(statefulset.Spec.Replicas) defer func() { @@ -757,7 +757,7 @@ var _ = ginkgo.Describe("Prevent duplicate cluster ID", func() { } ginkgo.By("Creating statefulset with replica 3 and a deployment") statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", accessMode) + false, 3, "", 1, accessMode) replicas := *(statefulset.Spec.Replicas) defer func() { @@ -864,7 +864,7 @@ var _ = ginkgo.Describe("Prevent duplicate cluster ID", func() { } ginkgo.By("Creating statefulset with replica 3 and a deployment") statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", accessMode) + false, 3, "", 1, accessMode) replicas := *(statefulset.Spec.Replicas) defer func() { diff --git a/tests/e2e/rwx_topology_utils.go b/tests/e2e/rwx_topology_utils.go index 878ee768a9..6f52b16f56 100644 --- a/tests/e2e/rwx_topology_utils.go +++ b/tests/e2e/rwx_topology_utils.go @@ -893,18 +893,24 @@ func verifyK8sNodeStatusAfterSiteRecovery(client clientset.Interface, ctx contex } /* This util will perform psod operation on a host */ -func psodHost(hostIP string) error { +func psodHost(hostIP string, psodTimeOut string) error { ginkgo.By("PSOD") - sshCmd := fmt.Sprintf("vsish -e set /config/Misc/intOpts/BlueScreenTimeout %s", psodTime) - op, err := runCommandOnESX("root", hostIP, sshCmd) + var timeout string + if psodTimeOut != "" { + timeout = psodTimeOut + } else { + timeout = psodTime + } + sshCmd := fmt.Sprintf("vsish -e set /config/Misc/intOpts/BlueScreenTimeout %s", timeout) + op, err := runCommandOnESX(rootUser, hostIP, sshCmd) framework.Logf(op) if err != nil { return fmt.Errorf("failed to set BlueScreenTimeout: %w", err) } ginkgo.By("Injecting PSOD") - psodCmd := "vsish -e set /reliability/crashMe/Panic 1" - op, err = runCommandOnESX("root", hostIP, psodCmd) + psodCmd := "vsish -e set /reliability/crashMe/Panic 1; exit" + op, err = runCommandOnESX(rootUser, hostIP, psodCmd) framework.Logf(op) if err != nil { return fmt.Errorf("failed to inject PSOD: %w", err) diff --git a/tests/e2e/statefulsets.go b/tests/e2e/statefulsets.go index 12036bb9e9..7614a45ac6 100644 --- a/tests/e2e/statefulsets.go +++ b/tests/e2e/statefulsets.go @@ -976,7 +976,7 @@ var _ = ginkgo.Describe("statefulset", func() { ginkgo.By("Creating statfulset and deployment from storageclass") statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", "") + false, 3, "", 1, "") replicas := *(statefulset.Spec.Replicas) csiNs := GetAndExpectStringEnvVar(envCSINamespace) csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) diff --git a/tests/e2e/util.go b/tests/e2e/util.go index 4a16536ca0..606512774d 100644 --- a/tests/e2e/util.go +++ b/tests/e2e/util.go @@ -3618,18 +3618,8 @@ func psodHostWithPv(ctx context.Context, vs *vSphere, pvName string) string { framework.Logf("hostIP %v", hostIP) gomega.Expect(hostIP).NotTo(gomega.BeEmpty()) - ginkgo.By("PSOD") - sshCmd := fmt.Sprintf("vsish -e set /config/Misc/intOpts/BlueScreenTimeout %s", psodTime) - op, err := runCommandOnESX("root", hostIP, sshCmd) - framework.Logf(op) + err := psodHost(hostIP, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Injecting PSOD ") - psodCmd := "vsish -e set /reliability/crashMe/Panic 1" - op, err = runCommandOnESX("root", hostIP, psodCmd) - framework.Logf(op) - gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) - return hostIP } diff --git a/tests/e2e/vm_service_vsan_stretch_cluster.go b/tests/e2e/vm_service_vsan_stretch_cluster.go new file mode 100644 index 0000000000..78b0de18df --- /dev/null +++ b/tests/e2e/vm_service_vsan_stretch_cluster.go @@ -0,0 +1,1190 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "fmt" + "os" + "strings" + "sync" + "time" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + + vmopv1 "github.com/vmware-tanzu/vm-operator/api/v1alpha1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" + fnodes "k8s.io/kubernetes/test/e2e/framework/node" + fpod "k8s.io/kubernetes/test/e2e/framework/pod" + fpv "k8s.io/kubernetes/test/e2e/framework/pv" + admissionapi "k8s.io/pod-security-admission/api" + ctlrclient "sigs.k8s.io/controller-runtime/pkg/client" + + cnsop "sigs.k8s.io/vsphere-csi-driver/v3/pkg/apis/cnsoperator" +) + +var _ bool = ginkgo.Describe("[vmsvc] vm service with csi vol tests", func() { + + f := framework.NewDefaultFramework("vmsvc") + f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged + f.SkipNamespaceCreation = true // tests will create their own namespaces + var ( + client clientset.Interface + namespace string + datastoreURL string + storagePolicyName string + storageClassName string + storageProfileId string + vcRestSessionId string + vmi string + vmClass string + csiNs string + vmopC ctlrclient.Client + cnsopC ctlrclient.Client + isVsanHealthServiceStopped bool + isSPSserviceStopped bool + vcAddress string + nodeList *v1.NodeList + ) + + ginkgo.BeforeEach(func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + client = f.ClientSet + var err error + + nodeList, err = fnodes.GetReadySchedulableNodes(ctx, f.ClientSet) + framework.ExpectNoError(err, "Unable to find ready and schedulable Node") + if !(len(nodeList.Items) > 0) { + framework.Failf("Unable to find ready and schedulable Node") + } + storagePolicyName = GetAndExpectStringEnvVar(envStoragePolicyNameForSharedDatastores) + + bootstrap() + + readVcEsxIpsViaTestbedInfoJson(GetAndExpectStringEnvVar(envTestbedInfoJsonPath)) + initialiseFdsVar(ctx) + + vcAddress = e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort + vcRestSessionId = createVcSession4RestApis(ctx) + csiNs = GetAndExpectStringEnvVar(envCSINamespace) + + storageClassName = strings.ReplaceAll(storagePolicyName, " ", "-") // since this is a wcp setup + storageClassName = strings.ToLower(storageClassName) + framework.Logf("storageClassName: %s", storageClassName) + + datastoreURL = GetAndExpectStringEnvVar(envSharedDatastoreURL) + dsRef := getDsMoRefFromURL(ctx, datastoreURL) + framework.Logf("dsmoId: %v", dsRef.Value) + + storageProfileId = e2eVSphere.GetSpbmPolicyID(storagePolicyName) + contentLibId := createAndOrGetContentlibId4Url(vcRestSessionId, GetAndExpectStringEnvVar(envContentLibraryUrl), + dsRef.Value, GetAndExpectStringEnvVar(envContentLibraryUrlSslThumbprint)) + + framework.Logf("Create a WCP namespace for the test") + vmClass = os.Getenv(envVMClass) + if vmClass == "" { + vmClass = vmClassBestEffortSmall + } + namespace = createTestWcpNs( + vcRestSessionId, storageProfileId, vmClass, contentLibId, getSvcId(vcRestSessionId)) + + vmopScheme := runtime.NewScheme() + gomega.Expect(vmopv1.AddToScheme(vmopScheme)).Should(gomega.Succeed()) + vmopC, err = ctlrclient.New(f.ClientConfig(), ctlrclient.Options{Scheme: vmopScheme}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + cnsOpScheme := runtime.NewScheme() + gomega.Expect(cnsop.AddToScheme(cnsOpScheme)).Should(gomega.Succeed()) + cnsopC, err = ctlrclient.New(f.ClientConfig(), ctlrclient.Options{Scheme: cnsOpScheme}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + vmImageName := GetAndExpectStringEnvVar(envVmsvcVmImageName) + framework.Logf("Waiting for virtual machine image list to be available in namespace '%s' for image '%s'", + namespace, vmImageName) + vmi = waitNGetVmiForImageName(ctx, vmopC, namespace, vmImageName) + gomega.Expect(vmi).NotTo(gomega.BeEmpty()) + }) + + ginkgo.AfterEach(func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + if isVsanHealthServiceStopped { + ginkgo.By(fmt.Sprintf("Starting %v on the vCenter host", vsanhealthServiceName)) + startVCServiceWait4VPs(ctx, vcAddress, vsanhealthServiceName, &isVsanHealthServiceStopped) + } + + if isSPSserviceStopped { + ginkgo.By(fmt.Sprintf("Starting %v on the vCenter host", spsServiceName)) + startVCServiceWait4VPs(ctx, vcAddress, vsanhealthServiceName, &isSPSserviceStopped) + } + dumpSvcNsEventsOnTestFailure(client, namespace) + delTestWcpNs(vcRestSessionId, namespace) + gomega.Expect(waitForNamespaceToGetDeleted(ctx, client, namespace, poll, pollTimeout)).To(gomega.Succeed()) + }) + + /* + Primary site down + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create a statefulset, deployment with volumes from the stretched datastore + 3. Bring down the primary site + 4. Verify that the VMs hosted by esx servers are brought up on the other site + 5. Verify that the k8s cluster is healthy and all the k8s constructs created in step 2 are running and volume + and application lifecycle actions work fine + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete all objects created in step 2 and 5 + */ + ginkgo.It("VMService - primary site down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the primary site") + siteFailover(ctx, true) + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, vm := range vms { + _, err := wait4Vm2ReachPowerStateInSpec(ctx, vmopC, vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + Secondary site down + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create a statefulset, deployment with volumes from the stretched datastore + 3. Bring down the primary site + 4. Verify that the VMs hosted by esx servers are brought up on the other site + 5. Verify that the k8s cluster is healthy and all the k8s constructs created in step 2 are running and volume + and application lifecycle actions work fine + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete all objects created in step 2 and 5 + */ + ginkgo.It("VMService - secondary site down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + + ginkgo.By("Get StorageClass for volume creation") + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the secondary site") + siteFailover(ctx, false) + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + time.Sleep(5 * time.Minute) + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, vm := range vms { + _, err := wait4Vm2ReachPowerStateInSpec(ctx, vmopC, vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + VMService VM creation while primary site goes down¯ + Steps: + 1. Create 10 PVCS using the storageclass as mentioned in testbed structure and verify that it goes to bound state. + 2. Create VMService VM with each PVC created in step1. + 3. While VMService VM creation is going on, bring down the primary site by powering off the hosts in primary site in parallel. + 4. Verify that the supervisor cluster should be in running and ready state after site failover. + 5. Verify that all the PVCs created in step 2 are running fine. + 6. Perform volume lifecycle actions which should work fine. + 7. Bring primary site up and wait for testbed to be back to normal. + 8. Delete all objects created in the test. + */ + ginkgo.It("VMService VM creation while primary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var vmCount = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + var vms []*vmopv1.VirtualMachine + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ch := make(chan *vmopv1.VirtualMachine) + var wg sync.WaitGroup + var lock sync.Mutex + ginkgo.By("Creating VM in parallel to site failure") + wg.Add(vmCount) + go createVMServiceVmInParallel(ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName, vmCount, ch, &wg, &lock) + go func() { + for v := range ch { + vms = append(vms, v) + } + }() + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() + close(ch) + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + VMService VM deletion while secondary site goes down + Steps: + + 1. Create 10 PVCS using the storageclass as mentioned in testbed structure and verify they go into bound state. + 2. Create VMService VM with each PVC created in step1. + 3. Verify all PVC's metadata on CNS. + 4. Once the VMs are up verify that the volume is accessible inside the VM. + 5. Delete all the VMs created in step2. + 6. While VMService VM deletion is going on, + bring down the secondary site by powering off the hosts in secondary site in parallel. + 7. Verify that the supervisor cluster should be in running and ready state after site failover. + 8. Verify all the VMservice vms created in step2 are deleted successfully. + 9. Perform volume lifecycle actions which should work fine. + 10.Bring secondary site up and wait for testbed to be back to normal. + 11.Delete all objects created in this test. + */ + ginkgo.It("VMService VM deletion while secondary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + + var wg sync.WaitGroup + ginkgo.By("Creating VM in parallel to secondary site failure") + wg.Add(2) + go deleteVMServiceVmInParallel(ctx, vmopC, vms, namespace, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify all the VMservice vms created before secondary site failure are deleted successfully") + for _, vm := range vms { + _, err := getVmsvcVM(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).To(gomega.HaveOccurred()) + } + + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + PSOD hosts on secondary site + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create two statefulset with replica count 1(sts1) and 5(sts2) respectively using a thick provision policy + and wait for all replicas to be running + 3. Change replica count of sts1 and sts2 to 3 + 4. Bring down primary site + 5. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site + 6. Verify there were no issue with replica scale up/down and verify pod entry in CNS volumemetadata for the + volumes associated with the PVC used by statefulsets are updated + 7. Change replica count of sts1 to 5 a sts2 to 1 and verify they are successful + 8. Delete statefulsets and its pvcs created in step 2 + 9. Bring primary site up and wait for testbed to be back to normal + */ + ginkgo.It("VMService - psod hosts on secondary site", func() { + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var vms []*vmopv1.VirtualMachine + var vmlbsvcs []*vmopv1.VirtualMachineService + var svcCsipods, csipods *v1.PodList + + ginkgo.By("Creating StorageClass") + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, 10, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ch := make(chan *vmopv1.VirtualMachine) + var wg sync.WaitGroup + var lock sync.Mutex + ginkgo.By("Creating VM in parallel to site failure") + wg.Add(2) + go createVMServiceVmInParallel(ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName, 10, ch, &wg, &lock) + go func() { + for v := range ch { + vms = append(vms, v) + } + }() + go psodHostsInParallel(true, "600", &wg) + wg.Wait() + close(ch) + + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + time.Sleep(5 * time.Minute) + + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Check if csi pods are running fine after site recovery") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + VMService - witness failure + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create a statefulset, deployment with volumes from the stretched datastore + 3. Bring down the primary site + 4. Verify that the VMs hosted by esx servers are brought up on the other site + 5. Verify that the k8s cluster is healthy and all the k8s constructs created in step 2 are running and volume + and application lifecycle actions work fine + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete all objects created in step 2 and 5 + */ + ginkgo.It("VMService - witness failure", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down witness host") + toggleWitnessPowerState(ctx, true) + defer func() { + ginkgo.By("Bring up the witness host before terminating the test") + if fds.witnessDown != "" { + toggleWitnessPowerState(ctx, false) + } + }() + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + + ginkgo.By("Check storage compliance") + comp := checkVmStorageCompliance(client, storagePolicyName) + if !comp { + framework.Failf("Expected VM and storage compliance to be false but found true") + } + + ginkgo.By("Bring up witness host") + if fds.witnessDown != "" { + toggleWitnessPowerState(ctx, false) + } + + ginkgo.By("Check storage compliance") + comp = checkVmStorageCompliance(client, storagePolicyName) + if !comp { + framework.Failf("Expected VM and storage compliance to be true but found false") + } + + }) + + /* + Primary site network isolation + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create a statefulset, deployment with volumes from the stretched datastore + 3. Bring down the primary site + 4. Verify that the VMs hosted by esx servers are brought up on the other site + 5. Verify that the k8s cluster is healthy and all the k8s constructs created in step 2 are running and volume + and application lifecycle actions work fine + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete all objects created in step 2 and 5 + */ + ginkgo.It("VMService - Primary site network isolation", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Cause a network failure on primary site + ginkgo.By("Isolate secondary site from witness and primary site") + siteNetworkFailure(false, false) + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + siteNetworkFailure(false, true) + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, vm := range vms { + _, err := wait4Vm2ReachPowerStateInSpec(ctx, vmopC, vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") + siteNetworkFailure(false, true) + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + +}) diff --git a/tests/e2e/vmservice_utils.go b/tests/e2e/vmservice_utils.go index 8427b9b6ca..4daca33f89 100644 --- a/tests/e2e/vmservice_utils.go +++ b/tests/e2e/vmservice_utils.go @@ -28,19 +28,23 @@ import ( "reflect" "strconv" "strings" + "sync" "time" + "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" "github.com/pkg/sftp" "golang.org/x/crypto/ssh" vmopv1 "github.com/vmware-tanzu/vm-operator/api/v1alpha1" v1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/kubernetes/test/e2e/framework" + fpv "k8s.io/kubernetes/test/e2e/framework/pv" fssh "k8s.io/kubernetes/test/e2e/framework/ssh" ctlrclient "sigs.k8s.io/controller-runtime/pkg/client" cnsnodevmattachmentv1alpha1 "sigs.k8s.io/vsphere-csi-driver/v3/pkg/apis/cnsoperator/cnsnodevmattachment/v1alpha1" @@ -802,3 +806,141 @@ func wait4Pvc2Detach( }) gomega.Expect(waitErr).NotTo(gomega.HaveOccurred()) } + +// createVMServiceVmWithMultiplePvcs +func createVMServiceVmWithMultiplePvcs(ctx context.Context, c ctlrclient.Client, namespace string, vmClass string, + pvcs []*v1.PersistentVolumeClaim, vmi string, storageClassName string, secretName string) []*vmopv1.VirtualMachine { + var vms []*vmopv1.VirtualMachine + for _, pvc := range pvcs { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + vols := []vmopv1.VirtualMachineVolume{} + vmName := fmt.Sprintf("csi-test-vm-%d", r.Intn(10000)) + + vols = append(vols, vmopv1.VirtualMachineVolume{ + Name: pvc.Name, + PersistentVolumeClaim: &vmopv1.PersistentVolumeClaimVolumeSource{ + PersistentVolumeClaimVolumeSource: v1.PersistentVolumeClaimVolumeSource{ClaimName: pvc.Name}, + }, + }) + + vm := vmopv1.VirtualMachine{ + ObjectMeta: metav1.ObjectMeta{Name: vmName, Namespace: namespace}, + Spec: vmopv1.VirtualMachineSpec{ + PowerState: vmopv1.VirtualMachinePoweredOn, + ImageName: vmi, + ClassName: vmClass, + StorageClass: storageClassName, + Volumes: vols, + VmMetadata: &vmopv1.VirtualMachineMetadata{Transport: cloudInitLabel, SecretName: secretName}, + }, + } + err := c.Create(ctx, &vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + vms = append(vms, waitNgetVmsvcVM(ctx, c, namespace, vmName)) + } + return vms +} + +// createVMServiceVmInParallel +func createVMServiceVmInParallel(ctx context.Context, c ctlrclient.Client, namespace string, vmClass string, + pvcs []*v1.PersistentVolumeClaim, vmi string, storageClassName string, secretName string, + vmCount int, ch chan *vmopv1.VirtualMachine, wg *sync.WaitGroup, lock *sync.Mutex) { + defer wg.Done() + for i := 0; i < vmCount; i++ { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + vols := []vmopv1.VirtualMachineVolume{} + vmName := fmt.Sprintf("csi-test-vm-%d", r.Intn(10000)) + + vols = append(vols, vmopv1.VirtualMachineVolume{ + Name: pvcs[i].Name, + PersistentVolumeClaim: &vmopv1.PersistentVolumeClaimVolumeSource{ + PersistentVolumeClaimVolumeSource: v1.PersistentVolumeClaimVolumeSource{ClaimName: pvcs[i].Name}, + }, + }) + + vm := vmopv1.VirtualMachine{ + ObjectMeta: metav1.ObjectMeta{Name: vmName, Namespace: namespace}, + Spec: vmopv1.VirtualMachineSpec{ + PowerState: vmopv1.VirtualMachinePoweredOn, + ImageName: vmi, + ClassName: vmClass, + StorageClass: storageClassName, + Volumes: vols, + VmMetadata: &vmopv1.VirtualMachineMetadata{Transport: cloudInitLabel, SecretName: secretName}, + }, + } + err := c.Create(ctx, &vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + lock.Lock() + ch <- &vm + lock.Unlock() + + } +} + +func deleteVMServiceVmInParallel(ctx context.Context, c ctlrclient.Client, vms []*vmopv1.VirtualMachine, namespace string, + wg *sync.WaitGroup) { + + defer wg.Done() + for _, vm := range vms { + err := c.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } +} + +// performVolumeLifecycleActionForVmServiceVM +func performVolumeLifecycleActionForVmServiceVM(ctx context.Context, client clientset.Interface, + vmopC ctlrclient.Client, cnsopC ctlrclient.Client, vmClass string, namespace string, vmi string, + sc *storagev1.StorageClass, secretName string) { + ginkgo.By("Create a PVC") + pvc, err := createPVC(ctx, client, namespace, nil, "", sc, "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, []*v1.PersistentVolumeClaim{pvc}, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pv := pvs[0] + volHandle := pv.Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + defer func() { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for CNS volumes to be deleted") + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }() + + ginkgo.By("Creating VM") + vm := createVmServiceVmWithPvcs( + ctx, vmopC, namespace, vmClass, []*v1.PersistentVolumeClaim{pvc}, vmi, sc.Name, secretName) + defer func() { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvc})).NotTo(gomega.HaveOccurred()) +} diff --git a/tests/e2e/vsan_stretched_cluster.go b/tests/e2e/vsan_stretched_cluster.go index d74cc24f2f..99481b6056 100644 --- a/tests/e2e/vsan_stretched_cluster.go +++ b/tests/e2e/vsan_stretched_cluster.go @@ -35,6 +35,7 @@ import ( "golang.org/x/crypto/ssh" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" @@ -56,8 +57,10 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f const operationStormScale = 50 var ( client clientset.Interface + svcClient clientset.Interface namespace string nodeList *v1.NodeList + svcNodeList *v1.NodeList storagePolicyName string scParameters map[string]string storageClassName string @@ -73,6 +76,9 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f defaultDatastore *object.Datastore isVsanHealthServiceStopped bool nimbusGeneratedK8sVmPwd string + sc *storagev1.StorageClass + err error + accessmode v1.PersistentVolumeAccessMode ) ginkgo.BeforeEach(func() { @@ -94,6 +100,17 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f framework.ExpectNoError(err, "cluster not completely healthy") // TODO: verify csi pods are up + if guestCluster { + if k8senv := GetAndExpectStringEnvVar("SUPERVISOR_CLUSTER_KUBE_CONFIG"); k8senv != "" { + svcClient, err = createKubernetesClientFromConfig(k8senv) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + svcNodeList, err = fnodes.GetReadySchedulableNodes(ctx, svcClient) + framework.ExpectNoError(err, "Unable to find ready and schedulable Node") + if !(len(svcNodeList.Items) > 0) { + framework.Failf("Unable to find ready and schedulable Node") + } + } nodeList, err = fnodes.GetReadySchedulableNodes(ctx, f.ClientSet) framework.ExpectNoError(err, "Unable to find ready and schedulable Node") @@ -110,12 +127,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f framework.ExpectNoError(client.StorageV1().StorageClasses().Delete(ctx, defaultNginxStorageClassName, *metav1.NewDeleteOptions(0)), "Unable to delete storage class "+defaultNginxStorageClassName) } - scParameters = make(map[string]string) - nodeList, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - framework.ExpectNoError(err, "Unable to list k8s nodes") - if !(len(nodeList.Items) > 0) { - framework.Failf("Unable to find k8s nodes") - } + if os.Getenv("VOLUME_OPS_SCALE") != "" { volumeOpsScale, err = strconv.Atoi(os.Getenv(envVolumeOperationsScale)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -162,6 +174,11 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) } + if rwxAccessMode { + accessmode = v1.ReadWriteMany + } else { + accessmode = v1.ReadWriteOnce + } }) ginkgo.AfterEach(func() { @@ -193,6 +210,14 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f } } + if supervisorCluster { + dumpSvcNsEventsOnTestFailure(client, namespace) + } + if guestCluster { + svcClient, svNamespace := getSvcClientAndNamespace() + dumpSvcNsEventsOnTestFailure(svcClient, svNamespace) + } + }) ginkgo.JustAfterEach(func() { @@ -221,23 +246,30 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 6. Bring primary site up and wait for testbed to be back to normal 7. Delete all objects created in step 2 and 5 */ - ginkgo.It("[primary-centric] Primary site down", func() { + ginkgo.It("[primary-centric][csi-vsan-stretch-wcp][csi-vsan-stretch-tkg] Primary site down", func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" + var stsReplicas, depReplicaCount int32 + var statefulset *appsv1.StatefulSet + var err error + var svcCsipods *v1.PodList - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.By("Creating StorageClass for Statefulset") + if vanillaCluster { + scParameters = map[string]string{} + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } ginkgo.By("Creating service") service := CreateService(namespace, client) @@ -245,13 +277,24 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f deleteService(namespace, client, service) }() ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", "") + + if rwxAccessMode { + depReplicaCount = 3 + } else { + depReplicaCount = 1 + } + stsReplicas = 3 + statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, stsReplicas, "", depReplicaCount, accessmode) ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - replicas := *(statefulset.Spec.Replicas) csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + defer func() { scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) @@ -284,22 +327,43 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f }() ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster || vanillaCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + time.Sleep(5 * time.Minute) + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failure in supervisor") + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc) + volumeLifecycleActions(ctx, client, namespace, sc, "") + // Scale down replicas of statefulset and verify CNS entries for volumes scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) + ssPodsBeforeScaleDown, stsReplicas-1, true, true) // Scale up replicas of statefulset and verify CNS entries for volumes scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) + stsReplicas, true, true) + + if rwxAccessMode { + depReplicaCount += 3 + updateDeploymentReplicawithWait(client, depReplicaCount, deployment.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment, namespace, nil, nil, + nil, "") + } ginkgo.By("Bring up the primary site") if len(fds.hostsDown) > 0 && fds.hostsDown != nil { @@ -307,6 +371,13 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fds.hostsDown = nil } + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + wait4AllK8sNodesToBeUp(ctx, svcClient, svcNodeList) + err = waitForAllNodes2BeReady(ctx, svcClient) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Wait for k8s cluster to be healthy") // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client) @@ -331,24 +402,33 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 8. Delete statefulsets and its pvcs created in step 2 9. Bring primary site up and wait for testbed to be back to normal */ - ginkgo.It("[primary-centric][control-plane-on-primary] Statefulset scale up/down while primary"+ + ginkgo.It("[primary-centric][control-plane-on-primary] [csi-vsan-stretch-wcp][csi-vsan-stretch-tkg] Statefulset scale up/down while primary"+ " site goes down", func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 + var statefulset1, statefulset2 *appsv1.StatefulSet + var err error + var svcCsipods *v1.PodList + + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + + } ginkgo.By("Creating service") service := CreateService(namespace, client) @@ -358,11 +438,29 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", "") - replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", "") + + if rwxAccessMode { + dep1ReplicaCount = 3 + dep2ReplicaCount = 5 + } else { + dep1ReplicaCount = 1 + dep2ReplicaCount = 1 + } + sts1Replicas = 1 + sts2Replicas = 5 + statefulset1, deployment1, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, sts1Replicas, "web", dep1ReplicaCount, accessmode) + statefulset2, deployment2, _ := createStsDeployment(ctx, client, namespace, sc, true, + true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessmode) ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - replicas2 := *(statefulset2.Spec.Replicas) + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } defer func() { scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) @@ -384,17 +482,21 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f } }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if rwxAccessMode { + dep1ReplicaCount += 3 + dep2ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) - replicas1 += 2 - ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) - fss.UpdateReplicas(ctx, client, statefulset1, replicas1) + } else { + sts1Replicas += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) + fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) - replicas2 -= 2 - ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) - fss.UpdateReplicas(ctx, client, statefulset2, replicas2) + sts2Replicas -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) + fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) + } ginkgo.By("Bring down the primary site") siteFailover(ctx, true) @@ -408,31 +510,55 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f }() ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster && guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + time.Sleep(5 * time.Minute) + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failure in supervisor") + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") - // Scale up replicas of statefulset1 and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, false, true) - // Scale down replicas of statefulset2 and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, false, true) + if rwxAccessMode { + dep1ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + dep2ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } else { + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) - // Scaling up statefulset sts1 - replicas1 += 2 - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, true, false) + // Scaling up statefulset sts1 + sts1Replicas += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) - // Scaling down statefulset sts2 - replicas2 -= 2 - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, true, false) + // Scaling down statefulset sts2 + sts2Replicas -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + } scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) @@ -458,6 +584,13 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fds.hostsDown = nil } + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + wait4AllK8sNodesToBeUp(ctx, svcClient, svcNodeList) + err = waitForAllNodes2BeReady(ctx, svcClient) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Wait for k8s cluster to be healthy") // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client) @@ -628,24 +761,32 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.It("[primary-centric] PVC creation while primary site goes down", func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - var pvclaims []*v1.PersistentVolumeClaim - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + var svcCsipods, csipods *v1.PodList + + if vanillaCluster { + scParameters = map[string]string{} + storageClassName = "nginx-sc-thick" + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + ginkgo.By("Get csi pods list before bringing down the site") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Bring down the primary site while creating pvcs") @@ -683,17 +824,30 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) volumeHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volumeHandle = getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volumeHandle).NotTo(gomega.BeEmpty()) + } err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster && guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - // Check if csi pods are running fine after site failure + if guestCluster { + ginkgo.By("Check for csi pods to be in Ready state in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Check if csi pods are running fine after site failure") err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -702,8 +856,20 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f for i := 0; i < volumeOpsScale; i++ { volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(persistentvolumes[i].Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := persistentvolumes[i].Spec.CSI.VolumeHandle + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaims[i], + persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + err = waitAndVerifyCnsVolumeMetadata(ctx, volHandle, pvclaims[i], persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } + ginkgo.By("Delete all PVCs created in this test") for _, pvclaim := range pvclaims { err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -725,8 +891,6 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fds.hostsDown = nil } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -747,32 +911,47 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.It("[primary-centric][control-plane-on-primary][distributed] Primary site network isolation", func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + var stsReplicas, depReplicaCount int32 + var statefulset *appsv1.StatefulSet + var svcCsipods, csipods *v1.PodList + + if rwxAccessMode { + depReplicaCount = 3 + stsReplicas = 3 + } else { + depReplicaCount = 1 + stsReplicas = 4 + } + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } ginkgo.By("Creating service") service := CreateService(namespace, client) defer func() { deleteService(namespace, client, service) }() + ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", "") + false, stsReplicas, "", depReplicaCount, accessmode) ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - replicas := *(statefulset.Spec.Replicas) - defer func() { pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -782,6 +961,10 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") volumeHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volumeHandle = getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volumeHandle).NotTo(gomega.BeEmpty()) + } err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, pollTimeout) errMsg := "The object or item referred to could not be found" @@ -798,25 +981,39 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f } }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + ginkgo.By("Get csi pods list before bringing down the site") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) // Cause a network failure on primary site ginkgo.By("Isolate primary site from witness and secondary site") siteNetworkFailure(true, false) - defer func() { ginkgo.By("Bring up the primary site before terminating the test") siteNetworkFailure(true, true) }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failure in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - // Check if csi pods are running fine after network failure + ginkgo.By("Check for nodes to be in Ready state") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster || vanillaCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Check if csi pods are running fine after site failure") err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -827,50 +1024,30 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f err = fpod.WaitForPodNameRunningInNamespace(ctx, client, pod.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - fss.WaitForStatusReadyReplicas(ctx, client, statefulset, replicas) + fss.WaitForStatusReadyReplicas(ctx, client, statefulset, stsReplicas) gomega.Expect(fss.CheckMount(ctx, client, statefulset, mountPath)).NotTo(gomega.HaveOccurred()) ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc) - // Scale down replicas of statefulset and verify CNS entries for volumes + volumeLifecycleActions(ctx, client, namespace, sc, "") + + ginkgo.By("Performing scaledown operation on statefulset when site is down") + stsReplicas = 2 scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) - // Scale up replicas of statefulset and verify CNS entries for volumes + ssPodsBeforeScaleDown, stsReplicas, true, true) + + ginkgo.By("Performing scaleup operation on statefulset when site is down") + stsReplicas = 6 scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) + stsReplicas, true, true) ginkgo.By("Bring up the primary site") siteNetworkFailure(true, true) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Scale down statefulset and deployment after site recovery") scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - volumeHandle := pv.Spec.CSI.VolumeHandle - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - } }) /* @@ -889,20 +1066,25 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f defer cancel() ginkgo.By("Creating StorageClass") // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) for i := 0; i < volumeOpsScale; i++ { framework.Logf("Creating pvc %v", i) pvclaims[i], err = createPVC(ctx, client, namespace, nil, diskSize, sc, "") @@ -914,6 +1096,17 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f for i := 0; i < volumeOpsScale; i++ { volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(persistentvolumes[i].Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := persistentvolumes[i].Spec.CSI.VolumeHandle + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaims[i], + persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + err = waitAndVerifyCnsVolumeMetadata(ctx, volHandle, pvclaims[i], persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } // Get the list of csi pods running in CSI namespace @@ -936,9 +1129,13 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f }() ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } // Check if csi pods are running fine after site failure err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) @@ -1122,15 +1319,10 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 9. Bring primary site up and wait for testbed to be back to normal */ ginkgo.It("[primary-centric] Label updates to PV, PVC, pod while primary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim + if os.Getenv(envFullSyncWaitTime) != "" { fullSyncWaitTime, err := strconv.Atoi(os.Getenv(envFullSyncWaitTime)) framework.Logf("Full-Sync interval time value is = %v", fullSyncWaitTime) @@ -1139,13 +1331,26 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fullSyncWaitTime = defaultFullSyncWaitTime } - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + var pvclaims []*v1.PersistentVolumeClaim + var svcCsipods, csipods *v1.PodList + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } for i := 0; i < volumeOpsScale; i++ { framework.Logf("Creating pvc") @@ -1159,6 +1364,17 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f for i := 0; i < volumeOpsScale; i++ { volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(persistentvolumes[i].Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := persistentvolumes[i].Spec.CSI.VolumeHandle + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaims[i], + persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + err = waitAndVerifyCnsVolumeMetadata(ctx, volHandle, pvclaims[i], persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } defer func() { for _, claim := range pvclaims { @@ -1178,8 +1394,13 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f } }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + ginkgo.By("Get csi pods list before bringing down the site") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Bring down the primary site while adding labels to PVCs and PVs") @@ -1192,17 +1413,33 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f go siteFailureInParallel(ctx, true, &wg) wg.Wait() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) + ginkgo.By("Check if csi pods are running fine after site failure") + if guestCluster { + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) framework.Logf("Sleeping full-sync interval for volumes to be updated " + "with labels in CNS") time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + ginkgo.By("Check if csi pods are running fine after site failure") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) persistentvolumes, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) @@ -1246,11 +1483,6 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Bring up the primary site") siteRestore(true) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) /* PVC creation while secondary site goes down and csi provisioner leader is in secondary site @@ -1770,66 +2002,80 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 7. Delete all objects created in step 2 and 5 */ ginkgo.It("[control-plane-on-primary] Secondary site down", func() { + ctx, cancel := context.WithCancel(context.Background()) defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + var stsReplicas, depReplicaCount int32 + var statefulset *appsv1.StatefulSet + var svcCsipods, csipods *v1.PodList + + if rwxAccessMode { + depReplicaCount = 3 + stsReplicas = 3 + } else { + depReplicaCount = 1 + stsReplicas = 4 + } + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - framework.Logf("Creating service") + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Creating service") service := CreateService(namespace, client) defer func() { deleteService(namespace, client, service) }() ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", "") + statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, stsReplicas, "", depReplicaCount, accessmode) ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - replicas := *(statefulset.Spec.Replicas) - defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) for _, claim := range pvcs.Items { pv := getPvFromClaim(client, namespace, claim.Name) err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) } }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Bring down the secondary site") siteFailover(ctx, false) - defer func() { ginkgo.By("Bring up the secondary site before terminating the test") if len(fds.hostsDown) > 0 && fds.hostsDown != nil { @@ -1838,23 +2084,48 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f } }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failurein supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - // Check if csi pods are running fine after site failure err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc) - // Scale down replicas of statefulset and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) - // Scale up replicas of statefulset and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) + volumeLifecycleActions(ctx, client, namespace, sc, "") + + // Statefulset and deployments in PodVM might got to Terminating state as + // the nodes attached to these pods might become inaccessible during site failure. + // Hence validating these steps once site is restored back. + if !supervisorCluster { + ginkgo.By("Performing scaledown operation on statefulset when site is down") + stsReplicas = 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, stsReplicas, true, true) + + ginkgo.By("Performing scaleup operation on statefulset when site is down") + stsReplicas = 6 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + stsReplicas, true, true) + + if rwxAccessMode { + ginkgo.By("Performing scaleup operation on deployment when site is down") + depReplicaCount = 4 + updateDeploymentReplicawithWait(client, depReplicaCount, deployment.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment, namespace, nil, nil, + nil, "") + } + } ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 && fds.hostsDown != nil { @@ -1862,11 +2133,30 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fds.hostsDown = nil } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if supervisorCluster { + ginkgo.By("Performing scaledown operation on statefulset when site is down") + stsReplicas = 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, stsReplicas, true, true) + + ginkgo.By("Performing scaleup operation on statefulset when site is down") + stsReplicas = 6 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + stsReplicas, true, true) + + if rwxAccessMode { + ginkgo.By("Performing scaleup operation on deployment when site is down") + depReplicaCount = 4 + updateDeploymentReplicawithWait(client, depReplicaCount, deployment.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment, namespace, nil, nil, + nil, "") + } + } + + ginkgo.By("Scale down statefulset and deployment after site recovery") scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) }) @@ -1896,28 +2186,54 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f // decide which test setup is available to run ginkgo.By("CNS_TEST: Running for vanilla k8s setup") scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName + scParameters[scParamStoragePolicyName] = storagePolicyName storageClassName = "nginx-sc-default" + var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 + var statefulset1, statefulset2 *appsv1.StatefulSet + var deployment1, deployment2 *appsv1.Deployment + var err error + var svcCsipods *v1.PodList - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + if vanillaCluster { + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } + framework.Logf("Creating service") service := CreateService(namespace, client) defer func() { deleteService(namespace, client, service) }() + if rwxAccessMode { + dep1ReplicaCount = 1 + dep2ReplicaCount = 5 + } else { + dep1ReplicaCount = 1 + dep2ReplicaCount = 1 + } + sts1Replicas = 1 + sts2Replicas = 2 + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", "") - replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", - "") + + statefulset1, deployment1, _ = createStsDeployment(ctx, client, namespace, sc, true, true, sts1Replicas, "web", dep1ReplicaCount, accessmode) + + statefulset2, deployment2, _ = createStsDeployment(ctx, client, namespace, sc, true, true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessmode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) replicas2 := *(statefulset2.Spec.Replicas) @@ -1925,13 +2241,18 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - replicas1 += 2 - ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) - fss.UpdateReplicas(ctx, client, statefulset1, replicas1) - - replicas2 -= 2 - ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) - fss.UpdateReplicas(ctx, client, statefulset2, replicas2) + sts1Replicas += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) + fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) + + sts2Replicas -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) + fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } ginkgo.By("Isolate secondary site from witness and primary site") siteNetworkFailure(false, false) @@ -1945,9 +2266,19 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f }() ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failurein supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } // Check if csi pods are running fine after site failure err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) @@ -1956,23 +2287,45 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") // Scale up replicas of statefulset1 and verify CNS entries for volumes scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, false, true) + sts1Replicas, false, true) // Scale down replicas of statefulset2 and verify CNS entries for volumes scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, ss2PodsBeforeScaleDown, replicas2, false, true) + if rwxAccessMode { + dep1ReplicaCount += 2 + updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, + nil, "") + dep2ReplicaCount -= 2 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment1.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } + ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc) + volumeLifecycleActions(ctx, client, namespace, sc, accessmode) // Scaling up statefulset sts1 - replicas1 += 2 + sts1Replicas += 2 scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, true, false) + sts1Replicas, true, false) // Scaling down statefulset sts2 - replicas2 -= 2 + sts2Replicas -= 2 scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, true, false) + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + + if rwxAccessMode { + dep1ReplicaCount += 2 + updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, + nil, "") + dep2ReplicaCount -= 2 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment1.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } ginkgo.By("Bring up the secondary site by removing network failure") if len(fds.hostsPartitioned) > 0 && fds.hostsPartitioned != nil { @@ -1986,10 +2339,9 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc) + volumeLifecycleActions(ctx, client, namespace, sc, accessmode) scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - }) /* @@ -2006,23 +2358,36 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.It("[distributed] Witness failure", func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" + + var stsReplicas, depReplicaCount int32 + var statefulset *appsv1.StatefulSet + var svcCsipods, csipods *v1.PodList + + if rwxAccessMode { + depReplicaCount = 3 + stsReplicas = 3 + } else { + depReplicaCount = 1 + stsReplicas = 4 + } + + ginkgo.By("Creating service") service := CreateService(namespace, client) defer func() { deleteService(namespace, client, service) }() - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + ginkgo.By("Get csi pods list before bringing down witness host") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Bring down witness host") toggleWitnessPowerState(ctx, true) - defer func() { ginkgo.By("Bring up the witness host before terminating the test") if fds.witnessDown != "" { @@ -2030,30 +2395,67 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f } }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Check if csi pods are running fine after witness failure") + if guestCluster { + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Check if csi pods are running fine after witness failure") err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, false, 0, "", - "") - replicas := *(statefulset.Spec.Replicas) + ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") + statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, stsReplicas, "", depReplicaCount, accessmode) ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - // Scale down replicas of statefulset and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + ginkgo.By("Check storage compliance") comp := checkVmStorageCompliance(client, storagePolicyName) if !comp { framework.Failf("Expected VM and storage compliance to be false but found true") @@ -2064,20 +2466,37 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f toggleWitnessPowerState(ctx, false) } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc) + volumeLifecycleActions(ctx, client, namespace, sc, "") - ginkgo.By(fmt.Sprintf("Scaling up statefulset: %v to number of Replica: %v", - statefulset.Name, replicas)) - // Scale up replicas of statefulset and verify CNS entries for volumes + ginkgo.By("Performing scaledown operation on statefulset") + stsReplicas = 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, stsReplicas, true, true) + + ginkgo.By("Performing scaleup operation on statefulset") + stsReplicas = 6 scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) + stsReplicas, true, true) + + if rwxAccessMode { + ginkgo.By("Performing scaleup operation on deployment") + depReplicaCount = 4 + updateDeploymentReplicawithWait(client, depReplicaCount, deployment.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment, namespace, nil, nil, + nil, "") + } + + ginkgo.By("Check storage compliance") + comp = checkVmStorageCompliance(client, storagePolicyName) + if !comp { + framework.Failf("Expected VM and storage compliance to be true but found false") + } + ginkgo.By("Scale down statefulset and deployment after site recovery") scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) }) @@ -2133,11 +2552,11 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", - "") + statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, + accessmode) replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", - "") + statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, + accessmode) ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) replicas2 := *(statefulset2.Spec.Replicas) @@ -2488,11 +2907,11 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", - "") + statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, + accessmode) replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", - "") + statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, + accessmode) ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) replicas2 := *(statefulset2.Spec.Replicas) @@ -2640,11 +3059,11 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", - "") + statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, + accessmode) replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", - "") + statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, + accessmode) ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) replicas2 := *(statefulset2.Spec.Replicas) @@ -2692,9 +3111,13 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f }() ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } // Check if csi pods are running fine after site failure err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) @@ -2746,7 +3169,6 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) /* @@ -2761,8 +3183,8 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 7. Create 50 new PVCs 8. Wait for secondary site VMs to come up and k8s to be healthy 9. Verify all stateful sets have scaled up/down successfully - 10. Scale down first 50 sts to 2 replicas - 11. Scale up second 50 statefulsets to 1 replica + 10. Scale down first 50 sts to 2 replicas. + 11. Scale up second 50 statefulsets to 1 replica. 12. Verify all stateful sets have scaled up/down successfully 13. Delete all stateful sets 14. Delete all PVCs @@ -2802,8 +3224,8 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f for i := 0; i < operationStormScale; i++ { statefulsetName := prefix1 + strconv.Itoa(i) framework.Logf("Creating statefulset: %s", statefulsetName) - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, statefulsetName, - "") + statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, statefulsetName, 1, + accessmode) replicas1 = *(statefulset.Spec.Replicas) stsList = append(stsList, statefulset) } @@ -2812,8 +3234,8 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f for i := 0; i < operationStormScale; i++ { statefulsetName := prefix2 + strconv.Itoa(i) framework.Logf("Creating statefulset: %s", statefulsetName) - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 2, statefulsetName, - "") + statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 2, statefulsetName, 1, + accessmode) replicas2 = *(statefulset.Spec.Replicas) stsList = append(stsList, statefulset) } @@ -2879,7 +3301,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f pvcList = append(pvcList, v) } }() - framework.Logf("ch is %v", ch) + go siteFailureInParallel(ctx, false, &wg) wg.Wait() close(ch) @@ -2967,8 +3389,8 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f // wait for the VMs to move back err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + /* Partial failure of secondary site Steps: @@ -2976,7 +3398,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 2. Create a statefulset, deployment with volumes from the stretched datastore 3. Bring down a esx server in the secondary site 4. Verify that the VMs on the esx server which was brought down are started up on the - other esx servers in the secondary site + other esx servers in the secondary site. 5. Verify that the k8s cluster is healthy and all the k8s constructs created in step 2 are running and volume and application lifecycle actions work fine 6. Restore secondary site back up and wait for testbed to be back to normal @@ -3008,7 +3430,8 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 0, "", "") + false, 3, "", 1, accessmode) + ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) replicas := *(statefulset.Spec.Replicas) csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) @@ -3065,7 +3488,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc) + volumeLifecycleActions(ctx, client, namespace, sc, "") // Scale down replicas of statefulset and verify CNS entries for volumes scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, ssPodsBeforeScaleDown, replicas-1, true, true) @@ -3085,7 +3508,6 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - }) /* @@ -3369,29 +3791,29 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f }) /* - Site failover during full sync - Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication - 2. Create 6 PVCs with reclaim policy Delete and 8 with reclaim policy Retain - and wait for them to be bound - 3. Delete four PVCs with reclaim policy Retain - 4. Delete two PVs reclaim policy Retain related to PVC used in step 3 - 5. Create two pods using PVCs with reclaim policy Delete - 6. Bring vsan-health service down - 7. Create two pods with two PVCs each - 8. Create two static PVs with disk left after step 4 - 9. Create two PVCs to bind to PVs with reclaim policy Retain - 10. Delete four PVCs with reclaim policy Retain different from the ones used in step 3 and 9 - 11. Delete two PVs reclaim policy Retain related to PVC used in step 10 - 12. Add labels to all PVs, PVCs - 13. Bring vsan-health service up when full sync is triggered - 14. Bring down primary site - 15. Verify that the VMs on the primary site are started up on the other esx servers - in the secondary site - 16. Wait for full sync - 17. Verify CNS entries - 18. Delete all pods, PVCs and PVs - 19. Bring primary site up and wait for testbed to be back to normal + Site failover during full sync + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create 6 PVCs with reclaim policy Delete and 8 with reclaim policy Retain + and wait for them to be bound + 3. Delete four PVCs with reclaim policy Retain + 4. Delete two PVs reclaim policy Retain related to PVC used in step 3 + 5. Create two pods using PVCs with reclaim policy Delete + 6. Bring vsan-health service down + 7. Create two pods with two PVCs each + 8. Create two static PVs with disk left after step 4 + 9. Create two PVCs to bind to PVs with reclaim policy Retain + 10. Delete four PVCs with reclaim policy Retain different from the ones used in step 3 and 9 + 11. Delete two PVs reclaim policy Retain related to PVC used in step 10 + 12. Add labels to all PVs, PVCs + 13. Bring vsan-health service up when full sync is triggered + 14. Bring down primary site + 15. Verify that the VMs on the primary site are started up on the other esx servers + in the secondary site + 16. Wait for full sync + 17. Verify CNS entries + 18. Delete all pods, PVCs and PVs + 19. Bring primary site up and wait for testbed to be back to normal */ ginkgo.It("[primary-centric][distributed] Primary site failover during full sync when syncer"+ @@ -3401,29 +3823,41 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Creating StorageClass") // decide which test setup is available to run ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-delete" + var pods []*v1.Pod var pvclaimsWithDelete, pvclaimsWithRetain []*v1.PersistentVolumeClaim var volHandles []string + var scRetain, scDelete *storagev1.StorageClass - framework.Logf("Ensuring %s leader is in primary site", syncerContainerName) - err := changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, true) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - scRetain, err := createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + framework.Logf("Ensuring %s leader is in primary site", syncerContainerName) + err := changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, true) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - scDelete, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, scRetain.Name, *metav1.NewDeleteOptions(0)) + if vanillaCluster { + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scRetain, err = createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = client.StorageV1().StorageClasses().Delete(ctx, scDelete.Name, *metav1.NewDeleteOptions(0)) + scSpec := getVSphereStorageClassSpec("nginx-sc-delete", scParameters, nil, "", "", false) + scDelete, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, scRetain.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = client.StorageV1().StorageClasses().Delete(ctx, scDelete.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scRetain, err = createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + scSpec := getVSphereStorageClassSpec("nginx-sc-delete", scParameters, nil, "", "", false) + scDelete, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } for i := 0; i < 6; i++ { framework.Logf("Creating pvc %v with reclaim policy Delete", i) @@ -3651,9 +4085,13 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f }() ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } // Check if csi pods are running fine after site failure err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) @@ -4047,4 +4485,228 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) + + /* + PSOD hosts on secondary site + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create two statefulset with replica count 1(sts1) and 5(sts2) respectively using a thick provision policy + and wait for all replicas to be running + 3. Change replica count of sts1 and sts2 to 3 + 4. Bring down primary site + 5. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site + 6. Verify there were no issue with replica scale up/down and verify pod entry in CNS volumemetadata for the + volumes associated with the PVC used by statefulsets are updated + 7. Change replica count of sts1 to 5 a sts2 to 1 and verify they are successful + 8. Delete statefulsets and its pvcs created in step 2 + 9. Bring primary site up and wait for testbed to be back to normal + */ + ginkgo.It("PSOD hosts on secondary site", func() { + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var svcCsipods, csipods *v1.PodList + var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() + + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") + + if rwxAccessMode { + dep1ReplicaCount = 3 + dep2ReplicaCount = 5 + } else { + dep1ReplicaCount = 1 + dep2ReplicaCount = 1 + } + sts1Replicas = 1 + sts2Replicas = 5 + statefulset1, deployment1, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, sts1Replicas, "web", dep1ReplicaCount, accessmode) + statefulset2, deployment2, _ := createStsDeployment(ctx, client, namespace, sc, true, + true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessmode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) + + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if rwxAccessMode { + dep1ReplicaCount += 3 + dep2ReplicaCount += 3 + err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } else { + sts1Replicas += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) + fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) + + sts2Replicas -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) + fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) + } + + ginkgo.By("Bring down the secondary site") + psodHostsOnSite(false, "600") + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + time.Sleep(5 * time.Minute) + + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Check if csi pods are running fine after site recovery") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Statefulset and deployments in PodVM might got to Terminating state as + // the nodes attached to these pods might become inaccessible during site failure. + // Hence validating these steps once site is restored back. + if !supervisorCluster { + + if rwxAccessMode { + dep1ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + dep2ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } else { + + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) + + // Scaling up statefulset sts1 + sts1Replicas -= 4 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) + + // Scaling down statefulset sts2 + sts2Replicas += 5 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + } + + } + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if supervisorCluster { + + if rwxAccessMode { + dep1ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + dep2ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } else { + + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) + + // Scaling up statefulset sts1 + sts1Replicas -= 4 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) + + // Scaling down statefulset sts2 + sts2Replicas += 5 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + } + + } + + }) + }) diff --git a/tests/e2e/vsan_stretched_cluster_utils.go b/tests/e2e/vsan_stretched_cluster_utils.go index 5470859eeb..11791b459f 100644 --- a/tests/e2e/vsan_stretched_cluster_utils.go +++ b/tests/e2e/vsan_stretched_cluster_utils.go @@ -30,6 +30,7 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" "github.com/vmware/govmomi/find" + "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/vsan" vsantypes "github.com/vmware/govmomi/vsan/types" "golang.org/x/crypto/ssh" @@ -175,6 +176,21 @@ func createFaultDomainMap(ctx context.Context, vs *vSphere) map[string]string { finder.SetDatacenter(dc) hosts, err := finder.HostSystemList(ctx, "*") gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if !vanillaCluster { + hostsInVsanStretchCluster := []*object.HostSystem{} + for _, host := range hosts { + hostInfo := host.Common.InventoryPath + hostIpInfo := strings.Split(hostInfo, "/") + hostCluster := hostIpInfo[len(hostIpInfo)-2] + if !strings.Contains(hostCluster, "EdgeMgmtCluster") { + hostsInVsanStretchCluster = append(hostsInVsanStretchCluster, host) + } + + } + hosts = hostsInVsanStretchCluster + } + for _, host := range hosts { vsanSystem, _ := host.ConfigManager().VsanSystem(ctx) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -663,7 +679,7 @@ func checkVmStorageCompliance(client clientset.Interface, storagePolicy string) // statefulset, deployment and volumes of statfulset created func createStsDeployment(ctx context.Context, client clientset.Interface, namespace string, sc *storagev1.StorageClass, isDeploymentRequired bool, modifyStsSpec bool, - replicaCount int32, stsName string, + stsReplica int32, stsName string, depReplicaCount int32, accessMode v1.PersistentVolumeAccessMode) (*appsv1.StatefulSet, *appsv1.Deployment, []string) { var pvclaims []*v1.PersistentVolumeClaim if accessMode == "" { @@ -680,7 +696,7 @@ func createStsDeployment(ctx context.Context, client clientset.Interface, namesp statefulset.Name = stsName statefulset.Spec.Template.Labels["app"] = statefulset.Name statefulset.Spec.Selector.MatchLabels["app"] = statefulset.Name - *(statefulset.Spec.Replicas) = replicaCount + *(statefulset.Spec.Replicas) = stsReplica } CreateStatefulSet(namespace, statefulset, client) replicas := *(statefulset.Spec.Replicas) @@ -705,9 +721,24 @@ func createStsDeployment(ctx context.Context, client clientset.Interface, namesp volumesBeforeScaleDown = append(volumesBeforeScaleDown, pv.Spec.CSI.VolumeHandle) // Verify the attached volume match the one in CNS cache if !multivc { - err := verifyVolumeMetadataInCNS(&e2eVSphere, pv.Spec.CSI.VolumeHandle, - volumespec.PersistentVolumeClaim.ClaimName, pv.ObjectMeta.Name, sspod.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster { + volHandle := getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := pv.Spec.CSI.VolumeHandle + pvcName := volumespec.PersistentVolumeClaim.ClaimName + + pvclaim, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, + pvcName, metav1.GetOptions{}) + gomega.Expect(pvclaim).NotTo(gomega.BeNil()) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaim, + pv, &sspod) + } else { + err := verifyVolumeMetadataInCNS(&e2eVSphere, pv.Spec.CSI.VolumeHandle, + volumespec.PersistentVolumeClaim.ClaimName, pv.ObjectMeta.Name, sspod.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } else { err := verifyVolumeMetadataInCNSForMultiVC(&multiVCe2eVSphere, pv.Spec.CSI.VolumeHandle, volumespec.PersistentVolumeClaim.ClaimName, pv.ObjectMeta.Name, sspod.Name) @@ -730,7 +761,7 @@ func createStsDeployment(ctx context.Context, client clientset.Interface, namesp labelsMap := make(map[string]string) labelsMap["app"] = "test" deployment, err := createDeployment( - ctx, client, 1, labelsMap, nil, namespace, pvclaims, "", false, busyBoxImageOnGcr) + ctx, client, depReplicaCount, labelsMap, nil, namespace, pvclaims, "", false, busyBoxImageOnGcr) gomega.Expect(err).NotTo(gomega.HaveOccurred()) deployment, err = client.AppsV1().Deployments(namespace).Get(ctx, deployment.Name, metav1.GetOptions{}) @@ -749,7 +780,15 @@ func createStsDeployment(ctx context.Context, client clientset.Interface, namesp // volumeLifecycleActions creates pvc and pod and waits for them to be in healthy state and then deletes them func volumeLifecycleActions(ctx context.Context, client clientset.Interface, namespace string, - sc *storagev1.StorageClass) { + sc *storagev1.StorageClass, accessMode v1.PersistentVolumeAccessMode) { + + var vmUUID string + var exists bool + + if accessMode == "" { + // If accessMode is not specified, set the default accessMode. + accessMode = v1.ReadWriteOnce + } if rwxAccessMode { pvc1, err := createPVC(ctx, client, namespace, nil, "", sc, v1.ReadWriteMany) @@ -796,10 +835,27 @@ func volumeLifecycleActions(ctx context.Context, client clientset.Interface, nam gomega.Expect(err).NotTo(gomega.HaveOccurred()) volHandle := pvs[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(pvs[0].Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + pod1, err := createPod(ctx, client, namespace, nil, []*v1.PersistentVolumeClaim{pvc1}, false, execCommand) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - vmUUID := getNodeUUID(ctx, client, pod1.Spec.NodeName) + if vanillaCluster { + vmUUID = getNodeUUID(ctx, client, pod1.Spec.NodeName) + framework.Logf("VMUUID : %s", vmUUID) + } else if guestCluster { + vmUUID, err = getVMUUIDFromNodeName(pod1.Spec.NodeName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("VMUUID : %s", vmUUID) + } else if supervisorCluster { + annotations := pod1.Annotations + vmUUID, exists = annotations[vmUUIDLabel] + gomega.Expect(exists).To(gomega.BeTrue(), fmt.Sprintf("Pod doesn't have %s annotation", vmUUIDLabel)) + framework.Logf("VMUUID : %s", vmUUID) + } framework.Logf("VMUUID : %s", vmUUID) isDiskAttached, err := e2eVSphere.isVolumeAttachedToVM(client, volHandle, vmUUID) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -811,6 +867,13 @@ func volumeLifecycleActions(ctx context.Context, client clientset.Interface, nam []string{"/bin/cat", "/mnt/volume1/fstype"}, "", time.Minute) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster && rwxAccessMode { + ginkgo.By("Verifying whether the CnsFileAccessConfig CRD is created or not for Pod1") + //verifyCNSFileAccessConfigCRDInSupervisor(ctx, f, pod1.Spec.NodeName+"-"+volHandle, + // crdCNSFileAccessConfig, crdVersion, crdGroup, true) + deletePodAndWaitForVolsToDetach(ctx, client, pod1) + } + deletePodAndWaitForVolsToDetach(ctx, client, pod1) err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc1.Name, namespace) @@ -870,7 +933,7 @@ func scaleDownStsAndVerifyPodMetadata(ctx context.Context, client clientset.Inte gomega.Expect(isDiskDetached).To(gomega.BeTrue(), fmt.Sprintf("Volume %q is not detached from the node %q", pv.Spec.CSI.VolumeHandle, sspod.Spec.NodeName)) - } else { + } else if supervisorCluster { annotations := sspod.Annotations vmUUID, exists := annotations[vmUUIDLabel] gomega.Expect(exists).To(gomega.BeTrue(), @@ -885,6 +948,15 @@ func scaleDownStsAndVerifyPodMetadata(ctx context.Context, client clientset.Inte fmt.Sprintf( "PodVM with vmUUID: %s still exists. So volume: %s is not detached from the PodVM", vmUUID, sspod.Spec.NodeName)) + } else { + + ginkgo.By("Verify volume is detached from the node") + isDiskDetached, err := e2eVSphere.waitForVolumeDetachedFromNode(client, + pv.Spec.CSI.VolumeHandle, sspod.Spec.NodeName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(isDiskDetached).To(gomega.BeTrue(), + fmt.Sprintf("Volume %q is not detached from the node %q", pv.Spec.CSI.VolumeHandle, sspod.Spec.NodeName)) + } } } @@ -898,9 +970,24 @@ func scaleDownStsAndVerifyPodMetadata(ctx context.Context, client clientset.Inte for _, volumespec := range sspod.Spec.Volumes { if volumespec.PersistentVolumeClaim != nil { pv := getPvFromClaim(client, statefulset.Namespace, volumespec.PersistentVolumeClaim.ClaimName) - err := verifyVolumeMetadataInCNS(&e2eVSphere, pv.Spec.CSI.VolumeHandle, - volumespec.PersistentVolumeClaim.ClaimName, pv.ObjectMeta.Name, sspod.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster { + volHandle := getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := pv.Spec.CSI.VolumeHandle + pvcName := volumespec.PersistentVolumeClaim.ClaimName + + pvclaim, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, + pvcName, metav1.GetOptions{}) + gomega.Expect(pvclaim).NotTo(gomega.BeNil()) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaim, + pv, &sspod) + } else { + + err := verifyVolumeMetadataInCNS(&e2eVSphere, pv.Spec.CSI.VolumeHandle, + volumespec.PersistentVolumeClaim.ClaimName, pv.ObjectMeta.Name, sspod.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } } } @@ -942,29 +1029,50 @@ func scaleUpStsAndVerifyPodMetadata(ctx context.Context, client clientset.Interf pv := getPvFromClaim(client, statefulset.Namespace, volumespec.PersistentVolumeClaim.ClaimName) framework.Logf(fmt.Sprintf("Verify volume: %s is attached to the node: %s", pv.Spec.CSI.VolumeHandle, sspod.Spec.NodeName)) - var vmUUID string + var vmUUID, volHandle string var exists bool ctx, cancel := context.WithCancel(context.Background()) defer cancel() + volHandle = pv.Spec.CSI.VolumeHandle if vanillaCluster { vmUUID = getNodeUUID(ctx, client, sspod.Spec.NodeName) - } else { + } else if supervisorCluster { annotations := pod.Annotations vmUUID, exists = annotations[vmUUIDLabel] gomega.Expect(exists).To( gomega.BeTrue(), fmt.Sprintf("Pod doesn't have %s annotation", vmUUIDLabel)) _, err := e2eVSphere.getVMByUUID(ctx, vmUUID) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + vmUUID, err = getVMUUIDFromNodeName(sspod.Spec.NodeName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("VMUUID : %s", vmUUID) + volHandle = getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) } if !rwxAccessMode { - isDiskAttached, err := e2eVSphere.isVolumeAttachedToVM(client, pv.Spec.CSI.VolumeHandle, vmUUID) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Disk is not attached to the node") + isDiskAttached, err := e2eVSphere.isVolumeAttachedToVM(client, volHandle, vmUUID) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Expect(isDiskAttached).To(gomega.BeTrue(), "Disk is not attached") } framework.Logf("After scale up, verify the attached volumes match those in CNS Cache") - err = verifyVolumeMetadataInCNS(&e2eVSphere, pv.Spec.CSI.VolumeHandle, - volumespec.PersistentVolumeClaim.ClaimName, pv.ObjectMeta.Name, sspod.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster { + volHandle := getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := pv.Spec.CSI.VolumeHandle + pvcName := volumespec.PersistentVolumeClaim.ClaimName + + pvclaim, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, + pvcName, metav1.GetOptions{}) + gomega.Expect(pvclaim).NotTo(gomega.BeNil()) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaim, + pv, &sspod) + } else { + err = verifyVolumeMetadataInCNS(&e2eVSphere, pv.Spec.CSI.VolumeHandle, + volumespec.PersistentVolumeClaim.ClaimName, pv.ObjectMeta.Name, sspod.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } } } @@ -1172,3 +1280,21 @@ func checkForEventWithMessage(client clientset.Interface, namespace string, } return eventFound } + +// psodHostsOnSite executes PSOD operation on the hosts of the given site +func psodHostsOnSite(primarySite bool, psodTimeout string) { + hosts := fds.secondarySiteHosts + if primarySite { + hosts = fds.primarySiteHosts + } + + for _, host := range hosts { + psodHost(host, psodTimeout) + } +} + +// psodHostsInParallel +func psodHostsInParallel(primarySite bool, psodTimeout string, wg *sync.WaitGroup) { + defer wg.Done() + psodHostsOnSite(true, psodTimeout) +}