From 3b1eca9fb32adcf2e4dd8f5da342ae514e010c80 Mon Sep 17 00:00:00 2001 From: Aishwarya-Hebbar Date: Sun, 15 Sep 2024 23:40:09 +0530 Subject: [PATCH] vsan stretch with CSI snapshot testcases --- tests/e2e/util.go | 26 + tests/e2e/vm_service_vsan_stretch_cluster.go | 679 +++++++++++++++ tests/e2e/vsan_stretched_cluster.go | 851 +++++++++++++++++++ tests/e2e/vsan_stretched_cluster_utils.go | 75 ++ 4 files changed, 1631 insertions(+) diff --git a/tests/e2e/util.go b/tests/e2e/util.go index 2456b8deb9..5088f98627 100644 --- a/tests/e2e/util.go +++ b/tests/e2e/util.go @@ -7026,3 +7026,29 @@ func removeStoragePolicyQuota(ctx context.Context, restClientConfig *rest.Config framework.Logf("Quota after removing: %s", spq.Spec.Limit) } + +func createMultipleDeployments(ctx context.Context, client clientset.Interface, namespace string, + depCount int, pvcList []*v1.PersistentVolumeClaim) []*appsv1.Deployment { + + framework.Logf("Creating Deployment") + var deploymentList []*appsv1.Deployment + for i := 0; i < depCount; i++ { + labelsMap := make(map[string]string) + labelsMap["app"] = "test" + deployment, err := createDeployment( + ctx, client, 1, labelsMap, nil, namespace, []*v1.PersistentVolumeClaim{pvcList[i]}, "", false, busyBoxImageOnGcr) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + deployment, err = client.AppsV1().Deployments(namespace).Get(ctx, deployment.Name, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + pods, err := fdep.GetPodsForDeployment(ctx, client, deployment) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pod := pods.Items[0] + err = fpod.WaitForPodNameRunningInNamespace(ctx, client, pod.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + deploymentList = append(deploymentList, deployment) + + } + return deploymentList +} diff --git a/tests/e2e/vm_service_vsan_stretch_cluster.go b/tests/e2e/vm_service_vsan_stretch_cluster.go index 928a41978f..0d977f5439 100644 --- a/tests/e2e/vm_service_vsan_stretch_cluster.go +++ b/tests/e2e/vm_service_vsan_stretch_cluster.go @@ -20,7 +20,9 @@ import ( "context" "fmt" "os" + "strconv" "strings" + "sync" "time" "github.com/onsi/ginkgo/v2" @@ -28,10 +30,13 @@ import ( vmopv1 "github.com/vmware-tanzu/vm-operator/api/v1alpha1" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/kubernetes/test/e2e/framework" + e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" fnodes "k8s.io/kubernetes/test/e2e/framework/node" fpod "k8s.io/kubernetes/test/e2e/framework/pod" fpv "k8s.io/kubernetes/test/e2e/framework/pv" @@ -39,6 +44,9 @@ import ( ctlrclient "sigs.k8s.io/controller-runtime/pkg/client" cnsop "sigs.k8s.io/vsphere-csi-driver/v3/pkg/apis/cnsoperator" + + snapV1 "github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1" + snapclient "github.com/kubernetes-csi/external-snapshotter/client/v6/clientset/versioned" ) var _ bool = ginkgo.Describe("[vsan-stretch-vmsvc] vm service with csi vol tests", func() { @@ -63,6 +71,9 @@ var _ bool = ginkgo.Describe("[vsan-stretch-vmsvc] vm service with csi vol tests isSPSserviceStopped bool vcAddress string nodeList *v1.NodeList + snapc *snapclient.Clientset + guestClusterRestConfig *rest.Config + pandoraSyncWaitTime int ) ginkgo.BeforeEach(func() { @@ -122,6 +133,23 @@ var _ bool = ginkgo.Describe("[vsan-stretch-vmsvc] vm service with csi vol tests namespace, vmImageName) vmi = waitNGetVmiForImageName(ctx, vmopC, namespace, vmImageName) gomega.Expect(vmi).NotTo(gomega.BeEmpty()) + + if !guestCluster { + restConfig = getRestConfigClient() + snapc, err = snapclient.NewForConfig(restConfig) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + guestClusterRestConfig = getRestConfigClientForGuestCluster(guestClusterRestConfig) + snapc, err = snapclient.NewForConfig(guestClusterRestConfig) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if os.Getenv(envPandoraSyncWaitTime) != "" { + pandoraSyncWaitTime, err = strconv.Atoi(os.Getenv(envPandoraSyncWaitTime)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + pandoraSyncWaitTime = defaultPandoraSyncWaitTime + } }) ginkgo.AfterEach(func() { @@ -296,4 +324,655 @@ var _ bool = ginkgo.Describe("[vsan-stretch-vmsvc] vm service with csi vol tests gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) + + /* + VMService - Volume snapshot creation while secondary site goes down + Steps: + 1. Create 10 PVCS using the storageclass as mentioned in testbed structure and verify that it goes to bound state. + 2. Create VMService VM with each PVC created in step1. + 3. While VMService VM creation is going on, bring down the primary site by powering off the hosts in primary site in parallel. + 4. Verify that the supervisor cluster should be in running and ready state after site failover. + 5. Verify that all the PVCs created in step 2 are running fine. + 6. Perform volume lifecycle actions which should work fine. + 7. Bring primary site up and wait for testbed to be back to normal. + 8. Delete all objects created in the test. + */ + ginkgo.It("VMService - Volume snapshot creation while secondary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var vmCount = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + var pvclaimsList []*v1.PersistentVolumeClaim + var volHandles []string + var volumeSnapshotList []*snapV1.VolumeSnapshot + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + for i := 0; i < pvcCount; i++ { + pvclaim, persistentVolumes := createPVCAndQueryVolumeInCNS(ctx, client, namespace, nil, "", + diskSize, sc, true) + pvclaimsList = append(pvclaimsList, pvclaim) + volHandle := persistentVolumes[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + } + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + volHandles = append(volHandles, volHandle) + + defer func() { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + _, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + /*for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + }*/ + } + + ginkgo.By("Create volume snapshot class") + volumeSnapshotClass, err := createVolumeSnapshotClass(ctx, snapc, deletionPolicy) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + if vanillaCluster { + err = snapc.SnapshotV1().VolumeSnapshotClasses().Delete(ctx, volumeSnapshotClass.Name, + metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ch := make(chan *snapV1.VolumeSnapshot) + var wg sync.WaitGroup + var lock *sync.Mutex + ginkgo.By("Creating volumeSnapshot in parallel to secondary site failure") + wg.Add(vmCount) + go createDynamicSnapshotInParallel(ctx, namespace, snapc, + pvclaimsList, volumeSnapshotClass.Name, ch, lock, &wg) + go func() { + for v := range ch { + volumeSnapshotList = append(volumeSnapshotList, v) + } + }() + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + close(ch) + + defer func() { + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + _, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, + vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Verify volume snapshot is created") + for i, volumeSnapshot := range volumeSnapshotList { + volumeSnapshot, err = waitForVolumeSnapshotReadyToUse(*snapc, ctx, namespace, volumeSnapshot.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if volumeSnapshot.Status.RestoreSize.Cmp(resource.MustParse(diskSize)) != 0 { + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "unexpected restore size") + } + + ginkgo.By("Verify volume snapshot content is created") + snapshotContent, err := snapc.SnapshotV1().VolumeSnapshotContents().Get(ctx, + *volumeSnapshot.Status.BoundVolumeSnapshotContentName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + snapshotContent, err = waitForVolumeSnapshotContentReadyToUse(*snapc, ctx, snapshotContent.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Get volume snapshot ID from snapshot handle") + snapshotId, err := getVolumeSnapshotIdFromSnapshotHandle(ctx, snapshotContent) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Query CNS and check the volume snapshot entry") + err = waitForCNSSnapshotToBeCreated(volHandles[i], snapshotId) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify restore volume from snapshot is successfull") + verifyVolumeRestoreOperation(ctx, client, namespace, sc, volumeSnapshot, diskSize, true) + + ginkgo.By("Delete dynamic volume snapshot") + _, _, err = deleteVolumeSnapshot(ctx, snapc, namespace, + volumeSnapshot, pandoraSyncWaitTime, volHandles[i], snapshotId) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + VMService - Restore volume from snapshot while secondary site goes down + Steps: + 1. Create 10 PVCS using the storageclass as mentioned in testbed structure and verify that it goes to bound state. + 2. Create VMService VM with each PVC created in step1. + 3. While VMService VM creation is going on, bring down the primary site by powering off the hosts in primary site in parallel. + 4. Verify that the supervisor cluster should be in running and ready state after site failover. + 5. Verify that all the PVCs created in step 2 are running fine. + 6. Perform volume lifecycle actions which should work fine. + 7. Bring primary site up and wait for testbed to be back to normal. + 8. Delete all objects created in the test. + */ + ginkgo.It("VMService - Restore volume from snapshot while secondary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + var pvclaimsList, restoreVolList []*v1.PersistentVolumeClaim + var volHandles []string + var volumeSnapshotList []*snapV1.VolumeSnapshot + var snapshotIds []string + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + for i := 0; i < pvcCount; i++ { + pvclaim, persistentVolumes := createPVCAndQueryVolumeInCNS(ctx, client, namespace, nil, "", + diskSize, sc, true) + pvclaimsList = append(pvclaimsList, pvclaim) + volHandle := persistentVolumes[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + } + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + volHandles = append(volHandles, volHandle) + + defer func() { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + + ginkgo.By("Create volume snapshot class") + volumeSnapshotClass, err := createVolumeSnapshotClass(ctx, snapc, deletionPolicy) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + if vanillaCluster { + err = snapc.SnapshotV1().VolumeSnapshotClasses().Delete(ctx, volumeSnapshotClass.Name, + metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Create a dynamic volume snapshot") + for i, pvclaim := range pvclaimsList { + volumeSnapshot, _, _, + _, snapshotId, err := createDynamicVolumeSnapshot(ctx, namespace, snapc, volumeSnapshotClass, + pvclaim, volHandles[i], diskSize, true) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeSnapshotList = append(volumeSnapshotList, volumeSnapshot) + //snapshotContents = append(snapshotContents, snapshotContent) + snapshotIds = append(snapshotIds, snapshotId) + + } + defer func() { + for i, volumeSnapshot := range volumeSnapshotList { + ginkgo.By("Delete dynamic volume snapshot") + _, _, err = deleteVolumeSnapshot(ctx, snapc, namespace, + volumeSnapshot, pandoraSyncWaitTime, volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } + }() + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the secondary site while creating pvcs") + var ch chan *v1.PersistentVolumeClaim + var wg sync.WaitGroup + var lock *sync.Mutex + wg.Add(2) + go restoreVolumeFromSnapshotInParallel(ctx, client, namespace, sc, volumeSnapshotList, ch, lock, &wg) + go func() { + for v := range ch { + restoreVolList = append(restoreVolList, v) + } + }() + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + _, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, + vmClass, namespace, vmi, sc, secretName) + + for i, _ := range volumeSnapshotList { + + persistentvolumes2, err := fpv.WaitForPVClaimBoundPhase(ctx, client, + []*v1.PersistentVolumeClaim{restoreVolList[i]}, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volHandle2 := persistentvolumes2[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle2 = getVolumeIDFromSupervisorCluster(volHandle2) + } + gomega.Expect(volHandle2).NotTo(gomega.BeEmpty()) + + // Create a Pod to use this PVC, and verify volume has been attached + ginkgo.By("Creating pod to attach PV to the node") + pod, err := createPod(ctx, client, namespace, nil, + []*v1.PersistentVolumeClaim{restoreVolList[i]}, false, execRWXCommandPod1) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + var vmUUID string + nodeName := pod.Spec.NodeName + + if vanillaCluster { + vmUUID = getNodeUUID(ctx, client, pod.Spec.NodeName) + } else if guestCluster { + vmUUID, err = getVMUUIDFromNodeName(pod.Spec.NodeName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By(fmt.Sprintf("Verify volume: %s is attached to the node: %s", volHandle2, nodeName)) + isDiskAttached, err := e2eVSphere.isVolumeAttachedToVM(client, volHandle2, vmUUID) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(isDiskAttached).To(gomega.BeTrue(), "Volume is not attached to the node") + + ginkgo.By("Verify the volume is accessible and Read/write is possible") + cmd := []string{"exec", pod.Name, "--namespace=" + namespace, "--", "/bin/sh", "-c", + "cat /mnt/volume1/Pod1.html "} + output := e2ekubectl.RunKubectlOrDie(namespace, cmd...) + gomega.Expect(strings.Contains(output, "Hello message from Pod1")).NotTo(gomega.BeFalse()) + + wrtiecmd := []string{"exec", pod.Name, "--namespace=" + namespace, "--", "/bin/sh", "-c", + "echo 'Hello message from test into Pod1' > /mnt/volume1/Pod1.html"} + e2ekubectl.RunKubectlOrDie(namespace, wrtiecmd...) + output = e2ekubectl.RunKubectlOrDie(namespace, cmd...) + gomega.Expect(strings.Contains(output, "Hello message from test into Pod1")).NotTo(gomega.BeFalse()) + } + + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + VMService VM snapshot deletion while primary site goes down¯ + Steps: + 1. Create 10 PVCS using the storageclass as mentioned in testbed structure and verify that it goes to bound state. + 2. Create VMService VM with each PVC created in step1. + 3. While VMService VM creation is going on, bring down the primary site by powering off the hosts in primary site in parallel. + 4. Verify that the supervisor cluster should be in running and ready state after site failover. + 5. Verify that all the PVCs created in step 2 are running fine. + 6. Perform volume lifecycle actions which should work fine. + 7. Bring primary site up and wait for testbed to be back to normal. + 8. Delete all objects created in the test. + */ + ginkgo.It("VMService VM snapshot deletion while primary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + var vmlbsvcs []*vmopv1.VirtualMachineService + var pvclaimsList []*v1.PersistentVolumeClaim + var volHandles, snapshotIds []string + var volumeSnapshotList []*snapV1.VolumeSnapshot + + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + for i := 0; i < pvcCount; i++ { + pvclaim, persistentVolumes := createPVCAndQueryVolumeInCNS(ctx, client, namespace, nil, "", + diskSize, sc, true) + pvclaimsList = append(pvclaimsList, pvclaim) + volHandle := persistentVolumes[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + } + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + volHandles = append(volHandles, volHandle) + + defer func() { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + vmlbsvcs = append(vmlbsvcs, vmlbsvc) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + + ginkgo.By("Create volume snapshot class") + volumeSnapshotClass, err := createVolumeSnapshotClass(ctx, snapc, deletionPolicy) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + if vanillaCluster { + err = snapc.SnapshotV1().VolumeSnapshotClasses().Delete(ctx, volumeSnapshotClass.Name, + metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Create a dynamic volume snapshot") + for i, pvclaim := range pvclaimsList { + volumeSnapshot, _, _, + _, snapshotId, err := createDynamicVolumeSnapshot(ctx, namespace, snapc, volumeSnapshotClass, + pvclaim, volHandles[i], diskSize, true) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeSnapshotList = append(volumeSnapshotList, volumeSnapshot) + //snapshotContents = append(snapshotContents, snapshotContent) + snapshotIds = append(snapshotIds, snapshotId) + + } + defer func() { + for i, volumeSnapshot := range volumeSnapshotList { + ginkgo.By("Delete dynamic volume snapshot") + _, _, err = deleteVolumeSnapshot(ctx, snapc, namespace, + volumeSnapshot, pandoraSyncWaitTime, volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } + }() + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + var wg sync.WaitGroup + ginkgo.By("Deleting volumeSnapshot in parallel to primary site failure") + wg.Add(2) + go deleteVolumeSnapshotInParallel(ctx, namespace, snapc, volumeSnapshotList, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + defer func() { + for _, volumeSnapshot := range volumeSnapshotList { + deleteVolumeSnapshotWithPandoraWait(ctx, snapc, namespace, + volumeSnapshot.Name, pandoraSyncWaitTime) + } + }() + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + time.Sleep(5 * time.Minute) + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + _, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, + vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Verify all volume snapshots are deleted") + for i, volumeSnapshot := range volumeSnapshotList { + framework.Logf("Wait until the volume snapshot content is deleted") + err = waitForVolumeSnapshotContentToBeDeleted(*snapc, ctx, *volumeSnapshot.Status.BoundVolumeSnapshotContentName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Verify snapshot entry %v is deleted from CNS for volume %v", snapshotIds[i], volHandles[i]) + err = waitForCNSSnapshotToBeDeleted(volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Verify snapshot entry is deleted from CNS") + err = verifySnapshotIsDeletedInCNS(volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Deleting volume snapshot again to check 'Not found' error") + deleteVolumeSnapshotWithPandoraWait(ctx, snapc, namespace, volumeSnapshot.Name, pandoraSyncWaitTime) + } + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) }) diff --git a/tests/e2e/vsan_stretched_cluster.go b/tests/e2e/vsan_stretched_cluster.go index 7db6da129d..774474c7c0 100644 --- a/tests/e2e/vsan_stretched_cluster.go +++ b/tests/e2e/vsan_stretched_cluster.go @@ -37,10 +37,13 @@ import ( v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/kubernetes/test/e2e/framework" fdep "k8s.io/kubernetes/test/e2e/framework/deployment" + e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" fnodes "k8s.io/kubernetes/test/e2e/framework/node" fpod "k8s.io/kubernetes/test/e2e/framework/pod" fpv "k8s.io/kubernetes/test/e2e/framework/pv" @@ -48,6 +51,9 @@ import ( admissionapi "k8s.io/pod-security-admission/api" cnsoperatorv1alpha1 "sigs.k8s.io/vsphere-csi-driver/v3/pkg/apis/cnsoperator" k8s "sigs.k8s.io/vsphere-csi-driver/v3/pkg/kubernetes" + + snapV1 "github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1" + snapclient "github.com/kubernetes-csi/external-snapshotter/client/v6/clientset/versioned" ) var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", func() { @@ -76,6 +82,8 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f defaultDatastore *object.Datastore isVsanHealthServiceStopped bool nimbusGeneratedK8sVmPwd string + snapc *snapclient.Clientset + guestClusterRestConfig *rest.Config sc *storagev1.StorageClass accessMode v1.PersistentVolumeAccessMode ) @@ -182,6 +190,16 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f accessMode = v1.ReadWriteOnce } + if !guestCluster { + restConfig = getRestConfigClient() + snapc, err = snapclient.NewForConfig(restConfig) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + guestClusterRestConfig = getRestConfigClientForGuestCluster(guestClusterRestConfig) + snapc, err = snapclient.NewForConfig(guestClusterRestConfig) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }) ginkgo.AfterEach(func() { @@ -4313,4 +4331,837 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) + + /* + Dynamic volume snapshot creation while primary site goes down + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create 30 PVCs using a thick provision policy so that it takes some time for PVC creation to go through + 3. Bring down primary site + 4. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site + 5. Verify that the PVCs created in step 2 is bound successfully + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete PVCs created in step 2 + */ + ginkgo.It("[primary-centric] Dynamic volume snapshot creation while primary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + var sc *storagev1.StorageClass + var err error + var volHandles []string + var pvclaimsList []*v1.PersistentVolumeClaim + var svcCsipods *v1.PodList + + if vanillaCluster { + scParameters = map[string]string{} + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Create multiple PVCs") + + for i := 0; i < volumeOpsScale; i++ { + pvclaim, persistentVolumes := createPVCAndQueryVolumeInCNS(ctx, client, namespace, nil, "", + diskSize, sc, true) + pvclaimsList = append(pvclaimsList, pvclaim) + volHandle := persistentVolumes[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + } + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + volHandles = append(volHandles, volHandle) + + defer func() { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + + framework.Logf("pvclaimList: %v", pvclaimsList) + + ginkgo.By("Create volume snapshot class") + volumeSnapshotClass, err := createVolumeSnapshotClass(ctx, snapc, deletionPolicy) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + if vanillaCluster { + err = snapc.SnapshotV1().VolumeSnapshotClasses().Delete(ctx, volumeSnapshotClass.Name, + metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + deploymentList := createMultipleDeployments(ctx, client, namespace, len(pvclaimsList)/2, pvclaimsList) + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the primary site while creating volume snapshots") + var wg sync.WaitGroup + ch := make(chan *snapV1.VolumeSnapshot) + var volumeSnapshotList []*snapV1.VolumeSnapshot + lock := &sync.Mutex{} + wg.Add(2) + go createDynamicSnapshotInParallel(ctx, namespace, snapc, pvclaimsList, + volumeSnapshotClass.Name, ch, lock, &wg) + go func() { + for v := range ch { + volumeSnapshotList = append(volumeSnapshotList, v) + } + }() + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() + close(ch) + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + defer func() { + for _, volumeSnapshot := range volumeSnapshotList { + deleteVolumeSnapshotWithPandoraWait(ctx, snapc, namespace, volumeSnapshot.Name, pandoraSyncWaitTime) + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + time.Sleep(5 * time.Minute) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + _, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, dep := range deploymentList { + err = fdep.WaitForDeploymentComplete(client, dep) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Verify volume snapshot is created") + for i, volumeSnapshot := range volumeSnapshotList { + volumeSnapshot, err = waitForVolumeSnapshotReadyToUse(*snapc, ctx, namespace, volumeSnapshot.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if volumeSnapshot.Status.RestoreSize.Cmp(resource.MustParse(diskSize)) != 0 { + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "unexpected restore size") + } + + ginkgo.By("Verify volume snapshot content is created") + snapshotContent, err := snapc.SnapshotV1().VolumeSnapshotContents().Get(ctx, + *volumeSnapshot.Status.BoundVolumeSnapshotContentName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + snapshotContent, err = waitForVolumeSnapshotContentReadyToUse(*snapc, ctx, snapshotContent.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Get volume snapshot ID from snapshot handle") + snapshotId, err := getVolumeSnapshotIdFromSnapshotHandle(ctx, snapshotContent) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Query CNS and check the volume snapshot entry") + err = waitForCNSSnapshotToBeCreated(volHandles[i], snapshotId) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify restore volume from snapshot is successfull") + verifyVolumeRestoreOperation(ctx, client, namespace, sc, volumeSnapshot, diskSize, true) + + ginkgo.By("Delete dynamic volume snapshot") + _, _, err = deleteVolumeSnapshot(ctx, snapc, namespace, + volumeSnapshot, pandoraSyncWaitTime, volHandles[i], snapshotId) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + Delete volume snapshot creation while primary site goes down + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create 30 PVCs using a thick provision policy so that it takes some time for PVC creation to go through + 3. Bring down primary site + 4. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site + 5. Verify that the PVCs created in step 2 is bound successfully + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete PVCs created in step 2 + */ + ginkgo.It("[primary-centric] Delete volume snapshot creation while primary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + var sc *storagev1.StorageClass + var err error + var volHandles []string + var volumeSnapshotList []*snapV1.VolumeSnapshot + var snapshotContents []*snapV1.VolumeSnapshotContent + var snapshotIds []string + var pvclaimsList []*v1.PersistentVolumeClaim + var svcCsipods *v1.PodList + + if vanillaCluster { + scParameters = map[string]string{} + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Create multiple PVCs") + for i := 0; i < volumeOpsScale; i++ { + pvclaim, persistentVolumes := createPVCAndQueryVolumeInCNS(ctx, client, namespace, nil, "", + diskSize, sc, true) + pvclaimsList = append(pvclaimsList, pvclaim) + volHandle := persistentVolumes[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + } + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + volHandles = append(volHandles, volHandle) + + defer func() { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + ginkgo.By("Create volume snapshot class") + volumeSnapshotClass, err := createVolumeSnapshotClass(ctx, snapc, deletionPolicy) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + if vanillaCluster { + err = snapc.SnapshotV1().VolumeSnapshotClasses().Delete(ctx, volumeSnapshotClass.Name, + metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + deploymentList := createMultipleDeployments(ctx, client, namespace, len(pvclaimsList)/2, pvclaimsList) + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + ginkgo.By("Create a dynamic volume snapshot") + for i, pvclaim := range pvclaimsList { + volumeSnapshot, snapshotContent, _, + _, snapshotId, err := createDynamicVolumeSnapshot(ctx, namespace, snapc, volumeSnapshotClass, + pvclaim, volHandles[i], diskSize, true) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeSnapshotList = append(volumeSnapshotList, volumeSnapshot) + snapshotContents = append(snapshotContents, snapshotContent) + snapshotIds = append(snapshotIds, snapshotId) + + } + defer func() { + for i, volumeSnapshot := range volumeSnapshotList { + + ginkgo.By("Delete dynamic volume snapshot") + _, _, err = deleteVolumeSnapshot(ctx, snapc, namespace, + volumeSnapshot, pandoraSyncWaitTime, volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } + }() + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the secondary site while creating pvcs") + var wg sync.WaitGroup + wg.Add(2) + go deleteVolumeSnapshotInParallel(ctx, namespace, snapc, volumeSnapshotList, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() + + defer func() { + for _, volumeSnapshot := range volumeSnapshotList { + deleteVolumeSnapshotWithPandoraWait(ctx, snapc, namespace, volumeSnapshot.Name, pandoraSyncWaitTime) + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + time.Sleep(5 * time.Minute) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + _, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, dep := range deploymentList { + err = fdep.WaitForDeploymentComplete(client, dep) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Verify volume snapshot is delete") + for i, volumeSnapshot := range volumeSnapshotList { + framework.Logf("Wait until the volume snapshot content is deleted") + err = waitForVolumeSnapshotContentToBeDeleted(*snapc, ctx, *volumeSnapshot.Status.BoundVolumeSnapshotContentName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Verify snapshot entry %v is deleted from CNS for volume %v", snapshotIds[i], volHandles[i]) + err = waitForCNSSnapshotToBeDeleted(volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Verify snapshot entry is deleted from CNS") + err = verifySnapshotIsDeletedInCNS(volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + framework.Logf("Deleting volume snapshot again to check 'Not found' error") + deleteVolumeSnapshotWithPandoraWait(ctx, snapc, namespace, volumeSnapshot.Name, pandoraSyncWaitTime) + } + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + Restore volume from snapshot while primary site goes down + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create 30 PVCs using a thick provision policy so that it takes some time for PVC creation to go through + 3. Bring down primary site + 4. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site + 5. Verify that the PVCs created in step 2 is bound successfully + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete PVCs created in step 2 + */ + ginkgo.It("[primary-centric] Restore volume from snapshot while primary site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + var sc *storagev1.StorageClass + var err error + var volHandles []string + var volumeSnapshotList []*snapV1.VolumeSnapshot + var snapshotIds []string + var pvclaimsList, restoreVolList []*v1.PersistentVolumeClaim + var svcCsipods *v1.PodList + + if vanillaCluster { + scParameters = map[string]string{} + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Create multiple PVCs") + for i := 0; i < volumeOpsScale; i++ { + pvclaim, persistentVolumes := createPVCAndQueryVolumeInCNS(ctx, client, namespace, nil, "", + diskSize, sc, true) + pvclaimsList = append(pvclaimsList, pvclaim) + volHandle := persistentVolumes[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + } + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + volHandles = append(volHandles, volHandle) + + defer func() { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + ginkgo.By("Create volume snapshot class") + volumeSnapshotClass, err := createVolumeSnapshotClass(ctx, snapc, deletionPolicy) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + if vanillaCluster { + err = snapc.SnapshotV1().VolumeSnapshotClasses().Delete(ctx, volumeSnapshotClass.Name, + metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + deploymentList := createMultipleDeployments(ctx, client, namespace, len(pvclaimsList)/2, pvclaimsList) + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + ginkgo.By("Create a dynamic volume snapshot") + for i, pvclaim := range pvclaimsList { + volumeSnapshot, _, _, + _, snapshotId, err := createDynamicVolumeSnapshot(ctx, namespace, snapc, volumeSnapshotClass, + pvclaim, volHandles[i], diskSize, true) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeSnapshotList = append(volumeSnapshotList, volumeSnapshot) + //snapshotContents = append(snapshotContents, snapshotContent) + snapshotIds = append(snapshotIds, snapshotId) + + } + defer func() { + for i, volumeSnapshot := range volumeSnapshotList { + ginkgo.By("Delete dynamic volume snapshot") + _, _, err = deleteVolumeSnapshot(ctx, snapc, namespace, + volumeSnapshot, pandoraSyncWaitTime, volHandles[i], snapshotIds[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } + }() + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the primary site while creating pvcs") + var ch chan *v1.PersistentVolumeClaim + var wg sync.WaitGroup + lock := &sync.Mutex{} + wg.Add(2) + go restoreVolumeFromSnapshotInParallel(ctx, client, namespace, sc, volumeSnapshotList, ch, lock, &wg) + go func() { + for v := range ch { + restoreVolList = append(restoreVolList, v) + } + }() + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + defer func() { + for _, volumeSnapshot := range volumeSnapshotList { + deleteVolumeSnapshotWithPandoraWait(ctx, snapc, namespace, volumeSnapshot.Name, pandoraSyncWaitTime) + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + time.Sleep(5 * time.Minute) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + _, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, dep := range deploymentList { + err = fdep.WaitForDeploymentComplete(client, dep) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Verify volume snapshot is created") + for i, _ := range volumeSnapshotList { + + persistentvolumes2, err := fpv.WaitForPVClaimBoundPhase(ctx, client, + []*v1.PersistentVolumeClaim{restoreVolList[i]}, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volHandle2 := persistentvolumes2[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle2 = getVolumeIDFromSupervisorCluster(volHandle2) + } + gomega.Expect(volHandle2).NotTo(gomega.BeEmpty()) + + // Create a Pod to use this PVC, and verify volume has been attached + ginkgo.By("Creating pod to attach PV to the node") + pod, err := createPod(ctx, client, namespace, nil, + []*v1.PersistentVolumeClaim{restoreVolList[i]}, false, execRWXCommandPod1) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + var vmUUID string + nodeName := pod.Spec.NodeName + + if vanillaCluster { + vmUUID = getNodeUUID(ctx, client, pod.Spec.NodeName) + } else if guestCluster { + vmUUID, err = getVMUUIDFromNodeName(pod.Spec.NodeName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By(fmt.Sprintf("Verify volume: %s is attached to the node: %s", volHandle2, nodeName)) + isDiskAttached, err := e2eVSphere.isVolumeAttachedToVM(client, volHandle2, vmUUID) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(isDiskAttached).To(gomega.BeTrue(), "Volume is not attached to the node") + + ginkgo.By("Verify the volume is accessible and Read/write is possible") + cmd := []string{"exec", pod.Name, "--namespace=" + namespace, "--", "/bin/sh", "-c", + "cat /mnt/volume1/Pod1.html "} + output := e2ekubectl.RunKubectlOrDie(namespace, cmd...) + gomega.Expect(strings.Contains(output, "Hello message from Pod1")).NotTo(gomega.BeFalse()) + + wrtiecmd := []string{"exec", pod.Name, "--namespace=" + namespace, "--", "/bin/sh", "-c", + "echo 'Hello message from test into Pod1' > /mnt/volume1/Pod1.html"} + e2ekubectl.RunKubectlOrDie(namespace, wrtiecmd...) + output = e2ekubectl.RunKubectlOrDie(namespace, cmd...) + gomega.Expect(strings.Contains(output, "Hello message from test into Pod1")).NotTo(gomega.BeFalse()) + } + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + Statefulset scale up/down while primary site goes down + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create two statefulset with replica count 1(sts1) and 5(sts2) respectively using a thick provision policy + and wait for all replicas to be running + 3. Change replica count of sts1 and sts2 to 3 + 4. Bring down primary site + 5. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site + 6. Verify there were no issue with replica scale up/down and verify pod entry in CNS volumemetadata for the + volumes associated with the PVC used by statefulsets are updated + 7. Change replica count of sts1 to 5 a sts2 to 1 and verify they are successful + 8. Delete statefulsets and its pvcs created in step 2 + 9. Bring primary site up and wait for testbed to be back to normal + + ginkgo.It("[primary-centric][control-plane-on-primary]"+ + "[csi-vsan-stretch-wcp][csi-vsan-stretch-tkg] Statefulset scale up/down while primary"+ + " site goes down", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 + var err error + var svcCsipods *v1.PodList + var standalonePvcs []*v1.PersistentVolumeClaim + + + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } + + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() + + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") + + sts1Replicas = 3 + sts2Replicas = 3 + statefulset1, deployment1, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, sts1Replicas, "web", dep1ReplicaCount, accessMode) + statefulset2, deployment2, _ := createStsDeployment(ctx, client, namespace, sc, true, + true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessMode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) + + ginkgo.By("Create multiple PVCs") + for i := 0; i < pvcCount; i++ { + pvclaim, persistentVolumes := createPVCAndQueryVolumeInCNS(ctx, client, namespace, nil, "", + diskSize, sc, true) + pvclaimsList = append(pvclaimsList, pvclaim) + volHandle := persistentVolumes[0].Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + } + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + volHandles = append(volHandles, volHandle) + + defer func() { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + standalonePvcs=append(standalonePvcs, pvclaim) + } + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + ginkgo.By("Bring down the primary site") + var wg sync.WaitGroup + wg.Add(2) + go expandVolumeInParallel(client, allPVCs, "10Gi",&wg) + go siteFailover(ctx, true) + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster && guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + time.Sleep(5 * time.Minute) + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failure in supervisor") + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, "") + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + wait4AllK8sNodesToBeUp(ctx, svcClient, svcNodeList) + err = waitForAllNodes2BeReady(ctx, svcClient) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) + + // Scaling up statefulset sts1 + sts1Replicas += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) + + // Scaling down statefulset sts2 + sts2Replicas -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + + } + })*/ }) diff --git a/tests/e2e/vsan_stretched_cluster_utils.go b/tests/e2e/vsan_stretched_cluster_utils.go index 7a41ac698f..07aa2d7fd3 100644 --- a/tests/e2e/vsan_stretched_cluster_utils.go +++ b/tests/e2e/vsan_stretched_cluster_utils.go @@ -38,6 +38,7 @@ import ( v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgtypes "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" @@ -52,6 +53,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" triggercsifullsyncv1alpha1 "sigs.k8s.io/vsphere-csi-driver/v3/pkg/internalapis/cnsoperator/triggercsifullsync/v1alpha1" + + snapV1 "github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1" + snapclient "github.com/kubernetes-csi/external-snapshotter/client/v6/clientset/versioned" ) const ( @@ -1283,3 +1287,74 @@ func checkForEventWithMessage(client clientset.Interface, namespace string, } return eventFound } + +// createDynamicSnapshotInParallel +func createDynamicSnapshotInParallel(ctx context.Context, namespace string, + snapc *snapclient.Clientset, pvcList []*v1.PersistentVolumeClaim, volumeSnapshotClassName string, + ch chan *snapV1.VolumeSnapshot, lock *sync.Mutex, wg *sync.WaitGroup) { + defer wg.Done() + ginkgo.By("Create a volume snapshot") + for _, pvc := range pvcList { + snapshot, err := snapc.SnapshotV1().VolumeSnapshots(namespace).Create(ctx, + getVolumeSnapshotSpec(namespace, volumeSnapshotClassName, pvc.Name), metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + lock.Lock() + ch <- snapshot + lock.Unlock() + framework.Logf("Volume snapshot name is : %s", snapshot.Name) + } + +} + +// deleteDynamicSnapshotInParallel +func deleteVolumeSnapshotInParallel(ctx context.Context, namespace string, + snapc *snapclient.Clientset, volumeSnapshotList []*snapV1.VolumeSnapshot, wg *sync.WaitGroup) { + defer wg.Done() + ginkgo.By("Delete a volume snapshot") + for _, volumeSnapshot := range volumeSnapshotList { + err := snapc.SnapshotV1().VolumeSnapshots(namespace).Delete(ctx, + volumeSnapshot.Name, metav1.DeleteOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("%s is successfully deleted", volumeSnapshot.Name) + } +} + +// restoreVolumeFromSnapshotInParallel +func restoreVolumeFromSnapshotInParallel(ctx context.Context, client clientset.Interface, namespace string, + sc *storagev1.StorageClass, volumeSnapshotList []*snapV1.VolumeSnapshot, + ch chan *v1.PersistentVolumeClaim, lock *sync.Mutex, wg *sync.WaitGroup) { + defer wg.Done() + ginkgo.By("Create PVC from snapshot") + for _, volumeSnapshot := range volumeSnapshotList { + pvcSpec := getPersistentVolumeClaimSpecWithDatasource(namespace, diskSize, sc, nil, + v1.ReadWriteOnce, volumeSnapshot.Name, snapshotapigroup) + + pvc, err := fpv.CreatePVC(ctx, client, namespace, pvcSpec) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + lock.Lock() + ch <- pvc + lock.Unlock() + framework.Logf("PVC created name is : %s", pvc.Name) + } + +} + +func expandVolumeInParallel(client clientset.Interface, pvclaims []*v1.PersistentVolumeClaim, wg *sync.WaitGroup, resizeValue string) { + + defer wg.Done() + for _, pvclaim := range pvclaims { + ginkgo.By("Expanding current pvc") + currentPvcSize := pvclaim.Spec.Resources.Requests[v1.ResourceStorage] + newSize := currentPvcSize.DeepCopy() + newSize.Add(resource.MustParse(resizeValue)) + framework.Logf("currentPvcSize %v, newSize %v", currentPvcSize, newSize) + pvclaim, err := expandPVCSize(pvclaim, newSize, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(pvclaim).NotTo(gomega.BeNil()) + + pvcSize := pvclaim.Spec.Resources.Requests[v1.ResourceStorage] + if pvcSize.Cmp(newSize) != 0 { + framework.Failf("error updating pvc size %q", pvclaim.Name) + } + } +}