red-hat-storage · mashetty330 · Sep 19, 2024 · Oct 21, 2024 · Nov 13, 2024 · Nov 14, 2024
@@ -202,28 +202,31 @@ def check_for_read_pause(self, label, start_time, end_time):
 
         """
         paused = 0
+        max_fail_expected = len(self.workload_map[label][0]) - 2
+        failed = 0
         for pod_obj in self.workload_map[label][0]:
-            if get_pod_node(pod_obj).name in self.non_quorum_nodes:
-                logger.info(
-                    f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
+            try:
+                pause_count = 0
+                time_var = start_time
+                pod_log = get_pod_logs(
+                    pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
                 )
-                continue
-            pause_count = 0
-            time_var = start_time
-            pod_log = get_pod_logs(
-                pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
-            )
-            logger.info(f"Current pod: {pod_obj.name}")
-            while time_var <= (end_time + timedelta(minutes=1)):
-                t_time = time_var.strftime("%H:%M")
-                if f" {t_time}" not in pod_log:
-                    pause_count += 1
-                    logger.info(f"Read pause: {t_time}")
+                logger.info(f"Current pod: {pod_obj.name}")
+                while time_var <= (end_time + timedelta(minutes=1)):
+                    t_time = time_var.strftime("%H:%M")
+                    if f" {t_time}" not in pod_log:
+                        pause_count += 1
+                        logger.info(f"Read pause: {t_time}")
+                    else:
+                        logger.info(f"Read success: {t_time}")
+                    time_var = time_var + timedelta(minutes=1)
+                if pause_count > 5:
+                    paused += 1
+            except CommandFailed:
+                if failed <= max_fail_expected:
+                    failed += 1
                 else:
-                    logger.info(f"Read success: {t_time}")
-                time_var = time_var + timedelta(minutes=1)
-            if pause_count > 5:
-                paused += 1
+                    raise
         return paused
 
     @retry(CommandFailed, tries=6, delay=10)
@@ -241,13 +244,14 @@ def check_for_write_pause(self, label, start_time, end_time):
 
         """
         paused = 0
+        max_fail_expected = (
+            len(self.workload_map[label][0]) - 2
+            if label == constants.LOGWRITER_CEPHFS_LABEL
+            else 1
+        )
+        failed = 0
         for pod_obj in self.workload_map[label][0]:
-            if get_pod_node(pod_obj).name in self.non_quorum_nodes:
-                logger.info(
-                    f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
-                )
-                continue
-            excepted = 0
+            no_such_file_expected = 1
             for file_name in self.logfile_map[label][0]:
                 pause_count = 0
                 try:
@@ -269,13 +273,16 @@ def check_for_write_pause(self, label, start_time, end_time):
                         "No such file or directory" in err.args[0]
                         and label == constants.LOGWRITER_RBD_LABEL
                     ):
-                        if excepted == 0:
+                        if no_such_file_expected == 1:
                             logger.info(
                                 f"Seems like file {file_name} is not in RBD pod {pod_obj.name}"
                             )
-                            excepted += 1
+                            no_such_file_expected += 1
                         else:
                             raise UnexpectedBehaviour
+                        failed += 1
+                    elif failed <= max_fail_expected:
+                        failed += 1
                     else:
                         raise
 
@@ -437,7 +444,7 @@ def check_for_data_loss(self, label):
         return True
 
     @retry(CommandFailed, tries=15, delay=5)
-    def check_ceph_accessibility(self, timeout, delay=5, grace=120):
+    def check_ceph_accessibility(self, timeout, delay=60, grace=180):
         """
         Check for ceph access for the 'timeout' seconds
 
@@ -469,7 +476,10 @@ def check_ceph_accessibility(self, timeout, delay=5, grace=120):
             if "TimeoutExpired" in err.args[0]:
                 logger.error("Ceph status check got timed out. maybe ceph is hung.")
                 return False
-            elif "connect: no route to host" in err.args[0]:
+            elif (
+                "connect: no route to host" in err.args[0]
+                or "error dialing backend" in err.args[0]
+            ):
                 ceph_tools_pod.delete(wait=False)
             raise
 
@@ -484,7 +494,7 @@ def get_out_of_quorum_nodes(self):
         # find out the mons in quorum
         ceph_tools_pod = pod.get_ceph_tools_pod()
 
-        @retry(CommandFailed, tries=10, delay=10)
+        @retry(CommandFailed, tries=8, delay=5)
         def _get_non_quorum_mons():
             """
             Get non quorum mon pods
@@ -634,7 +644,7 @@ def cephfs_failure_checks(
             self.check_for_read_pause(
                 constants.LOGREADER_CEPHFS_LABEL, start_time, end_time
             )
-            == 0
+            <= 2
         ), "Read operations are paused for CephFS workloads even for the ones in available zones"
         logger.info("All read operations are successful for CephFs workload")
 
@@ -653,7 +663,7 @@ def rbd_failure_checks(self, start_time, end_time, **kwargs):
                 start_time,
                 end_time,
             )
-            == 0
+            <= 1
         ), "Write operations paused for RBD workloads even for the ones in available zone"
         logger.info("all write operations are successful for RBD workloads")
 

@@ -136,8 +136,47 @@ def finalizer():
     request.addfinalizer(finalizer)
 
 
+@pytest.fixture(scope="class")
+def setup_cnv_workload(request, cnv_workload, setup_cnv):
+
+    logger.info("Setting up CNV workload and creating some data")
+    vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)[0]
+    vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
+    md5sum_before = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")
+
+    def finalizer():
+
+        # check vm data written before the failure for integrity
+        logger.info("Waiting for VM SSH connectivity!")
+        vm_obj.wait_for_ssh_connectivity()
+        md5sum_after = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")
+        assert (
+            md5sum_before == md5sum_after
+        ), "Data integrity of the file inside VM is not maintained during the failure"
+        logger.info(
+            "Data integrity of the file inside VM is maintained during the failure"
+        )
+
+        # check if new data can be created
+        vm_obj.run_ssh_cmd(
+            command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
+        )
+        logger.info("Successfully created new data inside VM")
+
+        # check if the data can be copied back to local machine
+        vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
+        logger.info("VM data is successfully copied back to local machine")
+
+        # stop the VM
+        vm_obj.stop()
+        logger.info("Stoped the VM successfully")
+
+    request.addfinalizer(finalizer)
+
+
 @turquoise_squad
 @stretchcluster_required
+@pytest.mark.usefixtures("setup_cnv_workload")
 @pytest.mark.usefixtures("setup_logwriter_workloads")
 class TestMonAndOSDFailures:
     """

@@ -3,7 +3,6 @@
 import time
 import ocpnetsplit
 
-from ocs_ci.utility.retry import retry
 from ocs_ci.framework.pytest_customization.marks import (
     turquoise_squad,
     tier1,
@@ -13,7 +12,7 @@
     recover_workload_pods_post_recovery,
     recover_from_ceph_stuck,
 )
-from ocs_ci.ocs.exceptions import UnexpectedBehaviour, CommandFailed
+from ocs_ci.ocs.exceptions import UnexpectedBehaviour
 
 from ocs_ci.ocs.resources.stretchcluster import StretchCluster
 from ocs_ci.ocs.exceptions import CephHealthException
@@ -87,10 +86,11 @@ def finalizer():
         argvalues=[
             pytest.param(
                 constants.NETSPLIT_DATA_1_DATA_2,
-                15,
+                30,
                 marks=[
                     pytest.mark.polarion_id("OCS-5069"),
                     pytest.mark.polarion_id("OCS-5071"),
+                    pytest.mark.bugzilla("2265992"),
                 ],
             ),
             pytest.param(
@@ -196,16 +196,9 @@ def test_netsplit(
         )
         logger.info(f"Netsplit induced at {start_time} for zones {zones}")
 
-        # get the nodes which are present in the
-        # out of quorum zone
-        if (
-            zones != constants.NETSPLIT_ARBITER_DATA_1
-            or zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2
-        ):
-            retry(CommandFailed, tries=5, delay=10)(sc_obj.get_out_of_quorum_nodes)()
-
-        # note the end time (UTC)
-        if not sc_obj.check_ceph_accessibility(timeout=(duration * 60)):
+        # check for ceph accessibility and note the end time (UTC)
+        timeout = (end_time - datetime.now(timezone.utc)).total_seconds()
+        if not sc_obj.check_ceph_accessibility(timeout=int(timeout)):
             assert recover_from_ceph_stuck(
                 sc_obj
             ), "Something went wrong. not expected. please check rook-ceph logs"
@@ -216,6 +209,8 @@ def test_netsplit(
         logger.info(f"Ended netsplit at {end_time}")
 
         # check vm data written before the failure for integrity
+        logger.info("Waiting for VM SSH connectivity!")
+        vm_obj.wait_for_ssh_connectivity()
         md5sum_after = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")
         assert (
             md5sum_before == md5sum_after