Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test_failover_and_relocate[primary_down-rbd] failed during lastgroupsynctime #10868

Open
Shilpi-Das1 opened this issue Nov 15, 2024 · 0 comments · May be fixed by #10874
Open

test_failover_and_relocate[primary_down-rbd] failed during lastgroupsynctime #10868

Shilpi-Das1 opened this issue Nov 15, 2024 · 0 comments · May be fixed by #10874
Assignees
Labels
DR Metro and Regional DR related PRs

Comments

@Shilpi-Das1
Copy link

tests/functional/disaster-recovery/regional-dr/test_failover_and_relocate.py::TestFailoverAndRelocate::test_failover_and_relocate[primary_down-rbd] is failing with below error
This test case failed in ppc64le arch
Version details:
OCP: 4.17.3
ODF: 4.17.0-126
ACM: 2.12
Gitops: 1.14.1
OADP: 1.4.1
submariner: 0.19.0
volsync: 0.11

23:54:19 - MainThread - ocs_ci.ocs.resources.drpc - INFO - C[rdr-hub1-417] - Current lastGroupSyncTime is 2024-11-15T04:42:05Z.
23:54:19 - MainThread - ocs_ci.helpers.dr_helpers - INFO - C[rdr-hub1-417] - Verified: Current lastGroupSyncTime 2024-11-15T04:42:05Z is different from previous value 2024-11-15T04:28:02Z
23:54:19 - MainThread - ocs_ci.helpers.dr_helpers - INFO - C[rdr-hub1-417] - Time in minutes since the last sync 12.233333333333333
23:54:19 - MainThread - ocs_ci.framework.pytest_customization.reports - INFO - C[rdr-hub1-417] - duration reported by tests/functional/disaster-recovery/regional-dr/test_failover_and_relocate.py::TestFailoverAndRelocate::test_failover_and_relocate[primary_down-rbd] immediately after test execution: 1797.95
FAILED
_____ TestFailoverAndRelocate.test_failover_and_relocate[primary_down-rbd] _____

self = <test_failover_and_relocate.TestFailoverAndRelocate object at 0x7ffe8e85acd0>
primary_cluster_down = True, pvc_interface = 'CephBlockPool'
setup_acm_ui = None
dr_workload = <function dr_workload.<locals>.factory at 0x7ffe8f0c1ca0>
nodes_multicluster = [<ocs_ci.ocs.platform_nodes.IBMPowerNodes object at 0x7ffe8f156ee0>, <ocs_ci.ocs.platform_nodes.IBMPowerNodes object at 0x7ffe8f156760>, <ocs_ci.ocs.platform_nodes.IBMPowerNodes object at 0x7ffe8d916fa0>]
node_restart_teardown = None

    @pytest.mark.parametrize(
        argnames=["primary_cluster_down", "pvc_interface"],
        argvalues=[
            pytest.param(
                False,
                constants.CEPHBLOCKPOOL,
                marks=pytest.mark.polarion_id(polarion_id_primary_up),
                id="primary_up-rbd",
            ),
            pytest.param(
                True,
                constants.CEPHBLOCKPOOL,
                marks=pytest.mark.polarion_id(polarion_id_primary_down),
                id="primary_down-rbd",
            ),
            pytest.param(
                False,
                constants.CEPHFILESYSTEM,
                marks=pytest.mark.polarion_id(polarion_id_primary_up_cephfs),
                id="primary_up-cephfs",
            ),
            pytest.param(
                True,
                constants.CEPHFILESYSTEM,
                marks=pytest.mark.polarion_id(polarion_id_primary_down_cephfs),
                id="primary_down-cephfs",
            ),
        ],
    )
    def test_failover_and_relocate(
        self,
        primary_cluster_down,
        pvc_interface,
        setup_acm_ui,
        dr_workload,
        nodes_multicluster,
        node_restart_teardown,
    ):
        """
        Tests to verify application failover when the primary cluster is either UP or DOWN and relocate between managed
        clusters.
        This test is also compatible to be run from ACM UI,
        pass the yaml conf/ocsci/dr_ui.yaml to trigger it.
    
        """
        if config.RUN.get("rdr_failover_via_ui"):
            acm_obj = AcmAddClusters()
    
        workloads = dr_workload(
            num_of_subscription=1, num_of_appset=1, pvc_interface=pvc_interface
        )
        drpc_subscription = DRPC(namespace=workloads[0].workload_namespace)
        drpc_appset = DRPC(
            namespace=constants.GITOPS_CLUSTER_NAMESPACE,
            resource_name=f"{workloads[1].appset_placement_name}-drpc",
        )
        drpc_objs = [drpc_subscription, drpc_appset]
    
        primary_cluster_name = dr_helpers.get_current_primary_cluster_name(
            workloads[0].workload_namespace
        )
        config.switch_to_cluster_by_name(primary_cluster_name)
        primary_cluster_index = config.cur_index
        primary_cluster_nodes = get_node_objs()
        secondary_cluster_name = dr_helpers.get_current_secondary_cluster_name(
            workloads[0].workload_namespace
        )
    
        if pvc_interface == constants.CEPHFILESYSTEM:
            # Verify the creation of ReplicationDestination resources on secondary cluster
            config.switch_to_cluster_by_name(secondary_cluster_name)
            for wl in workloads:
                dr_helpers.wait_for_replication_destinations_creation(
                    wl.workload_pvc_count, wl.workload_namespace
                )
    
        scheduling_interval = dr_helpers.get_scheduling_interval(
            workloads[0].workload_namespace
        )
        wait_time = 2 * scheduling_interval  # Time in minutes
        logger.info(f"Waiting for {wait_time} minutes to run IOs")
        sleep(wait_time * 60)
    
        for obj in drpc_objs:
            before_failover_last_group_sync_time = (
                dr_helpers.verify_last_group_sync_time(obj, scheduling_interval)
            )
        logger.info("Verified lastGroupSyncTime before failover.")
    
        if config.RUN.get("rdr_failover_via_ui"):
            logger.info("Start the process of Failover from ACM UI")
            config.switch_acm_ctx()
            dr_submariner_validation_from_ui(acm_obj)
    
        # Stop primary cluster nodes
        if primary_cluster_down:
            config.switch_to_cluster_by_name(primary_cluster_name)
            logger.info(f"Stopping nodes of primary cluster: {primary_cluster_name}")
            nodes_multicluster[primary_cluster_index].stop_nodes(primary_cluster_nodes)
    
            # Verify if cluster is marked unavailable on ACM console
            if config.RUN.get("rdr_failover_via_ui"):
                config.switch_acm_ctx()
                check_cluster_status_on_acm_console(
                    acm_obj,
                    down_cluster_name=primary_cluster_name,
                    expected_text="Unknown",
                )
        elif config.RUN.get("rdr_failover_via_ui"):
            check_cluster_status_on_acm_console(acm_obj)
    
        for wl in workloads:
            if config.RUN.get("rdr_failover_via_ui"):
                # Failover via ACM UI
                failover_relocate_ui(
                    acm_obj,
                    scheduling_interval=scheduling_interval,
                    workload_to_move=f"{wl.workload_name}-1",
                    policy_name=wl.dr_policy_name,
                    failover_or_preferred_cluster=secondary_cluster_name,
                )
            else:
                # Failover action via CLI
                dr_helpers.failover(
                    secondary_cluster_name,
                    wl.workload_namespace,
                    wl.workload_type,
                    wl.appset_placement_name
                    if wl.workload_type == constants.APPLICATION_SET
                    else None,
                )
    
        # Verify resources creation on secondary cluster (failoverCluster)
        config.switch_to_cluster_by_name(secondary_cluster_name)
        for wl in workloads:
            dr_helpers.wait_for_all_resources_creation(
                wl.workload_pvc_count,
                wl.workload_pod_count,
                wl.workload_namespace,
            )
    
        # Verify resources deletion from primary cluster
        config.switch_to_cluster_by_name(primary_cluster_name)
    
        # Start nodes if cluster is down
        if primary_cluster_down:
            logger.info(
                f"Waiting for {wait_time} minutes before starting nodes of primary cluster: {primary_cluster_name}"
            )
            sleep(wait_time * 60)
            nodes_multicluster[primary_cluster_index].start_nodes(primary_cluster_nodes)
            wait_for_nodes_status([node.name for node in primary_cluster_nodes])
            logger.info("Wait for 180 seconds for pods to stabilize")
            sleep(180)
            logger.info(
                "Wait for all the pods in openshift-storage to be in running state"
            )
            assert wait_for_pods_to_be_running(
                timeout=720
            ), "Not all the pods reached running state"
            logger.info("Checking for Ceph Health OK")
            ceph_health_check()
    
        for wl in workloads:
            dr_helpers.wait_for_all_resources_deletion(wl.workload_namespace)
    
        if pvc_interface == constants.CEPHFILESYSTEM:
            for wl in workloads:
                # Verify the deletion of ReplicationDestination resources on secondary cluster
                config.switch_to_cluster_by_name(secondary_cluster_name)
                dr_helpers.wait_for_replication_destinations_deletion(
                    wl.workload_namespace
                )
                # Verify the creation of ReplicationDestination resources on primary cluster
                config.switch_to_cluster_by_name(primary_cluster_name)
                dr_helpers.wait_for_replication_destinations_creation(
                    wl.workload_pvc_count, wl.workload_namespace
                )
    
        if pvc_interface == constants.CEPHBLOCKPOOL:
            dr_helpers.wait_for_mirroring_status_ok(
                replaying_images=sum([wl.workload_pvc_count for wl in workloads])
            )
    
        after_failover_last_group_sync_time = []
        for obj in drpc_objs:
            after_failover_last_group_sync_time.append(
>               dr_helpers.verify_last_group_sync_time(
                    obj, scheduling_interval, before_failover_last_group_sync_time
                )
            )

test_failover_and_relocate[primary_down-rbd].log

@prsurve prsurve added the DR Metro and Regional DR related PRs label Nov 15, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
DR Metro and Regional DR related PRs
Projects
None yet
Development

Successfully merging a pull request may close this issue.

3 participants