Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Stretch cluster] Minor tweaks and correction in netsplit test #10580

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 42 additions & 32 deletions ocs_ci/ocs/resources/stretchcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,28 +202,31 @@ def check_for_read_pause(self, label, start_time, end_time):

"""
paused = 0
max_fail_expected = len(self.workload_map[label][0]) - 2
failed = 0
for pod_obj in self.workload_map[label][0]:
if get_pod_node(pod_obj).name in self.non_quorum_nodes:
logger.info(
f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
try:
pause_count = 0
time_var = start_time
pod_log = get_pod_logs(
pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
)
continue
pause_count = 0
time_var = start_time
pod_log = get_pod_logs(
pod_name=pod_obj.name, namespace=constants.STRETCH_CLUSTER_NAMESPACE
)
logger.info(f"Current pod: {pod_obj.name}")
while time_var <= (end_time + timedelta(minutes=1)):
t_time = time_var.strftime("%H:%M")
if f" {t_time}" not in pod_log:
pause_count += 1
logger.info(f"Read pause: {t_time}")
logger.info(f"Current pod: {pod_obj.name}")
while time_var <= (end_time + timedelta(minutes=1)):
t_time = time_var.strftime("%H:%M")
if f" {t_time}" not in pod_log:
pause_count += 1
logger.info(f"Read pause: {t_time}")
else:
logger.info(f"Read success: {t_time}")
time_var = time_var + timedelta(minutes=1)
if pause_count > 5:
paused += 1
except CommandFailed:
if failed <= max_fail_expected:
failed += 1
else:
logger.info(f"Read success: {t_time}")
time_var = time_var + timedelta(minutes=1)
if pause_count > 5:
paused += 1
raise
return paused

@retry(CommandFailed, tries=6, delay=10)
Expand All @@ -241,13 +244,14 @@ def check_for_write_pause(self, label, start_time, end_time):

"""
paused = 0
max_fail_expected = (
len(self.workload_map[label][0]) - 2
if label == constants.LOGWRITER_CEPHFS_LABEL
else 1
)
failed = 0
for pod_obj in self.workload_map[label][0]:
if get_pod_node(pod_obj).name in self.non_quorum_nodes:
logger.info(
f"Not checking the logs from {pod_obj.name} as it belongs to non-quorum zone"
)
continue
excepted = 0
no_such_file_expected = 1
for file_name in self.logfile_map[label][0]:
pause_count = 0
try:
Expand All @@ -269,13 +273,16 @@ def check_for_write_pause(self, label, start_time, end_time):
"No such file or directory" in err.args[0]
and label == constants.LOGWRITER_RBD_LABEL
):
if excepted == 0:
if no_such_file_expected == 1:
logger.info(
f"Seems like file {file_name} is not in RBD pod {pod_obj.name}"
)
excepted += 1
no_such_file_expected += 1
else:
raise UnexpectedBehaviour
failed += 1
elif failed <= max_fail_expected:
failed += 1
else:
raise

Expand Down Expand Up @@ -437,7 +444,7 @@ def check_for_data_loss(self, label):
return True

@retry(CommandFailed, tries=15, delay=5)
def check_ceph_accessibility(self, timeout, delay=5, grace=120):
def check_ceph_accessibility(self, timeout, delay=60, grace=180):
"""
Check for ceph access for the 'timeout' seconds

Expand Down Expand Up @@ -469,7 +476,10 @@ def check_ceph_accessibility(self, timeout, delay=5, grace=120):
if "TimeoutExpired" in err.args[0]:
logger.error("Ceph status check got timed out. maybe ceph is hung.")
return False
elif "connect: no route to host" in err.args[0]:
elif (
"connect: no route to host" in err.args[0]
or "error dialing backend" in err.args[0]
):
ceph_tools_pod.delete(wait=False)
raise

Expand All @@ -484,7 +494,7 @@ def get_out_of_quorum_nodes(self):
# find out the mons in quorum
ceph_tools_pod = pod.get_ceph_tools_pod()

@retry(CommandFailed, tries=10, delay=10)
@retry(CommandFailed, tries=8, delay=5)
def _get_non_quorum_mons():
"""
Get non quorum mon pods
Expand Down Expand Up @@ -634,7 +644,7 @@ def cephfs_failure_checks(
self.check_for_read_pause(
constants.LOGREADER_CEPHFS_LABEL, start_time, end_time
)
== 0
<= 2
), "Read operations are paused for CephFS workloads even for the ones in available zones"
logger.info("All read operations are successful for CephFs workload")

Expand All @@ -653,7 +663,7 @@ def rbd_failure_checks(self, start_time, end_time, **kwargs):
start_time,
end_time,
)
== 0
<= 1
), "Write operations paused for RBD workloads even for the ones in available zone"
logger.info("all write operations are successful for RBD workloads")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,47 @@ def finalizer():
request.addfinalizer(finalizer)


@pytest.fixture(scope="class")
def setup_cnv_workload(request, cnv_workload, setup_cnv):

logger.info("Setting up CNV workload and creating some data")
vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)[0]
vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
md5sum_before = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")

def finalizer():

# check vm data written before the failure for integrity
logger.info("Waiting for VM SSH connectivity!")
vm_obj.wait_for_ssh_connectivity()
md5sum_after = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")
assert (
md5sum_before == md5sum_after
), "Data integrity of the file inside VM is not maintained during the failure"
logger.info(
"Data integrity of the file inside VM is maintained during the failure"
)

# check if new data can be created
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
)
logger.info("Successfully created new data inside VM")

# check if the data can be copied back to local machine
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
logger.info("VM data is successfully copied back to local machine")

# stop the VM
vm_obj.stop()
logger.info("Stoped the VM successfully")

request.addfinalizer(finalizer)


@turquoise_squad
@stretchcluster_required
@pytest.mark.usefixtures("setup_cnv_workload")
@pytest.mark.usefixtures("setup_logwriter_workloads")
class TestMonAndOSDFailures:
"""
Expand Down
21 changes: 8 additions & 13 deletions tests/functional/disaster-recovery/sc_arbiter/test_netsplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import time
import ocpnetsplit

from ocs_ci.utility.retry import retry
from ocs_ci.framework.pytest_customization.marks import (
turquoise_squad,
tier1,
Expand All @@ -13,7 +12,7 @@
recover_workload_pods_post_recovery,
recover_from_ceph_stuck,
)
from ocs_ci.ocs.exceptions import UnexpectedBehaviour, CommandFailed
from ocs_ci.ocs.exceptions import UnexpectedBehaviour

from ocs_ci.ocs.resources.stretchcluster import StretchCluster
from ocs_ci.ocs.exceptions import CephHealthException
Expand Down Expand Up @@ -87,10 +86,11 @@ def finalizer():
argvalues=[
pytest.param(
constants.NETSPLIT_DATA_1_DATA_2,
15,
30,
marks=[
pytest.mark.polarion_id("OCS-5069"),
pytest.mark.polarion_id("OCS-5071"),
pytest.mark.bugzilla("2265992"),
],
),
pytest.param(
Expand Down Expand Up @@ -196,16 +196,9 @@ def test_netsplit(
)
logger.info(f"Netsplit induced at {start_time} for zones {zones}")

# get the nodes which are present in the
# out of quorum zone
if (
zones != constants.NETSPLIT_ARBITER_DATA_1
or zones != constants.NETSPLIT_ARBITER_DATA_1_AND_ARBITER_DATA_2
):
retry(CommandFailed, tries=5, delay=10)(sc_obj.get_out_of_quorum_nodes)()

# note the end time (UTC)
if not sc_obj.check_ceph_accessibility(timeout=(duration * 60)):
# check for ceph accessibility and note the end time (UTC)
timeout = (end_time - datetime.now(timezone.utc)).total_seconds()
if not sc_obj.check_ceph_accessibility(timeout=int(timeout)):
assert recover_from_ceph_stuck(
sc_obj
), "Something went wrong. not expected. please check rook-ceph logs"
Expand All @@ -216,6 +209,8 @@ def test_netsplit(
logger.info(f"Ended netsplit at {end_time}")

# check vm data written before the failure for integrity
logger.info("Waiting for VM SSH connectivity!")
vm_obj.wait_for_ssh_connectivity()
md5sum_after = vm_obj.run_ssh_cmd(command="md5sum /file_1.txt")
assert (
md5sum_before == md5sum_after
Expand Down
Loading