diff --git a/e2e/libs/replica/rest.py b/e2e/libs/replica/rest.py index c832670f33..8c492c0b56 100644 --- a/e2e/libs/replica/rest.py +++ b/e2e/libs/replica/rest.py @@ -3,6 +3,7 @@ from replica.base import Base from utils import common_utils +from utility.utility import logging from replica.constant import RETRY_COUNTS from replica.constant import RETRY_INTERVAL @@ -22,45 +23,54 @@ def delete_replica(self, volume_name, node_name): def wait_for_replica_rebuilding_start(self, volume_name, node_name): rebuilding_replica_name = None for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - for replica in v.replicas: - if replica.hostId == node_name: - rebuilding_replica_name = replica.name + try: + v = self.longhorn_client.by_id_volume(volume_name) + for replica in v.replicas: + if replica.hostId == node_name: + rebuilding_replica_name = replica.name + break + if rebuilding_replica_name: break - if rebuilding_replica_name: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert rebuilding_replica_name != None, f'failed to get rebuilding replica name' started = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - for status in v.rebuildStatus: - if status.replica == rebuilding_replica_name and\ - status.state == "in_progress": - started = True + try: + v = self.longhorn_client.by_id_volume(volume_name) + for status in v.rebuildStatus: + if status.replica == rebuilding_replica_name and\ + status.state == "in_progress": + started = True + break + if started: break - if started: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert started, f'replica {rebuilding_replica_name} rebuilding starting failed' def wait_for_replica_rebuilding_complete(self, volume_name, node_name): completed = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - for replica in v.replicas: - # use replica.mode is RW or RO to check if this replica - # has been rebuilt or not - # because rebuildStatus is not reliable - # when the rebuild progress reaches 100% - # it will be removed from rebuildStatus immediately - # and you will just get an empty rebuildStatus [] - # so it's no way to distinguish "rebuilding not started yet" - # or "rebuilding already completed" using rebuildStatus - if replica.hostId == node_name and replica.mode == "RW": - completed = True - break + try: + v = self.longhorn_client.by_id_volume(volume_name) + for replica in v.replicas: + # use replica.mode is RW or RO to check if this replica + # has been rebuilt or not + # because rebuildStatus is not reliable + # when the rebuild progress reaches 100% + # it will be removed from rebuildStatus immediately + # and you will just get an empty rebuildStatus [] + # so it's no way to distinguish "rebuilding not started yet" + # or "rebuilding already completed" using rebuildStatus + if replica.hostId == node_name and replica.mode == "RW": + completed = True + break + except Exception as e: + logging(f"Failed to get volume {e}") if completed: break time.sleep(RETRY_INTERVAL) diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 521ccc28f3..888b8a48e5 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -81,9 +81,9 @@ def wait_for_all_instance_manager_running(): retry_count, retry_interval = get_retry_count_and_interval() for _ in range(retry_count): logging(f"Waiting for all instance manager running ({_}) ...") - instance_managers = longhorn_client.list_instance_manager() - instance_manager_map = {} try: + instance_managers = longhorn_client.list_instance_manager() + instance_manager_map = {} for im in instance_managers: if im.currentState == "running": instance_manager_map[im.nodeID] = im diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index c374fc7c95..f626714cc3 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -20,7 +20,12 @@ def __init__(self, node_exec): self.node_exec = node_exec def get(self, volume_name): - return self.longhorn_client.by_id_volume(volume_name) + for i in range(RETRY_COUNTS): + try: + return self.longhorn_client.by_id_volume(volume_name) + except Exception as e: + logging(f"Failed to get volume {e}") + time.sleep(RETRY_INTERVAL) def create(self, volume_name, size, replica_count): return NotImplemented @@ -36,7 +41,7 @@ def wait_for_volume_state(self, volume_name, desired_state): def get_endpoint(self, volume_name): endpoint = "" - v = self.longhorn_client.by_id_volume(volume_name) + v = self.get(volume_name) if v.disableFrontend: assert endpoint == "" return endpoint @@ -44,12 +49,15 @@ def get_endpoint(self, volume_name): assert v.frontend == VOLUME_FRONTEND_BLOCKDEV or\ v.frontend == VOLUME_FRONTEND_ISCSI for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - engines = v.controllers - assert len(engines) != 0 - endpoint = engines[0].endpoint - if endpoint != "": - break + try: + v = self.longhorn_client.by_id_volume(volume_name) + engines = v.controllers + assert len(engines) != 0 + endpoint = engines[0].endpoint + if endpoint != "": + break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) logging(f"Got volume {volume_name} endpoint = {endpoint}") @@ -72,55 +80,64 @@ def delete_replica(self, volume_name, node_name): def wait_for_replica_rebuilding_start(self, volume_name, node_name): rebuilding_replica_name = None for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - logging(f"Got volume {volume_name} replicas = {v.replicas}") - for replica in v.replicas: - if replica.hostId == node_name: - rebuilding_replica_name = replica.name + try: + v = self.longhorn_client.by_id_volume(volume_name) + logging(f"Got volume {volume_name} replicas = {v.replicas}") + for replica in v.replicas: + if replica.hostId == node_name: + rebuilding_replica_name = replica.name + break + if rebuilding_replica_name: break - if rebuilding_replica_name: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert rebuilding_replica_name != None logging(f"Got rebuilding replica = {rebuilding_replica_name}") started = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - logging(f"Got volume rebuild status = {v.rebuildStatus}") - for status in v.rebuildStatus: - for replica in v.replicas: - if status.replica == replica.name and \ - replica.hostId == node_name and \ - status.state == "in_progress": - logging(f"Started {node_name}'s replica {replica.name} rebuilding") - started = True - break - if started: - break + try: + v = self.longhorn_client.by_id_volume(volume_name) + logging(f"Got volume rebuild status = {v.rebuildStatus}") + for status in v.rebuildStatus: + for replica in v.replicas: + if status.replica == replica.name and \ + replica.hostId == node_name and \ + status.state == "in_progress": + logging(f"Started {node_name}'s replica {replica.name} rebuilding") + started = True + break + if started: + break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert started, f"wait for replica on node {node_name} rebuilding timeout: {v}" def wait_for_replica_rebuilding_complete(self, volume_name, node_name): completed = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - logging(f"Got volume {volume_name} replicas = {v.replicas}") - for replica in v.replicas: - # use replica.mode is RW or RO to check if this replica - # has been rebuilt or not - # because rebuildStatus is not reliable - # when the rebuild progress reaches 100% - # it will be removed from rebuildStatus immediately - # and you will just get an empty rebuildStatus [] - # so it's no way to distinguish "rebuilding not started yet" - # or "rebuilding already completed" using rebuildStatus - if replica.hostId == node_name and replica.mode == "RW": - logging(f"Completed {node_name}'s replica {replica.name} rebuilding") - completed = True + try: + v = self.longhorn_client.by_id_volume(volume_name) + logging(f"Got volume {volume_name} replicas = {v.replicas}") + for replica in v.replicas: + # use replica.mode is RW or RO to check if this replica + # has been rebuilt or not + # because rebuildStatus is not reliable + # when the rebuild progress reaches 100% + # it will be removed from rebuildStatus immediately + # and you will just get an empty rebuildStatus [] + # so it's no way to distinguish "rebuilding not started yet" + # or "rebuilding already completed" using rebuildStatus + if replica.hostId == node_name and replica.mode == "RW": + logging(f"Completed {node_name}'s replica {replica.name} rebuilding") + completed = True + break + if completed: break - if completed: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert completed