From 9ccd5a5eab0f1fbb7c8795bb90b729c934cdcba3 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 3 Apr 2024 16:51:24 +0200 Subject: [PATCH] added documentation. Changed timeout to 2**(2+attempts) to decrease number of unlikely to work attempts --- bibigrid/core/actions/create.py | 2 +- bibigrid/core/utility/ansible_configurator.py | 9 ++++----- bibigrid/core/utility/handler/ssh_handler.py | 6 ++++-- documentation/markdown/features/configuration.md | 13 ++++++++++++- .../roles/bibigrid/files/slurm/create_server.py | 6 +++--- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index b3cd2c57..adeeb588 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -81,7 +81,7 @@ def __init__(self, providers, configurations, config_path, log, debug=False, clu self.ssh_user = configurations[0].get("sshUser") or "ubuntu" self.ssh_add_public_key_commands = ssh_handler.get_add_ssh_public_key_commands( configurations[0].get("sshPublicKeyFiles")) - self.ssh_timeout = configurations[0].get("sshTimeout", 5) + self.ssh_timeout = configurations[0].get("sshTimeout", 4) self.config_path = config_path self.master_ip = None self.log.debug("Cluster-ID: %s", self.cluster_id) diff --git a/bibigrid/core/utility/ansible_configurator.py b/bibigrid/core/utility/ansible_configurator.py index 6e24e4f2..af6facbe 100644 --- a/bibigrid/core/utility/ansible_configurator.py +++ b/bibigrid/core/utility/ansible_configurator.py @@ -6,10 +6,10 @@ import mergedeep import yaml -from bibigrid.core.actions.version import __version__ from bibigrid.core.actions import create from bibigrid.core.actions import ide +from bibigrid.core.actions.version import __version__ from bibigrid.core.utility import id_generation from bibigrid.core.utility import yaml_dumper from bibigrid.core.utility.handler import configuration_handler @@ -30,7 +30,7 @@ SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme", "munge_key": id_generation.generate_munge_key(), "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 900, "TreeWidth": 128}} -CLOUD_SCHEDULING = {"timeout": 5} +CLOUD_SCHEDULING = {"sshTimeout": 4} def delete_old_vars(log): @@ -185,9 +185,8 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us master_configuration.get("slurmConf", {}), strategy=mergedeep.Strategy.TYPESAFE_REPLACE), "cloud_scheduling": mergedeep.merge({}, CLOUD_SCHEDULING, - master_configuration.get( - "cloudScheduling", {}), - strategy=mergedeep.Strategy.TYPESAFE_REPLACE)} + master_configuration.get("cloudScheduling", {}), + strategy=mergedeep.Strategy.TYPESAFE_REPLACE)} if master_configuration.get("nfs"): nfs_shares = master_configuration.get("nfsShares", []) nfs_shares = nfs_shares + DEFAULT_NFS_SHARES diff --git a/bibigrid/core/utility/handler/ssh_handler.py b/bibigrid/core/utility/handler/ssh_handler.py index 0baf0f20..0a742318 100644 --- a/bibigrid/core/utility/handler/ssh_handler.py +++ b/bibigrid/core/utility/handler/ssh_handler.py @@ -113,10 +113,12 @@ def is_active(client, paramiko_key, ssh_data, log): username=ssh_data['username'], pkey=paramiko_key, timeout=7, auth_timeout=ssh_data['timeout'], port=port) establishing_connection = False - log.info(f"Successfully connected to {ssh_data['floating_ip']}") + log.info(f"Successfully connected to {ssh_data['floating_ip']}.") except paramiko.ssh_exception.NoValidConnectionsError as exc: if attempts < ssh_data['timeout']: - time.sleep(2 ** attempts) + sleep_time = 2 ** (attempts+2) + time.sleep(sleep_time) + log.info(f"Waiting {sleep_time} before attempting to reconnect.") attempts += 1 else: log.error(f"Attempt to connect to {ssh_data['floating_ip']} failed.") diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index b327f2ba..0f86da90 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -44,6 +44,17 @@ sshPublicKeyFiles: - /home/user/.ssh/id_ecdsa_colleague.pub ``` +#### sshTimeout (optional) +Defines the number of attempts that BiBiGrid will try to connect to the master instance via ssh. +Attempts have a pause of `2^(attempts+2)` seconds in between. Default value is 4. + +#### cloudScheduling (optional) +This key allows you to influence cloud scheduling. Currently, only a single key `sshTimeout` can be set here. + +##### sshTimeout (optional) +Defines the number of attempts that the master will try to connect to on demand created worker instances via ssh. +Attempts have a pause of `2^(attempts+2)` seconds in between. Default value is 4. + #### autoMount (optional) > **Warning:** If a volume has an obscure filesystem, this might overwrite your data! @@ -149,7 +160,7 @@ This is required if your provider has any post-launch services interfering with seemingly random errors can occur when the service interrupts ansible's execution. Services are listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific` (not yet). -#### +#### gateway (optional) In order to save valuable floating ips, BiBiGrid can also make use of a gateway to create the cluster. For more information on how to set up a gateway, how gateways work and why they save floating ips please continue reading [here](https://cloud.denbi.de/wiki/Tutorials/SaveFloatingIPs/). diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index 55872254..19e9b828 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -138,8 +138,8 @@ def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", usernam establishing_connection = False except paramiko.ssh_exception.NoValidConnectionsError as exc: logging.info("Attempting to connect to %s... This might take a while", private_ip) - if attempts < common_config["cloud_scheduling"]["timeout"]: - time.sleep(2 ** attempts) + if attempts < common_config["cloud_scheduling"]["sshTimeout"]: + time.sleep(2 ** (2+attempts)) attempts += 1 else: logging.warning("Attempt to connect to %s failed.", private_ip) @@ -222,7 +222,7 @@ def _run_playbook(cmdline_args): # read common configuration with open("/opt/playbook/vars/common_configuration.yml", mode="r", encoding="utf-8") as common_configuration_file: common_config = yaml.safe_load(common_configuration_file) -logging.warning(f"Maximum 'is active' attempts: {common_config['cloud_scheduling']['timeout']}") +logging.info(f"Maximum 'is active' attempts: {common_config['cloud_scheduling']['sshTimeout']}") # read clouds.yaml with open("/etc/openstack/clouds.yaml", mode="r", encoding="utf-8") as clouds_file: clouds = yaml.safe_load(clouds_file)["clouds"]