Skip to content

Commit

Permalink
added documentation. Changed timeout to 2**(2+attempts) to decrease n…
Browse files Browse the repository at this point in the history
…umber of unlikely to work attempts
  • Loading branch information
XaverStiensmeier committed Apr 3, 2024
1 parent ba79542 commit 9ccd5a5
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 12 deletions.
2 changes: 1 addition & 1 deletion bibigrid/core/actions/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __init__(self, providers, configurations, config_path, log, debug=False, clu
self.ssh_user = configurations[0].get("sshUser") or "ubuntu"
self.ssh_add_public_key_commands = ssh_handler.get_add_ssh_public_key_commands(
configurations[0].get("sshPublicKeyFiles"))
self.ssh_timeout = configurations[0].get("sshTimeout", 5)
self.ssh_timeout = configurations[0].get("sshTimeout", 4)
self.config_path = config_path
self.master_ip = None
self.log.debug("Cluster-ID: %s", self.cluster_id)
Expand Down
9 changes: 4 additions & 5 deletions bibigrid/core/utility/ansible_configurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

import mergedeep
import yaml
from bibigrid.core.actions.version import __version__

from bibigrid.core.actions import create
from bibigrid.core.actions import ide
from bibigrid.core.actions.version import __version__
from bibigrid.core.utility import id_generation
from bibigrid.core.utility import yaml_dumper
from bibigrid.core.utility.handler import configuration_handler
Expand All @@ -30,7 +30,7 @@
SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme",
"munge_key": id_generation.generate_munge_key(),
"elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 900, "TreeWidth": 128}}
CLOUD_SCHEDULING = {"timeout": 5}
CLOUD_SCHEDULING = {"sshTimeout": 4}


def delete_old_vars(log):
Expand Down Expand Up @@ -185,9 +185,8 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us
master_configuration.get("slurmConf", {}),
strategy=mergedeep.Strategy.TYPESAFE_REPLACE),
"cloud_scheduling": mergedeep.merge({}, CLOUD_SCHEDULING,
master_configuration.get(
"cloudScheduling", {}),
strategy=mergedeep.Strategy.TYPESAFE_REPLACE)}
master_configuration.get("cloudScheduling", {}),
strategy=mergedeep.Strategy.TYPESAFE_REPLACE)}
if master_configuration.get("nfs"):
nfs_shares = master_configuration.get("nfsShares", [])
nfs_shares = nfs_shares + DEFAULT_NFS_SHARES
Expand Down
6 changes: 4 additions & 2 deletions bibigrid/core/utility/handler/ssh_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,12 @@ def is_active(client, paramiko_key, ssh_data, log):
username=ssh_data['username'], pkey=paramiko_key, timeout=7,
auth_timeout=ssh_data['timeout'], port=port)
establishing_connection = False
log.info(f"Successfully connected to {ssh_data['floating_ip']}")
log.info(f"Successfully connected to {ssh_data['floating_ip']}.")
except paramiko.ssh_exception.NoValidConnectionsError as exc:
if attempts < ssh_data['timeout']:
time.sleep(2 ** attempts)
sleep_time = 2 ** (attempts+2)
time.sleep(sleep_time)
log.info(f"Waiting {sleep_time} before attempting to reconnect.")
attempts += 1
else:
log.error(f"Attempt to connect to {ssh_data['floating_ip']} failed.")
Expand Down
13 changes: 12 additions & 1 deletion documentation/markdown/features/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ sshPublicKeyFiles:
- /home/user/.ssh/id_ecdsa_colleague.pub
```
#### sshTimeout (optional)
Defines the number of attempts that BiBiGrid will try to connect to the master instance via ssh.
Attempts have a pause of `2^(attempts+2)` seconds in between. Default value is 4.

#### cloudScheduling (optional)
This key allows you to influence cloud scheduling. Currently, only a single key `sshTimeout` can be set here.

##### sshTimeout (optional)
Defines the number of attempts that the master will try to connect to on demand created worker instances via ssh.
Attempts have a pause of `2^(attempts+2)` seconds in between. Default value is 4.

#### autoMount (optional)
> **Warning:** If a volume has an obscure filesystem, this might overwrite your data!

Expand Down Expand Up @@ -149,7 +160,7 @@ This is required if your provider has any post-launch services interfering with
seemingly random errors can occur when the service interrupts ansible's execution. Services are
listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific` (not yet).

####
#### gateway (optional)
In order to save valuable floating ips, BiBiGrid can also make use of a gateway to create the cluster.
For more information on how to set up a gateway, how gateways work and why they save floating ips please continue reading [here](https://cloud.denbi.de/wiki/Tutorials/SaveFloatingIPs/).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", usernam
establishing_connection = False
except paramiko.ssh_exception.NoValidConnectionsError as exc:
logging.info("Attempting to connect to %s... This might take a while", private_ip)
if attempts < common_config["cloud_scheduling"]["timeout"]:
time.sleep(2 ** attempts)
if attempts < common_config["cloud_scheduling"]["sshTimeout"]:
time.sleep(2 ** (2+attempts))
attempts += 1
else:
logging.warning("Attempt to connect to %s failed.", private_ip)
Expand Down Expand Up @@ -222,7 +222,7 @@ def _run_playbook(cmdline_args):
# read common configuration
with open("/opt/playbook/vars/common_configuration.yml", mode="r", encoding="utf-8") as common_configuration_file:
common_config = yaml.safe_load(common_configuration_file)
logging.warning(f"Maximum 'is active' attempts: {common_config['cloud_scheduling']['timeout']}")
logging.info(f"Maximum 'is active' attempts: {common_config['cloud_scheduling']['sshTimeout']}")
# read clouds.yaml
with open("/etc/openstack/clouds.yaml", mode="r", encoding="utf-8") as clouds_file:
clouds = yaml.safe_load(clouds_file)["clouds"]
Expand Down

0 comments on commit 9ccd5a5

Please sign in to comment.