From a9eea47cd198d1b783c1e6070636508eb3c9d6bc Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Mon, 26 Feb 2024 17:42:18 +0100 Subject: [PATCH 1/7] added keyword for ssh_timeout and improved argument passing for ssh. --- bibigrid/core/actions/create.py | 37 +++-- bibigrid/core/startup.py | 2 +- bibigrid/core/utility/handler/ssh_handler.py | 140 ++++++++----------- 3 files changed, 75 insertions(+), 104 deletions(-) diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index 6f43920a..c70f5e3f 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -81,6 +81,7 @@ def __init__(self, providers, configurations, config_path, log, debug=False, clu self.ssh_user = configurations[0].get("sshUser") or "ubuntu" self.ssh_add_public_key_commands = ssh_handler.get_add_ssh_public_key_commands( configurations[0].get("sshPublicKeyFiles")) + self.ssh_timeout = configurations[0].get("sshTimeout", 5) self.config_path = config_path self.master_ip = None self.log.debug("Cluster-ID: %s", self.cluster_id) @@ -129,7 +130,7 @@ def generate_security_groups(self): """ Generate a security groups: - default with basic rules for the cluster - - wireguard when more than one provider is used (= multicloud) + - wireguard when more than one provider is used (= multi-cloud) """ self.log.info("Generating Security Groups") for provider, configuration in zip(self.providers, self.configurations): @@ -151,8 +152,7 @@ def generate_security_groups(self): for cidr in tmp_configuration['subnet_cidrs']: rules.append( {"direction": "ingress", "ethertype": "IPv4", "protocol": "tcp", "port_range_min": None, - "port_range_max": None, "remote_ip_prefix": cidr, - "remote_group_id": None}) + "port_range_max": None, "remote_ip_prefix": cidr, "remote_group_id": None}) provider.append_rules_to_security_group(default_security_group_id, rules) configuration["security_groups"] = [self.default_security_group_name] # store in configuration # when running a multi-cloud setup create an additional wireguard group @@ -232,17 +232,17 @@ def initialize_instances(self): Setup all servers """ for configuration in self.configurations: + ssh_data = {"floating_ip": configuration["floating_ip"], "private_key": KEY_FOLDER + self.key_name, + "username": self.ssh_user, "commands": None, "filepaths": None, + "gateway": configuration.get("gateway", {}), "timeout": self.ssh_timeout} if configuration.get("masterInstance"): self.master_ip = configuration["floating_ip"] - ssh_handler.ansible_preparation(floating_ip=configuration["floating_ip"], - private_key=KEY_FOLDER + self.key_name, username=self.ssh_user, - commands=self.ssh_add_public_key_commands, log=self.log, - gateway=configuration.get("gateway", {})) + ssh_data["commands"] = self.ssh_add_public_key_commands + ssh_handler.ANSIBLE_SETUP + ssh_data["filepaths"] = [(ssh_data["private_key"], ssh_handler.PRIVATE_KEY_FILE)] + ssh_handler.execute_ssh(ssh_data, self.log) elif configuration.get("vpnInstance"): - ssh_handler.execute_ssh(floating_ip=configuration["floating_ip"], - private_key=KEY_FOLDER + self.key_name, username=self.ssh_user, - commands=ssh_handler.VPN_SETUP, log=self.log, - gateway=configuration.get("gateway", {})) + ssh_data["commands"] = ssh_handler.VPN_SETUP + ssh_handler.execute_ssh(ssh_data, self.log) def prepare_volumes(self, provider, mounts): """ @@ -316,9 +316,10 @@ def upload_data(self): else: commands = [ssh_handler.get_ac_command(self.providers, AC_NAME.format( cluster_id=self.cluster_id))] + ssh_handler.ANSIBLE_START - ssh_handler.execute_ssh(floating_ip=self.master_ip, private_key=KEY_FOLDER + self.key_name, - username=self.ssh_user, filepaths=FILEPATHS, commands=commands, log=self.log, - gateway=self.configurations[0].get("gateway", {})) + ssh_data = {"floating_ip": self.master_ip, "private_key": KEY_FOLDER + self.key_name, + "username": self.ssh_user, "commands": commands, "filepaths": FILEPATHS, + "gateway": self.configurations[0].get("gateway", {}), "timeout": self.ssh_timeout} + ssh_handler.execute_ssh(ssh_data=ssh_data, log=self.log) def start_start_instance_threads(self): """ @@ -354,8 +355,7 @@ def extended_network_configuration(self): f"{configuration_b['subnet_cidrs']})") # add provider_b network as allowed network for cidr in configuration_b["subnet_cidrs"]: - allowed_addresses.append( - {'ip_address': cidr, 'mac_address': configuration_a["mac_addr"]}) + allowed_addresses.append({'ip_address': cidr, 'mac_address': configuration_a["mac_addr"]}) # configure security group rules provider_a.append_rules_to_security_group(self.wireguard_security_group_name, [ {"direction": "ingress", "ethertype": "IPv4", "protocol": "udp", "port_range_min": 51820, @@ -443,9 +443,8 @@ def log_cluster_start_info(self): port = int(sympy.sympify(gateway["portFunction"]).subs(dict(octets))) ssh_ip = gateway["ip"] self.log.log(42, f"Cluster {self.cluster_id} with master {self.master_ip} up and running!") - self.log.log(42, - f"SSH: ssh -i '{KEY_FOLDER}{self.key_name}' {self.ssh_user}@{ssh_ip}" - f"{f' -p {port}' if gateway else ''}") + self.log.log(42, f"SSH: ssh -i '{KEY_FOLDER}{self.key_name}' {self.ssh_user}@{ssh_ip}" + f"{f' -p {port}' if gateway else ''}") self.log.log(42, f"Terminate cluster: ./bibigrid.sh -i '{self.config_path}' -t -cid {self.cluster_id}") self.log.log(42, f"Detailed cluster info: ./bibigrid.sh -i '{self.config_path}' -l -cid {self.cluster_id}") if self.configurations[0].get("ide"): diff --git a/bibigrid/core/startup.py b/bibigrid/core/startup.py index 3a073f27..25001ab7 100755 --- a/bibigrid/core/startup.py +++ b/bibigrid/core/startup.py @@ -85,7 +85,7 @@ def run_action(args, configurations, config_path): debug=args.debug, config_path=config_path) LOG.log(42, "Creating a new cluster takes about 10 or more minutes depending on your cloud provider " - "and your configuration. Be patient.") + "and your configuration. Please be patient.") exit_state = creator.create() else: if not args.cluster_id: diff --git a/bibigrid/core/utility/handler/ssh_handler.py b/bibigrid/core/utility/handler/ssh_handler.py index f5c71ac3..54b874b0 100644 --- a/bibigrid/core/utility/handler/ssh_handler.py +++ b/bibigrid/core/utility/handler/ssh_handler.py @@ -1,6 +1,6 @@ """ This module handles ssh and sftp connections to master and vpngtw. It also holds general execution routines used to -setup the Cluster. +set up the Cluster. """ import os import socket @@ -10,15 +10,15 @@ import sympy import yaml -from bibigrid.core.utility import ansible_commands as aC +from bibigrid.core.utility import ansible_commands as a_c from bibigrid.models.exceptions import ConnectionException, ExecutionException PRIVATE_KEY_FILE = ".ssh/id_ecdsa" # to name bibigrid-temp keys identically on remote -ANSIBLE_SETUP = [aC.NO_UPDATE, aC.UPDATE, aC.PYTHON3_PIP, aC.ANSIBLE_PASSLIB, - (f"chmod 600 {PRIVATE_KEY_FILE}", "Adjust private key permissions."), aC.PLAYBOOK_HOME, - aC.PLAYBOOK_HOME_RIGHTS, aC.ADD_PLAYBOOK_TO_LINUX_HOME] +ANSIBLE_SETUP = [a_c.NO_UPDATE, a_c.UPDATE, a_c.PYTHON3_PIP, a_c.ANSIBLE_PASSLIB, + (f"chmod 600 {PRIVATE_KEY_FILE}", "Adjust private key permissions."), a_c.PLAYBOOK_HOME, + a_c.PLAYBOOK_HOME_RIGHTS, a_c.ADD_PLAYBOOK_TO_LINUX_HOME] # ANSIBLE_START = [aC.WAIT_READY, aC.UPDATE, aC.MV_ANSIBLE_CONFIG, aC.EXECUTE] # another UPDATE seems to not necessary. -ANSIBLE_START = [aC.WAIT_READY, aC.MV_ANSIBLE_CONFIG, aC.EXECUTE] +ANSIBLE_START = [a_c.WAIT_READY, a_c.MV_ANSIBLE_CONFIG, a_c.EXECUTE] VPN_SETUP = [("echo Example", "Echos an Example")] @@ -53,7 +53,7 @@ def get_ac_command(providers, name): def get_add_ssh_public_key_commands(ssh_public_key_files): """ Builds and returns the necessary commands to add given public keys to remote for additional access. - :param ssh_public_key_files: public keys to add + @param ssh_public_key_files: public keys to add :return: list of public key add commands """ commands = [] @@ -69,10 +69,10 @@ def copy_to_server(sftp, local_path, remote_path, log): """ Recursively copies files and folders to server. If a folder is given as local_path, the structure within will be kept. - :param sftp: sftp connection - :param local_path: file or folder locally - :param remote_path: file or folder locally - :param log: + @param sftp: sftp connection + @param local_path: file or folder locally + @param remote_path: file or folder locally + @param log: :return: """ log.debug("Copy %s to %s...", local_path, remote_path) @@ -87,17 +87,14 @@ def copy_to_server(sftp, local_path, remote_path, log): copy_to_server(sftp, os.path.join(local_path, filename), os.path.join(remote_path, filename), log) -def is_active(client, floating_ip_address, private_key, username, log, gateway, timeout=5): +def is_active(client, paramiko_key, ssh_data, log): """ Checks if connection is possible and therefore if server is active. Raises paramiko.ssh_exception.NoValidConnectionsError if timeout is reached - :param client: created client - :param floating_ip_address: ip to connect to - :param private_key: SSH-private_key - :param username: SSH-username - :param log: - :param timeout: how long to wait between ping - :param gateway: if node should be reached over a gateway port is set to 30000 + subnet * 256 + host + @param client: created client + @param paramiko_key: SSH-private_key + @param log: + @param ssh_data: dict containing among other things gateway, floating_ip, username (waiting grows quadratically till 2**timeout before accepting failure) """ attempts = 0 @@ -105,33 +102,34 @@ def is_active(client, floating_ip_address, private_key, username, log, gateway, while establishing_connection: try: port = 22 - if gateway: - log.info(f"Using SSH Gateway {gateway.get('ip')}") - octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(floating_ip_address.split("."))} - port = int(sympy.sympify(gateway["portFunction"]).subs(dict(octets))) - log.info(f"Port {port} will be used (see {gateway['portFunction']} and octets {octets}).") - client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username, - pkey=private_key, timeout=7, auth_timeout=5, port=port) + if ssh_data.get('gateway'): + log.info(f"Using SSH Gateway {ssh_data['gateway'].get('ip')}") + octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(ssh_data['floating_ip'].split("."))} + port = int(sympy.sympify(ssh_data['gateway']["portFunction"]).subs(dict(octets))) + log.info(f"Port {port} will be used (see {ssh_data['gateway']['portFunction']} and octets {octets}).") + client.connect(hostname=ssh_data['gateway'].get("ip") or ssh_data['floating_ip'], + username=ssh_data['username'], pkey=paramiko_key, timeout=7, + auth_timeout=ssh_data['timeout'], port=port) establishing_connection = False - log.info(f"Successfully connected to {floating_ip_address}") + log.info(f"Successfully connected to {ssh_data['floating_ip']}") except paramiko.ssh_exception.NoValidConnectionsError as exc: - log.info(f"Attempting to connect to {floating_ip_address}... This might take a while", ) - if attempts < timeout: + log.info(f"Attempting to connect to {ssh_data['floating_ip']}... This might take a while", ) + if attempts < ssh_data['timeout']: time.sleep(2 ** attempts) attempts += 1 else: - log.error(f"Attempt to connect to {floating_ip_address} failed.") + log.error(f"Attempt to connect to {ssh_data['floating_ip']} failed.") raise ConnectionException(exc) from exc except socket.timeout as exc: log.warning("Socket timeout exception occurred. Try again ...") - if attempts < timeout: + if attempts < ssh_data['timeout']: attempts += 1 else: - log.error(f"Attempt to connect to {floating_ip_address} failed, due to a socket timeout.") + log.error(f"Attempt to connect to {ssh_data['floating_ip']} failed, due to a socket timeout.") raise ConnectionException(exc) from exc except TimeoutError as exc: # pylint: disable=duplicate-except log.error("The attempt to connect to %s failed. Possible known reasons:" - "\n\t-Your network's security group doesn't allow SSH.", floating_ip_address) + "\n\t-Your network's security group doesn't allow SSH.", ssh_data['floating_ip']) raise ConnectionException(exc) from exc @@ -139,7 +137,7 @@ def line_buffered(f): """ https://stackoverflow.com/questions/25260088/paramiko-with-continuous-stdout temporary hangs? - :param f: + @param f: :return: """ line_buf = b"" @@ -154,9 +152,9 @@ def line_buffered(f): def execute_ssh_cml_commands(client, commands, log): """ Executes commands and logs exit_status accordingly. - :param client: Client with connection to remote - :param commands: Commands to execute on remote - :param log: + @param client: Client with connection to remote + @param commands: Commands to execute on remote + @param log: """ for command in commands: _, ssh_stdout, _ = client.exec_command(command[0]) @@ -183,61 +181,35 @@ def execute_ssh_cml_commands(client, commands, log): raise ExecutionException(msg) -def ansible_preparation(floating_ip, private_key, username, log, gateway, commands=None, filepaths=None): - """ - Installs python and pip. Then installs ansible over pip. - Copies private key to instance so cluster-nodes are reachable and sets permission as necessary. - Copies additional files and executes additional commands if given. - The playbook is copied later, because it needs all servers setup and is not time intensive. - See: create.update_playbooks - :param floating_ip: public ip of server to ansible-prepare - :param private_key: generated private key of all cluster-server - :param username: username of all server - :param log: - :param commands: additional commands to execute - :param filepaths: additional files to copy: (localpath, remotepath) - :param gateway - """ - if filepaths is None: - filepaths = [] - if commands is None: - commands = [] - log.info("Ansible preparation...") - commands = ANSIBLE_SETUP + commands - filepaths.append((private_key, PRIVATE_KEY_FILE)) - execute_ssh(floating_ip, private_key, username, log, gateway, commands, filepaths) - - -def execute_ssh(floating_ip, private_key, username, log, gateway, commands=None, filepaths=None): +def execute_ssh(ssh_data, log): """ Executes commands on remote and copies files given in filepaths - :param floating_ip: public ip of remote - :param private_key: key of remote - :param username: username of remote - :param commands: commands - :param log: - :param filepaths: filepaths (localpath, remotepath) - :param gateway: gateway if used + + @param ssh_data: Dict containing floating_ip, private_key, username, commands, filepaths, gateway, timeout + @param log: """ - if commands is None: - commands = [] - paramiko_key = paramiko.ECDSAKey.from_private_key_file(private_key) + log.debug(f"Running execute_sshc with ssh_data: {ssh_data}.") + if ssh_data.get("filepaths") is None: + ssh_data["filepaths"] = [] + if ssh_data.get("commands") is None: + ssh_data["commands"] = [] + paramiko_key = paramiko.ECDSAKey.from_private_key_file(ssh_data["private_key"]) with paramiko.SSHClient() as client: client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) try: - is_active(client=client, floating_ip_address=floating_ip, username=username, private_key=paramiko_key, - log=log, gateway=gateway) + is_active(client=client, paramiko_key=paramiko_key, ssh_data=ssh_data, log=log) except ConnectionException as exc: - log.error(f"Couldn't connect to ip {gateway or floating_ip} using private key {private_key}.") + log.error(f"Couldn't connect to ip {ssh_data['gateway'] or ssh_data['floating_ip']} using private key " + f"{ssh_data['private_key']}.") raise exc else: - log.debug(f"Setting up {floating_ip}") - if filepaths: - log.debug(f"Setting up filepaths for {floating_ip}") + log.debug(f"Setting up {ssh_data['floating_ip']}") + if ssh_data['filepaths']: + log.debug(f"Setting up filepaths for {ssh_data['floating_ip']}") sftp = client.open_sftp() - for local_path, remote_path in filepaths: + for local_path, remote_path in ssh_data['filepaths']: copy_to_server(sftp=sftp, local_path=local_path, remote_path=remote_path, log=log) - log.debug("SFTP: Files %s copied.", filepaths) - if commands: - log.debug(f"Setting up commands for {floating_ip}") - execute_ssh_cml_commands(client=client, commands=commands, log=log) + log.debug("SFTP: Files %s copied.", ssh_data['filepaths']) + if ssh_data["floating_ip"]: + log.debug(f"Setting up commands for {ssh_data['floating_ip']}") + execute_ssh_cml_commands(client=client, commands=ssh_data["commands"], log=log) From b571279ba12e92825c29c755b860e87351f32e51 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Fri, 22 Mar 2024 12:31:18 +0100 Subject: [PATCH 2/7] added comment about sshTimeout key --- bibigrid.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bibigrid.yml b/bibigrid.yml index 27481c9a..584c4bb7 100644 --- a/bibigrid.yml +++ b/bibigrid.yml @@ -7,6 +7,8 @@ cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds) # -- BEGIN: GENERAL CLUSTER INFORMATION -- + # sshTimeout: 5 # Number of ssh connection attempts with 2^attempt seconds in between (2^sshTimeout-1 is the max time before returning with an error) + ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself. #sshPublicKeyFiles: # - [public key one] From 3c3e35b7c941c8eecb8287798a0e2e93e687e390 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Tue, 26 Mar 2024 16:13:37 +0100 Subject: [PATCH 3/7] added timeout to common_configuration --- bibigrid/core/utility/ansible_commands.py | 2 +- bibigrid/core/utility/ansible_configurator.py | 10 +++++++--- .../roles/bibigrid/files/slurm/create_server.py | 14 +++++++------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/bibigrid/core/utility/ansible_commands.py b/bibigrid/core/utility/ansible_commands.py index fc6c2815..66fcb5ba 100644 --- a/bibigrid/core/utility/ansible_commands.py +++ b/bibigrid/core/utility/ansible_commands.py @@ -50,7 +50,7 @@ MV_ANSIBLE_CONFIG = ( "sudo install -D /opt/playbook/ansible.cfg /etc/ansible/ansible.cfg", "Move ansible configuration.") EXECUTE = (f"ansible-playbook {os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.SITE_YML)} -i " - f"{os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.ANSIBLE_HOSTS)} -l vpn", + f"{os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.ANSIBLE_HOSTS)} -l vpn -vvvv", "Execute ansible playbook. Be patient.") # ansible setup diff --git a/bibigrid/core/utility/ansible_configurator.py b/bibigrid/core/utility/ansible_configurator.py index 642e9ae4..6dad0ac5 100644 --- a/bibigrid/core/utility/ansible_configurator.py +++ b/bibigrid/core/utility/ansible_configurator.py @@ -29,6 +29,7 @@ SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme", "munge_key": id_generation.generate_munge_key(), "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 900, "TreeWidth": 128}} +CLOUD_SCHEDULING = {"timeout": 5} def delete_old_vars(log): @@ -180,7 +181,11 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us "slurm": master_configuration.get("slurm", True), "ssh_user": ssh_user, "slurm_conf": mergedeep.merge({}, SLURM_CONF, master_configuration.get("slurmConf", {}), - strategy=mergedeep.Strategy.TYPESAFE_REPLACE)} + strategy=mergedeep.Strategy.TYPESAFE_REPLACE), + "cloud_scheduling": mergedeep.merge({}, CLOUD_SCHEDULING, + master_configuration.get( + "cloudScheduling", {}), + strategy=mergedeep.Strategy.TYPESAFE_REPLACE)} if master_configuration.get("nfs"): nfs_shares = master_configuration.get("nfsShares", []) nfs_shares = nfs_shares + DEFAULT_NFS_SHARES @@ -197,8 +202,7 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us master_configuration.get("zabbixConf", {}), strategy=mergedeep.Strategy.TYPESAFE_REPLACE) - for from_key, to_key in [("ansibleRoles", "ansible_roles"), - ("ansibleGalaxyRoles", "ansible_galaxy_roles")]: + for from_key, to_key in [("ansibleRoles", "ansible_roles"), ("ansibleGalaxyRoles", "ansible_galaxy_roles")]: pass_through(master_configuration, common_configuration_yaml, from_key, to_key) if len(configurations) > 1: diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index ab9d2c12..00452b97 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -117,7 +117,7 @@ def start_server(worker, start_worker_group, start_data): server_start_data["other_openstack_exception"].append(worker) -def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", username="ubuntu", timeout=7): +def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", username="ubuntu"): """ Waits until SSH connects successful. This guarantees that the node can be reached via Ansible. @param private_ip: ip of node @@ -138,7 +138,7 @@ def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", usernam establishing_connection = False except paramiko.ssh_exception.NoValidConnectionsError as exc: logging.info("Attempting to connect to %s... This might take a while", private_ip) - if attempts < timeout: + if attempts < common_config["cloud_scheduling"]["timeout"]: time.sleep(2 ** attempts) attempts += 1 else: @@ -213,16 +213,16 @@ def _run_playbook(cmdline_args): worker_groups = [] for filename in os.listdir(GROUP_VARS_PATH): if filename != "master.yml": - f = os.path.join(GROUP_VARS_PATH, filename) + worker_group_yaml_file = os.path.join(GROUP_VARS_PATH, filename) # checking if it is a file - if os.path.isfile(f): - with open(f, mode="r", encoding="utf-8") as worker_group: - worker_groups.append(yaml.safe_load(worker_group)) + if os.path.isfile(worker_group_yaml_file): + with open(worker_group_yaml_file, mode="r", encoding="utf-8") as worker_group_yaml: + worker_groups.append(yaml.safe_load(worker_group_yaml)) # read common configuration with open("/opt/playbook/vars/common_configuration.yml", mode="r", encoding="utf-8") as common_configuration_file: common_config = yaml.safe_load(common_configuration_file) - +logging.warning(f"ThisGrep {common_config['cloud_scheduling']['timeout']}") # read clouds.yaml with open("/etc/openstack/clouds.yaml", mode="r", encoding="utf-8") as clouds_file: clouds = yaml.safe_load(clouds_file)["clouds"] From ae30617b0ef737f871b5ab08e3148425f51f879d Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 27 Mar 2024 13:12:32 +0100 Subject: [PATCH 4/7] removed debug verbosity and improved log message wording --- bibigrid/core/utility/ansible_commands.py | 2 +- resources/playbook/roles/bibigrid/files/slurm/create_server.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bibigrid/core/utility/ansible_commands.py b/bibigrid/core/utility/ansible_commands.py index 66fcb5ba..fc6c2815 100644 --- a/bibigrid/core/utility/ansible_commands.py +++ b/bibigrid/core/utility/ansible_commands.py @@ -50,7 +50,7 @@ MV_ANSIBLE_CONFIG = ( "sudo install -D /opt/playbook/ansible.cfg /etc/ansible/ansible.cfg", "Move ansible configuration.") EXECUTE = (f"ansible-playbook {os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.SITE_YML)} -i " - f"{os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.ANSIBLE_HOSTS)} -l vpn -vvvv", + f"{os.path.join(aRP.PLAYBOOK_PATH_REMOTE, aRP.ANSIBLE_HOSTS)} -l vpn", "Execute ansible playbook. Be patient.") # ansible setup diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index 00452b97..55872254 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -222,7 +222,7 @@ def _run_playbook(cmdline_args): # read common configuration with open("/opt/playbook/vars/common_configuration.yml", mode="r", encoding="utf-8") as common_configuration_file: common_config = yaml.safe_load(common_configuration_file) -logging.warning(f"ThisGrep {common_config['cloud_scheduling']['timeout']}") +logging.warning(f"Maximum 'is active' attempts: {common_config['cloud_scheduling']['timeout']}") # read clouds.yaml with open("/etc/openstack/clouds.yaml", mode="r", encoding="utf-8") as clouds_file: clouds = yaml.safe_load(clouds_file)["clouds"] From 4f2ca97bf72dad4e22f89a42d8d6dc73f65e6626 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 27 Mar 2024 16:05:45 +0100 Subject: [PATCH 5/7] fixed is_active structure --- bibigrid/core/utility/handler/ssh_handler.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bibigrid/core/utility/handler/ssh_handler.py b/bibigrid/core/utility/handler/ssh_handler.py index 54b874b0..27105a68 100644 --- a/bibigrid/core/utility/handler/ssh_handler.py +++ b/bibigrid/core/utility/handler/ssh_handler.py @@ -99,21 +99,22 @@ def is_active(client, paramiko_key, ssh_data, log): """ attempts = 0 establishing_connection = True + log.info(f"Attempting to connect to {ssh_data['floating_ip']}... This might take a while") + port = 22 + if ssh_data.get('gateway'): + log.info(f"Using SSH Gateway {ssh_data['gateway'].get('ip')}") + octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(ssh_data['floating_ip'].split("."))} + port = int(sympy.sympify(ssh_data['gateway']["portFunction"]).subs(dict(octets))) + log.info(f"Port {port} will be used (see {ssh_data['gateway']['portFunction']} and octets {octets}).") while establishing_connection: try: - port = 22 - if ssh_data.get('gateway'): - log.info(f"Using SSH Gateway {ssh_data['gateway'].get('ip')}") - octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(ssh_data['floating_ip'].split("."))} - port = int(sympy.sympify(ssh_data['gateway']["portFunction"]).subs(dict(octets))) - log.info(f"Port {port} will be used (see {ssh_data['gateway']['portFunction']} and octets {octets}).") + log.info(f"Attempt {attempts}/{ssh_data['timeout']}. Connecting to {ssh_data['floating_ip']}") client.connect(hostname=ssh_data['gateway'].get("ip") or ssh_data['floating_ip'], username=ssh_data['username'], pkey=paramiko_key, timeout=7, auth_timeout=ssh_data['timeout'], port=port) establishing_connection = False log.info(f"Successfully connected to {ssh_data['floating_ip']}") except paramiko.ssh_exception.NoValidConnectionsError as exc: - log.info(f"Attempting to connect to {ssh_data['floating_ip']}... This might take a while", ) if attempts < ssh_data['timeout']: time.sleep(2 ** attempts) attempts += 1 From ba7954265261b776b3bf40c2856325924aa73597 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Thu, 28 Mar 2024 14:40:32 +0100 Subject: [PATCH 6/7] fixed pip dependabot.yml --- .github/dependabot.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index e644f86c..4da4d6da 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,6 +10,4 @@ updates: schedule: interval: "daily" open-pull-requests-limit: 10 - versioning-strategy: "widen" - target: - versions: [">=3.0.0"] + versioning-strategy: "auto" From 9ccd5a5eab0f1fbb7c8795bb90b729c934cdcba3 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 3 Apr 2024 16:51:24 +0200 Subject: [PATCH 7/7] added documentation. Changed timeout to 2**(2+attempts) to decrease number of unlikely to work attempts --- bibigrid/core/actions/create.py | 2 +- bibigrid/core/utility/ansible_configurator.py | 9 ++++----- bibigrid/core/utility/handler/ssh_handler.py | 6 ++++-- documentation/markdown/features/configuration.md | 13 ++++++++++++- .../roles/bibigrid/files/slurm/create_server.py | 6 +++--- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index b3cd2c57..adeeb588 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -81,7 +81,7 @@ def __init__(self, providers, configurations, config_path, log, debug=False, clu self.ssh_user = configurations[0].get("sshUser") or "ubuntu" self.ssh_add_public_key_commands = ssh_handler.get_add_ssh_public_key_commands( configurations[0].get("sshPublicKeyFiles")) - self.ssh_timeout = configurations[0].get("sshTimeout", 5) + self.ssh_timeout = configurations[0].get("sshTimeout", 4) self.config_path = config_path self.master_ip = None self.log.debug("Cluster-ID: %s", self.cluster_id) diff --git a/bibigrid/core/utility/ansible_configurator.py b/bibigrid/core/utility/ansible_configurator.py index 6e24e4f2..af6facbe 100644 --- a/bibigrid/core/utility/ansible_configurator.py +++ b/bibigrid/core/utility/ansible_configurator.py @@ -6,10 +6,10 @@ import mergedeep import yaml -from bibigrid.core.actions.version import __version__ from bibigrid.core.actions import create from bibigrid.core.actions import ide +from bibigrid.core.actions.version import __version__ from bibigrid.core.utility import id_generation from bibigrid.core.utility import yaml_dumper from bibigrid.core.utility.handler import configuration_handler @@ -30,7 +30,7 @@ SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme", "munge_key": id_generation.generate_munge_key(), "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 900, "TreeWidth": 128}} -CLOUD_SCHEDULING = {"timeout": 5} +CLOUD_SCHEDULING = {"sshTimeout": 4} def delete_old_vars(log): @@ -185,9 +185,8 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us master_configuration.get("slurmConf", {}), strategy=mergedeep.Strategy.TYPESAFE_REPLACE), "cloud_scheduling": mergedeep.merge({}, CLOUD_SCHEDULING, - master_configuration.get( - "cloudScheduling", {}), - strategy=mergedeep.Strategy.TYPESAFE_REPLACE)} + master_configuration.get("cloudScheduling", {}), + strategy=mergedeep.Strategy.TYPESAFE_REPLACE)} if master_configuration.get("nfs"): nfs_shares = master_configuration.get("nfsShares", []) nfs_shares = nfs_shares + DEFAULT_NFS_SHARES diff --git a/bibigrid/core/utility/handler/ssh_handler.py b/bibigrid/core/utility/handler/ssh_handler.py index 0baf0f20..0a742318 100644 --- a/bibigrid/core/utility/handler/ssh_handler.py +++ b/bibigrid/core/utility/handler/ssh_handler.py @@ -113,10 +113,12 @@ def is_active(client, paramiko_key, ssh_data, log): username=ssh_data['username'], pkey=paramiko_key, timeout=7, auth_timeout=ssh_data['timeout'], port=port) establishing_connection = False - log.info(f"Successfully connected to {ssh_data['floating_ip']}") + log.info(f"Successfully connected to {ssh_data['floating_ip']}.") except paramiko.ssh_exception.NoValidConnectionsError as exc: if attempts < ssh_data['timeout']: - time.sleep(2 ** attempts) + sleep_time = 2 ** (attempts+2) + time.sleep(sleep_time) + log.info(f"Waiting {sleep_time} before attempting to reconnect.") attempts += 1 else: log.error(f"Attempt to connect to {ssh_data['floating_ip']} failed.") diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index b327f2ba..0f86da90 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -44,6 +44,17 @@ sshPublicKeyFiles: - /home/user/.ssh/id_ecdsa_colleague.pub ``` +#### sshTimeout (optional) +Defines the number of attempts that BiBiGrid will try to connect to the master instance via ssh. +Attempts have a pause of `2^(attempts+2)` seconds in between. Default value is 4. + +#### cloudScheduling (optional) +This key allows you to influence cloud scheduling. Currently, only a single key `sshTimeout` can be set here. + +##### sshTimeout (optional) +Defines the number of attempts that the master will try to connect to on demand created worker instances via ssh. +Attempts have a pause of `2^(attempts+2)` seconds in between. Default value is 4. + #### autoMount (optional) > **Warning:** If a volume has an obscure filesystem, this might overwrite your data! @@ -149,7 +160,7 @@ This is required if your provider has any post-launch services interfering with seemingly random errors can occur when the service interrupts ansible's execution. Services are listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific` (not yet). -#### +#### gateway (optional) In order to save valuable floating ips, BiBiGrid can also make use of a gateway to create the cluster. For more information on how to set up a gateway, how gateways work and why they save floating ips please continue reading [here](https://cloud.denbi.de/wiki/Tutorials/SaveFloatingIPs/). diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index 55872254..19e9b828 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -138,8 +138,8 @@ def check_ssh_active(private_ip, private_key="/opt/slurm/.ssh/id_ecdsa", usernam establishing_connection = False except paramiko.ssh_exception.NoValidConnectionsError as exc: logging.info("Attempting to connect to %s... This might take a while", private_ip) - if attempts < common_config["cloud_scheduling"]["timeout"]: - time.sleep(2 ** attempts) + if attempts < common_config["cloud_scheduling"]["sshTimeout"]: + time.sleep(2 ** (2+attempts)) attempts += 1 else: logging.warning("Attempt to connect to %s failed.", private_ip) @@ -222,7 +222,7 @@ def _run_playbook(cmdline_args): # read common configuration with open("/opt/playbook/vars/common_configuration.yml", mode="r", encoding="utf-8") as common_configuration_file: common_config = yaml.safe_load(common_configuration_file) -logging.warning(f"Maximum 'is active' attempts: {common_config['cloud_scheduling']['timeout']}") +logging.info(f"Maximum 'is active' attempts: {common_config['cloud_scheduling']['sshTimeout']}") # read clouds.yaml with open("/etc/openstack/clouds.yaml", mode="r", encoding="utf-8") as clouds_file: clouds = yaml.safe_load(clouds_file)["clouds"]