diff --git a/bibigrid.yaml b/bibigrid.yaml index 8982a810..ea5d2d73 100644 --- a/bibigrid.yaml +++ b/bibigrid.yaml @@ -9,7 +9,7 @@ # -- BEGIN: GENERAL CLUSTER INFORMATION -- # sshTimeout: 5 # number of attempts to connect to instances during startup with delay in between # cloudScheduling: - # sshTimeout: 42 # like sshTimeout but during the on demand scheduling on the running cluster + # sshTimeout: 5 # like sshTimeout but during the on demand scheduling on the running cluster ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself. #sshPublicKeyFiles: @@ -72,24 +72,6 @@ # Depends on cloud image sshUser: # for example ubuntu - # Depends on cloud site: - # Berlin : regionOne - # Bielefeld : bielefeld - # DKFZ : regionOne - # Giessen : RegionOne - # Heidelberg : RegionOne - # Tuebingen : RegionOne - region: Bielefeld - - # Depends on cloud site: - # Berlin : nova - # Bielefeld : default - # DKFZ : nova - # Giessen : nova - # Heidelberg : nova - # Tuebingen : nova - availabilityZone: default - # Depends on cloud site and project subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/ # or network: diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index bffb9835..7c15144f 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -281,7 +281,6 @@ def initialize_instances(self): wait_for_services_commands = [ (wait_for_service_command.format(service=service), wait_for_service_message.format(service=service)) for service in configuration.get("waitForServices", [])] - print(wait_for_services_commands) ssh_data["commands"] = ( wait_for_services_commands + self.ssh_add_public_key_commands + ssh_handler.ANSIBLE_SETUP) ssh_data["filepaths"] = [(ssh_data["private_key"], ssh_handler.PRIVATE_KEY_FILE)] @@ -340,12 +339,12 @@ def prepare_configurations(self): configuration["subnet"]] configuration["sshUser"] = self.ssh_user # is used in ansibleConfigurator - def upload_data(self): + def upload_data(self, private_key, clean_playbook=False): """ Configures ansible and then uploads the modified files and all necessary data to the master @return: """ - self.log.debug("Uploading ansible Data") + self.log.debug("Running upload_data") if not os.path.isfile(a_rp.HOSTS_FILE): with open(a_rp.HOSTS_FILE, 'a', encoding='utf-8') as hosts_file: hosts_file.write("# placeholder file for worker DNS entries (see 003-dns)") @@ -360,7 +359,14 @@ def upload_data(self): self.log.debug(f"Starting playbook with {ansible_start}.") commands = [ssh_handler.get_ac_command(self.providers, AC_NAME.format( cluster_id=self.cluster_id))] + ssh_handler.ANSIBLE_START - ssh_data = {"floating_ip": self.master_ip, "private_key": KEY_FOLDER + self.key_name, "username": self.ssh_user, + if clean_playbook: + self.log.info("Cleaning Playbook") + ssh_data = {"floating_ip": self.master_ip, "private_key": private_key, "username": self.ssh_user, + "commands": [("rm -rf ~/playbook/*", "Remove Playbook")], "filepaths": [], + "gateway": self.configurations[0].get("gateway", {}), "timeout": self.ssh_timeout} + ssh_handler.execute_ssh(ssh_data=ssh_data, log=self.log) + self.log.info("Uploading Data") + ssh_data = {"floating_ip": self.master_ip, "private_key": private_key, "username": self.ssh_user, "commands": commands, "filepaths": FILEPATHS, "gateway": self.configurations[0].get("gateway", {}), "timeout": self.ssh_timeout} ssh_handler.execute_ssh(ssh_data=ssh_data, log=self.log) @@ -370,6 +376,7 @@ def start_start_server_threads(self): Starts for each provider a start_instances thread and joins them. @return: """ + self.log.debug("Running start_start_server_threads") start_server_threads = [] worker_count = 0 ansible_configurator.write_yaml(a_rp.HOSTS_FILE, {"host_entries": {}}, self.log) @@ -397,6 +404,7 @@ def extended_network_configuration(self): Configure master/vpn-worker network for a multi/hybrid cloud @return: """ + self.log.debug("Running extended_network_configuration") if len(self.providers) == 1: return @@ -439,7 +447,7 @@ def create(self): # pylint: disable=too-many-branches,too-many-statements self.start_start_server_threads() self.extended_network_configuration() self.initialize_instances() - self.upload_data() + self.upload_data(os.path.join(KEY_FOLDER, self.key_name)) self.log_cluster_start_info() if self.configurations[0].get("deleteTmpKeypairAfter"): for provider in self.providers: diff --git a/bibigrid/core/actions/list_clusters.py b/bibigrid/core/actions/list_clusters.py index e6e3cf03..1f07f95d 100644 --- a/bibigrid/core/actions/list_clusters.py +++ b/bibigrid/core/actions/list_clusters.py @@ -146,12 +146,12 @@ def get_master_access_ip(cluster_id, master_provider, log): @param log: @return: public ip of master """ + # TODO: maybe move the method from list_clusters as it is now independent of list_clusters log.info("Finding master ip for cluster %s...", cluster_id) - servers = master_provider.list_servers() - for server in servers: - master = create.MASTER_IDENTIFIER(cluster_id=cluster_id) - if server["name"].startswith(master): - return server.get("public_v4") or server.get("public_v6") or server.get("private_v4") + master = create.MASTER_IDENTIFIER(cluster_id=cluster_id) + server = master_provider.get_server(master) + if server: + return server.get("public_v4") or server.get("public_v6") or server.get("private_v4") log.warning("Cluster %s not found on master_provider %s.", cluster_id, master_provider.cloud_specification["identifier"]) return None diff --git a/bibigrid/core/actions/terminate.py b/bibigrid/core/actions/terminate.py index 1452cb99..9a56bd3c 100644 --- a/bibigrid/core/actions/terminate.py +++ b/bibigrid/core/actions/terminate.py @@ -148,7 +148,6 @@ def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5 tmp_success = False while not tmp_success: try: - # TODO: Check if security group exists at all not_found = not provider.get_security_group(security_group_name) tmp_success = provider.delete_security_group(security_group_name) except ConflictException: diff --git a/bibigrid/core/actions/update.py b/bibigrid/core/actions/update.py index efe9aaf7..dc9f5e42 100644 --- a/bibigrid/core/actions/update.py +++ b/bibigrid/core/actions/update.py @@ -2,24 +2,34 @@ Module that contains methods to update the master playbook """ -from bibigrid.core.utility import ansible_commands as a_c -from bibigrid.core.utility.handler import ssh_handler -from bibigrid.core.utility.paths import ansible_resources_path as a_rp -from bibigrid.core.utility.paths import bin_path +from bibigrid.core.actions import create +from bibigrid.core.actions.list_clusters import dict_clusters from bibigrid.core.utility.handler import cluster_ssh_handler -def update(cluster_id, master_provider, master_configuration, log): - log.info("Starting update...") - master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider, - master_configuration, log) +def update(creator, log): + log.info(f"Starting update for cluster {creator.cluster_id}...") + master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(creator.cluster_id, + creator.providers[0], + creator.configurations[0], log) + log.info(f"Trying to update {master_ip}@{ssh_user} with key {used_private_key}") + cluster_dict = dict_clusters(creator.providers, log) + if cluster_dict[creator.cluster_id]["workers"]: + workers = [worker['name'] for worker in cluster_dict[creator.cluster_id]["workers"]] + log.warning(f"There are still workers up! {workers}") + return 1 if master_ip and ssh_user and used_private_key: - log.info("Trying to update %s@%s", master_ip, ssh_user) - ssh_handler.execute_ssh(floating_ip=master_ip, private_key=used_private_key, username=ssh_user, - log=log, - gateway=master_configuration.get("gateway", {}), - commands=[a_c.EXECUTE], - filepaths=[(a_rp.PLAYBOOK_PATH, a_rp.PLAYBOOK_PATH_REMOTE), - (bin_path.BIN_PATH, bin_path.BIN_PATH_REMOTE)]) + master = create.MASTER_IDENTIFIER(cluster_id=creator.cluster_id) + server = creator.providers[0].get_server(master) + creator.master_ip = master_ip + creator.configurations[0]["private_v4"] = server["private_v4"] + creator.configurations[0]["floating_ip"] = master_ip + # TODO Test Volumes + creator.configurations[0]["volumes"] = server["volumes"] + creator.prepare_configurations() + log.log(42, f"Uploading data and executing BiBiGrid's Ansible playbook to {creator.cluster_id}") + creator.upload_data(used_private_key, clean_playbook=True) + log.log(42, f"Successfully updated cluster {creator.cluster_id}") return 0 + log.warning("One or more among master_ip, ssh_user and used_private_key are none. Aborting...") return 1 diff --git a/bibigrid/core/provider.py b/bibigrid/core/provider.py index 9e06dbd0..fb359e98 100644 --- a/bibigrid/core/provider.py +++ b/bibigrid/core/provider.py @@ -88,8 +88,8 @@ def list_servers(self): """ @abstractmethod - def create_server(self, name, flavor, image, network, key_name=None, wait=True, - volumes=None, security_groups=None): # pylint: disable=too-many-arguments + def create_server(self, name, flavor, image, network, key_name=None, wait=True, volumes=None, + security_groups=None): # pylint: disable=too-many-arguments """ Creates a new server and waits for it to be accessible if wait=True. If volumes are given, they are attached. Returns said server (dict) @@ -223,8 +223,8 @@ def get_active_images(self): return [image["name"] for image in self.get_images() if image["status"].lower() == "active"] def get_active_flavors(self): - return [flavor["name"] for flavor in self.get_flavors() - if "legacy" not in flavor["name"].lower() and "deprecated" not in flavor["name"].lower()] + return [flavor["name"] for flavor in self.get_flavors() if + "legacy" not in flavor["name"].lower() and "deprecated" not in flavor["name"].lower()] @abstractmethod def set_allowed_addresses(self, id_or_ip, allowed_address_pairs): @@ -273,6 +273,13 @@ def get_security_group(self, name_or_id): @return: """ + def get_server(self, name_or_id): + """ + Returns server if found else None. + @param name_or_id: + @return: + """ # TODO Test + def get_mount_info_from_server(self, server): volumes = [] for server_volume in server["volumes"]: diff --git a/bibigrid/core/startup.py b/bibigrid/core/startup.py index 9caac7cb..8c726cb6 100755 --- a/bibigrid/core/startup.py +++ b/bibigrid/core/startup.py @@ -82,7 +82,7 @@ def run_action(args, configurations, config_path): creator = create.Create(providers=providers, configurations=configurations, log=LOG, debug=args.debug, config_path=config_path) LOG.log(42, "Creating a new cluster takes about 10 or more minutes depending on your cloud provider " - "and your configuration. Please be patient.") + "and your configuration. Please be patient.") exit_state = creator.create() else: if not args.cluster_id: @@ -99,7 +99,10 @@ def run_action(args, configurations, config_path): exit_state = ide.ide(args.cluster_id, providers[0], configurations[0], LOG) elif args.update: LOG.info("Action update selected") - exit_state = update.update(args.cluster_id, providers[0], configurations[0], LOG) + creator = create.Create(providers=providers, configurations=configurations, log=LOG, + debug=args.debug, + config_path=config_path, cluster_id=args.cluster_id) + exit_state = update.update(creator, LOG) for provider in providers: provider.close() else: diff --git a/bibigrid/core/utility/command_line_interpreter.py b/bibigrid/core/utility/command_line_interpreter.py index 98ed1a35..44275eb5 100644 --- a/bibigrid/core/utility/command_line_interpreter.py +++ b/bibigrid/core/utility/command_line_interpreter.py @@ -57,7 +57,7 @@ def interpret_command_line(): help="Establishes a secure connection to ide. Needs cluster-id set") actions.add_argument("-u", "--update", action='store_true', help="Updates master's playbook. " "Needs cluster-id set, no jobs running " - "and no workers up") + "and all workers down (experimental)") args = parser.parse_args() needs_config = args.terminate or args.create or args.list or args.check or args.ide if needs_config and not args.config_input: diff --git a/bibigrid/openstack/openstack_provider.py b/bibigrid/openstack/openstack_provider.py index 75de1c2c..5fb3bacd 100644 --- a/bibigrid/openstack/openstack_provider.py +++ b/bibigrid/openstack/openstack_provider.py @@ -328,3 +328,11 @@ def get_security_group(self, name_or_id): @return: """ return self.conn.get_security_group(name_or_id) + + def get_server(self, name_or_id): + """ + Returns server if found else None. + @param name_or_id: + @return: + """ + return self.conn.get_server(name_or_id) diff --git a/documentation/markdown/features/update.md b/documentation/markdown/features/update.md index 40ea97dd..c1033854 100644 --- a/documentation/markdown/features/update.md +++ b/documentation/markdown/features/update.md @@ -1,5 +1,25 @@ # Update +This feature is experimental -Updates ansible-playbook and nothing else. You cannot declare new instances or anything. -Only relevant if a fix or a new feature is added to the ansible-playbook. -In the future we will try to further enhance this feature. \ No newline at end of file +Update re-uploads the playbook, updates the configuration data and executes the playbook again. + +Updating the configuration data does not allow for all kinds of updates, because some changes - +like attaching volumes, would need an undo process which is not implemented. That might come in a future version. +Therefore, some keys mentioned below in [updatable](#updatable) have "(activate)" behind them. +Those keys should not be deactivated, but only activated in updates. + +**Configuration keys not listed below are considered not updatable.** + +## Updatable +- Ansible playbook + + +- workerInstances +- useMasterAsCompute +- userRoles +- cloudScheduling +- waitForServices +- features +- ide (activate) +- nfsShares (activate) +- zabbix (activate) \ No newline at end of file diff --git a/resources/defaults/slurm/slurm.j2 b/resources/defaults/slurm/slurm.j2 index abd8078b..037daf80 100644 --- a/resources/defaults/slurm/slurm.j2 +++ b/resources/defaults/slurm/slurm.j2 @@ -76,7 +76,6 @@ SlurmdLogFile=/var/log/slurm/slurmd.log {% endif %} {% set _ = node_groups.append(node.name) %} {% set mem = (node.flavor.ram // 1024) * 1000 %} -# {{ node }} NodeName={{ node.name }} SocketsPerBoard={{ node.flavor.vcpus }} CoresPerSocket=1 RealMemory={{ mem - [mem // 2, 16000] | min }} State={{node.state }} {{"Features=" + (node.features | join(",")) if node.features is defined }}# {{ node.cloud_identifier }} {% for partition in node.partitions %} {% if partition not in partitions %} diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yaml b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yaml index 7ee24efc..e3a28ac6 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yaml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yaml @@ -19,9 +19,6 @@ owner: slurm group: root mode: "0600" - notify: - - slurmdbd - - slurmctld - name: Generate random JWT Secret command: @@ -42,8 +39,6 @@ owner: root group: root mode: "0644" - notify: - - slurmrestd - name: Create system overrides directories (slurmdbdm slurmrestd) file: @@ -66,9 +61,6 @@ with_items: - slurmdbd - slurmrestd - notify: - - slurmdbd - - slurmrestd - name: Register Slurm users home dir shell: "set -o pipefail && grep slurm /etc/passwd | cut -d ':' -f 6" @@ -220,3 +212,13 @@ - slurmd - slurmdbd - slurmrestd + +- name: Restart Slurm services + systemd: + name: "{{ item }}" + state: restarted + loop: + - slurmdbd + - slurmrestd + - slurmctld + - slurmd diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm.yaml b/resources/playbook/roles/bibigrid/tasks/042-slurm.yaml index a4d47c59..d80fb67c 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm.yaml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm.yaml @@ -71,9 +71,6 @@ with_items: - slurmd - slurmctld - notify: - - slurmd - - slurmctld - name: Enable slurmctld and slurmd services systemd: @@ -93,9 +90,6 @@ owner: slurm group: root mode: 0444 - notify: - - slurmctld - - slurmd - name: Create Job Container configuration template: @@ -104,9 +98,6 @@ owner: slurm group: root mode: 0444 - notify: - - slurmctld - - slurmd - name: Slurm cgroup configuration copy: @@ -115,6 +106,8 @@ owner: slurm group: root mode: 0444 - notify: - - slurmctld - - slurmd + +- name: Restart slurmd + systemd: + name: slurmd + state: restarted diff --git a/tests/provider/test_provider.py b/tests/provider/test_provider.py index ffeec438..d6ba89c2 100644 --- a/tests/provider/test_provider.py +++ b/tests/provider/test_provider.py @@ -6,9 +6,9 @@ import os import unittest +import bibigrid.core.utility.paths.basic_path as bP from bibigrid.core import startup from bibigrid.core.utility import image_selection -import bibigrid.core.utility.paths.basic_path as bP from bibigrid.core.utility.handler import configuration_handler from bibigrid.core.utility.handler import provider_handler from bibigrid.models.exceptions import ExecutionException @@ -70,9 +70,8 @@ "MFbUTTukAiDf4jAgvJkg7ayE0MPapGpI/OhSK2gyN45VAzs2m7uykun87B491JagZ57qr16vt8vxGYpFCEe8QqAcrUszUPqyPrb0auA8bz" \ "jO8S41Kx8FfG+7eTu4dQ0= user" -CONFIGURATIONS = configuration_handler.read_configuration(logging, - os.path.join(bP.ROOT_PATH, - "resources/tests/bibigrid_test.yaml")) +CONFIGURATIONS = configuration_handler.read_configuration(logging, os.path.join(bP.ROOT_PATH, + "resources/tests/bibigrid_test.yaml")) PROVIDERS = provider_handler.get_providers(CONFIGURATIONS, logging) @@ -160,12 +159,15 @@ def test_active_server_methods(self): floating_ip = provider.attach_available_floating_ip( provider.get_external_network(configuration["network"]), provider_server) server_list = provider.list_servers() + get_server = provider.get_server("bibigrid_test_server") self.assertEqual(SERVER_KEYS, set(provider_server.keys())) self.assertEqual("bibigrid_test_keypair", provider_server["key_name"]) self.assertEqual(FLOATING_IP_KEYS, set(floating_ip.keys())) - self.assertTrue([server for server in server_list if - server["name"] == "bibigrid_test_server" and server[ - "public_v4"] == floating_ip.floating_ip_address]) + list_server = next(server for server in server_list if + server["name"] == "bibigrid_test_server" and server[ + "public_v4"] == floating_ip.floating_ip_address) + self.assertEqual("bibigrid_test_server", get_server["name"]) + self.assertEqual(get_server, list_server) provider.delete_keypair("bibigrid_test_keypair") def test_get_external_network(self): @@ -226,6 +228,15 @@ def test_get_image_mismatch(self): with self.subTest(provider.NAME): self.assertIsNone(provider.get_image_by_id_or_name("NONE")) + # TODO test_get_images + # TODO test_get_flavors + # TODO test_set_allowed_addresses + # TODO test_get_server + # TODO test_get_security_group + # TODO test_create_security_group + # TODO append_rules_to_security_group + # TODO test_delete_security_group + if CONFIGURATIONS[0].get("snapshotImage"): def test_get_snapshot(self): for provider, configuration in zip(PROVIDERS, CONFIGURATIONS): diff --git a/tests/test_create.py b/tests/test_create.py index 63dd119d..5eb60901 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -1,6 +1,7 @@ """ Module to test create """ +import os from unittest import TestCase from unittest.mock import patch, MagicMock, mock_open @@ -122,8 +123,7 @@ def test_initialize_master(self, mock_execute_ssh): ssh_data = {'floating_ip': floating_ip, 'private_key': create.KEY_FOLDER + creator.key_name, 'username': creator.ssh_user, 'commands': creator.ssh_add_public_key_commands + ssh_handler.ANSIBLE_SETUP, - 'filepaths': [(create.KEY_FOLDER + creator.key_name, '.ssh/id_ecdsa')], - 'gateway': {}, 'timeout': 5} + 'filepaths': [(create.KEY_FOLDER + creator.key_name, '.ssh/id_ecdsa')], 'gateway': {}, 'timeout': 5} mock_execute_ssh.assert_called_with(ssh_data, startup.LOG) def test_prepare_volumes_none(self): @@ -204,14 +204,12 @@ def test_upload_playbooks(self, mock_execute_ssh, mock_ac_ssh, mock_configure_an configuration = {} creator = create.Create([provider], [configuration], "", startup.LOG) creator.master_ip = 42 - creator.upload_data() + creator.upload_data(os.path.join(create.KEY_FOLDER, creator.key_name)) mock_configure_ansible.assert_called_with(providers=creator.providers, configurations=creator.configurations, cluster_id=creator.cluster_id, log=startup.LOG) ssh_data = {'floating_ip': creator.master_ip, 'private_key': create.KEY_FOLDER + creator.key_name, - 'username': creator.ssh_user, - 'commands': [mock_ac_ssh()] + ssh_handler.ANSIBLE_START, - 'filepaths': create.FILEPATHS, - 'gateway': {}, 'timeout': 5} + 'username': creator.ssh_user, 'commands': [mock_ac_ssh()] + ssh_handler.ANSIBLE_START, + 'filepaths': create.FILEPATHS, 'gateway': {}, 'timeout': 5} mock_execute_ssh.assert_called_with(ssh_data=ssh_data, log=startup.LOG) @patch.object(create.Create, "generate_keypair")