From d989d73a2b0c394a51ce43b16880849074135197 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Fri, 18 Oct 2024 16:37:16 +0200 Subject: [PATCH] improved readability greatly. Fixed overwriting host vars bug --- bibigrid/core/actions/create.py | 71 +++++++++---------- bibigrid/core/utility/ansible_configurator.py | 11 +-- bibigrid/core/utility/validate_schema.py | 40 +++++++---- .../markdown/features/configuration.md | 45 +++--------- .../bibigrid/files/slurm/create_server.py | 53 ++++++++------ .../bibigrid/files/slurm/delete_server.py | 4 +- 6 files changed, 108 insertions(+), 116 deletions(-) diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index 7e0e4320..98f6c84c 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -196,20 +196,17 @@ def start_vpn_or_master(self, configuration, provider): # pylint: disable=too-m image = image_selection.select_image(provider, instance["image"], self.log, configuration.get("fallbackOnOtherImage")) - volumes = self.attach_volumes(provider=provider, instance=instance, name=name) + volumes = self.create_server_volumes(provider=provider, instance=instance, name=name) # create a server and block until it is up and running + boot_volume = instance.get("bootVolume", configuration.get("bootVolume", {})) server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network, volumes=volumes, security_groups=configuration["security_groups"], wait=True, - boot_from_volume=instance.get("bootFromVolume", - configuration.get("bootFromVolume", False)), - boot_volume=instance.get("bootVolume", configuration.get("bootVolume")), - terminate_boot_volume=instance.get("terminateBootVolume", - configuration.get("terminateBootVolume", - True)), - volume_size=instance.get("bootVolumeSize", - configuration.get("bootVolumeSize", 50))) - self.attached_volumes_ansible_preparation(provider, server, instance, name) + boot_from_volume=boot_volume.get("bootFromVolume", False), + boot_volume=bool(boot_volume), + terminate_boot_volume=boot_volume.get("terminate", True), + volume_size=boot_volume.get("size", 50)) + self.add_volume_device_info_to_instance(provider, server, instance, name) configuration["private_v4"] = server["private_v4"] self.log.debug(f"Created Server {name}: {server['private_v4']}.") @@ -239,18 +236,17 @@ def start_workers(self, worker, worker_count, configuration, provider): # pylin image = image_selection.select_image(provider, worker["image"], self.log, configuration.get("fallbackOnOtherImage")) - volumes = self.attach_volumes(provider=provider, instance=worker, name=name) + volumes = self.create_server_volumes(provider=provider, instance=worker, name=name) # create a server and block until it is up and running + boot_volume = worker.get("bootVolume", configuration.get("bootVolume", {})) server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network, volumes=volumes, security_groups=configuration["security_groups"], wait=True, - boot_from_volume=worker.get("bootFromVolume", - configuration.get("bootFromVolume", False)), - boot_volume=worker.get("bootVolume", configuration.get("bootVolume")), - terminate_boot_volume=worker.get("terminateBootVolume", - configuration.get("terminateBootVolume", - True))) - self.attached_volumes_ansible_preparation(provider, server, worker, name) + boot_from_volume=boot_volume.get("bootFromVolume", False), + boot_volume=bool(boot_volume), + terminate_boot_volume=boot_volume.get("terminateBoot", True), + volume_size=boot_volume.get("size", 50)) + self.add_volume_device_info_to_instance(provider, server, worker, name) self.log.info(f"Worker {name} started on {provider.cloud_specification['identifier']}.") with self.worker_thread_lock: @@ -264,30 +260,31 @@ def start_workers(self, worker, worker_count, configuration, provider): # pylin ansible_configurator.write_yaml(a_rp.HOSTS_FILE, hosts, self.log) self.log.debug(f"Added worker {name} to hosts file {a_rp.HOSTS_FILE}.") - def attach_volumes(self, provider, instance, name): + def create_server_volumes(self, provider, instance, name): self.log.info("Creating volumes ...") - volumes = [] - for i, attach_volume in enumerate(instance.get("attachVolumes", [])): + return_volumes = [] + for i, volume in enumerate(instance.get("volumes", [])): volume_name = f"{name}-{i}" self.log.debug(f"Created volume {volume_name}") - volume = provider.create_volume(size=attach_volume.get("size", 50), name=volume_name) - attach_volume["name"] = volume_name - volumes.append(volume) - return volumes - - def attached_volumes_ansible_preparation(self, provider, server, instance, name): - server_volumes = provider.get_mount_info_from_server(server) # list of attached volumes - attach_volumes = instance.get("attachVolumes", []) - if attach_volumes: - for attach_volume in attach_volumes: + volume["name"] = volume_name + return_volume = provider.create_volume(size=volume.get("size", 50), name=volume_name) + return_volumes.append(return_volume) + return return_volumes + + def add_volume_device_info_to_instance(self, provider, server, instance, name): + server_volumes = provider.get_mount_info_from_server(server) # list of volumes attachments + volumes = instance.get("volumes") + if volumes: + for volume in volumes: server_volume = next((server_volume for server_volume in server_volumes if - server_volume["name"] == attach_volume["name"]), None) - attach_volume["device"] = server_volume.get("device") - self.log.debug(f"Added Configuration: Instance {name} has volume {attach_volume['name']} " - f"as device {attach_volume['device']} that is going to be mounted to " - f"{attach_volume['mountPoint']}") + server_volume["name"] == volume["name"]), None) + volume["device"] = server_volume.get("device") + + self.log.debug(f"Added Configuration: Instance {name} has volume {volume['name']} " + f"as device {volume['device']} that is going to be mounted to " + f"{volume['mountPoint']}") else: - instance["attachVolumes"] = [] + instance["volumes"] = [] def prepare_vpn_or_master_args(self, configuration, provider): """ diff --git a/bibigrid/core/utility/ansible_configurator.py b/bibigrid/core/utility/ansible_configurator.py index abf8a59a..607cd354 100644 --- a/bibigrid/core/utility/ansible_configurator.py +++ b/bibigrid/core/utility/ansible_configurator.py @@ -91,7 +91,6 @@ def write_host_and_group_vars(configurations, providers, cluster_id, log): # py name = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=f"[{worker_count}-{worker_count + worker.get('count', 1) - 1}]") group_name = name.replace("[", "").replace("]", "").replace(":", "_").replace("-", "_") - worker_count += worker.get('count', 1) regexp = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=r"\d+") worker_dict = {"name": name, "regexp": regexp, "image": worker["image"], "network": configuration["network"], "flavor": flavor_dict, @@ -99,6 +98,7 @@ def write_host_and_group_vars(configurations, providers, cluster_id, log): # py "cloud_identifier": configuration["cloud_identifier"], "on_demand": worker.get("onDemand", True), "state": "CLOUD", "partitions": worker.get("partitions", []) + ["all", configuration["cloud_identifier"]], + "boot_volume": worker.get("bootVolume", configuration.get("bootVolume", {})) } worker_features = worker.get("features", []) @@ -111,8 +111,11 @@ def write_host_and_group_vars(configurations, providers, cluster_id, log): # py pass_through(configuration, worker_dict, "waitForServices", "wait_for_services") write_yaml(os.path.join(aRP.GROUP_VARS_FOLDER, f"{group_name}.yaml"), worker_dict, log) for worker_number in range(worker.get('count', 1)): - name = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=worker_number) - write_yaml(os.path.join(aRP.HOST_VARS_FOLDER, f"{name}.yaml"), {"volumes": worker.get("attachVolumes", [])}, log) + name = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=worker_count+worker_number) + write_yaml(os.path.join(aRP.HOST_VARS_FOLDER, f"{name}.yaml"), {"volumes": worker.get("volumes", [])}, + log) + worker_count += worker.get('count', 1) + vpngtw = configuration.get("vpnInstance") if vpngtw: name = create.VPN_WORKER_IDENTIFIER(cluster_id=cluster_id, additional=f"{vpn_count}") @@ -141,7 +144,7 @@ def write_host_and_group_vars(configurations, providers, cluster_id, log): # py "network_cidrs": configuration["subnet_cidrs"], "floating_ip": configuration["floating_ip"], "flavor": flavor_dict, "private_v4": configuration["private_v4"], "cloud_identifier": configuration["cloud_identifier"], - "volumes": configuration["masterInstance"]["attachVolumes"], + "volumes": configuration["masterInstance"]["volumes"], "fallback_on_other_image": configuration.get("fallbackOnOtherImage", False), "state": "UNKNOWN" if configuration.get("useMasterAsCompute", True) else "DRAINED", "on_demand": False, diff --git a/bibigrid/core/utility/validate_schema.py b/bibigrid/core/utility/validate_schema.py index 6e67a08a..559cce8c 100644 --- a/bibigrid/core/utility/validate_schema.py +++ b/bibigrid/core/utility/validate_schema.py @@ -5,15 +5,21 @@ from schema import Schema, Optional, Or, SchemaError WORKER = {'type': str, 'image': str, Optional('count'): int, Optional('onDemand'): bool, Optional('partitions'): [str], - Optional('features'): [str], - Optional('bootVolume'): str, - Optional('bootFromVolume'): bool, Optional('terminateBootVolume'): bool, Optional('bootVolumeSize'): int, - } + Optional('features'): [str], + Optional('bootVolume'): { + Optional('name'): str, + Optional('terminate'): bool, + Optional('size'): int + }, + } MASTER = VPN = {'type': str, 'image': str, Optional('onDemand'): bool, Optional('partitions'): [str], - Optional('features'): [str], - Optional('bootVolume'): str, - Optional('bootFromVolume'): bool, Optional('terminateBootVolume'): bool, Optional('bootVolumeSize'): int, - } + Optional('features'): [str], + Optional('bootVolume'): { + Optional('name'): str, + Optional('terminate'): bool, + Optional('size'): int + }, + } # Define the schema for the configuration file master_schema = Schema( @@ -31,22 +37,30 @@ 'ResumeTimeout'): int, Optional('TreeWidth'): int}}, Optional('zabbix'): bool, Optional('nfs'): bool, Optional('ide'): bool, Optional('useMasterAsCompute'): bool, - Optional('useMasterWithPublicIp'): bool, Optional('waitForServices'): [str], Optional('bootVolume'): str, - Optional('bootFromVolume'): bool, Optional('terminateBootVolume'): bool, Optional('bootVolumeSize'): int, + Optional('useMasterWithPublicIp'): bool, Optional('waitForServices'): [str], Optional('gateway'): {'ip': str, 'portFunction': str}, Optional('dontUploadCredentials'): bool, Optional('fallbackOnOtherImage'): bool, Optional('localDNSLookup'): bool, Optional('features'): [str], 'workerInstances': [ WORKER], 'masterInstance': MASTER, Optional('vpngtw'): {'type': str, 'image': str}, - Optional('bootVolume'): str, - Optional('bootFromVolume'): bool, Optional('terminateBootVolume'): bool, Optional('bootVolumeSize'): int + Optional('bootVolume'): { + Optional('name'): str, + Optional('terminate'): bool, + Optional('size'): int + }, }) other_schema = Schema( {'infrastructure': str, 'cloud': str, 'sshUser': str, Or('subnet', 'network'): str, 'cloud_identifier': str, Optional('waitForServices'): [str], Optional('features'): [str], 'workerInstances': [ - WORKER], 'vpnInstance': VPN}) + WORKER], 'vpnInstance': VPN, + Optional('bootVolume'): { + Optional('name'): str, + Optional('terminate'): bool, + Optional('size'): int + }, + }) def validate_configurations(configurations, log): diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index 664087c8..480960d5 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -70,38 +70,6 @@ cloudScheduling: sshTimeout: 5 ``` -#### masterMounts (optional:False) - -`masterMounts` expects a list of volumes and snapshots. Those will be attached to the master. If any snapshots are -given, volumes are first created from them. Volumes are not deleted after Cluster termination. - -```yaml -masterMounts: - - name: test # name of the volume to be attached - mountPoint: /vol/spool2 # where attached volume is to be mount to (optional) -``` - -`masterMounts` can be combined with [nfsshares](#nfsshares-optional). -The following example attaches volume test to our master instance and mounts it to `/vol/spool2`. -Then it creates an nfsshare on `/vol/spool2` allowing workers to access the volume test. - -```yaml -masterMounts: - - name: test # name of the volume to be attached - mountPoint: /vol/spool2 # where attached volume is to be mount to (optional) - -nfsshares: - - /vol/spool2 -``` - -
- - What is mounting? - - -[Mounting](https://man7.org/linux/man-pages/man8/mount.8.html) adds a new filesystem to the file tree allowing access. -
- #### nfsShares (optional) `nfsShares` expects a list of folder paths to share over the network using nfs. @@ -263,10 +231,14 @@ workerInstance: features: # optional - hasdatabase - holdsinformation - bootVolume: False - bootFromVolume: True - terminateBootVolume: True - bootVolumeSize: 50 + volumes: + - mountPoint: /vol/test + size: 50 + fstype: ext4 + bootVolume: + name: False + terminate: True + size: 50 ``` - `type` sets the instance's hardware configuration. @@ -279,7 +251,6 @@ workerInstance: - `bootFromVolume` (optional:False) if True, the instance will boot from a volume created for this purpose. - `terminateBootVolume` (optional:True) if True, the boot volume will be terminated when the server is terminated. - `bootVolumeSize` (optional:50) if a boot volume is created, this sets its size. - ##### Find your active `images` ```commandline diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index dddcac69..3ffd0fb9 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -85,27 +85,27 @@ def get_server_vars(name): return server_vars -def attach_volumes(provider, host_vars, name): +def create_server_volumes(provider, host_vars, name): logging.info("Creating volumes ...") - attach_volumes = host_vars.get('volumes', []) - volumes = [] + volumes = host_vars.get('volumes', []) + return_volumes = [] host_vars_path = f"/opt/playbook/host_vars/{name}.yaml" with FileLock(f"{host_vars_path}.lock"): - logging.info(f"Instance Volumes {attach_volumes}") - for i, attach_volume in enumerate(attach_volumes): - logging.info(f"{i}: {attach_volume}") + logging.info(f"Instance Volumes {volumes}") + for i, volume in enumerate(volumes): + logging.info(f"{i}: {volume}") volume_name = f"{name}-{i}" logging.info(f"Creating volume {volume_name}") - volume = provider.create_volume(size=attach_volume.get("size", 50), name=volume_name) - attach_volume["name"] = volume_name - volumes.append(volume) + volume["name"] = volume_name + return_volume = provider.create_volume(size=volume.get("size", 50), name=volume_name) + return_volumes.append(return_volume) with open(host_vars_path, mode="w+", encoding="utf-8") as host_vars_file: yaml.dump(host_vars, host_vars_file) - return volumes + return return_volumes -def attached_volumes_host_vars_update(connection, server, host_vars): +def volumes_host_vars_update(connection, server, host_vars): logging.info("Updating host vars volume info") host_vars_path = f"/opt/playbook/host_vars/{server['name']}.yaml" @@ -120,16 +120,17 @@ def attached_volumes_host_vars_update(connection, server, host_vars): server_attachment.append({"name": volume["name"], "device": attachment["device"]}) break # add device info - attach_volumes = host_vars.get("volumes", []) - if attach_volumes: - for attach_volume in attach_volumes: - logging.info(f"Finding device for {attach_volume['name']}.") + volumes = host_vars.get("volumes", []) + if volumes: + for volume in volumes: + logging.info(f"Finding device for {volume['name']}.") server_volume = next((server_volume for server_volume in server_attachment if - server_volume["name"] == attach_volume["name"]), None) - attach_volume["device"] = server_volume.get("device") - logging.debug(f"Added Configuration: Instance {server['name']} has volume {attach_volume['name']} " - f"as device {attach_volume['device']} that is going to be mounted to " - f"{attach_volume['mountPoint']}") + server_volume["name"] == volume["name"]), None) + volume["device"] = server_volume.get("device") + + logging.debug(f"Added Configuration: Instance {server['name']} has volume {volume['name']} " + f"as device {volume['device']} that is going to be mounted to " + f"{volume['mountPoint']}") with open(host_vars_path, mode="w+", encoding="utf-8") as host_vars_file: yaml.dump(host_vars, host_vars_file) logging.info(f"{host_vars_path}.lock released") @@ -183,12 +184,18 @@ def start_server(name, start_worker_group, start_data): # create server and ... image = select_image(start_worker_group, connection) host_vars = get_server_vars(name) - volumes = attach_volumes(connection, host_vars, name) + volumes = create_server_volumes(connection, host_vars, name) + boot_volume = start_worker_group.get("bootVolume", {}) server = connection.create_server(name=name, flavor=start_worker_group["flavor"]["name"], image=image, network=start_worker_group["network"], key_name=f"tempKey_bibi-{common_config['cluster_id']}", security_groups=[f"default-{common_config['cluster_id']}"], userdata=userdata, - volumes=volumes, wait=False) + volumes=volumes, wait=False, + boot_from_volume=boot_volume.get("bootFromVolume", False), + boot_volume=bool(boot_volume), + terminate_volume=boot_volume.get("terminate", True), + volume_size=boot_volume.get("size", 50) + ) # ... add it to server start_data["started_servers"].append(server) try: @@ -207,7 +214,7 @@ def start_server(name, start_worker_group, start_data): logging.warning(f"{exc}: Couldn't connect to {server.name}.") server_start_data["connection_exceptions"].append(server.name) logging.info("Update hosts.yaml") - attached_volumes_host_vars_update(connection, server, host_vars) + volumes_host_vars_update(connection, server, host_vars) update_hosts(server.name, server.private_v4) except OpenStackCloudException as exc: diff --git a/resources/playbook/roles/bibigrid/files/slurm/delete_server.py b/resources/playbook/roles/bibigrid/files/slurm/delete_server.py index fc1c0049..541edb06 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/delete_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/delete_server.py @@ -67,13 +67,13 @@ possible_workers = result.stdout.decode("utf-8").strip().split("\n") if terminate_worker in possible_workers: connection = connections[worker_group["cloud_identifier"]] - result = connection.delete_server(terminate_worker) + result = connection.delete_server(terminate_worker, wait=True) logging.info(f"Deleting Volumes") volume_list = connection.list_volumes() volume_regex = re.compile(fr"^{terminate_worker}-(\d+)$") for volume in volume_list: if volume_regex.match(volume["name"]): - logging.info(f"Trying to delete volume {volume['name']}: {0}") #connection.delete_volume(volume)}") + logging.info(f"Trying to delete volume {volume['name']}: {connection.delete_volume(volume)}") if not result: logging.warning(f"Couldn't delete worker {terminate_worker}: Server doesn't exist") else: