From 7a25d0bf3c6937ad2c5b0076e34b30c1388d3322 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Thu, 31 Oct 2024 14:18:10 +0100 Subject: [PATCH] updated bibigrid.yaml and aligned naming of bootVolume and volume --- bibigrid.yaml | 142 +++++++++--------- bibigrid/core/actions/create.py | 4 +- .../bibigrid/files/slurm/create_server.py | 2 +- 3 files changed, 77 insertions(+), 71 deletions(-) diff --git a/bibigrid.yaml b/bibigrid.yaml index 46b5c218..2c7bec28 100644 --- a/bibigrid.yaml +++ b/bibigrid.yaml @@ -1,29 +1,21 @@ # For an easy introduction see https://github.com/deNBI/bibigrid_clum # For more detailed information see https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown/features/configuration.md - # First configuration also holds general cluster information and must include the master. - # All other configurations (only needed for hybrid cloud setups) must not include another master, but exactly one vpngtw instead. -- infrastructure: openstack # former mode. Describes what cloud provider is used (others are not implemented yet) - cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds) +- # -- BEGIN: GENERAL CLUSTER INFORMATION -- + # The following options configure cluster wide keys + # Modify these according to your requirements - # -- BEGIN: GENERAL CLUSTER INFORMATION -- # sshTimeout: 5 # number of attempts to connect to instances during startup with delay in between - # cloudScheduling: - # sshTimeout: 5 # like sshTimeout but during the on demand scheduling on the running cluster - ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself. - #sshPublicKeyFiles: - # - [public key one] + ## sshPublicKeyFiles listed here will be added to the master's authorized_keys. A temporary key is stored at ~/.config/bibigrid/keys + # sshPublicKeyFiles: + # - [public key one] - ## Volumes and snapshots that will be mounted to master - #masterMounts: (optional) # WARNING: will overwrite unidentified filesystems - # - name: [volume name] - # mountPoint: [where to mount to] # (optional) + # nfsShares: # list of nfs shares. /vol/spool/ is automatically created as an nfs if nfs is true + # - [nfsShare one] - #nfsShares: /vol/spool/ is automatically created as a nfs - # - [nfsShare one] - - # userRoles: # see ansible_hosts for all options + ## Ansible Related + # userRoles: # see ansible_hosts for all 'hosts' options # - hosts: # - "master" # roles: # roles placed in resources/playbook/roles_user @@ -31,80 +23,94 @@ # varsFiles: # (optional) # - [...] - ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen). + ## If you use a gateway or start a cluster from the cloud, your master does not need a public ip. # useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated # gateway: # if you want to use a gateway for create. # ip: # IP of gateway to use # portFunction: 30000 + oct4 # variables are called: oct1.oct2.oct3.oct4 - # deleteTmpKeypairAfter: False - # dontUploadCredentials: False + ## Only relevant for specific projects (e.g. SimpleVM) + # deleteTmpKeypairAfter: False # warning: if you don't pass a key via sshPublicKeyFiles you lose access! + # dontUploadCredentials: False # warning: enabling this prevents you from scheduling on demand! + + ## Additional Software + # zabbix: False + # nfs: False + # ide: False # installs a web ide on the master node. A nice way to view your cluster (like Visual Studio Code) + + ### Slurm Related + # elastic_scheduling: # for large or slow clusters increasing these timeouts might be necessary to avoid failures + # SuspendTimeout: 60 # after SuspendTimeout seconds, slurm allows to power up the node again + # ResumeTimeout: 1200 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed. + + # cloudScheduling: + # sshTimeout: 5 # like sshTimeout but during the on demand scheduling on the running cluster - # Other keys - these are default False - # Usually Ignored - ##localFS: True - ##localDNSlookup: True + # useMasterAsCompute: True - #zabbix: True - #nfs: True - #ide: True # A nice way to view your cluster as if you were using Visual Studio Code + # -- END: GENERAL CLUSTER INFORMATION -- - useMasterAsCompute: True + # -- BEGIN: MASTER CLOUD INFORMATION -- + infrastructure: openstack # former mode. Describes what cloud provider is used (others are not implemented yet) + cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds) - # bootFromVolume: False - # terminateBootVolume: True - # bootVolumeSize: 50 - - # waitForServices: # existing service name that runs after an instance is launched. BiBiGrid's playbook will wait until service is "stopped" to avoid issues + # waitForServices: # list of existing service names that affect apt. BiBiGrid's playbook will wait until service is "stopped" to avoid issues # - de.NBI_Bielefeld_environment.service # uncomment for cloud site Bielefeld - # master configuration + ## master configuration masterInstance: - type: # existing type/flavor on your cloud. See launch instance>flavor for options - image: # existing active image on your cloud. Consider using regex to prevent image updates from breaking your running cluster + type: # existing type/flavor from your cloud. See launch instance>flavor for options + image: # existing active image from your cloud. Consider using regex to prevent image updates from breaking your running cluster # features: # list + # - feature1 # partitions: # list - # bootVolume: None - # bootFromVolume: True - # terminateBootVolume: True - # bootVolumeSize: 50 - - # -- END: GENERAL CLUSTER INFORMATION -- + # - partition1 + # bootVolume: # optional + # name: # optional; if you want to boot from a specific volume + # terminate: True # whether the volume is terminated on server termination + # size: 50 + # volumes: # optional + # - name: volumeName # empty for temporary volumes + # snapshot: snapshotName # optional; to create volume from a snapshot + # mountPoint: /vol/mountPath + # size: 50 + # fstype: ext4 # must support chown + # semiPermanent: False # if True volume is only deleted during cluster termination # fallbackOnOtherImage: False # if True, most similar image by name will be picked. A regex can also be given instead. - # worker configuration + ## worker configuration # workerInstances: - # - type: # existing type/flavor on your cloud. See launch instance>flavor for options + # - type: # existing type/flavor from your cloud. See launch instance>flavor for options # image: # same as master. Consider using regex to prevent image updates from breaking your running cluster - # count: # any number of workers you would like to create with set type, image combination + # count: # number of workers you would like to create with set type, image combination # # features: # list - # # partitions: # list - # # bootVolume: None - # # bootFromVolume: True - # # terminateBootVolume: True - # # bootVolumeSize: 50 - # # volumes: # see documentation - # # - mountPoint: + # # partitions: # list of slurm features that all nodes of this group have + # # bootVolume: # optional + # # name: # optional; if you want to boot from a specific volume + # # terminate: True # whether the volume is terminated on server termination # # size: 50 - # # fstype: ext4 - - - # Depends on cloud image - sshUser: # for example ubuntu + # # volumes: # optional + # # - name: volumeName # empty for temporary volumes + # # snapshot: snapshotName # optional; to create volume from a snapshot + # # mountPoint: /vol/mountPath + # # size: 50 + # # fstype: ext4 # must support chown + # # semiPermanent: False # if True volume is only deleted during cluster termination - # Depends on cloud site and project - subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/ - # or network: + # Depends on image + sshUser: # for example 'ubuntu' - # Uncomment if no full DNS service for started instances is available. - # Currently, the case in Berlin, DKFZ, Heidelberg and Tuebingen. - #localDNSLookup: True + # Depends on project + subnet: # existing subnet from your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/ + # network: # only if no subnet is given - #features: # list + # features: # list of slurm features that all nodes of this cloud have + # - feature1 - # elastic_scheduling: # for large or slow clusters increasing these timeouts might be necessary to avoid failures - # SuspendTimeout: 60 # after SuspendTimeout seconds, slurm allows to power up the node again - # ResumeTimeout: 1200 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed. + # bootVolume: # optional (cloud wide) + # name: # optional; if you want to boot from a specific volume + # terminate: True # whether the volume is terminated on server termination + # size: 50 #- [next configurations] diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index 1258aab8..effc8ca6 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -202,7 +202,7 @@ def start_vpn_or_master(self, configuration, provider): # pylint: disable=too-m boot_volume = instance.get("bootVolume", configuration.get("bootVolume", {})) server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network, volumes=volumes, security_groups=configuration["security_groups"], wait=True, - boot_from_volume=boot_volume.get("bootFromVolume", False), + boot_from_volume=boot_volume.get("name", False), boot_volume=bool(boot_volume), terminate_boot_volume=boot_volume.get("terminate", True), volume_size=boot_volume.get("size", 50)) @@ -242,7 +242,7 @@ def start_workers(self, worker, worker_count, configuration, provider): # pylin boot_volume = worker.get("bootVolume", configuration.get("bootVolume", {})) server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network, volumes=volumes, security_groups=configuration["security_groups"], wait=True, - boot_from_volume=boot_volume.get("bootFromVolume", False), + boot_from_volume=boot_volume.get("name", False), boot_volume=bool(boot_volume), terminate_boot_volume=boot_volume.get("terminateBoot", True), volume_size=boot_volume.get("size", 50)) diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index efa71d2a..0134785a 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -240,7 +240,7 @@ def start_server(name, start_worker_group, start_data): key_name=f"tempKey_bibi-{common_config['cluster_id']}", security_groups=[f"default-{common_config['cluster_id']}"], userdata=userdata, volumes=volumes, wait=False, - boot_from_volume=boot_volume.get("bootFromVolume", False), + boot_from_volume=boot_volume.get("name", False), boot_volume=bool(boot_volume), terminate_volume=boot_volume.get("terminate", True), volume_size=boot_volume.get("size", 50)