updated bibigrid.yaml and aligned naming of bootVolume and volume

BiBiServ · Oct 31, 2024 · 7a25d0b · 7a25d0b
1 parent 62a64b8
commit 7a25d0b
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 71 deletions.
diff --git a/bibigrid.yaml b/bibigrid.yaml
@@ -1,110 +1,116 @@
   # For an easy introduction see https://github.com/deNBI/bibigrid_clum
   # For more detailed information see https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown/features/configuration.md
-  # First configuration also holds general cluster information and must include the master.
-  # All other configurations (only needed for hybrid cloud setups) must not include another master, but exactly one vpngtw instead.
 
-- infrastructure: openstack # former mode. Describes what cloud provider is used (others are not implemented yet)
-  cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds)
+- # -- BEGIN: GENERAL CLUSTER INFORMATION --
+  # The following options configure cluster wide keys
+  # Modify these according to your requirements
 
-  # -- BEGIN: GENERAL CLUSTER INFORMATION --
   # sshTimeout: 5 # number of attempts to connect to instances during startup with delay in between
-  # cloudScheduling:
-  #    sshTimeout: 5 # like sshTimeout but during the on demand scheduling on the running cluster
 
-  ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself.
-  #sshPublicKeyFiles:
-  #  - [public key one]
+  ## sshPublicKeyFiles listed here will be added to the master's authorized_keys. A temporary key is stored at ~/.config/bibigrid/keys
+  # sshPublicKeyFiles:
+  #   - [public key one]
 
-  ## Volumes and snapshots that will be mounted to master
-  #masterMounts: (optional) # WARNING: will overwrite unidentified filesystems
-  #  - name: [volume name]
-  #    mountPoint: [where to mount to] # (optional)
+  # nfsShares: # list of nfs shares. /vol/spool/ is automatically created as an nfs if nfs is true
+  #   - [nfsShare one]
 
-  #nfsShares: /vol/spool/ is automatically created as a nfs
-  #  - [nfsShare one]
-
-  # userRoles: # see ansible_hosts for all options
+  ## Ansible Related
+  # userRoles: # see ansible_hosts for all 'hosts' options
   #  - hosts:
   #    - "master"
   #    roles: # roles placed in resources/playbook/roles_user
   #    - name: "resistance_nextflow"
   #    varsFiles: # (optional)
   #    - [...]
 
-  ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen).
+  ## If you use a gateway or start a cluster from the cloud, your master does not need a public ip.
   # useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated
   # gateway: # if you want to use a gateway for create.
   # ip: # IP of gateway to use
   # portFunction: 30000 + oct4 # variables are called: oct1.oct2.oct3.oct4
 
-  # deleteTmpKeypairAfter: False
-  # dontUploadCredentials: False
+  ## Only relevant for specific projects (e.g. SimpleVM)
+  # deleteTmpKeypairAfter: False # warning: if you don't pass a key via sshPublicKeyFiles you lose access!
+  # dontUploadCredentials: False # warning: enabling this prevents you from scheduling on demand!
+
+  ## Additional Software
+  # zabbix: False
+  # nfs: False
+  # ide: False # installs a web ide on the master node. A nice way to view your cluster (like Visual Studio Code)
+
+  ### Slurm Related
+  # elastic_scheduling: # for large or slow clusters increasing these timeouts might be necessary to avoid failures
+  #   SuspendTimeout: 60 # after SuspendTimeout seconds, slurm allows to power up the node again
+  #   ResumeTimeout: 1200 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed.
+
+  # cloudScheduling:
+  #    sshTimeout: 5 # like sshTimeout but during the on demand scheduling on the running cluster
 
-  # Other keys - these are default False
-  # Usually Ignored
-  ##localFS: True
-  ##localDNSlookup: True
+  # useMasterAsCompute: True
 
-  #zabbix: True
-  #nfs: True
-  #ide: True # A nice way to view your cluster as if you were using Visual Studio Code
+  # -- END: GENERAL CLUSTER INFORMATION --
 
-  useMasterAsCompute: True
+  # -- BEGIN: MASTER CLOUD INFORMATION --
+  infrastructure: openstack # former mode. Describes what cloud provider is used (others are not implemented yet)
+  cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds)
 
-  # bootFromVolume: False
-  # terminateBootVolume: True
-  # bootVolumeSize: 50
-
-  # waitForServices:  # existing service name that runs after an instance is launched. BiBiGrid's playbook will wait until service is "stopped" to avoid issues
+  # waitForServices:  # list of existing service names that affect apt. BiBiGrid's playbook will wait until service is "stopped" to avoid issues
   #  - de.NBI_Bielefeld_environment.service  # uncomment for cloud site Bielefeld
 
-  # master configuration
+  ## master configuration
   masterInstance:
-    type: # existing type/flavor on your cloud. See launch instance>flavor for options
-    image: # existing active image on your cloud. Consider using regex to prevent image updates from breaking your running cluster
+    type: # existing type/flavor from your cloud. See launch instance>flavor for options
+    image: # existing active image from your cloud. Consider using regex to prevent image updates from breaking your running cluster
     # features: # list
+    # - feature1
     # partitions: # list
-    # bootVolume: None
-    # bootFromVolume: True
-    # terminateBootVolume: True
-    # bootVolumeSize: 50
-
-  # -- END: GENERAL CLUSTER INFORMATION --
+    # - partition1
+    # bootVolume: # optional
+    #   name: # optional; if you want to boot from a specific volume
+    #   terminate: True # whether the volume is terminated on server termination
+    #   size: 50
+    # volumes: # optional
+    # - name: volumeName # empty for temporary volumes
+    #   snapshot: snapshotName # optional; to create volume from a snapshot
+    #   mountPoint: /vol/mountPath
+    #   size: 50
+    #   fstype: ext4 # must support chown
+    #   semiPermanent: False # if True volume is only deleted during cluster termination
 
   # fallbackOnOtherImage: False # if True, most similar image by name will be picked. A regex can also be given instead.
 
-  # worker configuration
+  ## worker configuration
   # workerInstances:
-  #  - type: # existing type/flavor on your cloud. See launch instance>flavor for options
+  #  - type: # existing type/flavor from your cloud. See launch instance>flavor for options
   #    image: # same as master. Consider using regex to prevent image updates from breaking your running cluster
-  #    count: # any number of workers you would like to create with set type, image combination
+  #    count: # number of workers you would like to create with set type, image combination
   #    # features: # list
-  #    # partitions: # list
-  #    # bootVolume: None
-  #    # bootFromVolume: True
-  #    # terminateBootVolume: True
-  #    # bootVolumeSize: 50
-  #    # volumes: # see documentation
-  #    # - mountPoint:
+  #    # partitions: # list of slurm features that all nodes of this group have
+  #    # bootVolume: # optional
+  #    #   name: # optional; if you want to boot from a specific volume
+  #    #   terminate: True # whether the volume is terminated on server termination
   #    #   size: 50
-  #    #   fstype: ext4
-
-
-  # Depends on cloud image
-  sshUser: # for example ubuntu
+  #    # volumes: # optional
+  #    # - name: volumeName # empty for temporary volumes
+  #    #   snapshot: snapshotName # optional; to create volume from a snapshot
+  #    #   mountPoint: /vol/mountPath
+  #    #   size: 50
+  #    #   fstype: ext4 # must support chown
+  #    #   semiPermanent: False # if True volume is only deleted during cluster termination
 
-  # Depends on cloud site and project
-  subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/
-  # or network:
+  # Depends on image
+  sshUser: # for example 'ubuntu'
 
-  # Uncomment if no full DNS service for started instances is available.
-  # Currently, the case in Berlin, DKFZ, Heidelberg and Tuebingen.
-  #localDNSLookup: True
+  # Depends on project
+  subnet: # existing subnet from your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/
+  # network: # only if no subnet is given
 
-  #features: # list
+  # features: # list of slurm features that all nodes of this cloud have
+  # - feature1
 
-  # elastic_scheduling: # for large or slow clusters increasing these timeouts might be necessary to avoid failures
-  #   SuspendTimeout: 60 # after SuspendTimeout seconds, slurm allows to power up the node again
-  #   ResumeTimeout: 1200 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed.
+  # bootVolume: # optional (cloud wide)
+  #   name: # optional; if you want to boot from a specific volume
+  #   terminate: True # whether the volume is terminated on server termination
+  #   size: 50
 
   #- [next configurations]
diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py
@@ -202,7 +202,7 @@ def start_vpn_or_master(self, configuration, provider):  # pylint: disable=too-m
         boot_volume = instance.get("bootVolume", configuration.get("bootVolume", {}))
         server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network,
                                         volumes=volumes, security_groups=configuration["security_groups"], wait=True,
-                                        boot_from_volume=boot_volume.get("bootFromVolume", False),
+                                        boot_from_volume=boot_volume.get("name", False),
                                         boot_volume=bool(boot_volume),
                                         terminate_boot_volume=boot_volume.get("terminate", True),
                                         volume_size=boot_volume.get("size", 50))
@@ -242,7 +242,7 @@ def start_workers(self, worker, worker_count, configuration, provider):  # pylin
         boot_volume = worker.get("bootVolume", configuration.get("bootVolume", {}))
         server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network,
                                         volumes=volumes, security_groups=configuration["security_groups"], wait=True,
-                                        boot_from_volume=boot_volume.get("bootFromVolume", False),
+                                        boot_from_volume=boot_volume.get("name", False),
                                         boot_volume=bool(boot_volume),
                                         terminate_boot_volume=boot_volume.get("terminateBoot", True),
                                         volume_size=boot_volume.get("size", 50))

diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py
@@ -240,7 +240,7 @@ def start_server(name, start_worker_group, start_data):
                                           key_name=f"tempKey_bibi-{common_config['cluster_id']}",
                                           security_groups=[f"default-{common_config['cluster_id']}"], userdata=userdata,
                                           volumes=volumes, wait=False,
-                                          boot_from_volume=boot_volume.get("bootFromVolume", False),
+                                          boot_from_volume=boot_volume.get("name", False),
                                           boot_volume=bool(boot_volume),
                                           terminate_volume=boot_volume.get("terminate", True),
                                           volume_size=boot_volume.get("size", 50)