From 32a3d13e8226e2cf9ae8c55e7bdc8d3183660072 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 24 Apr 2024 15:38:35 +0200 Subject: [PATCH 01/11] added cloudScheduling and userRoles in bibigrid.yml --- bibigrid.yml | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/bibigrid.yml b/bibigrid.yml index 584c4bb7..2fcfcbc3 100644 --- a/bibigrid.yml +++ b/bibigrid.yml @@ -8,6 +8,8 @@ # -- BEGIN: GENERAL CLUSTER INFORMATION -- # sshTimeout: 5 # Number of ssh connection attempts with 2^attempt seconds in between (2^sshTimeout-1 is the max time before returning with an error) + # cloudScheduling: + # sshTimeout: 42 # like the sshTimeout during startup but during the on demand scheduling ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself. #sshPublicKeyFiles: @@ -21,22 +23,11 @@ #nfsShares: /vol/spool/ is automatically created as a nfs # - [nfsShare one] - ## Ansible (Galaxy) roles can be added for execution # KEY NOT IMPLEMENTED YET - #ansibleRoles: - # - file: SomeFile - # hosts: SomeHosts - # name: SomeName - # vars: SomeVars - # vars_file: SomeVarsFile - - #ansibleGalaxyRoles: # KEY NOT IMPLEMENTED YET - # - hosts: SomeHost - # name: SomeName - # galaxy: SomeGalaxy - # git: SomeGit - # url: SomeURL - # vars: SomeVars - # vars_file: SomeVarsFile + # userRoles: + # - hosts: + # - "master" + # roles: + # - name: "resistance_nextflow" # roles in ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen). #useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated From 927cbd77a79bce30e6f8fefb99e00ca683a8cc7a Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 24 Apr 2024 17:13:44 +0200 Subject: [PATCH 02/11] added userRoles in documentation --- .../markdown/features/configuration.md | 28 +++++-------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index 5c4f77aa..1b59ce44 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -92,30 +92,16 @@ What is NFS? NFS (Network File System) is a stable and well-functioning network protocol for exchanging files over the local network. -#### ansibleRoles (optional) +#### userRoles (optional) -Yet to be explained and implemented. +`userRoles` takes a list of elements containing the keys `hosts`, `roles` and ```yaml - - file: SomeFile - hosts: SomeHosts - name: SomeName - vars: SomeVars - vars_file: SomeVarsFile -``` - -#### ansibleGalaxyRoles (optional) - -Yet to be explained and implemented. - -```yaml - - hosts: SomeHost - name: SomeName - galaxy: SomeGalaxy - git: SomeGit - url: SomeURL - vars: SomeVars - vars_file: SomeVarsFile +userRoles: + - hosts: + - "master" + roles: + - name: "resistance_nextflow" ``` #### localFS (optional) From fb17204ce774d6e8233ac7c10c27f69dccf7cb6f Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 24 Apr 2024 17:43:40 +0200 Subject: [PATCH 03/11] added varsFiles and comments --- documentation/markdown/features/configuration.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index 1b59ce44..c31227cc 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -97,11 +97,13 @@ NFS (Network File System) is a stable and well-functioning network protocol for `userRoles` takes a list of elements containing the keys `hosts`, `roles` and ```yaml -userRoles: +userRoles: # see ansible_hosts for all options - hosts: - - "master" + - "master" roles: - - name: "resistance_nextflow" + - name: "resistance_nextflow" # role placed in + # varsFiles: # vars placed in + # - file1 ``` #### localFS (optional) From 0812164a44051c73157c5707a6a1cd2793ec4a04 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 24 Apr 2024 17:54:38 +0200 Subject: [PATCH 04/11] added folder path in documentation --- documentation/markdown/features/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index c31227cc..bf6e42d0 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -101,8 +101,8 @@ userRoles: # see ansible_hosts for all options - hosts: - "master" roles: - - name: "resistance_nextflow" # role placed in - # varsFiles: # vars placed in + - name: "resistance_nextflow" # role placed in resources/playbook/user_roles + # varsFiles: # - file1 ``` From 323698dd2354fbd1ee045474475bdc914901a1b7 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 24 Apr 2024 17:55:31 +0200 Subject: [PATCH 05/11] fixed naming --- documentation/markdown/features/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index bf6e42d0..ecc619cd 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -101,7 +101,7 @@ userRoles: # see ansible_hosts for all options - hosts: - "master" roles: - - name: "resistance_nextflow" # role placed in resources/playbook/user_roles + - name: "resistance_nextflow" # role placed in resources/playbook/roles_user # varsFiles: # - file1 ``` From 8b7d185b6a63234cb632f06e84172c7d34c3c22b Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 24 Apr 2024 17:58:30 +0200 Subject: [PATCH 06/11] added that vars are optional --- documentation/markdown/features/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index ecc619cd..cd7be47d 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -102,7 +102,7 @@ userRoles: # see ansible_hosts for all options - "master" roles: - name: "resistance_nextflow" # role placed in resources/playbook/roles_user - # varsFiles: + # varsFiles: # (optional) # - file1 ``` From 53a574d9cea158f56e1b64ab6f85bfd6194a23dd Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Wed, 24 Apr 2024 18:00:46 +0200 Subject: [PATCH 07/11] polished userRoles documentation --- bibigrid.yml | 8 +++++--- documentation/markdown/features/configuration.md | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bibigrid.yml b/bibigrid.yml index 2fcfcbc3..bf073108 100644 --- a/bibigrid.yml +++ b/bibigrid.yml @@ -23,11 +23,13 @@ #nfsShares: /vol/spool/ is automatically created as a nfs # - [nfsShare one] - # userRoles: + # userRoles: # see ansible_hosts for all options # - hosts: # - "master" - # roles: - # - name: "resistance_nextflow" # roles in + # roles: # roles placed in resources/playbook/roles_user + # - name: "resistance_nextflow" + # varsFiles: # (optional) + # - [...] ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen). #useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index cd7be47d..f154efb1 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -100,8 +100,8 @@ NFS (Network File System) is a stable and well-functioning network protocol for userRoles: # see ansible_hosts for all options - hosts: - "master" - roles: - - name: "resistance_nextflow" # role placed in resources/playbook/roles_user + roles: # roles placed in resources/playbook/roles_user + - name: "resistance_nextflow" # varsFiles: # (optional) # - file1 ``` From c77ef62962fc65c6f5e3bd9ceaffed44e3667e5a Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Thu, 25 Apr 2024 15:09:34 +0200 Subject: [PATCH 08/11] added documentation for other configurations --- .../markdown/bibigrid_feature_list.md | 5 +++-- .../markdown/features/other_configurations.md | 22 +++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 documentation/markdown/features/other_configurations.md diff --git a/documentation/markdown/bibigrid_feature_list.md b/documentation/markdown/bibigrid_feature_list.md index 0ddecab2..96d828d1 100644 --- a/documentation/markdown/bibigrid_feature_list.md +++ b/documentation/markdown/bibigrid_feature_list.md @@ -5,14 +5,15 @@ | [Version](features/version.md) | Returns BiBiGrid's version for opening issues and the like | | [Terminate Cluster](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. | | [Create](features/create.md) | Creates the cluster specified by the configuration. | - | [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | +| [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | | [Check](features/check.md) | Checks if given configuration is valid and necessary security measures are taken. | | [Web IDE](features/ide.md) | Connects to running IDE of cluster-id's cluster. Requires that given cluster was setup with an ide. | | [Update](features/update.md) | Updates the master's playbook and runs that playbook for the master. Requires that no job is running and no workers up. | | [Cloud Specification Data](features/cloud_specification_data.md) | Contains necessary data to establish a general connection to the provider. | - | [Configuration](features/configuration.md) | Contains all data regarding cluster setup for all providers. | +| [Configuration](features/configuration.md) | Contains all data regarding cluster setup for all providers. | | [Command Line Interface](features/CLI.md) | What command line arguments can be passed into BiBiGrid. | | [Multi Cloud](features/multi_cloud.md) | Explanation how BiBiGrid's multi-cloud approach works | | [BiBiGrid Cluster Commands](features/cluster_commands.md) | Short useful commands to get information on the cluster | +| [Other Configurations](features/other_configurations.md) | Info about custom `ansible.cfg` and `slurm.conf` | ![](../images/actions.jpg) \ No newline at end of file diff --git a/documentation/markdown/features/other_configurations.md b/documentation/markdown/features/other_configurations.md new file mode 100644 index 00000000..0a92faa9 --- /dev/null +++ b/documentation/markdown/features/other_configurations.md @@ -0,0 +1,22 @@ +# Other Configurations +Besides the general BiBiGrid configuration there is also an `ansible.cfg` and a `slurm.conf`. +For 99% of all users those never need to be touched. However, some use cases require changes to those configurations. +For that purpose we store defaults in `resources/defaults` and on the first run copy copies to the actual locations +`resources/playbook/ansible.cfg` and `resources/playbook/bibigrid/templates/slurm/slurm.j2`. +That way you can make changes and if something doesn't work, you can just delete the configuration to go back to our +default one. + +## slurm.cfg +The `slurm.j2` is not a static configuration file, but instead a [jinja](https://jinja.palletsprojects.com/en/3.1.x/) template for the actual configuration that is +generated during runtime. That is necessary because it contains the actual instance names that are only known at runtime. +The jinja template is converted to the actual configuration by ansible in the `042-slurm.yml` task. + +The `slurm.j2` also takes certain information from your BiBiGrid configuration (see [slurmConf](configuration.md#slurmconf-optional)). + +Read more about the `slurm.conf` [here](https://slurm.schedmd.com/slurm.conf.html). + +## ansible.cfg +The `ansible.cfg` defines how ansible behaves during runtime. A key that sometimes need to be adapted is `timeout` which +is the timeout for the connection plugin. If your host answers very slowly, a low timeout might cause issues. + +Read more about the `ansible.cfg` [here](https://docs.ansible.com/ansible/latest/reference_appendices/config.html). \ No newline at end of file From 2a64d33d99ae6c2170812a4dc804b9748ba414c7 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Thu, 25 Apr 2024 15:09:46 +0200 Subject: [PATCH 09/11] added new feature keys --- .../markdown/features/configuration.md | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index f154efb1..006fded0 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -120,6 +120,31 @@ If `True`, master will store DNS information for his workers. Default is `False` If `False`, the cluster will start without the job scheduling system slurm. This is relevant to the fewest. Default is `True`. +##### SlurmConf (optional) +`SlurmConf` contains variable fields in the `slurm.conf`. The most common use is to increase the `SuspendTime` +and the `ResumeTimeout` like: + +```yaml +elastic_scheduling: + SuspendTime: 1800 + ResumeTimeout: 1800 +``` + +Please only use if necessary. On Demand Scheduling improves resource availability for all users. + +###### Defaults +```yaml +slurmConf: + db: slurm # see task 042-slurm-server.yml + db_user: slurm + db_password: changeme + munge_key: # automatically generated via id_generation.generate_munge_key + elastic_scheduling: + SuspendTime: 900 # if a node doesn't start in SuspendTime seconds, the start is considered failed. See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram + ResumeTimeout: 900 # if a node is not used for ResumeTimeout seconds, it will shut down + TreeWidth: 128 # https://slurm.schedmd.com/slurm.conf.html#OPT_TreeWidth +``` + #### zabbix (optional) If `True`, the monitoring solution [zabbix](https://www.zabbix.com/) will be installed on the master. Default is `False`. @@ -194,7 +219,7 @@ workerInstance: - `type` sets the instance's hardware configuration. - `image` sets the bootable operating system to be installed on the instance. - `count` sets how many workers of that `type` `image` combination are in this work group -- `onDemand` defines whether nodes in the worker group are scheduled on demand (True) or are started permanently (False). This option only works on the master cloud for now. +- `onDemand` defines whether nodes in the worker group are scheduled on demand (True) or are started permanently (False). Please only use if necessary. On Demand Scheduling improves resource availability for all users. This option only works on the master cloud for now. ##### Find your active `images` From 0e5a35093f7de77a4f37e385a86348707b2b04ab Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Thu, 25 Apr 2024 17:15:57 +0200 Subject: [PATCH 10/11] fixed template files not being j2 --- .gitignore | 2 +- bibigrid/core/utility/paths/ansible_resources_path.py | 5 +++-- resources/defaults/slurm/{slurm.conf => slurm.j2} | 0 resources/playbook/roles/bibigrid/tasks/020-disk-server.yml | 5 +++-- resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml | 2 +- resources/playbook/roles/bibigrid/tasks/042-slurm.yml | 4 ++-- .../templates/slurm/{job_container.conf => job_container.j2} | 0 .../bibigrid/templates/slurm/{slurmdbd.conf => slurmdbd.j2} | 0 8 files changed, 10 insertions(+), 8 deletions(-) rename resources/defaults/slurm/{slurm.conf => slurm.j2} (100%) rename resources/playbook/roles/bibigrid/templates/slurm/{job_container.conf => job_container.j2} (100%) rename resources/playbook/roles/bibigrid/templates/slurm/{slurmdbd.conf => slurmdbd.j2} (100%) diff --git a/.gitignore b/.gitignore index 0f7fbc2f..79833f52 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ # variable resources resources/playbook/ansible.cfg -resources/playbook/roles/bibigrid/templates/slurm/slurm.conf +resources/playbook/roles/bibigrid/templates/slurm/slurm.j2 resources/playbook/site.yml resources/playbook/ansible_hosts resources/playbook/vars/ diff --git a/bibigrid/core/utility/paths/ansible_resources_path.py b/bibigrid/core/utility/paths/ansible_resources_path.py index 5c0e01b3..1f29dc25 100644 --- a/bibigrid/core/utility/paths/ansible_resources_path.py +++ b/bibigrid/core/utility/paths/ansible_resources_path.py @@ -24,6 +24,7 @@ DEFAULT_IP_FILE = VARS_PATH + "{{ ansible_default_ipv4.address }}.yml" ANSIBLE_CFG = "ansible.cfg" SLURM_CONF = "slurm.conf" +SLURM_J2 = "slurm.j2" # LOCAL PLAYBOOK = "playbook/" @@ -41,12 +42,12 @@ VARS_FOLDER = os.path.join(PLAYBOOK_PATH, VARS_PATH) GROUP_VARS_FOLDER = os.path.join(PLAYBOOK_PATH, GROUP_VARS_PATH) HOST_VARS_FOLDER = os.path.join(PLAYBOOK_PATH, HOST_VARS_PATH) -SLURM_CONF_TEMPLATE_PATH = os.path.join(PLAYBOOK_PATH, "roles", "bibigrid", "templates", "slurm", SLURM_CONF) +SLURM_CONF_TEMPLATE_PATH = os.path.join(PLAYBOOK_PATH, "roles", "bibigrid", "templates", "slurm", SLURM_J2) # DEFAULTS DEFAULTS = os.path.join(b_p.RESOURCES_PATH, "defaults") ANSIBLE_CFG_DEFAULT_PATH = os.path.join(DEFAULTS, "ansible", ANSIBLE_CFG) -SLURM_CONF_TEMPLATE_DEFAULT_PATH = os.path.join(DEFAULTS, "slurm", SLURM_CONF) +SLURM_CONF_TEMPLATE_DEFAULT_PATH = os.path.join(DEFAULTS, "slurm", SLURM_J2) # REMOTE diff --git a/resources/defaults/slurm/slurm.conf b/resources/defaults/slurm/slurm.j2 similarity index 100% rename from resources/defaults/slurm/slurm.conf rename to resources/defaults/slurm/slurm.j2 diff --git a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml index 6691225e..de4f0049 100644 --- a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml @@ -18,9 +18,9 @@ when: master.disks is defined - when: volumes is defined and auto_mount - failed_when: false block: - name: Make sure disks are available + failed_when: false filesystem: fstype: ext4 dev: "{{ item.device }}" @@ -29,6 +29,7 @@ with_items: "{{ volumes }}" - name: Create mount folders if they don't exist + failed_when: false file: path: "/{{ item.name }}" state: directory @@ -38,7 +39,7 @@ with_items: "{{ volumes }}" - name: Mount disks - + failed_when: false mount: path: "{{ item.name }}" src: "{{ item.device }}" diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml index 580aabc4..8abd5d61 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml @@ -14,7 +14,7 @@ - name: Create slurmdb configuration file template: - src: slurm/slurmdbd.conf + src: slurm/slurmdbd.j2 dest: /etc/slurm/slurmdbd.conf owner: slurm group: root diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm.yml b/resources/playbook/roles/bibigrid/tasks/042-slurm.yml index e134dbe2..a4d47c59 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm.yml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm.yml @@ -88,7 +88,7 @@ - name: Create Slurm configuration template: - src: slurm/slurm.conf + src: slurm/slurm.j2 dest: /etc/slurm/slurm.conf owner: slurm group: root @@ -99,7 +99,7 @@ - name: Create Job Container configuration template: - src: slurm/job_container.conf + src: slurm/job_container.j2 dest: /etc/slurm/job_container.conf owner: slurm group: root diff --git a/resources/playbook/roles/bibigrid/templates/slurm/job_container.conf b/resources/playbook/roles/bibigrid/templates/slurm/job_container.j2 similarity index 100% rename from resources/playbook/roles/bibigrid/templates/slurm/job_container.conf rename to resources/playbook/roles/bibigrid/templates/slurm/job_container.j2 diff --git a/resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.conf b/resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.j2 similarity index 100% rename from resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.conf rename to resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.j2 From 66b4996de46407024fff96ebe3638c850f3e7dcc Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Fri, 26 Apr 2024 12:22:25 +0200 Subject: [PATCH 11/11] added helpful comments and removed no longer used roles/additional/ --- resources/playbook/roles/additional/tasks/main.yml | 4 ---- resources/tests/bibigrid_test_example.yml | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) delete mode 100644 resources/playbook/roles/additional/tasks/main.yml diff --git a/resources/playbook/roles/additional/tasks/main.yml b/resources/playbook/roles/additional/tasks/main.yml deleted file mode 100644 index e949ee7f..00000000 --- a/resources/playbook/roles/additional/tasks/main.yml +++ /dev/null @@ -1,4 +0,0 @@ -- debug: - msg: - - "Hello {{ ansible_user }}!" - diff --git a/resources/tests/bibigrid_test_example.yml b/resources/tests/bibigrid_test_example.yml index 9e3a9b1d..b8b19bfa 100644 --- a/resources/tests/bibigrid_test_example.yml +++ b/resources/tests/bibigrid_test_example.yml @@ -1,3 +1,4 @@ +# remove _example in order to actually use this file - infrastructure: # former mode. cloud: #credentials # name of clouds.yaml entry @@ -6,4 +7,4 @@ sshUser: ubuntu network: # network - snapshotImage: # name of a snapshot + snapshotImage: # name of a snapshot to create volume from. Volume needs to be deleted manually afterwards (optional)