diff --git a/.gitignore b/.gitignore index e2dacc6a..742d8be5 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ # variable resources resources/playbook/ansible.cfg -resources/playbook/roles/bibigrid/templates/slurm/slurm.conf +resources/playbook/roles/bibigrid/templates/slurm/slurm.j2 resources/playbook/site.yml resources/playbook/ansible_hosts resources/playbook/vars/ diff --git a/bibigrid.yml b/bibigrid.yml index 584c4bb7..bf073108 100644 --- a/bibigrid.yml +++ b/bibigrid.yml @@ -8,6 +8,8 @@ # -- BEGIN: GENERAL CLUSTER INFORMATION -- # sshTimeout: 5 # Number of ssh connection attempts with 2^attempt seconds in between (2^sshTimeout-1 is the max time before returning with an error) + # cloudScheduling: + # sshTimeout: 42 # like the sshTimeout during startup but during the on demand scheduling ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself. #sshPublicKeyFiles: @@ -21,22 +23,13 @@ #nfsShares: /vol/spool/ is automatically created as a nfs # - [nfsShare one] - ## Ansible (Galaxy) roles can be added for execution # KEY NOT IMPLEMENTED YET - #ansibleRoles: - # - file: SomeFile - # hosts: SomeHosts - # name: SomeName - # vars: SomeVars - # vars_file: SomeVarsFile - - #ansibleGalaxyRoles: # KEY NOT IMPLEMENTED YET - # - hosts: SomeHost - # name: SomeName - # galaxy: SomeGalaxy - # git: SomeGit - # url: SomeURL - # vars: SomeVars - # vars_file: SomeVarsFile + # userRoles: # see ansible_hosts for all options + # - hosts: + # - "master" + # roles: # roles placed in resources/playbook/roles_user + # - name: "resistance_nextflow" + # varsFiles: # (optional) + # - [...] ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen). #useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated diff --git a/bibigrid/core/utility/paths/ansible_resources_path.py b/bibigrid/core/utility/paths/ansible_resources_path.py index 5c0e01b3..1f29dc25 100644 --- a/bibigrid/core/utility/paths/ansible_resources_path.py +++ b/bibigrid/core/utility/paths/ansible_resources_path.py @@ -24,6 +24,7 @@ DEFAULT_IP_FILE = VARS_PATH + "{{ ansible_default_ipv4.address }}.yml" ANSIBLE_CFG = "ansible.cfg" SLURM_CONF = "slurm.conf" +SLURM_J2 = "slurm.j2" # LOCAL PLAYBOOK = "playbook/" @@ -41,12 +42,12 @@ VARS_FOLDER = os.path.join(PLAYBOOK_PATH, VARS_PATH) GROUP_VARS_FOLDER = os.path.join(PLAYBOOK_PATH, GROUP_VARS_PATH) HOST_VARS_FOLDER = os.path.join(PLAYBOOK_PATH, HOST_VARS_PATH) -SLURM_CONF_TEMPLATE_PATH = os.path.join(PLAYBOOK_PATH, "roles", "bibigrid", "templates", "slurm", SLURM_CONF) +SLURM_CONF_TEMPLATE_PATH = os.path.join(PLAYBOOK_PATH, "roles", "bibigrid", "templates", "slurm", SLURM_J2) # DEFAULTS DEFAULTS = os.path.join(b_p.RESOURCES_PATH, "defaults") ANSIBLE_CFG_DEFAULT_PATH = os.path.join(DEFAULTS, "ansible", ANSIBLE_CFG) -SLURM_CONF_TEMPLATE_DEFAULT_PATH = os.path.join(DEFAULTS, "slurm", SLURM_CONF) +SLURM_CONF_TEMPLATE_DEFAULT_PATH = os.path.join(DEFAULTS, "slurm", SLURM_J2) # REMOTE diff --git a/documentation/markdown/bibigrid_feature_list.md b/documentation/markdown/bibigrid_feature_list.md index 0ddecab2..96d828d1 100644 --- a/documentation/markdown/bibigrid_feature_list.md +++ b/documentation/markdown/bibigrid_feature_list.md @@ -5,14 +5,15 @@ | [Version](features/version.md) | Returns BiBiGrid's version for opening issues and the like | | [Terminate Cluster](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. | | [Create](features/create.md) | Creates the cluster specified by the configuration. | - | [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | +| [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | | [Check](features/check.md) | Checks if given configuration is valid and necessary security measures are taken. | | [Web IDE](features/ide.md) | Connects to running IDE of cluster-id's cluster. Requires that given cluster was setup with an ide. | | [Update](features/update.md) | Updates the master's playbook and runs that playbook for the master. Requires that no job is running and no workers up. | | [Cloud Specification Data](features/cloud_specification_data.md) | Contains necessary data to establish a general connection to the provider. | - | [Configuration](features/configuration.md) | Contains all data regarding cluster setup for all providers. | +| [Configuration](features/configuration.md) | Contains all data regarding cluster setup for all providers. | | [Command Line Interface](features/CLI.md) | What command line arguments can be passed into BiBiGrid. | | [Multi Cloud](features/multi_cloud.md) | Explanation how BiBiGrid's multi-cloud approach works | | [BiBiGrid Cluster Commands](features/cluster_commands.md) | Short useful commands to get information on the cluster | +| [Other Configurations](features/other_configurations.md) | Info about custom `ansible.cfg` and `slurm.conf` | ![](../images/actions.jpg) \ No newline at end of file diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index 5c4f77aa..006fded0 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -92,30 +92,18 @@ What is NFS? NFS (Network File System) is a stable and well-functioning network protocol for exchanging files over the local network. -#### ansibleRoles (optional) +#### userRoles (optional) -Yet to be explained and implemented. +`userRoles` takes a list of elements containing the keys `hosts`, `roles` and ```yaml - - file: SomeFile - hosts: SomeHosts - name: SomeName - vars: SomeVars - vars_file: SomeVarsFile -``` - -#### ansibleGalaxyRoles (optional) - -Yet to be explained and implemented. - -```yaml - - hosts: SomeHost - name: SomeName - galaxy: SomeGalaxy - git: SomeGit - url: SomeURL - vars: SomeVars - vars_file: SomeVarsFile +userRoles: # see ansible_hosts for all options + - hosts: + - "master" + roles: # roles placed in resources/playbook/roles_user + - name: "resistance_nextflow" + # varsFiles: # (optional) + # - file1 ``` #### localFS (optional) @@ -132,6 +120,31 @@ If `True`, master will store DNS information for his workers. Default is `False` If `False`, the cluster will start without the job scheduling system slurm. This is relevant to the fewest. Default is `True`. +##### SlurmConf (optional) +`SlurmConf` contains variable fields in the `slurm.conf`. The most common use is to increase the `SuspendTime` +and the `ResumeTimeout` like: + +```yaml +elastic_scheduling: + SuspendTime: 1800 + ResumeTimeout: 1800 +``` + +Please only use if necessary. On Demand Scheduling improves resource availability for all users. + +###### Defaults +```yaml +slurmConf: + db: slurm # see task 042-slurm-server.yml + db_user: slurm + db_password: changeme + munge_key: # automatically generated via id_generation.generate_munge_key + elastic_scheduling: + SuspendTime: 900 # if a node doesn't start in SuspendTime seconds, the start is considered failed. See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram + ResumeTimeout: 900 # if a node is not used for ResumeTimeout seconds, it will shut down + TreeWidth: 128 # https://slurm.schedmd.com/slurm.conf.html#OPT_TreeWidth +``` + #### zabbix (optional) If `True`, the monitoring solution [zabbix](https://www.zabbix.com/) will be installed on the master. Default is `False`. @@ -206,7 +219,7 @@ workerInstance: - `type` sets the instance's hardware configuration. - `image` sets the bootable operating system to be installed on the instance. - `count` sets how many workers of that `type` `image` combination are in this work group -- `onDemand` defines whether nodes in the worker group are scheduled on demand (True) or are started permanently (False). This option only works on the master cloud for now. +- `onDemand` defines whether nodes in the worker group are scheduled on demand (True) or are started permanently (False). Please only use if necessary. On Demand Scheduling improves resource availability for all users. This option only works on the master cloud for now. ##### Find your active `images` diff --git a/documentation/markdown/features/other_configurations.md b/documentation/markdown/features/other_configurations.md new file mode 100644 index 00000000..0a92faa9 --- /dev/null +++ b/documentation/markdown/features/other_configurations.md @@ -0,0 +1,22 @@ +# Other Configurations +Besides the general BiBiGrid configuration there is also an `ansible.cfg` and a `slurm.conf`. +For 99% of all users those never need to be touched. However, some use cases require changes to those configurations. +For that purpose we store defaults in `resources/defaults` and on the first run copy copies to the actual locations +`resources/playbook/ansible.cfg` and `resources/playbook/bibigrid/templates/slurm/slurm.j2`. +That way you can make changes and if something doesn't work, you can just delete the configuration to go back to our +default one. + +## slurm.cfg +The `slurm.j2` is not a static configuration file, but instead a [jinja](https://jinja.palletsprojects.com/en/3.1.x/) template for the actual configuration that is +generated during runtime. That is necessary because it contains the actual instance names that are only known at runtime. +The jinja template is converted to the actual configuration by ansible in the `042-slurm.yml` task. + +The `slurm.j2` also takes certain information from your BiBiGrid configuration (see [slurmConf](configuration.md#slurmconf-optional)). + +Read more about the `slurm.conf` [here](https://slurm.schedmd.com/slurm.conf.html). + +## ansible.cfg +The `ansible.cfg` defines how ansible behaves during runtime. A key that sometimes need to be adapted is `timeout` which +is the timeout for the connection plugin. If your host answers very slowly, a low timeout might cause issues. + +Read more about the `ansible.cfg` [here](https://docs.ansible.com/ansible/latest/reference_appendices/config.html). \ No newline at end of file diff --git a/resources/defaults/slurm/slurm.conf b/resources/defaults/slurm/slurm.j2 similarity index 100% rename from resources/defaults/slurm/slurm.conf rename to resources/defaults/slurm/slurm.j2 diff --git a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml index 6691225e..de4f0049 100644 --- a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml @@ -18,9 +18,9 @@ when: master.disks is defined - when: volumes is defined and auto_mount - failed_when: false block: - name: Make sure disks are available + failed_when: false filesystem: fstype: ext4 dev: "{{ item.device }}" @@ -29,6 +29,7 @@ with_items: "{{ volumes }}" - name: Create mount folders if they don't exist + failed_when: false file: path: "/{{ item.name }}" state: directory @@ -38,7 +39,7 @@ with_items: "{{ volumes }}" - name: Mount disks - + failed_when: false mount: path: "{{ item.name }}" src: "{{ item.device }}" diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml index 580aabc4..8abd5d61 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml @@ -14,7 +14,7 @@ - name: Create slurmdb configuration file template: - src: slurm/slurmdbd.conf + src: slurm/slurmdbd.j2 dest: /etc/slurm/slurmdbd.conf owner: slurm group: root diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm.yml b/resources/playbook/roles/bibigrid/tasks/042-slurm.yml index e134dbe2..a4d47c59 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm.yml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm.yml @@ -88,7 +88,7 @@ - name: Create Slurm configuration template: - src: slurm/slurm.conf + src: slurm/slurm.j2 dest: /etc/slurm/slurm.conf owner: slurm group: root @@ -99,7 +99,7 @@ - name: Create Job Container configuration template: - src: slurm/job_container.conf + src: slurm/job_container.j2 dest: /etc/slurm/job_container.conf owner: slurm group: root diff --git a/resources/playbook/roles/bibigrid/templates/slurm/job_container.conf b/resources/playbook/roles/bibigrid/templates/slurm/job_container.j2 similarity index 100% rename from resources/playbook/roles/bibigrid/templates/slurm/job_container.conf rename to resources/playbook/roles/bibigrid/templates/slurm/job_container.j2 diff --git a/resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.conf b/resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.j2 similarity index 100% rename from resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.conf rename to resources/playbook/roles/bibigrid/templates/slurm/slurmdbd.j2 diff --git a/resources/tests/bibigrid_test_example.yml b/resources/tests/bibigrid_test_example.yml index 9e3a9b1d..b8b19bfa 100644 --- a/resources/tests/bibigrid_test_example.yml +++ b/resources/tests/bibigrid_test_example.yml @@ -1,3 +1,4 @@ +# remove _example in order to actually use this file - infrastructure: # former mode. cloud: #credentials # name of clouds.yaml entry @@ -6,4 +7,4 @@ sshUser: ubuntu network: # network - snapshotImage: # name of a snapshot + snapshotImage: # name of a snapshot to create volume from. Volume needs to be deleted manually afterwards (optional)