diff --git a/bibigrid/core/actions/terminate.py b/bibigrid/core/actions/terminate.py index 44c480ae..1452cb99 100644 --- a/bibigrid/core/actions/terminate.py +++ b/bibigrid/core/actions/terminate.py @@ -148,10 +148,12 @@ def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5 tmp_success = False while not tmp_success: try: + # TODO: Check if security group exists at all + not_found = not provider.get_security_group(security_group_name) tmp_success = provider.delete_security_group(security_group_name) except ConflictException: tmp_success = False - if tmp_success: + if tmp_success or not_found: break if attempts < timeout: attempts += 1 @@ -162,7 +164,8 @@ def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5 log.error(f"Attempt to delete security group {security_group_name} on " f"{provider.cloud_specification['identifier']} failed.") break - log.info(f"Delete security_group {security_group_name} -> {tmp_success}") + log.info(f"Delete security_group {security_group_name} -> {tmp_success or not_found} on " + f"{provider.cloud_specification['identifier']}.") success = success and tmp_success return success diff --git a/bibigrid/core/provider.py b/bibigrid/core/provider.py index 61cc5012..9e06dbd0 100644 --- a/bibigrid/core/provider.py +++ b/bibigrid/core/provider.py @@ -265,6 +265,14 @@ def append_rules_to_security_group(self, name_or_id, rules): @return: """ + @abstractmethod + def get_security_group(self, name_or_id): + """ + Returns security group if found else None. + @param name_or_id: + @return: + """ + def get_mount_info_from_server(self, server): volumes = [] for server_volume in server["volumes"]: diff --git a/bibigrid/openstack/openstack_provider.py b/bibigrid/openstack/openstack_provider.py index 5efabf02..75de1c2c 100644 --- a/bibigrid/openstack/openstack_provider.py +++ b/bibigrid/openstack/openstack_provider.py @@ -320,3 +320,11 @@ def append_rules_to_security_group(self, name_or_id, rules): port_range_max=rule["port_range_max"], remote_ip_prefix=rule["remote_ip_prefix"], remote_group_id=rule["remote_group_id"]) + + def get_security_group(self, name_or_id): + """ + Returns security group if found else None. + @param name_or_id: + @return: + """ + return self.conn.get_security_group(name_or_id) diff --git a/documentation/markdown/bibigrid_feature_list.md b/documentation/markdown/bibigrid_feature_list.md index 5e4b9a82..0ddecab2 100644 --- a/documentation/markdown/bibigrid_feature_list.md +++ b/documentation/markdown/bibigrid_feature_list.md @@ -13,5 +13,6 @@ | [Configuration](features/configuration.md) | Contains all data regarding cluster setup for all providers. | | [Command Line Interface](features/CLI.md) | What command line arguments can be passed into BiBiGrid. | | [Multi Cloud](features/multi_cloud.md) | Explanation how BiBiGrid's multi-cloud approach works | +| [BiBiGrid Cluster Commands](features/cluster_commands.md) | Short useful commands to get information on the cluster | ![](../images/actions.jpg) \ No newline at end of file diff --git a/documentation/markdown/features/cluster_commands.md b/documentation/markdown/features/cluster_commands.md new file mode 100644 index 00000000..be16a42c --- /dev/null +++ b/documentation/markdown/features/cluster_commands.md @@ -0,0 +1,54 @@ +# BiBiGrid Cluster Commands + +## [bibiinfo](../../../resources/bin/bibiinfo) +Similar to `sinfo` but shows detailed information regarding node features. + +## [bibilog](../../../resources/bin/bibilog) +`bibilog` executes `tail -f` on the most recent worker creation out log. +Thereby, it helps you with understanding any worker startup issues. + +## [bibiplay](../../../resources/bin/bibiplay) +`bibiplay` is mainly a shortcut for `ansible-playbook /opt/playbook/site.yml -i /opt/playbook/ansible_hosts` +which allows you to execute the ansible playbook more easily. + +### Examples +You have changed something in the common configuration and want to propagate this change to the master. +```sh +bibiplay -l master +# executes the playbook only for the master +``` + +You have changed something in the slurm configuration and want to propagate this change to the master. +```sh +bibiplay -l master -t slurm +``` + +## [bibiname](../../../resources/playbook/roles/bibigrid/templates/bin/bibiname.j2)[m|v|default: w] [number] + +This command creates node names for the user without them needing to copy the cluster-id. +Takes two arguments. The first defines whether a master, vpngtw or worker is meant. Worker is the default. +The second parameter - if vpngtw or worker is selected - defines which vpngtw or worker is meant. + +### Examples +Assume the cluster-id `20ozebsutekrjj4`. + +```sh +bibiname m +# bibigrid-master-20ozebsutekrjj4 +``` + +```sh +bibiname v 0 +# bibigrid-vpngtw-20ozebsutekrjj4-0 +``` + +```sh +bibiname 0 # or bibiname w 0 +# bibigrid-worker-20ozebsutekrjj4-0 +``` + +A more advanced use would be to use the generated name to login into a worker: +```sh +ssh $(bibiname 0) # or bibiname w 0 +# ssh bibigrid-worker-20ozebsutekrjj4-0 +``` \ No newline at end of file diff --git a/documentation/pdfs/ELIXIR Compute 2023 -- Multi-Cloud - BiBiGrid.pdf b/documentation/pdfs/ELIXIR Compute 2023 -- Multi-Cloud - BiBiGrid.pdf deleted file mode 100644 index 672aa426..00000000 Binary files a/documentation/pdfs/ELIXIR Compute 2023 -- Multi-Cloud - BiBiGrid.pdf and /dev/null differ diff --git a/resources/bin/binfo b/resources/bin/bibiinfo similarity index 100% rename from resources/bin/binfo rename to resources/bin/bibiinfo diff --git a/resources/bin/bibilog b/resources/bin/bibilog new file mode 100644 index 00000000..22456e4e --- /dev/null +++ b/resources/bin/bibilog @@ -0,0 +1,16 @@ +#!/bin/bash +if [ "$1" == "err" ]; then + err_out="err" +else + err_out="out" +fi + +if [ "$2" == "fail" ]; then + fail_create="fail" +else + fail_create="create" +fi + +LOG="/var/log/slurm/worker_logs/$fail_create/$err_out" +RECENT=$(ls -1rt $LOG | tail -n1) +tail -f "$LOG/$RECENT" \ No newline at end of file diff --git a/resources/defaults/slurm/slurm.conf b/resources/defaults/slurm/slurm.conf index 0f622ea9..19645fd3 100644 --- a/resources/defaults/slurm/slurm.conf +++ b/resources/defaults/slurm/slurm.conf @@ -9,17 +9,15 @@ AuthAltParameters=jwt_key=/etc/slurm/jwt-secret.key ClusterName=bibigrid MpiDefault=none -ProctrackType=proctrack/linuxproc +ProctrackType=proctrack/cgroup # linuxproc # changed for 23.11.0 ReturnToService=2 SwitchType=switch/none TaskPlugin=task/none #TaskPlugin=task/cgroup JobAcctGatherType=jobacct_gather/linux -# see https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns:~:text=for%20additional%20details.-,cloud_dns,-By%20default%2C%20Slurm # SlurmctldParameters=cloud_dns -# Funktioniert nicht wie vermutet. slurmctld versucht mit diesem Parameter schon beim Start alle Clients aufzulösen, -# was natürlich nicht funktioniert. +# didn't work as expected. slurmctld tries to resolve all clients on startup which doesn't work obviously # PRIORITY PriorityType=priority/multifactor @@ -37,7 +35,6 @@ SlurmctldPort=6817 SlurmdPort=6818 # DIRECTORIES -#JobCheckpointDir=/var/lib/slurm/job_checkpoint SlurmdSpoolDir=/var/lib/slurm/slurmd StateSaveLocation=/var/lib/slurm/state_checkpoint @@ -61,7 +58,7 @@ AccountingStorageHost={{ hostvars[groups.master.0].name | lower }} AccountingStorageUser={{ slurm_conf.db_user }} # LOGGING -SlurmctldDebug=info +SlurmctldDebug=debug # info SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdDebug=info SlurmdLogFile=/var/log/slurm/slurmd.log @@ -102,7 +99,7 @@ SuspendExcNodes={{ hostvars[groups.master.0].name }} # Maximum number of nodes TreeWidth= {{ slurm_conf.elastic_scheduling.TreeWidth }} # Do not cache dns names -CommunicationParameters=NoAddrCache +# CommunicationParameters=NoAddrCache # REMOVED for 23.11.0 # Mark node status idle on suspend so DOWN is removed SlurmctldParameters=idle_on_node_suspend # Show slurm nodes all the time @@ -113,4 +110,4 @@ ResumeFailProgram=/opt/slurm/fail.sh # job container # TO BE TESTED JobContainerType=job_container/tmpfs -PrologFlags=Contain \ No newline at end of file +PrologFlags=Contain diff --git a/resources/playbook/roles/bibigrid/files/slurm/cgroup.conf b/resources/playbook/roles/bibigrid/files/slurm/cgroup.conf index 2705699f..5ab6361d 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/cgroup.conf +++ b/resources/playbook/roles/bibigrid/files/slurm/cgroup.conf @@ -1,6 +1,6 @@ # maybe this causes errors when using 23.11 https://slurm.schedmd.com/faq.html#cgroupv2 CgroupMountpoint="/sys/fs/cgroup" -CgroupAutomount=yes +# CgroupAutomount=yes # REMOVED 23.11.0 ConstrainCores=no ConstrainRAMSpace=yes ConstrainSwapSpace=no diff --git a/resources/playbook/roles/bibigrid/files/slurm/create.sh b/resources/playbook/roles/bibigrid/files/slurm/create.sh index e48d9954..d5bbc4c0 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create.sh +++ b/resources/playbook/roles/bibigrid/files/slurm/create.sh @@ -10,7 +10,7 @@ process_string() { fifth=${elements[4]} # Replace undesired characters in the second element - second=$(echo "$second" | sed -E 's/worker-/worker_/; s/vpnwkr-/vpnwkr_/') + second=$(echo "$second" | sed -E 's/worker-/worker_/; s/vpngtw-/vpngtw_/') # Check if the fifth element is not empty if [[ ! -z $fifth ]]; then diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index 19e9b828..0e8cd0da 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -273,13 +273,6 @@ def _run_playbook(cmdline_args): sys.exit(1) else: logging.info(ansible_execution_data) -server_start_data = {"started_servers": [], "other_openstack_exceptions": [], "connection_exceptions": [], - "available_servers": [], "openstack_wait_exceptions": []} -if [key for key in server_start_data if "exception" in key]: - logging.warning(server_start_data) - sys.exit(1) -else: - logging.info(server_start_data) logging.info("Successful create_server.py execution!") time_in_s = time.time() - start_time diff --git a/resources/playbook/roles/bibigrid/files/slurm/fail.sh b/resources/playbook/roles/bibigrid/files/slurm/fail.sh index 436f8b59..38d723b4 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/fail.sh +++ b/resources/playbook/roles/bibigrid/files/slurm/fail.sh @@ -10,7 +10,7 @@ process_string() { fifth=${elements[4]} # Replace undesired characters in the second element - second=$(echo "$second" | sed -E 's/worker-/worker_/; s/vpnwkr-/vpnwkr_/') + second=$(echo "$second" | sed -E 's/worker-/worker_/; s/vpngtw-/vpngtw_/') # Check if the fifth element is not empty if [[ ! -z $fifth ]]; then diff --git a/resources/playbook/roles/bibigrid/handlers/main.yml b/resources/playbook/roles/bibigrid/handlers/main.yml index b6b62321..d4888f5a 100644 --- a/resources/playbook/roles/bibigrid/handlers/main.yml +++ b/resources/playbook/roles/bibigrid/handlers/main.yml @@ -28,6 +28,7 @@ systemd: name: slurmctld state: restarted + when: "'master' in group_names" - name: slurmd systemd: diff --git a/resources/playbook/roles/bibigrid/tasks/001-apt.yml b/resources/playbook/roles/bibigrid/tasks/001-apt.yml index 407175db..0ca1c17b 100644 --- a/resources/playbook/roles/bibigrid/tasks/001-apt.yml +++ b/resources/playbook/roles/bibigrid/tasks/001-apt.yml @@ -10,6 +10,15 @@ group: root mode: 0644 +- name: Wait for cloud-init / user-data to finish + command: cloud-init status --wait + changed_when: false + +- name: Wait for /var/lib/dpkg/lock-frontend to be released + shell: while lsof /var/lib/dpkg/lock-frontend ; do sleep 10; done; + tags: + - skip_ansible_lint + - name: Wait for post-launch services to stop service_facts: register: result diff --git a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml index 3bc06ebf..6691225e 100644 --- a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml @@ -17,7 +17,9 @@ - "{{ master.disks }}" when: master.disks is defined -- block: +- when: volumes is defined and auto_mount + failed_when: false + block: - name: Make sure disks are available filesystem: fstype: ext4 @@ -36,10 +38,9 @@ with_items: "{{ volumes }}" - name: Mount disks + mount: path: "{{ item.name }}" src: "{{ item.device }}" state: mounted with_items: "{{ volumes }}" - when: volumes is defined and auto_mount - ignore_errors: true diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml index 69f0098f..580aabc4 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm-server.yml @@ -70,22 +70,6 @@ - slurmdbd - slurmrestd -- name: Enable slurmdbd and slurmrestd services - systemd: - name: "{{ item }}" - enabled: true - masked: false - state: started - daemon_reload: true - with_items: - - slurmdbd - - slurmrestd - -- name: Start slurm explicit after all dependencies are configured - systemd: - name: slurmctld - state: started - - name: Register Slurm users home dir shell: "set -o pipefail && grep slurm /etc/passwd | cut -d ':' -f 6" register: slurm_home @@ -180,6 +164,31 @@ groups: - ansible +- name: Generate location specific worker userdata + template: + src: slurm/worker_userdata.j2 + dest: "/opt/slurm/userdata_{{ hostvars[item].cloud_identifier }}.txt" + owner: slurm + group: ansible + mode: "0640" + with_items: "{{ groups.vpngtw + groups.master }}" + +- name: Enable slurmdbd and slurmrestd services + systemd: + name: "{{ item }}" + enabled: true + masked: false + state: started + daemon_reload: true + with_items: + - slurmdbd + - slurmrestd + +- name: Start slurm explicit after all dependencies are configured + systemd: + name: slurmctld + state: started + - when: slurm_home.stdout != '/opt/slurm' block: @@ -210,12 +219,3 @@ - slurmd - slurmdbd - slurmrestd - -- name: Generate location specific worker userdata - template: - src: slurm/worker_userdata.j2 - dest: "/opt/slurm/userdata_{{ hostvars[item].cloud_identifier }}.txt" - owner: slurm - group: ansible - mode: "0640" - with_items: "{{ groups.vpngtw + groups.master }}" diff --git a/resources/playbook/roles/bibigrid/tasks/042-slurm.yml b/resources/playbook/roles/bibigrid/tasks/042-slurm.yml index 49253ca0..e134dbe2 100644 --- a/resources/playbook/roles/bibigrid/tasks/042-slurm.yml +++ b/resources/playbook/roles/bibigrid/tasks/042-slurm.yml @@ -9,21 +9,25 @@ uid: 64030 group: slurm -- name: Install Slurm package (and dependencies) +- name: Create pinning configuration for slurm-bibigrid version 23.11.* + copy: + content: | + Package: slurm-bibigrid + Pin: version 23.11.* + Pin-Priority: 1001 + dest: /etc/apt/preferences.d/slurm-bibigrid + mode: '0311' + +- name: Install slurm-bibigrid package + apt: + name: slurm-bibigrid + state: present + +- name: Install Slurm package dependencies apt: name: - - slurm-full - munge -# - name: Download Slurm (TEMPORARY) -# get_url: -# url: "https://docs.cebitec.uni-bielefeld.de/s/FjCP3xQPPnBwSy9/download?path=%2F&files=slurm-full_23.11.0-0_amd64.deb" # Replace with your package link -# dest: "/tmp/package.deb" # Destination where the package will be saved -# - name: Install Slurm package -# apt: -# deb: "/tmp/package.deb" -# state: present # Install the package if not already installed - - name: Create new secret (Munge) copy: content: '{{ slurm_conf.munge_key }}' diff --git a/resources/playbook/roles/bibigrid/tasks/main.yml b/resources/playbook/roles/bibigrid/tasks/main.yml index b81dfdbc..1a6713f7 100644 --- a/resources/playbook/roles/bibigrid/tasks/main.yml +++ b/resources/playbook/roles/bibigrid/tasks/main.yml @@ -136,6 +136,7 @@ - debug: msg: "[BIBIGRID] Setup Slurm" - import_tasks: 042-slurm.yml + when: "'vpngtw' not in group_names" - import_tasks: 042-slurm-server.yml when: "'master' in group_names"