Skip to content

Commit

Permalink
Introduced yaml lock (#464)
Browse files Browse the repository at this point in the history
* removed unnecessary close

* simplified update_hosts

* updated logging to separate folder and file based on creation date

* many small changes and introducing locks

* restructured log files again. Removed outdated key warnings from bibigrid.yml

* added a few logs

* further improved logging hierarchy

* Added specific folder places for temporary job storage. This might solve the "SlurmSpoolDir full" bug.

* Improved logging

* Tried to fix temps and tried update to 23.11 but has errors so commented that part out

* added initial space
  • Loading branch information
XaverStiensmeier authored Feb 6, 2024
1 parent 0c048b7 commit 26c6738
Show file tree
Hide file tree
Showing 13 changed files with 117 additions and 22 deletions.
8 changes: 4 additions & 4 deletions bibigrid.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@

## Volumes and snapshots that will be mounted to master
# autoMount: False # WARNING: will overwrite unidentified filesystems
#masterMounts: # KEY NOT FULLY IMPLEMENTED YET
#masterMounts:
# - [mount one]

#nfsShares: # KEY NOT FULLY IMPLEMENTED YET; /vol/spool/ is automatically created as a nfs
#nfsShares: /vol/spool/ is automatically created as a nfs
# - [nfsShare one]

## Ansible (Galaxy) roles can be added for execution # KEY NOT IMPLEMENTED YET
Expand Down Expand Up @@ -48,7 +48,7 @@
#nfs: True
#ide: True # A nice way to view your cluster as if you were using Visual Studio Code

useMasterAsCompute: True # Currently ignored by slurm
useMasterAsCompute: True

#waitForServices: # existing service name that runs after an instance is launched. BiBiGrid's playbook will wait until service is "stopped" to avoid issues
# - de.NBI_Bielefeld_environment.service # uncomment for cloud site Bielefeld
Expand Down Expand Up @@ -104,4 +104,4 @@

#features: # list

#- [next configurations] # KEY NOT IMPLEMENTED YET
#- [next configurations]
3 changes: 2 additions & 1 deletion bibigrid/core/actions/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def start_vpn_or_master_instance(self, configuration, provider):
server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network,
volumes=volumes, security_groups=configuration["security_groups"], wait=True)
configuration["private_v4"] = server["private_v4"]

self.log.debug(f"Created Server {name}: {server['private_v4']}.")
# get mac address for given private address
# Attention: The following source code works with Openstack and IPV4 only
configuration["mac_addr"] = None
Expand All @@ -201,6 +201,7 @@ def start_vpn_or_master_instance(self, configuration, provider):
if identifier == VPN_WORKER_IDENTIFIER or (identifier == MASTER_IDENTIFIER and self.use_master_with_public_ip):
configuration["floating_ip"] = \
provider.attach_available_floating_ip(network=external_network, server=server)["floating_ip_address"]
self.log.debug(f"Added floating ip {configuration['floating_ip']} to {name}.")
elif identifier == MASTER_IDENTIFIER:
configuration["floating_ip"] = server["private_v4"] # pylint: enable=comparison-with-callable
configuration["volumes"] = provider.get_mount_info_from_server(server)
Expand Down
1 change: 1 addition & 0 deletions bibigrid/core/actions/ide.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def ide(cluster_id, master_provider, master_configuration, log):
ssh_pkey=used_private_key,
local_bind_address=(LOCALHOST, used_local_bind_address),
remote_bind_address=(LOCALHOST, REMOTE_BIND_ADDRESS)) as server:
log.debug(f"Used {used_local_bind_address} as the local binding address")
log.log(42, "CTRL+C to close port forwarding when you are done.")
with server:
# opens in existing window if any default program exists
Expand Down
5 changes: 3 additions & 2 deletions bibigrid/core/utility/handler/ssh_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,9 @@ def is_active(client, floating_ip_address, private_key, username, log, gateway,
log.info(f"Using SSH Gateway {gateway.get('ip')}")
octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(floating_ip_address.split("."))}
port = int(sympy.sympify(gateway["portFunction"]).subs(dict(octets)))
client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username, pkey=private_key,
timeout=7, auth_timeout=5, port=port)
log.info(f"Port {port} will be used (see {gateway['portFunction']} and octets {octets}).")
client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username,
pkey=private_key, timeout=7, auth_timeout=5, port=port)
establishing_connection = False
log.info(f"Successfully connected to {floating_ip_address}")
except paramiko.ssh_exception.NoValidConnectionsError as exc:
Expand Down
1 change: 1 addition & 0 deletions resources/playbook/roles/bibigrid/files/slurm/cgroup.conf
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# maybe this causes errors when using 23.11 https://slurm.schedmd.com/faq.html#cgroupv2
CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=yes
ConstrainCores=no
Expand Down
30 changes: 28 additions & 2 deletions resources/playbook/roles/bibigrid/files/slurm/create.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,33 @@
#!/bin/bash

process_string() {
# Split the input string by "-"
IFS='-' read -ra elements <<< "$1"

# Extract the second, fourth, and fifth elements
second=${elements[1]}
fourth=${elements[3]}
fifth=${elements[4]}

# Replace undesired characters in the second element
second=$(echo "$second" | sed -E 's/worker-/worker_/; s/vpnwkr-/vpnwkr_/')

# Check if the fifth element is not empty
if [[ ! -z $fifth ]]; then
echo "${second}_${fourth}-${fifth}"
else
echo "${second}_${fourth}"
fi
}

mkdir -p worker_logs
mkdir -p worker_logs/create
mkdir -p worker_logs/create/out
mkdir -p worker_logs/create/err

# redirect stderr and stdout
exec >> /var/log/slurm/create.out.log
exec 2>> /var/log/slurm/create.err.log
exec >> "/var/log/slurm/worker_logs/create/out/$(process_string "$1")_$(date '+%Y-%m-%d_%H:%M:%S').log"
exec 2>> "/var/log/slurm/worker_logs/create/err/$(process_string "$1")_$(date '+%Y-%m-%d_%H:%M:%S').log"

function log {
echo "$(date) $*"
Expand Down
22 changes: 12 additions & 10 deletions resources/playbook/roles/bibigrid/files/slurm/create_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import sys
import threading
import time
from filelock import FileLock

import ansible_runner
import os_client_config
Expand Down Expand Up @@ -142,19 +143,20 @@ def update_hosts(name, ip): # pylint: disable=invalid-name
@param ip: ibibigrid-worker0-3k1eeysgetmg4vb-3p address
@return:
"""
hosts = {"host_entries": {}}
if os.path.isfile(HOSTS_FILE_PATH):
logging.info("Updating hosts.yml")
with FileLock("hosts.yml.lock"):
logging.info("Lock acquired")
with open(HOSTS_FILE_PATH, mode="r", encoding="utf-8") as hosts_file:
hosts = yaml.safe_load(hosts_file)
hosts_file.close()
if hosts is None or "host_entries" not in hosts.keys():
logging.info(f"Existing hosts {hosts}")
if not hosts or "host_entries" not in hosts:
logging.info(f"Resetting host entries because {'first run' if hosts else 'broken'}.")
hosts = {"host_entries": {}}

hosts["host_entries"][name] = ip

with open(HOSTS_FILE_PATH, mode="w", encoding="utf-8") as hosts_file:
yaml.dump(hosts, hosts_file)
hosts_file.close()
hosts["host_entries"][name] = ip
logging.info(f"Added host {name} with ip {hosts['host_entries'][name]}")
with open(HOSTS_FILE_PATH, mode="w", encoding="utf-8") as hosts_file:
yaml.dump(hosts, hosts_file)
logging.info("Wrote hosts file. Released hosts.yml.lock.")


def configure_dns():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@
logging.info("delete_server.py started")
start_time = time.time()

logging.info(f"Terminate parameter: {sys.argv[1]}")

if len(sys.argv) < 2:
logging.warning("usage: $0 instance1_name[,instance2_name,...]")
logging.info("Your input %s with length %s", sys.argv, len(sys.argv))
sys.exit(1)

terminate_workers = sys.argv[1].split("\n")
logging.info("Deleting instances %s", terminate_workers)

Expand Down Expand Up @@ -61,7 +64,8 @@
logging.warning(f"Couldn't delete worker {terminate_worker}")
else:
logging.info(f"Deleted {terminate_worker}")
logging.info("Successful delete_server.py execution!")

logging.info(f"Successful delete_server.py execution ({sys.argv[1]})!")
time_in_s = time.time() - start_time
logging.info("--- %s minutes and %s seconds ---", math.floor(time_in_s / 60), time_in_s % 60)
logging.info("Exit Code 0")
Expand Down
32 changes: 30 additions & 2 deletions resources/playbook/roles/bibigrid/files/slurm/fail.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,32 @@
#!/bin/bash

process_string() {
# Split the input string by "-"
IFS='-' read -ra elements <<< "$1"

# Extract the second, fourth, and fifth elements
second=${elements[1]}
fourth=${elements[3]}
fifth=${elements[4]}

# Replace undesired characters in the second element
second=$(echo "$second" | sed -E 's/worker-/worker_/; s/vpnwkr-/vpnwkr_/')

# Check if the fifth element is not empty
if [[ ! -z $fifth ]]; then
echo "${second}_${fourth}-${fifth}"
else
echo "${second}_${fourth}"
fi
}

mkdir -p worker_logs
mkdir -p worker_logs/fail/out
mkdir -p worker_logs/fail/err

# redirect stderr and stdout
exec >> /var/log/slurm/fail.out.log
exec 2>> /var/log/slurm/fail.err.log
exec >> "/var/log/slurm/worker_logs/fail/out/$(process_string "$1")_$(date '+%Y-%m-%d_%H:%M:%S').log"
exec 2>> "/var/log/slurm/worker_logs/fail/err/$(process_string "$1")_$(date '+%Y-%m-%d_%H:%M:%S').log"

function log {
echo "$(date) $*"
Expand All @@ -15,7 +39,11 @@ scontrol update NodeName="$1" state=RESUME reason=FailedStartup # no sudo needed

hosts=$(scontrol show hostnames "$1")

echo "Hosts $hosts used"

# delete servers
python3 /usr/local/bin/delete_server.py "${hosts}"

echo "Finished delete_server.py execution."

exit $?
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@
- python-openstackclient==6.0.0
- openstacksdk==0.62.0
- os_client_config
- filelock
- paramiko
- ansible-runner

Expand Down
20 changes: 20 additions & 0 deletions resources/playbook/roles/bibigrid/tasks/042-slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@
- slurm-full
- munge

# - name: Download Slurm (TEMPORARY)
# get_url:
# url: "https://docs.cebitec.uni-bielefeld.de/s/FjCP3xQPPnBwSy9/download?path=%2F&files=slurm-full_23.11.0-0_amd64.deb" # Replace with your package link
# dest: "/tmp/package.deb" # Destination where the package will be saved
# - name: Install Slurm package
# apt:
# deb: "/tmp/package.deb"
# state: present # Install the package if not already installed

- name: Create new secret (Munge)
copy:
content: '{{ slurm_conf.munge_key }}'
Expand Down Expand Up @@ -84,6 +93,17 @@
- slurmctld
- slurmd

- name: Create Job Container configuration
template:
src: slurm/job_container.conf
dest: /etc/slurm/job_container.conf
owner: slurm
group: root
mode: 0444
notify:
- slurmctld
- slurmd

- name: Slurm cgroup configuration
copy:
src: slurm/cgroup.conf
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
NodeName={{ name }} AutoBasePath=true BasePath={{ '/vol/scratch/storage' if flavor.ephemeral else '/var/local/storage' }}
{% for worker_group in groups | select('match', '^bibigrid_worker_*') %}
{% set first_worker = groups[worker_group] | first %}
NodeName={{ hostvars[first_worker].name }} AutoBasePath=true BasePath={{ '/vol/scratch/storage' if hostvars[first_worker].flavor.ephemeral else '/var/local/storage' }}
{% endfor %}
5 changes: 5 additions & 0 deletions resources/playbook/roles/bibigrid/templates/slurm/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,8 @@ SlurmctldParameters=idle_on_node_suspend
PrivateData=cloud
# return node to idle when startup fails
ResumeFailProgram=/opt/slurm/fail.sh

# job container
# TO BE TESTED
JobContainerType=job_container/tmpfs
PrologFlags=Contain

0 comments on commit 26c6738

Please sign in to comment.