diff --git a/cluster-setup/Dockerfile b/cluster-setup/Dockerfile new file mode 100644 index 000000000..a22e82b0c --- /dev/null +++ b/cluster-setup/Dockerfile @@ -0,0 +1,11 @@ +ARG PYTHON_VERSION="bookworm" + +FROM python:${PYTHON_VERSION} AS base + +RUN apt update -y && apt upgrade -y + +RUN pip install --upgrade pip + +RUN pip install pyyaml + +CMD ["/bin/bash", "-l"] diff --git a/cluster-setup/README.md b/cluster-setup/README.md index 38c85a845..9d2068212 100644 --- a/cluster-setup/README.md +++ b/cluster-setup/README.md @@ -1,9 +1,447 @@ # CfE Cluster Setup -This repository contains code and instructions for setting up a multi-host compute cluster. +This directory contains code and instructions for setting up a multi-host compute cluster. + +## Deployment to Octomore + +This procedure, as of December 12, 2023, looks like the following. + +### Before you wipe the old machine + +If you're planning to restore the data from the old machine after the deployment, +make sure your backups are in order. System backups are typically kept using `rsnapshot`, +and a backup of the Kive PostgreSQL database is kept using `barman`. For example, +on our production server, these are kept on a NAS mounted at `/media/dragonite`. + +Optionally, if your backups are on a physical drive connected to the machine, to avoid +accidentally damaging or altering the backups, you could physically remove them until the +setup is complete and you're ready to restore data from them. + +There a few files that are worth preserving in particular and having available to you +during the deployment process: + +* Preserve copies of your system's `/etc/passwd`, `/etc/group`, and `/etc/shadow`. This + information will be used to populate the new system with the same users and groups + from the old system. +* Create a dump of the Kive PostgreSQL database using `pg_dumpall`. As the upgrade may + involve moving to a newer version of PostgreSQL, we likely can't use the Barman + backups to migrate from; thus we must do it the "old-fashioned" way. +* Preserve a copy of `/etc/kive/kive_apache.conf` and/or `/etc/kive/kive_purge.conf`. + These files contain the database password used by Kive (via `apache2`) to access PostgreSQL. + You can also just preserve this password and discard the files, as the files will be + recreated by Ansible. +* Preserve a copy of the `barman` user's `.pgpass` file. This contains the passwords + used by the `barman` and `streaming_barman` users when connecting to PostgreSQL, + and keeping these makes it easier to get the database set back up after importing + the database from the old system. Likewise here you can also just preserve the passwords + and discard the file. (Note that this file will typically *not* be present in the `rsnapshot` + backups, as the Barman user's home directory is in `/var`, which is not backed up.) + +### Install Ubuntu and do basic network setup on the head node + +First, manually install Ubuntu Jammy on the head node using an Ubuntu live USB drive. +At most points, follow the defaults. Some places where you need to fill in some details: + +- Create a user with username `ubuntu` when prompted during installation. This will be + our "bootstrap" user. +- Choose an appropriate system name for the computer, e.g. "octomore". +- Choose a root drive. As of the time of writing, there is a 120GB SSD on the system; this + is an appropriate choice for the root drive. +- Manually set up the LAN-facing interface (probably `eno0`) with IP address 192.168.69.86, + subnet 192.168.68.0/23, gateway 192.168.68.1, and DHCP server 192.168.168.101. +- Enable SSH when prompted. You don't need to import any identity at this point. + +Note that the completion screen isn't super obvious, so keep an eye for a completion message +at the top left of the screen at the end of the process. Once this is done, you can interact +with the head node via SSH. + +Next, upload the contents of [initialization/head] to the server and run `head_configuration.bash` +using `sudo`. +This sets up the root user's SSH key and `/etc/hosts`, and installs Ansible on the head node. +Accept the defaults whenever it asks which services should be restarted. +Now that Ansible is available on the root node, most of the rest of the procedure will be done +using Ansible playbooks defined in the [deployment] directory. Copy the `cluster-setup` directory +to the head node using the `deploy_cluster_setup.bash` script, placing it in a sensible location +with the appropriate permissions. If you make changes, you can also use the same script +to keep them synchronized between your workstation and the head node. + +#### Prepare Ansible configuration + +Go to the `deployment/group_vars` directory and create an `all.yaml` file from the +`octomore_template.yaml` file by copying and filling in some details. + +For the passwords, you can use a password generator to generate new passwords and secret keys; +however, it makes sense to use the same PostgreSQL passwords as on the old system. +These passwords are: + +* `kive_db_password`: this is the one preserved from `/etc/kive/kive_apache.conf` +or `/etc/kive/kive_purge.conf`. +* `barman_password`: this is in the `barman` user's `.pgpass` file. +* `streaming_barman_password`: this is also in the `barman` user's `.pgpass` file. + +Some other notable settings that you may need to adjust: + +* `kive_allowed_hosts`: this is a JSON-formatted list of IP addresses/URLs that the +web server will respond to requests on. +* `kive_subject_prefix`: this will be prepended to the emails sent by the Kive system. +It's a good idea to include some details on this system, e.g. "Kive server on Octomore", +or "Kive server on developer workstation". +* `kive_purge_start`: sets the threshold for the Kive purge task to perform file cleanup. +* `kive_purge_stop`: sets the stopping threshold for this Kive purge task; that is, a +purge will stop when the remaining files' total size is under this threshold. +* `kive_log_level`: the logging level, as understood by [Django's logging utilities][DjangoLogging], +used by the purge task. + +Then go to `deployment/` and create an `ansible.cfg` from one of the provided templates, +probably `ansible_octomore.cfg`. These files will be necessary for Ansible to work. + +> Note: all playbooks should be run using `sudo`! + +[DjangoLogging]: https://docs.djangoproject.com/en/4.0/topics/logging/ + +#### General preliminary setup + +The first playbook we will run sets up the `/data` partition, so the first thing we do +is find the `/dev/disk/by-id/` entry that corresponds to the drive you want to use as `/data` +and put the *basename* (i.e. the name of the soft link in the directory without the +`/dev/disk/by-id/` part of the path) into `group_vars/all.yml` as the lone entry in the +`data_physical_volumes` list. (Or, if you wish to use several volumes combined into +one logical volume, put all their names in this list.) + +> If any drives are already recognized by LVM from a previous system, you should +> delete the logical volumes, volume groups, and physical volumes associated with them. +> Details of how to do so may be found in [the LVM documentation][UbuntuLVMDocs]. +> If there are any [mdadm][https://raid.wiki.kernel.org/index.php/A_guide_to_mdadm] +> RAID arrays on these drives, you may also need to shut those down first using +> `mdadm --stop [array device]`. + +[UbuntuLVMDocs]: https://manpages.ubuntu.com/manpages/jammy/en/man8/lvm.8.html + +Now we can run the playbook `octomore_preliminary_setup.yaml`. This sets up the `/data` partition, +prepares some other system stuff on the head node, and configures the internal-facing networking. +With this in place, the playbook should set up an `ext4` volume at `/data` on the drive +you specified. + +#### Set up your backup drive + +Next, set up a backup drive for your system. A sample of how this was done for Octomore +is detailed in `create_backup_filesystem.yaml`. On another server you might use a +NAS-based backup solution instead. The goal in the end is to have a backup drive mounted +at the path specified in your `group_vars` as `kive_backup_path`; by default this would +be `/media/backup`. + +### Install Ubuntu on the compute nodes + +At this point, go back into the server room and install Ubuntu Jammy on the compute nodes. +These machines only have one hard drive, and their ethernet should automatically be set up +by default (the head node provides NAT and DHCP), so this should be a very straightforward +installation. Again, create a user with username `ubuntu` to be the bootstrap user. +Fetch the SSH public key generated by the root user on the head node during the running of +`head_configuration.bash` and place it in the [initialization/worker] directory on the +head node as `head_node_root_id_ed25519.pub` (don't commit this file to source control; +it isn't a security risk, but it isn't needed and might cause confusion later). Make an +appropriate `/etc/hosts` file for the worker nodes and place it in [initialization/worker] +as `cluster_hosts`; appropriate templates for both Octomore and Bulbasaur are in that +directory as `cluster_hosts_octomore` and `cluster_hosts_bulbasaur` respectively, so you can +copy one of those to `cluster_hosts` if you don't need anything customized. -# Test Environment +Copy the contents of the [initialization/worker] directory to each compute node, +including the aforementioned SSH public key. Then, run `worker_configuration.bash` using +`sudo`, which will install the necessary packages and set up the necessary SSH access for +the node to be used with Ansible. + +### Annoying detour: reassign the bootstrap user's UID and GID + +At this point, your `ubuntu` user on all the machines likely have a UID and GID of 1000. +This may conflict with one of the user accounts that will later be +imported into this machine. If this is the case, you can run `reassign_bootstrap_user_uid.yaml`. +You may need to create a *second* bootstrap user to do this, as running the playbook as `ubuntu` +may fail because the user is currently being used (even if you use `sudo`). This second bootstrap +user can be removed right after this playbook is done, and you can proceed again as the `ubuntu` +user. + +### Import users and groups from the old system + +The next playbook to run imports users from the old system. First, a YAML file must be prepared +using `export_users_and_groups.py` from the old system's `/etc/shadow`, `/etc/passwd`, and +`/etc/group`. (A Dockerfile and docker compose file are provided in this directory if you +need a simple environment with Python 3 to run the script.) Next, run + + sudo ansible-playbook --extra-vars "@[name of the produced YAML file]" import_users.yaml + +This will import user accounts into the head node. (These will later be synchronized to the +compute node as part of a subsequent playbook.) + +From here, you can lock and expire the `ubuntu` user and start using one of the just-imported accounts, +if you have one. Make sure that your uploaded `cluster-setup` directory is accessible by +the account you're using if you do so. The `lock_bootstrap_user.yaml` playbook can do this; +modify the `user_name` variable if necessary. + +### Get SSL credentials for the webserver + +Before you install Kive in the next step, you must get the SSL credentials for the server. +These must be acquired securely from IT or within the software group, and placed into the +[deployment] directory. *DO NOT* commit these files to source! + +The files needed are: + +* `DigiCertCA.crt`: the DigiCert certificate authority (CA) key, which specifies that DigiCert + issued the key. +* `star_cfe.crt`: the wildcard certificate issued by DigiCert, which certifies that this server + belongs to the `cfenet.ubc.ca` or `bccfe.ca` domain. +* `star_cfe.key`: our private signing key, used to issue a public key for HTTPS connections. + +These will then be used in the next step to configure Apache. + +### Set up network drives + +Our compute server also requires two network mounts, for `macdatafile` and `RAW_DATA`, in +order for MiCall to run. The playbook `mount_network_drives.yaml` sets these up; fill +in the required variables in `group_vars/all.yaml`; their names and dummy values are in +`group_vars/octomore_template.yaml`. + +*TO DO*: this playbook can be updated to use the `mount_network_drives` role. + +#### Changing network drive permissions + +Originally Octomore's network drives were mounted read-write on all nodes, and +Bulbasaur's network drives were mounted read-only. However, after discussion, +we decided it was likely best to mount them read-write only on Octomore's head node +and read-only on its compute nodes, as we believe only the head node needs to write to +these drives when it runs MiCall Watcher. + +It also makes sense to restrict all of Bulbasaur's nodes to read-only access, but if +Bulbasaur ever needs to be used for clinical work, we would need to change the permissions +(and install MiCall Watcher). + +If it turns out that we were incorrect, we would need to restore all of Octomore's +nodes to having read-write permissions on both network drives. + +The playbooks `network_drives_standard.yaml`, `network_drives_read_only.yaml`, +and `network_drives_read_write.yaml` were written to make these configuration changes +whenever necessary. + +### Install Kive + +With all of that table-setting in place, the main playbook to run is `kive_setup.yml`. This is +the "main" playbook, and will take longer to run. + +At this point, you should have a fresh, "empty" server, with Kive running. Several +`systemd`-based background tasks that perform Kive cleanup and backups should also be +in place. If that's your goal, then you can stop here. + +### Install FastTree + +Our Phylowatch service requires [FastTree] 2.1.9 to be installed on the cluster (at the time +of writing). This is an older version so the binaries are not directly available on the +FastTree website; rather, we must compile it from [the source code][FastTreeSourceCode]. +At the time of writing, the source code is available on their website, but if this ever +disappears, we maintain a vendored copy on macdatafile in the `Phylowatch` directory +as `FastTree-2.1.9.c`. + +[FastTree]: https://microbesonline.org/fasttree/ +[FastTreeSourceCode]: https://microbesonline.org/fasttree/FastTree-2.1.9.c + +Put this file into the `deployment` directory on the head node, and run the +`install_fasttree.yaml` playbook to compile and install FastTree. + +### Optional (but recommended): install smartmontools + +To install the `smartmontools` package, which provides `smartctl`, use the +`install_smartmontools.yaml` playbook (or simply install it using `apt`). + +## Restore from an old system + +If you are restoring an old system, make the backups available somewhere on +your system; e.g. at `/media/old_data` or a similar mount point. + +### Shut down Kive and backup services + +First, shut down the Kive purge tasks created in the previous step: + + sudo systemctl stop kive_purge.timer + sudo systemctl stop kive_purge_synch.timer + +Next, shut down the backup tasks that were created in the previous step: + + sudo systemctl stop barman_backup.timer + sudo systemctl stop rsnapshot_alpha.timer + sudo systemctl stop rsnapshot_beta.timer + sudo systemctl stop rsnapshot_gamma.timer + +Barman installs a cron job by default at the system level. For now, disable this +by commenting out the entry in `/etc/cron.d/barman`. + +Finally, shut down Kive itself by shutting down the PostgreSQL database and +webserver: + + sudo systemctl stop apache2 + sudo systemctl stop postgresql@14-main + +### Annoying detour 2: set the system locale to "Canada English" + +At this point in the Octomore migration, it was discovered that the old database +contents would not properly restore to the new database due to problems with the +database locale. The old database had as its locale `en_CA.UTF-8`, which was not +available on the newly-upgraded Octomore. + +To this end, the `set_locale_to_canada.yml` playbook was used to enable this +locale on all nodes, and the database then restored without issue. If this comes +up again, use this same playbook to correct the issue. + +### Restoring the database + +Now, restore the Kive data folders from the old backups. On our prod and dev +clusters this folder was `/data/kive`; use `rsync -avz` to copy this information +into place on your new server at wherever you set `kive_media_root` to in your +`group-vars` (by default, `/data/kive/media_root`). Assuming all has gone correctly +with importing users and groups, the ownership of the files should be as they were +on the old system. + +Next, move the just-created PostgreSQL "cluster" to a backup location (or simply +delete it if you're very confident). On a fresh install, the cluster is at +`/var/lib/postgresql/14/main`. Move this to, for example, `/var/lib/postgresql/14/main_backup`. +Create a fresh empty cluster in the original location using `initdb`: + + sudo -u postgres /usr/lib/postgresql/14/bin/initdb /var/lib/postgresql/14/main + +At the same time, we should also move (or delete) the Barman backups created to this point, +as they are inconsistent with the database that we are about to restore. Move the Barman +backup folder to a backup location, and create a fresh backup folder in the same location. +For example, if the backup folder was at `/media/backup/BarmanDBBackup`: + + sudo mv /media/backup/BarmanDBBackup /media/backup/BarmanDBBackup_original + sudo mkdir /media/backup/BarmanDBBackup + sudo chown barman:barman /media/backup/BarmanDBBackup + +Next you can restore the database using `psql` as the `postgres` user. Bring up the database +again (this time with the fresh empty cluster) and use `psql` to load the data: + +``` +sudo systemctl start postgresql@14-main +sudo -u postgres psql -f [dumped file from the old system] postgres +``` + +Note that in the `psql` command, we specified the database `postgres`. This must be +specified (it's a mandatory parameter to `psql`) but will actually be ignored. + +At this point, the database will have been restored to the old settings. If you didn't +use it before in your Ansible configuration (i.e. in `group_vars/all.yaml`), you should +now either specify the PostgreSQL passwords preserved from the old system in +`/etc/kive/kive_apache.conf`, `/etc/kive/kive_purge.conf`, and the `barman` user's +`.pgpass`, or reset the passwords using `psql` as the `postgres` user to the ones you +used in your Ansible settings. + +With the database running and restored, bring Apache back up with `sudo systemctl start apache2`. +If the test Kive website doesn't work, check the PostgreSQL logs for clues, and make sure +that Apache is able to reach the database. Make sure that the password in `/etc/kive/kive_apache.conf` +and `/etc/kive/kive_purge.conf` is correct and working. + +### Restore other old user data + +This can be done at the leisure of each user, so long as the old backups are mounted. +Use `rsync -avz` to move whatever user data back into place you like. + +### Finish setting up Barman + +At this point we can manually verify the last details that Barman needs to +run correctly. First, reactivate the Barman cron job by uncommenting +the entry you commented out before in `/etc/cron.d/barman`. Then check on the +`barman` configuration by running, as the `barman` user, + + barman check kive + +There may be problems with the configuration still. If so, the Barman log at +`/var/log/barman/barman.log` and the PostgreSQL logs at `/var/log/postgresql` +may be helpful in diagnosing the problems. Some that I experienced +while I was going through the process: + +* The `barman` and `streaming_barman` PostgreSQL user passwords may be incorrect, + resulting in the check showing failures for "PostgreSQL", "pg_basebackup compatible", + and "pg_receivexlog compatible". This happened because I didn't preserve these + passwords from before I wiped out the database, so I couldn't use the same passwords + for `barman` and `streaming_barman` in my Ansible configuration. + This can easily be remedied by changing these users' PostgreSQL passwords + in `psql` (as the `postgres` system user) with the command `\password [username]`; + use the passwords in the `barman` system user's `.pgpass` file. +* The "replication slot" entry in the `barman check kive` output may report a failure. + One possible reason for this is that `barman cron` has not run successfully yet, + as in the previous steps we had disabled the system-level cron job that runs this + every minute. This task is what invokes `barman receive-wal`. If this appears to + be the problem, you can manually invoke `barman cron` as the `barman` user. Or, + you can wait one minute for the cron job to run and see if this error clears up. +* The output will also indicate that there are not enough backups in place, which is + normal and expected at this point. These backups will be created by the + `barman_backup` systemd service. +* The check may still report a failure for "WAL archive". This is normal, as the WAL + archiving must be verified for a fresh install, and will be handled below. + +Next, verify the WAL archiving. To do this, as the `barman` user, run + + barman switch-wal --force --archive kive + +This may fail at first due to a timeout, but try again if so; it's likely to succeed +eventually if all is configured well. Check the configuration again to confirm +that things are ready to go. (Ignore the error caused by there not being enough +backups in place.) + +### Restart Kive and backup services + +With everything in place, restart the regularly-scheduled backup `systemd` tasks +and Kive purge tasks using `systemctl start` as the root user: + +* `barman_backup` +* `rsnapshot_alpha` +* `rsnapshot_beta` +* `rsnapshot_gamma` +* `kive_purge` +* `kive_purge_synch` + +For example, run `sudo systemctl start barman_backup.timer` to start `barman_backup`, and +similarly for the others. + +Lastly, bring Kive itself back up by bringing up: + +* `postgresql@14-main` +* `apache2` + +[initialization/head]: initialization/head +[initialization/worker]: initialization/worker +[initialization]: initialization +[deployment]: ./deployment + +## Test Environment + +We can use Multipass to bring up a test environment for development purposes, or +Vagrant. + +### Multipass + +The [initialization] directory contains templates and scripts for generating cloud-init +files to use when setting up a "head" VM and a "worker" VM. + +For the head configuration, you must supply a YAML file containing the names and IPs of +the compute nodes in the same format as they appear in the Ansible `group_vars`; for example, +simply copy `deployment/group_vars/default_template.yml` (these values are not hugely useful +for this test deployment anyway). Specify this as a parameter to the `create_head_user_data.py` +script and it will generate a `user_data` file suitable for use with Multipass: + + multipass launch --name TestHead --cloud-init [user data file you generated] --mount [path to the cluster-setup directory]:/app + +For the worker configuration, you must put the SSH public key generated for the root user +on the "head node" somewhere accessible by whoever you want to run `create_worker_user_data.py`, +and specify it as the parameter. This creates a `user_data` file suitable for use with +Multipass: similarly to the above, + + multipass launch --name TestWorker --cloud-init [user data file you generated] --mount [path to the cluster-setup directory]:/app + +These commands launch the machines and also mount the `cluster-setup` directory at `/app` +on both nodes. Now that both machines are online and have IP addresses, you can run +`configure_hosts_file.bash` on the head node to configure its `/etc/hosts` file so that +Ansible will know how to reach the worker node. + +### Vagrant This directory contains a Vagrantfile that describes two VMs (a head node and a worker node) that can be used to test Ansible playbooks or practice performing @@ -11,56 +449,78 @@ cluster management tasks. Ansible is installed on the `head` node, and this dire is mounted at `/vagrant`. Playbooks can be edited from the host machine, but should be run from the `head` node. - -# Quickstart - -This will guide you through setting up your test environment and running your -first Ansible commands. You'll need to have [Vagrant] and [VirtualBox] installed. - +You'll need to have [Vagrant] and [VirtualBox] or VMWare installed. To begin, bring up the Vagrant VMs. This will create two VMs (`head` and `worker`) and install Ansible on `head`. vagrant up -Next, log in to `head` and move into the test environment directory. This is where -we'll do most of our testing and practice. +On the head node, run (as root) `setup_ssh_keys.bash` and `setup_ssh_access.bash`; this will +install some dummy keys to enable passwordless SSH from the root user to itself, +which is necessary for Ansible. - vagrant ssh head - cd /vagrant/testenv +On the compute node, run (as root) `setup_ssh_access.bash`, which will allow the head +node's root user to SSH into the compute node without a password. This is also needed +for Ansible. -`ansible.cfg` contains the configuration for the test environment. Most -importantly, it directs ansible to load its inventory from -`testenv/inventory.ini` instead of from the default location under `/etc`. +With both nodes running, you can use `configure_hosts_file.bash` on the head node, +also as root, to fill in the head node's `/etc/hosts` file so that Ansible will know +how to reach the compute node. -From `./testenv`, you can run Ansible commands against the inventoried -hosts (including the head node). +At this point, you can log into the head node and work with the code in this directory +at `/vagrant`. In particular, the Ansible scripts are located in `/vagrant/deployment`. -This command runs the Ansible's `ping` module against all hosts, which checks that -they can be accessed. +To confirm that your Ansible configuration is correct, you can run this command: ansible -m ping all +This command runs the Ansible's `ping` module against all hosts, which checks that +they can be accessed. [Vagrant]: https://www.vagrantup.com/downloads.html [VirtualBox]: https://www.virtualbox.org/wiki/Downloads +## Using Ansible + +`ansible.cfg` contains the configuration for the test environment. Most +importantly, it directs ansible to load its inventory from +`deployment/inventory.ini` instead of from the default location under `/etc`. + +From `./deployment`, you can run Ansible commands against the inventoried +hosts (including the head node). -# Architecture (for lack of a better name) +### Architecture (for lack of a better name) Ansible executes *tasks* against one or more managed machines. Tasks may also -depend on *variables*, *files*, or *templates*. These can be grouped into *roles*. +depend on *variables*, *files*, or *templates*. These can also be grouped into *roles*, +which we make use of in this project to help organize our code. + +### Running playbooks + +Run playbooks using `ansible-playbook`, e.g. -This project uses roles to configure servers (e.g. Slurm worker, Kive server). + ansible-playbook kive_setup.yml +For all of our playbooks, you're intended to use `sudo` as well. -# Ansible Docs +#### Debugging a single role -Essential: +Per [this](https://stackoverflow.com/questions/38350674/ansible-can-i-execute-role-from-command-line) +stack overflow answer, a single role can be run with the following command: + + ansible -m include_role -a name= + +This has more verbose output and can be run in isolation, making it suitable +for development and debugging. + +### Ansible documentation + +#### Essential - [Concepts](https://docs.ansible.com/ansible/latest/user_guide/basic_concepts.html) - [Quickstart](https://docs.ansible.com/ansible/latest/user_guide/quickstart.html) -Thorough: +#### Thorough - [Playbooks](https://docs.ansible.com/ansible/2.3/playbooks.html) - [How to build your inventory](https://docs.ansible.com/ansible/latest/user_guide/intro_inventory.html#intro-inventory) @@ -69,7 +529,7 @@ Thorough: - [Best Practices](https://docs.ansible.com/ansible/latest/user_guide/playbooks_best_practices.html#playbooks-best-practices) - [Interpreter Discovery](https://docs.ansible.com/ansible/latest/reference_appendices/interpreter_discovery.html#interpreter-discovery) -Extended: +#### Extended - [Installation](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html#installation-guide) - [Become (privesc)](https://docs.ansible.com/ansible/2.3/become.html) @@ -77,8 +537,7 @@ Extended: - [Asynchronous Actions and Polling](https://docs.ansible.com/ansible/2.3/playbooks_async.html) - [Vault](https://docs.ansible.com/ansible/2.3/playbooks_vault.html) - -# Useful modules +#### Useful modules - [copy](https://docs.ansible.com/ansible/latest/modules/copy_module.html#copy-module) - [user](https://docs.ansible.com/ansible/latest/modules/user_module.html#user-module) @@ -95,18 +554,4 @@ Extended: - [lineinfile](https://docs.ansible.com/ansible/latest/modules/lineinfile_module.html) - [blockinfile](https://docs.ansible.com/ansible/latest/modules/blockinfile_module.html#blockinfile-module) - [git](https://docs.ansible.com/ansible/latest/modules/git_module.html#git-module) -- [unarchive](https://docs.ansible.com/ansible/latest/modules/unarchive_module.html) - -# Applying a single role - -Per [this](https://stackoverflow.com/questions/38350674/ansible-can-i-execute-role-from-command-line) -stack overflow answer, a single role can be run with the following command: - - ansible -m include_role -a name= - -This has more verbose output and can be run in isolation, making it suitable -for development and debugging. - - - - +- [unarchive](https://docs.ansible.com/ansible/latest/modules/unarchive_module.html) \ No newline at end of file diff --git a/cluster-setup/Vagrantfile b/cluster-setup/Vagrantfile index 88484642b..b854872e0 100644 --- a/cluster-setup/Vagrantfile +++ b/cluster-setup/Vagrantfile @@ -1,69 +1,93 @@ -HEAD_IP = "192.168.45.10" -WORKER_IP = "192.168.45.11" +HEAD_IP = "192.168.56.10" +WORKER_IP = "192.168.56.11" # Copy the test SSH keys into `/home/vagrant/.ssh/`. Keys are copied manually # to allow easy SSH traffic between VMs. -def add_keys(vm) +def add_keys(vm, home_dir="/home/vagrant") vm.provision "file", source: "./setupfiles/vagrant_testkey", - destination: "/home/vagrant/.ssh/id_ed25519" + destination: "#{home_dir}/.ssh/id_ed25519" vm.provision "file", source: "./setupfiles/vagrant_testkey.pub", - destination: "/home/vagrant/.ssh/id_ed25519.pub" + destination: "#{home_dir}/.ssh/id_ed25519.pub" vm.provision "shell", inline: <<-EOS - chmod 600 /home/vagrant/.ssh/id_ed25519 - chmod 644 /home/vagrant/.ssh/id_ed25519.pub + chmod 600 "#{home_dir}/.ssh/id_ed25519" + chmod 644 "#{home_dir}/.ssh/id_ed25519.pub" EOS end # Enable SSH access by copying the test SSH public key into # `/home/vagrant/.ssh/authorized_keys` -def add_key_access(vm) +def add_key_access(vm, home_dir="/home/vagrant") vm.provision "file", source: "./setupfiles/vagrant_testkey.pub", destination: "/tmp/vagrant_testkey.pub" vm.provision "shell", inline: <<-EOS - cat /tmp/vagrant_testkey.pub >> /home/vagrant/.ssh/authorized_keys - chmod 600 /home/vagrant/.ssh/authorized_keys + cat /tmp/vagrant_testkey.pub >> "#{home_dir}/.ssh/authorized_keys" + chmod 600 "#{home_dir}/.ssh/authorized_keys" EOS end # Add a synced folder to emulate the cluster's network drive. def add_data_dir(vm) Dir.mkdir("./data") if not Dir.exists?("./data") - vm.synced_folder "./data", "/data", mount_options: ["dmode=777", "fmode=664"] + # vm.synced_folder "./data", "/data", mount_options: ["dmode=777", "fmode=664"] end Vagrant.configure("2") do |config| - config.vm.box = "geerlingguy/centos8" + config.vm.box = "bento/ubuntu-22.04" + + # Give preference to VMware when it's available. + config.vm.provider "vmware_desktop" do |vmw| + vmw.gui = false + vmw.memory = 8 * 1024 + vmw.cpus = 2 + end config.vm.provider "virtualbox" do |vb| vb.gui = false vb.memory = 8 * 1024 - vb.cpus = 4 + vb.cpus = 2 end config.vm.define :head do |head| head.vm.hostname = "head" - head.vm.network "private_network", ip: HEAD_IP + + # Static IPs are not supported by the VMware provider, + # but we can use them when using Virtualbox. + head.vm.network "private_network" + # head.vm.provider "virtualbox" do |vb, override| + # head.vm.network "private_network", ip: HEAD_IP + # end + add_keys(head.vm) add_key_access(head.vm) + # add_keys(head.vm, home_dir="/root") + # add_key_access(head.vm, home_dir="/root") add_data_dir(head.vm) head.vm.provision "shell", path: "./setupfiles/install-ansible.sh" end config.vm.define :worker do |worker| worker.vm.hostname = "worker" - worker.vm.network "private_network", ip: WORKER_IP + + # As for the head node we set up Virtualbox and VMware providers differently. + worker.vm.network "private_network" + # worker.vm.provider "virtualbox" do |vb, override| + # override.vm.network "private_network", ip: WORKER_IP + # end + add_key_access(worker.vm) + # add_key_access(worker.vm, home_dir="/root") worker.vm.provision "shell", inline: <<-EOS - dnf install -q -y python3 epel-release - dnf config-manager --set-enabled PowerTools + apt update + apt install -y python3 EOS - add_data_dir(worker.vm) end - config.vm.provision "shell", inline: <<-EOS - echo "#{HEAD_IP}\thead\n#{WORKER_IP}\tworker" >> /etc/hosts - EOS + # # Note: when using a VMware provider, these IP addresses won't be meaningful + # # and you'll have to manually configure /etc/hosts. + # config.vm.provision "shell", inline: <<-EOS + # echo "#{HEAD_IP}\thead\n#{WORKER_IP}\tworker" >> /etc/hosts + # EOS end diff --git a/cluster-setup/compose.yaml b/cluster-setup/compose.yaml new file mode 100644 index 000000000..d5cb749f4 --- /dev/null +++ b/cluster-setup/compose.yaml @@ -0,0 +1,8 @@ +version: '3.9' + +services: + kive_export_clusters: + image: ${KIVE_EXPORT_USERS_IMAGE:-kive_export_users:latest} + restart: unless-stopped + volumes: + - ${KIVE_CLUSTER_SETUP_PATH:-/usr/local/share/Kive/cluster_setup}:/app diff --git a/cluster-setup/configure_hosts_file.bash b/cluster-setup/configure_hosts_file.bash new file mode 100644 index 000000000..d628587d6 --- /dev/null +++ b/cluster-setup/configure_hosts_file.bash @@ -0,0 +1,6 @@ +#! /usr/bin/env bash + +name=$1 +ip=$2 + +echo -e "${ip}\t${name}" >> /etc/hosts diff --git a/cluster-setup/deploy_cluster_setup.bash b/cluster-setup/deploy_cluster_setup.bash new file mode 100644 index 000000000..3e6ebabdc --- /dev/null +++ b/cluster-setup/deploy_cluster_setup.bash @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Use rsync to copy the cluster setup code to a target server. + +# Command-line parameters: +# prod|test +# e.g. +# ./deploy_cluster_setup.sh prod + +# If you need to override the default login/server or upload path, set +# the environment variables CLUSTER_SETUP_LOGIN and/or CLUSTER_SETUP_PATH. +# Check out the version of the code you want before running, as this script +# does not check out a fresh repository; we want this script to transfer over +# config files that would not be in a stock repo. + +# Make sure you have (or the account you log into the server with has) appropriate +# permissions on the deployment path. + +prod_or_test=$1 + +echo "Deployed tag/commit/branch $git_tag on $(date)." > deployment_notes.txt +echo 'Output of "git describe":' >> deployment_notes.txt +git describe --tags >> deployment_notes.txt +echo 'Output of "git show --format=oneline --no-patch":' >> deployment_notes.txt +git show --format=oneline --no-patch >> deployment_notes.txt + +if [ "$prod_or_test" == "prod" ]; then + server="kive-int.cfenet.ubc.ca" +else + server="testkive-int.cfenet.ubc.ca" +fi +server_login=${CLUSTER_SETUP_LOGIN:-"${USER}@${server}"} + +deployment_path=${CLUSTER_SETUP_PATH:-"/usr/local/src/cluster-setup"} + +rsync -avz --exclude-from deploy_exclude_list.txt -a ./ ${server_login}:${deployment_path} + +echo "... done." diff --git a/cluster-setup/deploy_exclude_list.txt b/cluster-setup/deploy_exclude_list.txt new file mode 100644 index 000000000..2a4b28dba --- /dev/null +++ b/cluster-setup/deploy_exclude_list.txt @@ -0,0 +1,4 @@ +deployment/group_vars/all.yml +deployment/group_vars/all.yaml +deployment/ansible.cfg +initialization/worker/head_node_root_id_ed25519.pub diff --git a/cluster-setup/deployment/ansible_bulbasaur.cfg b/cluster-setup/deployment/ansible_bulbasaur.cfg new file mode 100644 index 000000000..ad384c3be --- /dev/null +++ b/cluster-setup/deployment/ansible_bulbasaur.cfg @@ -0,0 +1,3 @@ +[defaults] +inventory = ./inventory_bulbasaur.ini +interpreter_python = /usr/bin/python3 diff --git a/cluster-setup/deployment/ansible_dev.cfg b/cluster-setup/deployment/ansible_dev.cfg new file mode 100644 index 000000000..ff114fed3 --- /dev/null +++ b/cluster-setup/deployment/ansible_dev.cfg @@ -0,0 +1,8 @@ +# The main Ansible configuration file. Copy this to "ansible.cfg" and +# fill in the appropriate inventory file to use. +# See the following for available sections and keys: +# https://docs.ansible.com/ansible/latest/reference_appendices/config.html + +[defaults] +inventory = ./inventory_dev.ini +interpreter_python = /usr/bin/python3 diff --git a/cluster-setup/deployment/ansible_octomore.cfg b/cluster-setup/deployment/ansible_octomore.cfg new file mode 100644 index 000000000..5b6db308d --- /dev/null +++ b/cluster-setup/deployment/ansible_octomore.cfg @@ -0,0 +1,3 @@ +[defaults] +inventory = ./inventory_octomore.ini +interpreter_python = /usr/bin/python3 diff --git a/cluster-setup/deployment/ansible_template.cfg b/cluster-setup/deployment/ansible_template.cfg new file mode 100644 index 000000000..ff114fed3 --- /dev/null +++ b/cluster-setup/deployment/ansible_template.cfg @@ -0,0 +1,8 @@ +# The main Ansible configuration file. Copy this to "ansible.cfg" and +# fill in the appropriate inventory file to use. +# See the following for available sections and keys: +# https://docs.ansible.com/ansible/latest/reference_appendices/config.html + +[defaults] +inventory = ./inventory_dev.ini +interpreter_python = /usr/bin/python3 diff --git a/cluster-setup/deployment/bulbasaur_preliminary_setup.yaml b/cluster-setup/deployment/bulbasaur_preliminary_setup.yaml new file mode 100644 index 000000000..725e12e09 --- /dev/null +++ b/cluster-setup/deployment/bulbasaur_preliminary_setup.yaml @@ -0,0 +1,46 @@ +--- + +- name: preliminary setup to run on the Octomore head node + hosts: head + become: true + roles: + - head_node_internal_interface + tasks: + - name: create /opt + file: + path: /opt + owner: root + group: root + mode: '0755' + state: directory + + - name: create the /data mount point + file: + path: /data + owner: root + group: root + mode: '0755' + state: directory + + - name: mount the already-existing filesystem + mount: + path: /data + src: /dev/data-vg/data-lv + fstype: ext4 + state: mounted + + - name: move the existing home folder to the side + command: mv /data/home /data/home_old + args: + creates: /data/home_old + removes: /data/home + + - name: move the existing Kive folder to the side + command: mv /data/kive /data/kive_old + args: + creates: /data/kive_old + removes: /data/kive + + - name: set up head node networking + include_role: + name: head_node_networking diff --git a/cluster-setup/deployment/copy_users_and_groups.yaml b/cluster-setup/deployment/copy_users_and_groups.yaml new file mode 100644 index 000000000..70fad8b84 --- /dev/null +++ b/cluster-setup/deployment/copy_users_and_groups.yaml @@ -0,0 +1,6 @@ +--- + +- name: copy users and groups from head to workers + hosts: workers + roles: + - copy_users_and_groups diff --git a/cluster-setup/deployment/create_backup_filesystem.yaml b/cluster-setup/deployment/create_backup_filesystem.yaml new file mode 100644 index 000000000..c8ab86f7d --- /dev/null +++ b/cluster-setup/deployment/create_backup_filesystem.yaml @@ -0,0 +1,57 @@ +--- + +- name: create backup filesystem + hosts: head +# vars: +# backup_physical_volumes: +# - "ata-ST10000NM001G-2MW103_ZS51H7QX" + # Replaced the above with the `backup_physical_volumes` variable in group_vars. + tasks: + - name: create a single partition on each of the physical volumes + loop: "{{ backup_physical_volumes }}" + community.general.parted: + device: "/dev/disk/by-id/{{ item }}" + number: 1 + state: present + label: gpt + + - name: construct a list of the partition names + block: + - name: initialize the list as empty + set_fact: + partition_names: [ ] + - name: append names to the list + loop: "{{ backup_physical_volumes }}" + set_fact: + partition_names: "{{ partition_names + ['/dev/disk/by-id/' ~ item ~ '-part1'] }}" + + - name: create a volume group out of the partitions + lvg: + vg: backup-vg + pvs: "{{ partition_names | join(',') }}" + + - name: create a logical volume from the volume group + community.general.lvol: + vg: backup-vg + lv: backup-lv + size: 100%VG + + - name: create the filesystem + community.general.filesystem: + fstype: ext4 + dev: /dev/backup-vg/backup-lv + + - name: create the /media/backup mount point + file: + path: /media/backup + owner: root + group: root + mode: '0755' + state: directory + + - name: mount the filesystem + mount: + path: /media/backup + src: /dev/backup-vg/backup-lv + fstype: ext4 + state: mounted diff --git a/cluster-setup/deployment/group_vars/bulbasaur_template.yaml b/cluster-setup/deployment/group_vars/bulbasaur_template.yaml new file mode 100644 index 000000000..d7bfa94c4 --- /dev/null +++ b/cluster-setup/deployment/group_vars/bulbasaur_template.yaml @@ -0,0 +1,161 @@ +--- +# These are settings appropriate for Octomore. Copy this file to +# "all.yml" and fill in the required passwords when deploying. +# Most of the network information should be already set appropriately. + +# Variables needed to set up Kive. +kive_allowed_hosts: "[\"192.168.69.179\", \"testkive-int.cfenet.ubc.ca\"]" +kive_listen_port: 80 +update_kive_source: yes +kive_server_email: testkive-noreply@bccfe.ca +kive_admins: "[[\"kive\", \"kive@bccfe.ca\"]]" +kive_subject_prefix: "Kive server on Bulbasaur" +kive_backup_path: /media/backup +kive_version: v0.16.2 +kive_python_package: python3.7 +kive_python_executable: python3.7 + +# Settings used by the Kive purge tasks. +kive_purge_start: "4TB" +kive_purge_stop: "3.5TB" +kive_log_level: INFO + +# Slurm configuration. +slurmctlnode: bulbasaur +slurm_nodes: + - name: bulbasaur + memory: 80000 + cpus: 24 + sockets: 2 + cores_per_socket: 6 + threads_per_core: 2 + - name: b06 + memory: 80000 + cpus: 40 + sockets: 2 + cores_per_socket: 10 + threads_per_core: 2 + +# Settings for network services running on the head node, +# e.g. firewall, NFS, and PostgreSQL. +nfs_export_to_hosts: 192.168.1.0/255.255.255.0 +kive_db_host: 192.168.1.1 +kive_db_subnet: 192.168.1.0/24 + +# Internal network (i.e. the network that connects the head node and compute nodes) +# configuration. +internal_subnet: 192.168.1.0 +internal_netmask: 255.255.255.0 +internal_mask_bits: 24 +internal_broadcast: 192.168.1.255 +internal_dhcp_range: [192.168.1.100, 192.168.1.200] + +head_internal_interface: eth1 +head_internal_interface_mac: "00:1e:67:fe:fb:76" +head_external_interface: eno0 +head_internal_address: 192.168.1.1 +head_internal_mask_bits: 24 + +# This creates DHCP reservations for the compute nodes. +# Slow nodes: b01 through B03. The rest are "fast nodes". +# As of August 2020, b01 is broken and not spun up. +compute_nodes: + - name: b01 + ip: 192.168.1.2 + mac: "06:B3:ED:E6:A2:66" + # Alternative MACs: 00:1E:67:54:9B:(E4-E7) + - name: b02 + ip: 192.168.1.3 + mac: "00:1E:67:80:3C:B1" + # Alternative MACs: 00:1E:67:80:3C:(B2-B4) + - name: b03 + ip: 192.168.1.4 + mac: "00:1E:67:92:D5:2A" + # Alternative MACs: 00:1E:67:92:D5:(2B-2D) + - name: b04 + ip: 192.168.1.5 + mac: "5A:83:62:41:FD:14" + # Alternative MACs: 00:1E:67:A9:14:(8D-90) + - name: b05 + ip: 192.168.1.6 + mac: "00:1E:67:BC:32:F3" + # Alternative MACs: 00:1E:67:BC:32:(F4-F6) + - name: b06 + ip: 192.168.1.7 + mac: "A4:BF:01:02:4A:A2" + # Alternative MAC: A4:BF:01:02:4A:A3 + - name: b07a + ip: 192.168.1.8 + mac: "00:0A:CD:2D:2A:FB" + # Alternative MACs: AC:1F:6B:91:C3:(FE-FF) + - name: b07b + ip: 192.168.1.9 + mac: "00:0A:CD:2D:2B:F4" + # Alternative MACs: AC:1F:6B:91:C3:(E8-E9) + - name: b08a + ip: 192.168.1.10 + mac: "00:0A:CD:2D:2A:FE" + # Alternative MACs: AC:1F:6B:91:C4:(24-25) + - name: b08b + ip: 192.168.1.11 + mac: "00:0A:CD:2D:2A:EC" + # Alternative MACs: AC:1F:6B:91:C3:(F2-F3) + +# The following are sensitive, and should be kept secret for a production system. +kive_db_password: fixme-14mPdzu5vTOQG2DgtDG1inghQpMX0TBdUqEK6nVNHVo +kive_server_secret_key: fixme-kpXk1iKLbHn6-T7zieLHgADFA8ZSh5itd8k_Sp932fM +barman_password: fixme-barman +streaming_barman_password: fixme-streaming-barman + +# Slurm installation: +slurm_src_basename: "slurm-23.02.5" +slurm_tarball: "slurm-23.02.5.tar.bz2" +slurm_source_url: "https://download.schedmd.com/slurm/slurm-23.02.5.tar.bz2" +slurm_sha1_checksum: b3f06d7030bd771a3a94be06e3c0d58a2630a21e + +# mod_wsgi installation: +mod_wsgi_source_url: "https://files.pythonhosted.org/packages/fe/12/b3756f3b72ae3410a83a9b98862925aea64e854c93ef16c1e46b11e32d59/mod_wsgi-4.9.4.tar.gz" +mod_wsgi_tarball: "mod_wsgi-4.9.4.tar.gz" +mod_wsgi_basename: "mod_wsgi-4.9.4" +mod_wsgi_sha256_checksum: "8e762662ea5b01afc386bbcfbaa079748eb6203ab1d6d3a3dac9237f5666cfc9" + +# The following are defaults, and probably don't need to be changed. +# In a typical deployment, Kive, its virtualenv, and slurm are only "installed" +# by the head node and shared to the compute nodes via NFS, so kive_venv, +# kive_slurm_path, and kive_root should be paths that are on the shared partitions +# (typically /opt and /usr/local). If one would rather install Kive and slurm +# locally to each node, then these paths and/or the NFS shares should be reworked. +kive_venv: /opt/venv_kive +kive_slurm_path: /opt/venv_kive/bin +kive_db_name: kive +kive_db_user: kive +kive_media_root: /data/kive/media_root +kive_static_root: /var/www/html/kive/static +kive_root: /usr/local/share/Kive +# - httpd configuration +kive_httpd_user: kive +kive_httpd_group: kive + +copied_groups: + - kive + - sudo + +default_shell: /usr/bin/bash + +data_physical_volumes: + - ata-ST10000VN0008-2PJ103_ZLW0388G + +backup_physical_volumes: + - ata-ST10000VN000-3AK101_WP015CNV + +macdatafile_username: "[fill this in]" +macdatafile_password: "[fill this in]" +macdatafile_domain: "[fill this in]" +macdatafile_read_only: false +macdatafile_network_share: "[fill this in]" + +raw_data_username: "[fill this in]" +raw_data_password: "[fill this in]" +raw_data_domain: "[fill this in]" +raw_data_read_only: false # set to true for a dev/test system +raw_data_network_share: "[fill this in]" diff --git a/cluster-setup/deployment/group_vars/default_template.yml b/cluster-setup/deployment/group_vars/default_template.yml new file mode 100644 index 000000000..c8b2fe7d0 --- /dev/null +++ b/cluster-setup/deployment/group_vars/default_template.yml @@ -0,0 +1,87 @@ +--- +# These are default settings for a "testing" environment, e.g. using +# VMs to simulate the procedure. Copy this to "all.yml" and fill in +# appropriate values for networking and passwords etc. + +# Variables needed to set up Kive. +kive_allowed_hosts: "[\"*\"]" +kive_listen_port: 8080 +update_kive_source: yes +kive_server_email: kive-noreply@bccfe.ca +kive_admins: "[[\"kive\", \"kive@bccfe.ca\"]]" +kive_subject_prefix: "Kive (development) server" +kive_backup_path: /data/kive_db_backup +kive_python_package: python3.7 +kive_python_executable: python3.7 + +# Settings used by the Kive purge tasks; uncomment if you need to customize +# for your system (the defaults are likely good for a development system). +# kive_purge_start: "20GB" +# kive_purge_stop: "15GB" +# kive_log_level: WARN + +# Settings for network services running on the head node, +# e.g. firewall, NFS, and PostgreSQL. +nfs_export_to_hosts: 192.168.64.0/255.255.255.0 +kive_db_host: 192.168.64.9 +kive_db_subnet: 192.168.64.0/24 + +# Internal network (i.e. the network that connects the head node and compute nodes) +# configuration. +internal_subnet: 192.168.1.0 +internal_netmask: 255.255.255.0 +internal_mask_bits: 24 +internal_broadcast: 192.168.1.255 +internal_dhcp_range: [192.168.1.100, 192.168.1.200] + +head_internal_interface: eth1 +head_internal_interface_mac: "00:1e:67:fe:fb:76" +head_external_interface: eth0 +head_internal_address: 192.168.1.1 +head_internal_mask_bits: 24 + +compute_nodes: + - name: worker + ip: 192.168.1.2 + mac: "ab:cd:ef:01:23:45" + +# The following are sensitive, and should be kept secret for a production system. +kive_db_password: fixme-14mPdzu5vTOQG2DgtDG1inghQpMX0TBdUqEK6nVNHVo +kive_server_secret_key: fixme-kpXk1iKLbHn6-T7zieLHgADFA8ZSh5itd8k_Sp932fM +barman_password: fixme-barman +streaming_barman_password: fixme-streaming-barman + +# Slurm installation: +slurm_src_basename: "slurm-23.02.5" +slurm_tarball: "slurm-23.02.5.tar.bz2" +slurm_source_url: "https://download.schedmd.com/slurm/slurm-23.02.5.tar.bz2" +slurm_sha1_checksum: b3f06d7030bd771a3a94be06e3c0d58a2630a21e + +# mod_wsgi installation: +mod_wsgi_source_url: "https://files.pythonhosted.org/packages/fe/12/b3756f3b72ae3410a83a9b98862925aea64e854c93ef16c1e46b11e32d59/mod_wsgi-4.9.4.tar.gz" +mod_wsgi_tarball: "mod_wsgi-4.9.4.tar.gz" +mod_wsgi_basename: "mod_wsgi-4.9.4" +mod_wsgi_sha256_checksum: "8e762662ea5b01afc386bbcfbaa079748eb6203ab1d6d3a3dac9237f5666cfc9" + +# The following are defaults, and probably don't need to be changed. +# In a typical deployment, Kive, its virtualenv, and slurm are only "installed" +# by the head node and shared to the compute nodes via NFS, so kive_venv, +# kive_slurm_path, and kive_root should be paths that are on the shared partitions +# (typically /opt and /usr/local). If one would rather install Kive and slurm +# locally to each node, then these paths and/or the NFS shares should be reworked. +kive_venv: /opt/venv_kive +kive_slurm_path: /opt/venv_kive/bin +kive_db_name: kive +kive_db_user: kive +kive_media_root: /data/kive/media_root +kive_static_root: /var/www/html/kive/static +kive_root: /usr/local/share/Kive +# - httpd configuration +kive_httpd_user: kive +kive_httpd_group: kive + +copied_groups: + - kive + - sudo + +default_shell: /usr/bin/bash diff --git a/cluster-setup/deployment/group_vars/octomore_template.yaml b/cluster-setup/deployment/group_vars/octomore_template.yaml new file mode 100644 index 000000000..8a03ccb15 --- /dev/null +++ b/cluster-setup/deployment/group_vars/octomore_template.yaml @@ -0,0 +1,161 @@ +--- +# These are settings appropriate for Octomore. Copy this file to +# "all.yml" and fill in the required passwords when deploying. +# Most of the network information should be already set appropriately. + +# Variables needed to set up Kive. +kive_allowed_hosts: "[\"192.168.69.179\", \"kive-int.cfenet.ubc.ca\"]" +kive_listen_port: 80 +update_kive_source: yes +kive_server_email: kive-noreply@bccfe.ca +kive_admins: "[[\"kive\", \"kive@bccfe.ca\"]]" +kive_subject_prefix: "Kive server on Octomore" +kive_backup_path: /media/backup +kive_version: v0.16.2 +kive_python_package: python3.7 +kive_python_executable: python3.7 + +# Settings used by the Kive purge tasks. +kive_purge_start: "4TB" +kive_purge_stop: "3.5TB" +kive_log_level: INFO + +# Slurm configuration. +slurmctlnode: octomore +slurm_nodes: + - name: octomore + memory: 96000 + cpus: 48 + sockets: 2 + cores_per_socket: 12 + threads_per_core: 2 + - name: b05 + memory: 80000 + cpus: 40 + sockets: 2 + cores_per_socket: 10 + threads_per_core: 2 + +# Settings for network services running on the head node, +# e.g. firewall, NFS, and PostgreSQL. +nfs_export_to_hosts: 192.168.1.0/255.255.255.0 +kive_db_host: 192.168.1.1 +kive_db_subnet: 192.168.1.0/24 + +# Internal network (i.e. the network that connects the head node and compute nodes) +# configuration. +internal_subnet: 192.168.1.0 +internal_netmask: 255.255.255.0 +internal_mask_bits: 24 +internal_broadcast: 192.168.1.255 +internal_dhcp_range: [192.168.1.100, 192.168.1.200] + +head_internal_interface: eth1 +head_internal_interface_mac: "00:1e:67:fe:fb:76" +head_external_interface: eno0 +head_internal_address: 192.168.1.1 +head_internal_mask_bits: 24 + +# This creates DHCP reservations for the compute nodes. +# Slow nodes: b01 through B03. The rest are "fast nodes". +# As of August 2020, b01 is broken and not spun up. +compute_nodes: + - name: b01 + ip: 192.168.1.2 + mac: "06:B3:ED:E6:A2:66" + # Alternative MACs: 00:1E:67:54:9B:(E4-E7) + - name: b02 + ip: 192.168.1.3 + mac: "00:1E:67:80:3C:B1" + # Alternative MACs: 00:1E:67:80:3C:(B2-B4) + - name: b03 + ip: 192.168.1.4 + mac: "00:1E:67:92:D5:2A" + # Alternative MACs: 00:1E:67:92:D5:(2B-2D) + - name: b04 + ip: 192.168.1.5 + mac: "5A:83:62:41:FD:14" + # Alternative MACs: 00:1E:67:A9:14:(8D-90) + - name: b05 + ip: 192.168.1.6 + mac: "00:1E:67:BC:32:F3" + # Alternative MACs: 00:1E:67:BC:32:(F4-F6) + - name: b06 + ip: 192.168.1.7 + mac: "A4:BF:01:02:4A:A2" + # Alternative MAC: A4:BF:01:02:4A:A3 + - name: b07a + ip: 192.168.1.8 + mac: "00:0A:CD:2D:2A:FB" + # Alternative MACs: AC:1F:6B:91:C3:(FE-FF) + - name: b07b + ip: 192.168.1.9 + mac: "00:0A:CD:2D:2B:F4" + # Alternative MACs: AC:1F:6B:91:C3:(E8-E9) + - name: b08a + ip: 192.168.1.10 + mac: "00:0A:CD:2D:2A:FE" + # Alternative MACs: AC:1F:6B:91:C4:(24-25) + - name: b08b + ip: 192.168.1.11 + mac: "00:0A:CD:2D:2A:EC" + # Alternative MACs: AC:1F:6B:91:C3:(F2-F3) + +# The following are sensitive, and should be kept secret for a production system. +kive_db_password: fixme-14mPdzu5vTOQG2DgtDG1inghQpMX0TBdUqEK6nVNHVo +kive_server_secret_key: fixme-kpXk1iKLbHn6-T7zieLHgADFA8ZSh5itd8k_Sp932fM +barman_password: fixme-barman +streaming_barman_password: fixme-streaming-barman + +# Slurm installation: +slurm_src_basename: "slurm-23.02.5" +slurm_tarball: "slurm-23.02.5.tar.bz2" +slurm_source_url: "https://download.schedmd.com/slurm/slurm-23.02.5.tar.bz2" +slurm_sha1_checksum: b3f06d7030bd771a3a94be06e3c0d58a2630a21e + +# mod_wsgi installation: +mod_wsgi_source_url: "https://files.pythonhosted.org/packages/fe/12/b3756f3b72ae3410a83a9b98862925aea64e854c93ef16c1e46b11e32d59/mod_wsgi-4.9.4.tar.gz" +mod_wsgi_tarball: "mod_wsgi-4.9.4.tar.gz" +mod_wsgi_basename: "mod_wsgi-4.9.4" +mod_wsgi_sha256_checksum: "8e762662ea5b01afc386bbcfbaa079748eb6203ab1d6d3a3dac9237f5666cfc9" + +# The following are defaults, and probably don't need to be changed. +# In a typical deployment, Kive, its virtualenv, and slurm are only "installed" +# by the head node and shared to the compute nodes via NFS, so kive_venv, +# kive_slurm_path, and kive_root should be paths that are on the shared partitions +# (typically /opt and /usr/local). If one would rather install Kive and slurm +# locally to each node, then these paths and/or the NFS shares should be reworked. +kive_venv: /opt/venv_kive +kive_slurm_path: /opt/venv_kive/bin +kive_db_name: kive +kive_db_user: kive +kive_media_root: /data/kive/media_root +kive_static_root: /var/www/html/kive/static +kive_root: /usr/local/share/Kive +# - httpd configuration +kive_httpd_user: kive +kive_httpd_group: kive + +copied_groups: + - kive + - sudo + +default_shell: /usr/bin/bash + +data_physical_volumes: + - ata-ST10000NM001G-2MW103_ZS51H7QX + +backup_physical_volumes: + - ata-ST10000NM0016-1TT101_ZA286TWE + +macdatafile_username: "[fill this in]" +macdatafile_password: "[fill this in]" +macdatafile_domain: "[fill this in]" +macdatafile_read_only: false +macdatafile_network_share: "[fill this in]" + +raw_data_username: "[fill this in]" +raw_data_password: "[fill this in]" +raw_data_domain: "[fill this in]" +raw_data_read_only: false # set to true for a dev/test system +raw_data_network_share: "[fill this in]" \ No newline at end of file diff --git a/cluster-setup/deployment/import_users.yaml b/cluster-setup/deployment/import_users.yaml new file mode 100644 index 000000000..f7639058c --- /dev/null +++ b/cluster-setup/deployment/import_users.yaml @@ -0,0 +1,6 @@ +--- + +- name: import users + hosts: head + roles: + - import_users_and_groups diff --git a/cluster-setup/deployment/install_fasttree.yaml b/cluster-setup/deployment/install_fasttree.yaml new file mode 100644 index 000000000..46d995b39 --- /dev/null +++ b/cluster-setup/deployment/install_fasttree.yaml @@ -0,0 +1,49 @@ +--- + +- hosts: head + become: true + tasks: + - name: make a directory to hold the source code + file: + path: /usr/local/src/FastTree-2.1.9 + owner: root + group: root + mode: '0755' + state: directory + + - name: copy FastTree source code to the host + copy: + src: FastTree-2.1.9.c + dest: /usr/local/src/FastTree-2.1.9/FastTree.c + owner: root + group: root + + - name: compile single-precision FastTree + command: + chdir: /usr/local/src/FastTree-2.1.9 + cmd: gcc -O3 -finline-functions -funroll-loops -Wall -o FastTree.single FastTree.c -lm + creates: FastTree.single + + - name: compile double-precision FastTree + command: + chdir: /usr/local/src/FastTree-2.1.9 + cmd: gcc -DUSE_DOUBLE -O3 -finline-functions -funroll-loops -Wall -o FastTree.double FastTree.c -lm + creates: FastTree.double + + - name: install FastTree.single + copy: + src: /usr/local/src/FastTree-2.1.9/FastTree.single + dest: /usr/local/bin/FastTree.single + mode: "0755" + + - name: install FastTree.double + copy: + src: /usr/local/src/FastTree-2.1.9/FastTree.double + dest: /usr/local/bin/FastTree.double + mode: "0755" + + - name: make an alias to FastTree.single + file: + src: /usr/local/bin/FastTree.single + path: /usr/local/bin/FastTree + state: link diff --git a/cluster-setup/deployment/install_postgresql_12.yaml b/cluster-setup/deployment/install_postgresql_12.yaml new file mode 100644 index 000000000..c085d4388 --- /dev/null +++ b/cluster-setup/deployment/install_postgresql_12.yaml @@ -0,0 +1,23 @@ +--- + +- name: install PostgreSQL 12 on the head node + hosts: head + tasks: + - name: add the apt signing key for the PostgreSQL apt repository + ansible.builtin.apt_key: + url: https://www.postgresql.org/media/keys/ACCC4CF8.asc + state: present + + - name: add PostgreSQL apt repository + become: true + ansible.builtin.apt_repository: + repo: deb https://apt.postgresql.org/pub/repos/apt jammy-pgdg main + state: present + + - name: install PostgreSQL 12 + become: true + apt: + name: + - postgresql-12 + - postgresql-client-12 + diff --git a/cluster-setup/deployment/install_smartmontools.yaml b/cluster-setup/deployment/install_smartmontools.yaml new file mode 100644 index 000000000..3d12a9812 --- /dev/null +++ b/cluster-setup/deployment/install_smartmontools.yaml @@ -0,0 +1,9 @@ +--- + +- hosts: head + become: true + tasks: + - name: install smartmontools + apt: + name: + - smartmontools diff --git a/cluster-setup/deployment/inventory_bulbasaur.ini b/cluster-setup/deployment/inventory_bulbasaur.ini new file mode 100644 index 000000000..ef27ab212 --- /dev/null +++ b/cluster-setup/deployment/inventory_bulbasaur.ini @@ -0,0 +1,21 @@ +# This is an inventory.ini file appropriate for Bulbasaur. +# This defines all of the compute nodes we have, which may or may not +# actually be connected to Bulbasaur. + +# Documentation on this file: +# https://docs.ansible.com/ansible/latest/user_guide/intro_inventory.html#adding-variables-to-inventory + +[head] +localhost + +[workers] +# b01 +# b02 +# b03 +# b04 +# b05 +b06 +# b07a +# b07b +# b08a +# b08b diff --git a/cluster-setup/deployment/inventory_dev.ini b/cluster-setup/deployment/inventory_dev.ini new file mode 100644 index 000000000..4fc8b91a8 --- /dev/null +++ b/cluster-setup/deployment/inventory_dev.ini @@ -0,0 +1,11 @@ +# This is an inventory.ini file appropriate for a dev setup used for +# testing the Ansible playbooks. This defines a two-node cluster with +# one head node and one compute node ("worker"). + +# Documentation on this file: +# https://docs.ansible.com/ansible/latest/user_guide/intro_inventory.html#adding-variables-to-inventory + +head + +[workers] +worker diff --git a/cluster-setup/deployment/inventory_octomore.ini b/cluster-setup/deployment/inventory_octomore.ini new file mode 100644 index 000000000..18bd521d8 --- /dev/null +++ b/cluster-setup/deployment/inventory_octomore.ini @@ -0,0 +1,21 @@ +# This is an inventory.ini file appropriate for Octomore. +# This defines all of the compute nodes we have, which may or may not +# actually be connected to Octomore. + +# Documentation on this file: +# https://docs.ansible.com/ansible/latest/user_guide/intro_inventory.html#adding-variables-to-inventory + +[head] +localhost + +[workers] +# b01 +b02 +b03 +# b04 +b05 +# b06 +b07a +b07b +b08a +b08b diff --git a/cluster-setup/deployment/kive_setup.yml b/cluster-setup/deployment/kive_setup.yml new file mode 100644 index 000000000..f18a0f06b --- /dev/null +++ b/cluster-setup/deployment/kive_setup.yml @@ -0,0 +1,33 @@ +--- + +- name: configure head node + hosts: head + tasks: + - name: build Slurm + include_role: + name: slurm_builder + - name: configure and start slurmctld and supporting services + include_role: + name: slurm_controller + - name: configure and start slurmd + include_role: + name: slurm_node + - name: set up Kive and supporting services on the head node + include_role: + name: kive_server + +- name: configure workers + hosts: workers + tasks: + - name: synchronize users and groups from the head node + include_role: + name: copy_users_and_groups + - name: set up worker node networking + include_role: + name: worker_node_networking + - name: configure and start slurmd + include_role: + name: slurm_node + - name: prepare node to work as a Kive node + include_role: + name: kive_node diff --git a/cluster-setup/deployment/lock_bootstrap_user.yaml b/cluster-setup/deployment/lock_bootstrap_user.yaml new file mode 100644 index 000000000..c73ceff37 --- /dev/null +++ b/cluster-setup/deployment/lock_bootstrap_user.yaml @@ -0,0 +1,12 @@ +--- + +- name: lock and expire the bootstrap user + hosts: all + vars: + user_name: ubuntu + tasks: + - name: lock and expire the user + user: + name: "{{ user_name }}" + password_lock: true + expires: 1 diff --git a/cluster-setup/deployment/mount_network_drives.yaml b/cluster-setup/deployment/mount_network_drives.yaml new file mode 100644 index 000000000..8caf1749c --- /dev/null +++ b/cluster-setup/deployment/mount_network_drives.yaml @@ -0,0 +1,100 @@ +--- + +- name: install CIFS utilities + hosts: all + tasks: + - name: install cifs-utils + become: true + apt: + name: + - cifs-utils + +- name: create CIFS credentials for the mounts + hosts: head + tasks: + - name: prepare credentials for macdatafile + become: true + vars: + cifs_username: "{{ macdatafile_username }}" + cifs_password: "{{ macdatafile_password }}" + cifs_domain: "{{ macdatafile_domain }}" + template: + src: templates/cifs_credentials.j2 + dest: /opt/smbcredentials + owner: root + group: root + mode: "0600" + + - name: prepare credentials for RAW_DATA + become: true + vars: + cifs_username: "{{ raw_data_username }}" + cifs_password: "{{ raw_data_password }}" + cifs_domain: "{{ raw_data_domain }}" + template: + src: templates/cifs_credentials.j2 + dest: /opt/smbcredentials-nextgen + owner: root + group: root + mode: "0600" + + +- name: mount the macdatafile CIFS network volume + hosts: all + tasks: + - name: create the macdatafile mount point + become: true + file: + path: /media/macdatafile + owner: root + group: root + mode: '0777' + state: directory + + - name: set mount options + set_fact: + macdatafile_mount_options: credentials=/opt/smbcredentials,x-systemd.requires-mounts-for=/opt,noperm,file_mode=0777,dir_mode=0777 + + - name: set read-only if configured + when: macdatafile_read_only | bool + set_fact: + macdatafile_mount_options: "{{ macdatafile_mount_options }},ro" + + - name: mount the macdatafile CIFS volume + become: true + mount: + path: /media/macdatafile + src: "{{ macdatafile_network_share }}" + opts: "{{ macdatafile_mount_options }}" + fstype: cifs + state: mounted + +- name: mount the RAW_DATA CIFS network volume + hosts: all + tasks: + - name: create the RAW_DATA mount point + become: true + file: + path: /media/RAW_DATA + owner: root + group: root + mode: '0777' + state: directory + + - name: set mount options + set_fact: + raw_data_mount_options: credentials=/opt/smbcredentials-nextgen,x-systemd.requires-mounts-for=/opt,noperm,file_mode=0777,dir_mode=0777 + + - name: set read-only if configured + when: raw_data_read_only | bool + set_fact: + raw_data_mount_options: "{{ raw_data_mount_options }},ro" + + - name: mount the RAW_DATA CIFS volume + become: true + mount: + path: /media/RAW_DATA + src: "{{ raw_data_network_share }}" + opts: "{{ raw_data_mount_options }}" + fstype: cifs + state: mounted diff --git a/cluster-setup/deployment/network_drives_read_only.yaml b/cluster-setup/deployment/network_drives_read_only.yaml new file mode 100644 index 000000000..9bf3e0390 --- /dev/null +++ b/cluster-setup/deployment/network_drives_read_only.yaml @@ -0,0 +1,27 @@ +--- + +- name: mount macdatafile and RAW_DATA read-only on all nodes + hosts: all + become: true + tasks: + - name: do the mounts on all the nodes + loop: + - mount_point: /media/macdatafile + network_share: "{{ macdatafile_network_share }}" + cifs_credentials: /opt/smbcredentials + cifs_credentials_volume: /opt + read_only: true + + - mount_point: /media/RAW_DATA + network_share: "{{ raw_data_network_share }}" + cifs_credentials: /opt/smbcredentials-nextgen + cifs_credentials_volume: /opt + read_only: true + include_role: + name: mount_network_drives + vars: + mount_point: "{{ item.mount_point }}" + network_share: "{{ item.network_share }}" + cifs_credentials: "{{ item.cifs_credentials }}" + cifs_credentials_volume: "{{ item.cifs_credentials_volume }}" + read_only: "{{ item.read_only }}" diff --git a/cluster-setup/deployment/network_drives_read_write.yaml b/cluster-setup/deployment/network_drives_read_write.yaml new file mode 100644 index 000000000..3be3965b4 --- /dev/null +++ b/cluster-setup/deployment/network_drives_read_write.yaml @@ -0,0 +1,27 @@ +--- + +- name: mount macdatafile and RAW_DATA read-write on all nodes + hosts: all + become: true + tasks: + - name: do the mounts on all the nodes + loop: + - mount_point: /media/macdatafile + network_share: "{{ macdatafile_network_share }}" + cifs_credentials: /opt/smbcredentials + cifs_credentials_volume: /opt + read_only: false + + - mount_point: /media/RAW_DATA + network_share: "{{ raw_data_network_share }}" + cifs_credentials: /opt/smbcredentials-nextgen + cifs_credentials_volume: /opt + read_only: false + include_role: + name: mount_network_drives + vars: + mount_point: "{{ item.mount_point }}" + network_share: "{{ item.network_share }}" + cifs_credentials: "{{ item.cifs_credentials }}" + cifs_credentials_volume: "{{ item.cifs_credentials_volume }}" + read_only: "{{ item.read_only }}" diff --git a/cluster-setup/deployment/network_drives_standard.yaml b/cluster-setup/deployment/network_drives_standard.yaml new file mode 100644 index 000000000..96fcc8398 --- /dev/null +++ b/cluster-setup/deployment/network_drives_standard.yaml @@ -0,0 +1,53 @@ +--- + +- name: "mount macdatafile and RAW_DATA read-write on the head node" + hosts: head + become: true + tasks: + - name: do the mounts on the head node + loop: + - mount_point: /media/macdatafile + network_share: "{{ macdatafile_network_share }}" + cifs_credentials: /opt/smbcredentials + cifs_credentials_volume: /opt + read_only: false + + - mount_point: /media/RAW_DATA + network_share: "{{ raw_data_network_share }}" + cifs_credentials: /opt/smbcredentials-nextgen + cifs_credentials_volume: /opt + read_only: false + include_role: + name: mount_network_drives + vars: + mount_point: "{{ item.mount_point }}" + network_share: "{{ item.network_share }}" + cifs_credentials: "{{ item.cifs_credentials }}" + cifs_credentials_volume: "{{ item.cifs_credentials_volume }}" + read_only: "{{ item.read_only }}" + +- name: "mount macdatafile and RAW_DATA read-only on the worker nodes" + hosts: workers + become: true + tasks: + - name: do the mounts on the worker nodes + loop: + - mount_point: /media/macdatafile + network_share: "{{ macdatafile_network_share }}" + cifs_credentials: /opt/smbcredentials + cifs_credentials_volume: /opt + read_only: true + + - mount_point: /media/RAW_DATA + network_share: "{{ raw_data_network_share }}" + cifs_credentials: /opt/smbcredentials-nextgen + cifs_credentials_volume: /opt + read_only: true + include_role: + name: mount_network_drives + vars: + mount_point: "{{ item.mount_point }}" + network_share: "{{ item.network_share }}" + cifs_credentials: "{{ item.cifs_credentials }}" + cifs_credentials_volume: "{{ item.cifs_credentials_volume }}" + read_only: "{{ item.read_only }}" diff --git a/cluster-setup/testenv/notes-for-prod.md b/cluster-setup/deployment/notes-for-prod.md similarity index 100% rename from cluster-setup/testenv/notes-for-prod.md rename to cluster-setup/deployment/notes-for-prod.md diff --git a/cluster-setup/deployment/octomore_preliminary_setup.yaml b/cluster-setup/deployment/octomore_preliminary_setup.yaml new file mode 100644 index 000000000..34fe20b4e --- /dev/null +++ b/cluster-setup/deployment/octomore_preliminary_setup.yaml @@ -0,0 +1,18 @@ +--- + +- name: preliminary setup to run on the Octomore head node + hosts: head + roles: + - create_data_filesystem + - head_node_internal_interface + tasks: + - name: create /opt + file: + path: /opt + owner: root + group: root + mode: '0755' + state: directory + - name: set up head node networking + include_role: + name: head_node_networking diff --git a/cluster-setup/deployment/reassign_bootstrap_user_uid.yaml b/cluster-setup/deployment/reassign_bootstrap_user_uid.yaml new file mode 100644 index 000000000..1d4d51cc2 --- /dev/null +++ b/cluster-setup/deployment/reassign_bootstrap_user_uid.yaml @@ -0,0 +1,22 @@ +--- + +- name: assign a different UID and GID to the bootstrap user + hosts: all + vars: + user_name: ubuntu + group_name: ubuntu + old_gid: 1000 + new_uid: 1020 + new_gid: 1020 + tasks: + - name: change UID + user: + name: "{{ user_name }}" + uid: "{{ new_uid }}" + - name: change GID + group: + name: "{{ group_name }}" + gid: "{{ new_gid }}" + - name: change group perms of the home folders + become: true + command: find /home/{{ user_name }} -group {{ old_gid }} -exec chgrp -h {{ group_name }} {} \; diff --git a/cluster-setup/deployment/roles/copy_users_and_groups/defaults/main.yml b/cluster-setup/deployment/roles/copy_users_and_groups/defaults/main.yml new file mode 100644 index 000000000..126e48421 --- /dev/null +++ b/cluster-setup/deployment/roles/copy_users_and_groups/defaults/main.yml @@ -0,0 +1,3 @@ +--- + +copied_groups: [] diff --git a/cluster-setup/deployment/roles/copy_users_and_groups/tasks/main.yml b/cluster-setup/deployment/roles/copy_users_and_groups/tasks/main.yml new file mode 100644 index 000000000..819dfdcdd --- /dev/null +++ b/cluster-setup/deployment/roles/copy_users_and_groups/tasks/main.yml @@ -0,0 +1,83 @@ +--- + +- name: read users + delegate_to: localhost + register: user_list + community.general.read_csv: + path: /etc/passwd + delimiter: ":" + fieldnames: + - name + - passwdx + - uid + - gid + - info + - home + - shell + +- name: read groups + delegate_to: localhost + register: group_list + community.general.read_csv: + path: /etc/group + delimiter: ":" + fieldnames: + - name + - passwdx + - gid + - users + +- name: record group members + loop: "{{ group_list.list }}" + when: item.name in copied_groups + set_fact: + group_name: "{{ item.name }}" + group_members: "{{ item.users.split(',') }}" + register: system_groups + +- name: build user groups + with_subelements: + - "{{ system_groups.results }}" + - ansible_facts.group_members + when: item.1 != '' + set_fact: + user_groups: "{{ user_groups | default({}) | combine({ item.1: [item.0.ansible_facts.group_name] }, list_merge='append') }}" + +- name: read system passwords + delegate_to: localhost + register: shadow_dict + community.general.read_csv: + path: /etc/shadow + delimiter: ":" + key: name + fieldnames: + - name + - passwd + - lastchanged + - min + - max + - warn + - inactive + - expire + +- name: copy system groups + loop: "{{ group_list.list }}" + when: > + (item.name in copied_groups) or + (item.name in shadow_dict.dict and shadow_dict.dict[item.name]['passwd'].startswith("$")) + group: + gid: "{{ item.gid }}" + name: "{{ item.name }}" + +- name: copy system users + loop: "{{ user_list.list }}" + when: shadow_dict.dict[item.name]['passwd'].startswith("$") + user: + uid: "{{ item.uid }}" + create_home: no + name: "{{ item.name }}" + password: "{{ shadow_dict.dict[item.name]['passwd'] }}" + group: "{{ item.name }}" + groups: "{{ user_groups[item.name] | default([]) }}" + append: true + shell: "{{ default_shell }}" diff --git a/cluster-setup/deployment/roles/create_data_filesystem/tasks/main.yaml b/cluster-setup/deployment/roles/create_data_filesystem/tasks/main.yaml new file mode 100644 index 000000000..2b2ba4305 --- /dev/null +++ b/cluster-setup/deployment/roles/create_data_filesystem/tasks/main.yaml @@ -0,0 +1,50 @@ +--- + +- name: create a single partition on each of the physical volumes + loop: "{{ data_physical_volumes }}" + community.general.parted: + device: "/dev/disk/by-id/{{ item }}" + number: 1 + state: present + label: gpt + +- name: construct a list of the partition names + block: + - name: initialize the list as empty + set_fact: + data_partition_names: [] + - name: append names to the list + loop: "{{ data_physical_volumes }}" + set_fact: + data_partition_names: "{{ data_partition_names + ['/dev/disk/by-id/' ~ item ~ '-part1'] }}" + +- name: create a volume group out of the data partitions + lvg: + vg: data-vg + pvs: "{{ data_partition_names | join(',') }}" + +- name: create a logical volume from the volume group + community.general.lvol: + vg: data-vg + lv: data-lv + size: 100%VG + +- name: create the filesystem + community.general.filesystem: + fstype: ext4 + dev: /dev/data-vg/data-lv + +- name: create the /data mount point + file: + path: /data + owner: root + group: root + mode: '0755' + state: directory + +- name: mount the filesystem + mount: + path: /data + src: /dev/data-vg/data-lv + fstype: ext4 + state: mounted diff --git a/cluster-setup/deployment/roles/head_node_internal_interface/tasks/main.yaml b/cluster-setup/deployment/roles/head_node_internal_interface/tasks/main.yaml new file mode 100644 index 000000000..b56910779 --- /dev/null +++ b/cluster-setup/deployment/roles/head_node_internal_interface/tasks/main.yaml @@ -0,0 +1,15 @@ +--- + +- name: generate and install netplan configuration for the internal-facing interface + become: true + template: + src: 60-internal-interface.yaml.j2 + dest: /etc/netplan/60-internal-interface.yaml + owner: root + group: root + mode: "644" + +- name: apply the configuration + become: true + command: + argv: [netplan, apply] diff --git a/cluster-setup/deployment/roles/head_node_internal_interface/templates/60-internal-interface.yaml.j2 b/cluster-setup/deployment/roles/head_node_internal_interface/templates/60-internal-interface.yaml.j2 new file mode 100644 index 000000000..7191f05a8 --- /dev/null +++ b/cluster-setup/deployment/roles/head_node_internal_interface/templates/60-internal-interface.yaml.j2 @@ -0,0 +1,10 @@ +network: + ethernets: + internal0: + dhcp4: false + addresses: + - "{{ head_internal_address }}/{{ head_internal_mask_bits }}" + match: + macaddress: "{{ head_internal_interface_mac }}" + set-name: {{ head_internal_interface }} + version: 2 diff --git a/cluster-setup/deployment/roles/head_node_networking/tasks/main.yml b/cluster-setup/deployment/roles/head_node_networking/tasks/main.yml new file mode 100644 index 000000000..30e8f6dc4 --- /dev/null +++ b/cluster-setup/deployment/roles/head_node_networking/tasks/main.yml @@ -0,0 +1,163 @@ +--- + +- name: set timezone + block: + - name: change the timezone + community.general.timezone: + name: America/Vancouver + - name: restart cron to reflect the new timezone + systemd: + name: cron + state: restarted + +- name: check if original /home has been renamed + stat: path=/data/home + register: data_home + +- name: move /home to /data to make it accessible to workers + become: true + block: + - name: create /data + file: + path: /data + owner: root + group: root + mode: '0755' + state: directory + - name: move /home to /data/home + command: mv /home /data/home + when: not data_home.stat.exists + +- name: symbolic link for /home + become: true + file: + path: /home + src: /data/home + state: link + +- name: enable packet forwarding + become: true + blockinfile: + path: /etc/ufw/sysctl.conf + block: | + net/ipv4/ip_forward=1 + +- name: configure IP masquerading + become: true + blockinfile: + path: /etc/ufw/before.rules + block: | + *nat + :POSTROUTING ACCEPT [0:0] + -A POSTROUTING -s {{ internal_subnet }}/{{ internal_mask_bits }} -o {{ head_external_interface }} -j MASQUERADE + COMMIT + +- name: restart ufw to allow the new rules to take effect + become: true + systemd: + name: ufw + state: restarted + enabled: true + +- name: allow forwarded packets from the compute nodes to traverse the firewall + become: true + community.general.ufw: + rule: allow + route: true + interface_in: "{{ head_internal_interface }}" + interface_out: "{{ head_external_interface }}" + from: "{{ internal_subnet }}/{{ internal_mask_bits }}" + +- name: open port for SSH access + become: true + community.general.ufw: + rule: allow + port: ssh + protocol: tcp + +- name: open NFS ports + become: true + block: + - name: open TCP port + community.general.ufw: + rule: allow + port: nfs + proto: tcp + - name: open UDP port + community.general.ufw: + rule: allow + port: nfs + proto: udp + +- name: open port for workers to communicate with slurmctld + become: true + community.general.ufw: + rule: allow + port: 6817 + protocol: tcp + +- name: enable ufw + become: true + community.general.ufw: + state: enabled + +- name: install NFS server + become: true + apt: + name: + - nfs-kernel-server + state: present + +- name: start NFS service + systemd: + name: nfs-server + state: started + enabled: true + +- name: set up NFS exports + become: true + register: nfs_exports_file + blockinfile: + path: /etc/exports + block: | + /data {{ nfs_export_to_hosts }}(rw,sync,no_all_squash,no_root_squash) + /usr/local {{ nfs_export_to_hosts }}(ro,sync,no_root_squash) + /opt {{ nfs_export_to_hosts }}(ro,sync,no_root_squash) + +- name: reload NFS exports + become: true + when: nfs_exports_file.changed + command: exportfs -r + +- name: install DHCP server + become: true + apt: + name: + - isc-dhcp-server + +- name: configure DHCP server + become: true + block: + - name: write conf file + template: + src: dhcpd.conf.j2 + dest: /etc/dhcp/dhcpd.conf + owner: root + group: root + mode: "644" + backup: true + - name: tell dhcpd which interface to serve DHCP requests on + template: + src: isc-dhcp-server.j2 + dest: /etc/default/isc-dhcp-server + owner: root + group: root + mode: "644" + backup: true + +- name: (re)start DHCP server + become: true + systemd: + name: isc-dhcp-server + state: restarted + enabled: true diff --git a/cluster-setup/deployment/roles/head_node_networking/templates/dhcpd.conf.j2 b/cluster-setup/deployment/roles/head_node_networking/templates/dhcpd.conf.j2 new file mode 100644 index 000000000..998ded0ec --- /dev/null +++ b/cluster-setup/deployment/roles/head_node_networking/templates/dhcpd.conf.j2 @@ -0,0 +1,21 @@ +# This dhcpd configuration managed by Ansible. Changes to this file will not be +# persisted if the Ansible playbooks used to set up this machine are rerun! + +default-lease-time 600; +max-lease-time 7200; + +option domain-name-servers 192.168.168.101; + +subnet {{ internal_subnet }} netmask {{ internal_netmask }} { + range {{ internal_dhcp_range[0] }} {{ internal_dhcp_range [1] }}; + option routers {{ head_internal_address }}; + option broadcast-address {{ internal_broadcast }}; +} + +{% for node in compute_nodes %} +host {{ node.name }} { + option host-name {{ node.name }}; + hardware ethernet {{ node.mac }}; + fixed-address {{ node.ip }}; +} +{% endfor %} diff --git a/cluster-setup/deployment/roles/head_node_networking/templates/isc-dhcp-server.j2 b/cluster-setup/deployment/roles/head_node_networking/templates/isc-dhcp-server.j2 new file mode 100644 index 000000000..3e5a3763d --- /dev/null +++ b/cluster-setup/deployment/roles/head_node_networking/templates/isc-dhcp-server.j2 @@ -0,0 +1,6 @@ +# This isc-dhcp-server configuration file is managed by Ansible. +# Changes to this file will not be persisted if the Ansible playbooks +# used to set up this machine are rerun! + +INTERFACESv4="{{ head_internal_interface }}" +INTERFACESv6="" diff --git a/cluster-setup/deployment/roles/import_users_and_groups/tasks/main.yml b/cluster-setup/deployment/roles/import_users_and_groups/tasks/main.yml new file mode 100644 index 000000000..9f6af9225 --- /dev/null +++ b/cluster-setup/deployment/roles/import_users_and_groups/tasks/main.yml @@ -0,0 +1,29 @@ +--- + +- name: create groups + become: true + block: + - name: create imported users' primary groups + loop: "{{ primary_groups }}" + group: + gid: "{{ item.gid }}" + name: "{{ item.name }}" + - name: create all other imported groups + loop: "{{ other_groups }}" + group: + gid: "{{ item.gid }}" + name: "{{ item.name }}" + +- name: create users + become: true + loop: "{{ users }}" + user: + uid: "{{ item.uid }}" + create_home: yes + home: "{{ item.home }}" + name: "{{ item.name }}" + password: "{{ item.hashed_password }}" + group: "{{ item.primary_group }}" + groups: "{{ item.groups }}" + append: true + shell: "{{ default_shell }}" diff --git a/roles/kive_node/README.md b/cluster-setup/deployment/roles/kive_node/README.md similarity index 100% rename from roles/kive_node/README.md rename to cluster-setup/deployment/roles/kive_node/README.md diff --git a/roles/kive_node/meta/main.yml b/cluster-setup/deployment/roles/kive_node/meta/main.yml similarity index 100% rename from roles/kive_node/meta/main.yml rename to cluster-setup/deployment/roles/kive_node/meta/main.yml diff --git a/cluster-setup/deployment/roles/kive_node/tasks/main.yml b/cluster-setup/deployment/roles/kive_node/tasks/main.yml new file mode 100644 index 000000000..d49395bfe --- /dev/null +++ b/cluster-setup/deployment/roles/kive_node/tasks/main.yml @@ -0,0 +1,65 @@ +--- + +- name: add the deadsnakes PPA to get old versions of Python + become: true + apt_repository: + repo: ppa:deadsnakes/ppa + +- name: install the version of Python required by kive + become: true + apt: + name: + - "{{ kive_python_package }}" + - "{{ kive_python_package }}-distutils" + - "{{ kive_python_package }}-venv" + - "{{ kive_python_package }}-dev" + - "lib{{ kive_python_package }}-dev" + +- name: install pip for this version of Python + become: true + shell: "curl https://bootstrap.pypa.io/get-pip.py | sudo {{ kive_python_executable }}" + +- name: kive package dependencies + become: true + apt: + name: +# - python3-dev +# - python3-venv + - libsqlite3-dev + - wcanadian + - lsof + - graphviz + - libgraphviz-dev + +- name: install postgres database libraries + become: true + apt: + name: + - postgresql-client + +- name: configure mail service for error logging + block: + - name: install postfix + become: true + apt: + name: + - postfix + - name: start postfix daemon + systemd: + name: postfix + state: started + enabled: true + +- name: create kive group + become: true + group: + name: kive + gid: 762 # random gid in system uid range (200, 999); hard-coded for consistency across hosts + +- name: create kive user + become: true + user: + name: kive + system: yes + uid: 762 # random uid in system uid range (200, 999); hard-coded for consistency across hosts + group: kive diff --git a/roles/kive_server/README.md b/cluster-setup/deployment/roles/kive_server/README.md similarity index 89% rename from roles/kive_server/README.md rename to cluster-setup/deployment/roles/kive_server/README.md index 9d16ad651..1dfb7393b 100644 --- a/roles/kive_server/README.md +++ b/cluster-setup/deployment/roles/kive_server/README.md @@ -3,7 +3,7 @@ server. It also applies the [Slurm controller] and [Slurm worker] roles. [Kive node]: ../kive_node [Slurm controller]: ../slurm_controller -[Slurm worker]: ../slurm_worker +[Slurm worker]: ../worker_node_networking It includes: diff --git a/cluster-setup/deployment/roles/kive_server/files/001-kive-ssl.conf b/cluster-setup/deployment/roles/kive_server/files/001-kive-ssl.conf new file mode 100644 index 000000000..748a7d1cf --- /dev/null +++ b/cluster-setup/deployment/roles/kive_server/files/001-kive-ssl.conf @@ -0,0 +1,30 @@ +# SSL configuration for the Kive web portal. This file was created by copying +# and modifying `/etc/apache2/sites-available/default-ssl.conf`. That file +# has some helpful comments that may be useful to look at if you ever need +# to further adjust this file. + + + + ServerAdmin webmaster@localhost + + DocumentRoot /var/www/html + + ErrorLog ${APACHE_LOG_DIR}/error.log + CustomLog ${APACHE_LOG_DIR}/access.log combined + + SSLEngine on + + SSLCertificateFile /etc/ssl/certs/star_cfe.crt + SSLCertificateKeyFile /etc/ssl/private/star_cfe.key + SSLCertificateChainFile /etc/ssl/certs/DigiCertCA.crt + + #SSLOptions +FakeBasicAuth +ExportCertData +StrictRequire + + SSLOptions +StdEnvVars + + + SSLOptions +StdEnvVars + + + + diff --git a/roles/kive_server/files/barman_backup.timer b/cluster-setup/deployment/roles/kive_server/files/barman_backup.timer similarity index 100% rename from roles/kive_server/files/barman_backup.timer rename to cluster-setup/deployment/roles/kive_server/files/barman_backup.timer diff --git a/roles/kive_server/files/crontab_mail.py b/cluster-setup/deployment/roles/kive_server/files/crontab_mail.py similarity index 100% rename from roles/kive_server/files/crontab_mail.py rename to cluster-setup/deployment/roles/kive_server/files/crontab_mail.py diff --git a/roles/kive_server/files/kive_purge.service b/cluster-setup/deployment/roles/kive_server/files/kive_purge.service similarity index 100% rename from roles/kive_server/files/kive_purge.service rename to cluster-setup/deployment/roles/kive_server/files/kive_purge.service diff --git a/roles/kive_server/files/kive_purge.timer b/cluster-setup/deployment/roles/kive_server/files/kive_purge.timer similarity index 100% rename from roles/kive_server/files/kive_purge.timer rename to cluster-setup/deployment/roles/kive_server/files/kive_purge.timer diff --git a/roles/kive_server/files/kive_purge_synch.service b/cluster-setup/deployment/roles/kive_server/files/kive_purge_synch.service similarity index 100% rename from roles/kive_server/files/kive_purge_synch.service rename to cluster-setup/deployment/roles/kive_server/files/kive_purge_synch.service diff --git a/roles/kive_server/files/kive_purge_synch.timer b/cluster-setup/deployment/roles/kive_server/files/kive_purge_synch.timer similarity index 100% rename from roles/kive_server/files/kive_purge_synch.timer rename to cluster-setup/deployment/roles/kive_server/files/kive_purge_synch.timer diff --git a/roles/kive_server/files/purge_apache_logs b/cluster-setup/deployment/roles/kive_server/files/purge_apache_logs similarity index 54% rename from roles/kive_server/files/purge_apache_logs rename to cluster-setup/deployment/roles/kive_server/files/purge_apache_logs index 69bf4a03e..33d81bc7b 100644 --- a/roles/kive_server/files/purge_apache_logs +++ b/cluster-setup/deployment/roles/kive_server/files/purge_apache_logs @@ -1,5 +1,5 @@ #!/usr/bin/bash # This removes all but the 10 most recent error_log files. -ls -r /var/log/httpd/error_log*|tail -n+11|xargs rm -f +ls -r /var/log/apache2/error_log*|tail -n+11|xargs rm -f # This removes all but the 10 most recent access_log files. -ls -r /var/log/httpd/access_log*|tail -n+11|xargs rm -f +ls -r /var/log/apache2/access_log*|tail -n+11|xargs rm -f diff --git a/roles/kive_server/files/rsnapshot_alpha.timer b/cluster-setup/deployment/roles/kive_server/files/rsnapshot_alpha.timer similarity index 100% rename from roles/kive_server/files/rsnapshot_alpha.timer rename to cluster-setup/deployment/roles/kive_server/files/rsnapshot_alpha.timer diff --git a/roles/kive_server/files/rsnapshot_beta.timer b/cluster-setup/deployment/roles/kive_server/files/rsnapshot_beta.timer similarity index 100% rename from roles/kive_server/files/rsnapshot_beta.timer rename to cluster-setup/deployment/roles/kive_server/files/rsnapshot_beta.timer diff --git a/roles/kive_server/files/rsnapshot_gamma.timer b/cluster-setup/deployment/roles/kive_server/files/rsnapshot_gamma.timer similarity index 100% rename from roles/kive_server/files/rsnapshot_gamma.timer rename to cluster-setup/deployment/roles/kive_server/files/rsnapshot_gamma.timer diff --git a/cluster-setup/deployment/roles/kive_server/files/wsgi.load b/cluster-setup/deployment/roles/kive_server/files/wsgi.load new file mode 100644 index 000000000..d76d1d7a3 --- /dev/null +++ b/cluster-setup/deployment/roles/kive_server/files/wsgi.load @@ -0,0 +1 @@ +LoadModule wsgi_module /usr/lib/apache2/modules/mod_wsgi.so diff --git a/roles/kive_server/handlers/main.yml b/cluster-setup/deployment/roles/kive_server/handlers/main.yml similarity index 68% rename from roles/kive_server/handlers/main.yml rename to cluster-setup/deployment/roles/kive_server/handlers/main.yml index 8b42c90cb..66639bd0e 100644 --- a/roles/kive_server/handlers/main.yml +++ b/cluster-setup/deployment/roles/kive_server/handlers/main.yml @@ -2,10 +2,10 @@ become: true become_user: root systemd: - name: "postgresql-12" + name: "postgresql" state: restarted -- name: restart http server +- name: restart web server become: true systemd: - name: httpd + name: apache2 state: restarted diff --git a/roles/kive_server/meta/main.yml b/cluster-setup/deployment/roles/kive_server/meta/main.yml similarity index 100% rename from roles/kive_server/meta/main.yml rename to cluster-setup/deployment/roles/kive_server/meta/main.yml diff --git a/roles/kive_server/tasks/main.yml b/cluster-setup/deployment/roles/kive_server/tasks/main.yml similarity index 50% rename from roles/kive_server/tasks/main.yml rename to cluster-setup/deployment/roles/kive_server/tasks/main.yml index 599c34fd0..354590718 100644 --- a/roles/kive_server/tasks/main.yml +++ b/cluster-setup/deployment/roles/kive_server/tasks/main.yml @@ -1,23 +1,233 @@ --- +# NOTE(nknight): this is done with `file` instead of during user creation so that we +# can set the permissions explicitly. +- name: create kive home directory + file: + path: /home/kive/ + state: directory + mode: "go-rx" + group: kive + owner: kive + + +- name: create kive app directories + become: true + loop: + - /etc/kive/ + - /var/kive/ + - /var/log/kive/ + - "{{ kive_media_root }}" + file: + path: "{{ item }}" + state: directory + mode: "2770" + owner: kive + group: kive + + +- name: kive environment configuration + become: true + become_user: kive + block: + - name: set kive environment variables and activate the virtualenv + blockinfile: + path: /home/kive/.bash_profile + block: | + export KIVE_DB_NAME={{ kive_db_name }} + export KIVE_DB_USER={{ kive_db_user }} + export KIVE_DB_HOST={{ kive_db_host }} + export KIVE_DB_PASSWORD={{ kive_db_password }} + + export KIVE_MEDIA_ROOT={{ kive_media_root }} + export KIVE_STATIC_ROOT={{ kive_static_root }} + export KIVE_SLURM_PATH={{ kive_slurm_path }} + source {{ kive_venv }}/bin/activate + create: true # create the file if it doesn't exist + backup: true + owner: kive + group: kive + - name: install package dependencies + become: true + apt: + update_cache: true + name: + - postgresql + - apache2 + - apache2-utils + - apache2-dev + # - libapache2-mod-wsgi-py3 + - barman + - barman-cli + - rsnapshot + +- name: fetch kive source code + become: true + git: + dest: "{{ kive_root }}" + repo: https://github.com/cfe-lab/Kive.git + version: "{{ kive_version | default('master') }}" + update: "{{ update_kive_source | default('no') }}" + +- name: set up the Kive Python virtualenv become: true block: - - name: install postgresql - dnf: - name: - - postgresql12-server - - httpd - - python3-mod_wsgi + - name: create directory for virtualenv + file: + path: "{{ kive_venv }}" + state: directory + - name: copy requirements file to track changes + register: kive_requirements + copy: + dest: "{{ kive_venv }}/requirements.txt" + src: "{{ kive_root }}/requirements.txt" + - name: kive python dependencies + when: kive_requirements.changed + pip: + requirements: "{{ kive_root }}/requirements.txt" + virtualenv_command: "{{ kive_python_executable }} -m venv" + virtualenv: "{{ kive_venv }}" +- name: install mod_wsgi from source + become: true + block: + - name: check if mod_wsgi slurm source files are already downloaded + stat: + path: "/usr/local/src/{{ mod_wsgi_tarball }}" + register: mod_wsgi_download -- name: httpd configuration + - name: fetch mod_wsgi source files + become: true + get_url: + url: "{{ mod_wsgi_source_url }}" + dest: "/usr/local/src/{{ mod_wsgi_tarball }}" + checksum: "sha256:{{ mod_wsgi_sha256_checksum }}" + when: not mod_wsgi_download.stat.exists + + - name: decompress mod_wsgi tarball + unarchive: + remote_src: true + src: "/usr/local/src/{{ mod_wsgi_tarball }}" + dest: "/usr/local/src" + owner: root + group: root + + - name: make a link to the mod_wsgi source code directory + file: + src: "/usr/local/src/{{ mod_wsgi_basename }}" + dest: "/usr/local/src/mod_wsgi" + state: link + + - name: configure mod_wsgi build + command: + argv: + - "/usr/local/src/mod_wsgi/configure" + - "--with-python=/usr/bin/python3.7" + chdir: "/usr/local/src/mod_wsgi" + creates: "/usr/local/src/mod_wsgi/Makefile" + + - name: build and install mod_wsgi + make: + chdir: "/usr/local/src/mod_wsgi" + target: install + + - name: add wsgi to the "modules available" + copy: + src: wsgi.load + dest: /etc/apache2/mods-available + + - name: enable the wsgi module in apache + community.general.apache2_module: + state: present + name: wsgi + +# This is following the general instructions for Ubuntu SSL support +# in `/usr/share/doc/apache2/README.Debian.gz`. +# For the certificates to be installed, they should be placed in the +# directory you're running Ansible in, with the desired names. +- name: enable Apache SSL support become: true block: - - name: httpd conf file + - name: enable the SSL module + community.general.apache2_module: + state: present + name: ssl + + - name: install SSL certificate + copy: + src: star_cfe.crt + dest: /etc/ssl/certs/star_cfe.crt + owner: root + group: root + + - name: install SSL key copy: - src: 001-kive.conf - dest: /etc/httpd/conf.d/ + src: star_cfe.key + dest: /etc/ssl/private/star_cfe.key + owner: root + group: root + mode: "0600" + + - name: install SSL certificate chain file + copy: + src: DigiCertCA.crt + dest: /etc/ssl/certs/DigiCertCA.crt + owner: root + group: root + + +- name: set up log purging + become: true + block: + - name: apache log purge script + copy: + src: purge_apache_logs + dest: /usr/sbin + mode: "755" + - name: purge service files + loop: + - kive_purge.service + - kive_purge.timer + - kive_purge_synch.service + - kive_purge_synch.timer + copy: + src: "{{ item }}" + dest: /etc/systemd/system + - name: purge config file + template: + src: kive_purge.conf.j2 + dest: /etc/kive/kive_purge.conf + owner: kive + group: kive + mode: "640" + - name: enable and start kive_purge timers + ignore_errors: "{{ ansible_check_mode }}" # Unit files not copied yet + loop: + - kive_purge.timer + - kive_purge_synch.timer + systemd: + name: "{{ item }}" + enabled: true + state: started + + +- name: web server configuration + become: true + block: + - name: Kive-specific configuration + block: + - name: install Kive-specific config file + template: + src: 001-kive.conf.j2 + dest: /etc/apache2/conf-available/001-kive.conf + owner: root + group: root + mode: "644" + - name: enable Kive-specific configuration + command: + cmd: "a2enconf 001-kive" + - name: kive web conf file template: src: kive_apache.conf.j2 @@ -25,161 +235,185 @@ owner: kive group: kive mode: "640" + - name: server environment variables via systemd blockinfile: - path: /etc/systemd/system/httpd.service.d/override.conf + path: /etc/systemd/system/apache2.service.d/override.conf create: true backup: true mode: "644" block: | [Service] EnvironmentFile=/etc/kive/kive_apache.conf - Environment=APACHE_RUN_USER=kive - Environment=APACHE_RUN_GROUP=kive - - name: update httpd.conf - loop: - - from: "Listen 80$" - to: "Listen {{ kive_listen_port }}" - - from: "User apache$" - to: "User {{ kive_httpd_user }}" - - from: "Group apache$" - to: "Group {{kive_httpd_group }}" - replace: - path: /etc/httpd/conf/httpd.conf - regexp: "{{ item.from }}" - replace: "{{ item.to }}" - - name: apache log purge settings - loop: - - from: 'ErrorLog "logs/error_log"' - to: ErrorLog "|/usr/sbin/rotatelogs -l -p /usr/sbin/purge_apache_logs /var/log/httpd/error_log.%Y-%m-%d-%H%M%S 15M - - from: 'CustomLog "logs/access_log" combined' - to: CustomLog "|/usr/sbin/rotatelogs -l -p /usr/sbin/purge_apache_logs /var/log/httpd/access_log.%Y-%m-%d-%H%M%S 15M" combined - replace: - path: /etc/httpd/conf/httpd.conf - regexp: "{{ item.from }}" - replace: "{{ item.to }}" - - name: enable httpd + + - name: update apache2 envvars + blockinfile: + path: /etc/apache2/envvars + backup: true + block: | + export APACHE_RUN_USER=kive + export APACHE_RUN_GROUP=kive + +# - name: update apache2 port +# blockinfile: +# path: /etc/apache2/ports.conf +# backup: true +# block: | +# Listen {{ kive_listen_port }} + + - name: set up rotating apache logs + block: + - name: add rotating logs configuration file + blockinfile: + path: /etc/apache2/conf-available/rotate-kive-logs.conf + create: true + backup: true + mode: "644" + block: | + ErrorLog "|/usr/bin/rotatelogs -l -p /usr/sbin/purge_apache_logs /var/log/apache2/error_log.%Y-%m-%d-%H%M%S 15M" + CustomLog "|/usr/bin/rotatelogs -l -p /usr/sbin/purge_apache_logs /var/log/apache2/access_log.%Y-%m-%d-%H%M%S 15M" combined + - name: activate rotating logs configuration + command: + cmd: "a2enconf rotate-kive-logs" + + - name: install the httpd site configuration for Kive + copy: + src: 001-kive-ssl.conf + dest: /etc/apache2/sites-available + owner: root + group: root + + - name: enable the httpd site configuration for Kive + command: + cmd: "a2ensite 001-kive-ssl" + + - name: enable and (re)start apache2 systemd: - name: httpd - state: started + name: apache2 + state: restarted enabled: true - - name: ensure firewalld is running + daemon_reload: true + + - name: ensure ufw (the firewall) is running systemd: - name: firewalld - state: started + name: ufw + state: restarted enabled: true - - name: firewall's internal interface - ansible.posix.firewalld: - zone: internal - interface: "{{ kive_internal_interface }}" - permanent: yes - immediate: yes - state: enabled - - name: firewall's external interface - ansible.posix.firewalld: - zone: external - interface: "{{ kive_external_interface }}" - permanent: yes - immediate: yes - state: enabled - - name: firewall's internal interface should ACCEPT - register: internal_accept - ansible.posix.firewalld: - zone: internal - target: ACCEPT - permanent: yes + +# - name: firewall's internal interface should ACCEPT +# register: internal_accept +# community.general.ufw: +# default: allow +# interface: "{{ head_internal_interface }}" +# direction: incoming +# state: enabled +# +# - name: reload internal interface firewall config +# when: internal_accept.changed +# community.general.ufw: +# interface: "{{ head_internal_interface }}" +# state: reloaded + + - name: permit access to Postgres on the internal interface + community.general.ufw: + interface: "{{ head_internal_interface }}" + direction: in + port: 5432 + protocol: tcp + rule: allow state: enabled - - name: reload firewall config - when: internal_accept.changed - command: firewall-cmd --reload + - name: permit https service through firewall - ansible.posix.firewalld: - zone: external - service: https - permanent: yes - immediate: yes + community.general.ufw: + interface: "{{ head_external_interface }}" + direction: in + port: https + protocol: tcp + rule: allow state: enabled + - name: permit http service through firewall - ansible.posix.firewalld: - zone: external - service: http - permanent: yes - immediate: yes + community.general.ufw: + interface: "{{ head_external_interface }}" + direction: in + port: http + protocol: tcp + rule: allow state: enabled +- name: enable and start database service + become: true + systemd: + name: "postgresql@14-main" + state: started + enabled: true - name: configure postgres server become: true become_user: postgres block: - - name: ensure database is initialized - become: true - become_user: root - command: - cmd: "/usr/pgsql-12/bin/postgresql-12-setup initdb" - creates: "/var/lib/pgsql/12/data/PG_VERSION" - - name: start database service - become: true - become_user: root - systemd: - name: "postgresql-12" - state: started - enabled: true + - name: add kive entries to pg_hba.conf block: - name: local connections notify: Restart postgresql service postgresql_pg_hba: contype: local - dest: "/var/lib/pgsql/12/data/pg_hba.conf" + dest: "/etc/postgresql/14/main/pg_hba.conf" databases: all users: kive method: "scram-sha-256" + - name: host connections notify: Restart postgresql service postgresql_pg_hba: contype: host - dest: "/var/lib/pgsql/12/data/pg_hba.conf" + dest: "/etc/postgresql/14/main/pg_hba.conf" databases: all users: kive - source: "{{kive_db_host}}/24" + source: "{{ kive_db_subnet }}" method: "scram-sha-256" + - name: barman connections notify: Restart postgresql service postgresql_pg_hba: contype: host - dest: "/var/lib/pgsql/12/data/pg_hba.conf" + dest: "/etc/postgresql/14/main/pg_hba.conf" databases: all users: barman source: 127.0.0.1/32 method: "scram-sha-256" + - name: barman IPv6 connections notify: Restart postgresql service postgresql_pg_hba: contype: host - dest: "/var/lib/pgsql/12/data/pg_hba.conf" + dest: "/etc/postgresql/14/main/pg_hba.conf" databases: all users: barman source: "::1/128" method: "scram-sha-256" + - name: streaming_barman connections notify: Restart postgresql service postgresql_pg_hba: contype: host - dest: "/var/lib/pgsql/12/data/pg_hba.conf" + dest: "/etc/postgresql/14/main/pg_hba.conf" databases: replication users: streaming_barman source: 127.0.0.1/32 method: "scram-sha-256" + - name: streaming_barman IPv6 connections notify: Restart postgresql service postgresql_pg_hba: contype: host - dest: "/var/lib/pgsql/12/data/pg_hba.conf" + dest: "/etc/postgresql/14/main/pg_hba.conf" databases: replication users: streaming_barman source: "::1/128" method: "scram-sha-256" + - name: local database settings in postgresql.conf notify: Restart postgresql service loop: @@ -197,16 +431,16 @@ value: 10 - option: max_replication_slots value: 10 - # Additionally, setting up standard WAL archiving. - - option: archive_mode - value: "on" - - option: archive_command - value: "'barman-wal-archive localhost kive %p'" +# # Additionally, setting up standard WAL archiving. +# - option: archive_mode +# value: "on" +# - option: archive_command +# value: "'barman-wal-archive localhost kive %p'" # Remove old option that was accidentally added. - option: archiver community.general.ini_file: backup: yes - path: /var/lib/pgsql/12/data/postgresql.conf + path: "/etc/postgresql/14/main/postgresql.conf" create: no section: null option: "{{ item.option }}" @@ -214,6 +448,10 @@ state: "{{ 'present' if item.value is defined else 'absent' }}" +- name: Flush handlers + meta: flush_handlers + + - name: kive installation block: - name: build kive API @@ -223,7 +461,7 @@ chdir: "{{ kive_root }}/api/" creates: "{{ kive_root }}/api/build/" - name: collect kive's static files - notify: restart http server + # notify: restart web server environment: KIVE_STATIC_ROOT: "{{ kive_static_root }}" community.general.django_manage: @@ -232,42 +470,6 @@ virtualenv: "{{ kive_venv }}" -- name: set up log purging - become: true - block: - - name: apache log purge script - copy: - src: purge_apache_logs - dest: /usr/sbin - mode: "755" - - name: purge service files - loop: - - kive_purge.service - - kive_purge.timer - - kive_purge_synch.service - - kive_purge_synch.timer - copy: - src: "{{ item }}" - dest: /etc/systemd/system - - name: purge config file - template: - src: kive_purge.conf.j2 - dest: /etc/kive/kive_purge.conf - owner: kive - group: kive - mode: "640" - - name: enable and start kive_purge timers - ignore_errors: "{{ ansible_check_mode }}" # Unit files not copied yet - loop: - - kive_purge.timer - - kive_purge_synch.timer - systemd: - name: "{{ item }}" - enabled: true - state: started - - - - name: set up kive database block: - name: create kive database @@ -284,6 +486,13 @@ name: kive password: "{{ kive_db_password }}" db: kive + - name: grant kive database user all privileges + become: true + become_user: postgres + postgresql_privs: + role: kive + db: kive + type: database priv: ALL - name: create barman database user become: true @@ -309,8 +518,8 @@ environment: KIVE_DB_NAME: "{{ kive_db_name }}" KIVE_DB_USER: "{{ kive_db_user }}" - KIVE_DB_HOST: "{{kive_db_host }}" - KIVE_DB_PASSWORD: "{{ kive_db_password}}" + KIVE_DB_HOST: "{{ kive_db_host }}" + KIVE_DB_PASSWORD: "{{ kive_db_password }}" become: true become_user: kive community.general.django_manage: @@ -318,22 +527,42 @@ app_path: "{{ kive_root }}/kive" virtualenv: "{{ kive_venv }}" -- name: database backup global config - loop: - - option: path_prefix - value: /usr/pgsql-12/bin/ - - option: archiver - value: "on" - community.general.ini_file: - backup: yes - path: /etc/barman.conf - create: no - section: barman - option: "{{ item.option }}" - value: "{{ item.value }}" +#- name: database backup global config +# loop: +# - option: path_prefix +# value: /usr/pgsql-12/bin/ +# - option: archiver +# value: "on" +# community.general.ini_file: +# backup: yes +# path: /etc/barman.conf +# create: no +# section: barman +# option: "{{ item.option }}" +# value: "{{ item.value }}" + +- name: create the Barman backup directory + file: + path: "{{ kive_backup_path }}/BarmanDBBackup" + state: directory + owner: barman + group: barman + +- name: barman passwords file + blockinfile: + path: /var/lib/barman/.pgpass + create: yes + owner: barman + group: barman + mode: u=rw,g=,o= + block: | + localhost:*:*:barman:{{ barman_password }} + localhost:*:*:streaming_barman:{{ streaming_barman_password }} - name: database backup kive config loop: + - option: path_prefix + value: /usr/lib/postgresql/14/bin - option: description value: Kive database - option: conninfo @@ -346,9 +575,9 @@ value: bzip2 # archiver = on is necessary for the "fallback" WAL backup that happens via # PostgreSQL's archive_command setting in postgresql.conf (which we have - # configured to use barman-wal-archive - - option: archiver - value: "on" + # configured to use barman-wal-archive) +# - option: archiver +# value: "on" - option: streaming_conninfo value: host=localhost user=streaming_barman dbname=kive - option: streaming_archiver @@ -367,6 +596,11 @@ option: "{{ item.option }}" value: "{{ item.value }}" +#- name: force a WAL switch to verify the WAL archiving process +# become: true +# become_user: barman +# command: barman switch-wal --force --archive kive + - name: SSH keys block: - name: Set key locations @@ -403,17 +637,8 @@ user: postgres key: "{{ lookup('file', '/var/lib/barman/.ssh/id_rsa.pub') }}" -- name: barman passwords file - blockinfile: - path: /var/lib/barman/.pgpass - create: yes - owner: barman - group: barman - mode: u=rw,g=,o= - block: | - localhost:*:*:barman:{{ barman_password }} - localhost:*:*:streaming_barman:{{ streaming_barman_password }} - +# This variable should be a JSON-formatted list of 2-lists, like +# [["User One", "userone@bccfe.ca"], ..., ["User N", "userN@bccfe.ca"]] - name: parse admin e-mails from json set_fact: mail_admins_list: "{{ kive_admins | from_json }}" @@ -421,7 +646,7 @@ - name: build admin e-mails list loop: "{{ mail_admins_list }}" set_fact: - mail_admins_emails: "{{ (mail_admins_emails | default([])) + [item.1] }}" + mail_admins_emails: "{{ (mail_admins_emails | default([])) + [item[1]] }}" # Copy crontab_mail.py out of the Kive source code, because it gets run by root. # This way, it can be locked down more than the Kive source code. @@ -432,10 +657,17 @@ owner: root group: root mode: u=rw,g=r,o=r + +- name: create the rsnapshot backup directory + file: + path: "{{ kive_backup_path }}/rsnapshot" + state: directory + - name: rsnapshot config file template: src: rsnapshot.conf.j2 dest: /etc/rsnapshot.conf + - name: list of scheduled services set_fact: scheduled_service_names: @@ -443,6 +675,7 @@ - rsnapshot_alpha - rsnapshot_beta - rsnapshot_gamma + - name: scheduled service files loop: "{{ scheduled_service_names }}" template: diff --git a/cluster-setup/deployment/roles/kive_server/templates/001-kive.conf.j2 b/cluster-setup/deployment/roles/kive_server/templates/001-kive.conf.j2 new file mode 100644 index 000000000..991574ea0 --- /dev/null +++ b/cluster-setup/deployment/roles/kive_server/templates/001-kive.conf.j2 @@ -0,0 +1,15 @@ +WSGIScriptAlias / {{ kive_root }}/kive/kive/wsgi.py +WSGIPythonPath {{ kive_root }}/kive:{{ kive_venv }}/lib/{{ kive_python_executable }}/site-packages + + + +Require all granted + + + +Alias /static {{ kive_static_root }} + + +Order deny,allow +Allow from all + diff --git a/roles/kive_server/templates/barman_backup.service.j2 b/cluster-setup/deployment/roles/kive_server/templates/barman_backup.service.j2 similarity index 82% rename from roles/kive_server/templates/barman_backup.service.j2 rename to cluster-setup/deployment/roles/kive_server/templates/barman_backup.service.j2 index 0a0bd07e4..de64f46d3 100644 --- a/roles/kive_server/templates/barman_backup.service.j2 +++ b/cluster-setup/deployment/roles/kive_server/templates/barman_backup.service.j2 @@ -7,7 +7,7 @@ User=barman ExecStart=/opt/venv_kive/bin/python /opt/crontab_mail.py \ --log /var/log/barman/kive.log \ --level ERROR \ - --subject {{ (kive_subject_prefix + "barman") | quote }} \ + --subject {{ (kive_subject_prefix + " barman") | quote }} \ --from {{ kive_server_email | quote }} \ {{ mail_admins_emails | join(',') | quote }} \ /bin/barman backup kive diff --git a/roles/kive_server/templates/kive_apache.conf.j2 b/cluster-setup/deployment/roles/kive_server/templates/kive_apache.conf.j2 similarity index 100% rename from roles/kive_server/templates/kive_apache.conf.j2 rename to cluster-setup/deployment/roles/kive_server/templates/kive_apache.conf.j2 diff --git a/roles/kive_server/templates/kive_purge.conf.j2 b/cluster-setup/deployment/roles/kive_server/templates/kive_purge.conf.j2 similarity index 72% rename from roles/kive_server/templates/kive_purge.conf.j2 rename to cluster-setup/deployment/roles/kive_server/templates/kive_purge.conf.j2 index 70b53edaf..aa9ad0eec 100644 --- a/roles/kive_server/templates/kive_purge.conf.j2 +++ b/cluster-setup/deployment/roles/kive_server/templates/kive_purge.conf.j2 @@ -8,7 +8,9 @@ KIVE_SERVER_EMAIL={{ kive_server_email | quote }} KIVE_ADMINS={{ kive_admins | quote }} KIVE_SUBJECT_PREFIX={{ kive_subject_prefix | quote }} -# Set these in /root/ansible-rundir/env_vars.yml if you don't like the defaults in settings.py +# The KIVE_PURGE_START, KIVE_PURGE_STOP, and KIVE_LOG_LEVEL variables +# can be set in Ansible prior to deployment if you don't like the defaults +# in settings.py. # KIVE_PURGE_START=20GB # KIVE_PURGE_STOP=15GB # KIVE_PURGE_DATASET_AGING=1.0 @@ -16,15 +18,15 @@ KIVE_SUBJECT_PREFIX={{ kive_subject_prefix | quote }} # KIVE_PURGE_CONTAINER_AGING=10.0 # KIVE_PURGE_WAIT='0 days, 1:00:00' # KIVE_PURGE_BATCH_SIZE=100 -# KIVE_LOG_LEVEL=WARN +# KIVE_LOG_LEVEL=WARNING {% if kive_purge_start is defined %} -KIVE_PURGE_START={{kive_purge_start}} +KIVE_PURGE_START={{ kive_purge_start }} {% endif %} {% if kive_purge_stop is defined %} -KIVE_PURGE_STOP={{kive_purge_stop}} +KIVE_PURGE_STOP={{ kive_purge_stop }} {% endif %} {% if kive_log_level is defined %} -KIVE_LOG_LEVEL={{kive_log_level}} +KIVE_LOG_LEVEL={{ kive_log_level }} {% endif %} # KIVE_LOG is set separately for each service in the .service files. diff --git a/roles/kive_server/templates/rsnapshot.conf.j2 b/cluster-setup/deployment/roles/kive_server/templates/rsnapshot.conf.j2 similarity index 100% rename from roles/kive_server/templates/rsnapshot.conf.j2 rename to cluster-setup/deployment/roles/kive_server/templates/rsnapshot.conf.j2 diff --git a/roles/kive_server/templates/rsnapshot_alpha.service.j2 b/cluster-setup/deployment/roles/kive_server/templates/rsnapshot_alpha.service.j2 similarity index 92% rename from roles/kive_server/templates/rsnapshot_alpha.service.j2 rename to cluster-setup/deployment/roles/kive_server/templates/rsnapshot_alpha.service.j2 index 4158512e4..fa2bfaee9 100644 --- a/roles/kive_server/templates/rsnapshot_alpha.service.j2 +++ b/cluster-setup/deployment/roles/kive_server/templates/rsnapshot_alpha.service.j2 @@ -8,4 +8,4 @@ ExecStart=/opt/venv_kive/bin/python /opt/crontab_mail.py \ --subject {{ (kive_subject_prefix + "rsnapshot alpha") | quote }} \ --from {{ kive_server_email | quote }} \ {{ mail_admins_emails | join(',') | quote }} \ - /bin/rsnapshot alpha + /usr/bin/rsnapshot alpha diff --git a/roles/kive_server/templates/rsnapshot_beta.service.j2 b/cluster-setup/deployment/roles/kive_server/templates/rsnapshot_beta.service.j2 similarity index 92% rename from roles/kive_server/templates/rsnapshot_beta.service.j2 rename to cluster-setup/deployment/roles/kive_server/templates/rsnapshot_beta.service.j2 index ed3f0f081..eab8867ec 100644 --- a/roles/kive_server/templates/rsnapshot_beta.service.j2 +++ b/cluster-setup/deployment/roles/kive_server/templates/rsnapshot_beta.service.j2 @@ -8,4 +8,4 @@ ExecStart=/opt/venv_kive/bin/python /opt/crontab_mail.py \ --subject {{ (kive_subject_prefix + "rsnapshot beta") | quote }} \ --from {{ kive_server_email | quote }} \ {{ mail_admins_emails | join(',') | quote }} \ - /bin/rsnapshot beta + /usr/bin/rsnapshot beta diff --git a/roles/kive_server/templates/rsnapshot_gamma.service.j2 b/cluster-setup/deployment/roles/kive_server/templates/rsnapshot_gamma.service.j2 similarity index 92% rename from roles/kive_server/templates/rsnapshot_gamma.service.j2 rename to cluster-setup/deployment/roles/kive_server/templates/rsnapshot_gamma.service.j2 index 3a9b3e90e..7f905c815 100644 --- a/roles/kive_server/templates/rsnapshot_gamma.service.j2 +++ b/cluster-setup/deployment/roles/kive_server/templates/rsnapshot_gamma.service.j2 @@ -8,4 +8,4 @@ ExecStart=/opt/venv_kive/bin/python /opt/crontab_mail.py \ --subject {{ (kive_subject_prefix + "rsnapshot gamma") | quote }} \ --from {{ kive_server_email | quote }} \ {{ mail_admins_emails | join(',') | quote }} \ - /bin/rsnapshot gamma + /usr/bin/rsnapshot gamma diff --git a/cluster-setup/deployment/roles/mount_network_drives/defaults/main.yml b/cluster-setup/deployment/roles/mount_network_drives/defaults/main.yml new file mode 100644 index 000000000..e717a8a4e --- /dev/null +++ b/cluster-setup/deployment/roles/mount_network_drives/defaults/main.yml @@ -0,0 +1,6 @@ +--- + +cifs_credentials: /opt/smbcredentials +cifs_credentials_volume: /opt +mount_point: /media/macdatafile +network_share: //localhost/macdatafile # this won't work, you have to customize this diff --git a/cluster-setup/deployment/roles/mount_network_drives/tasks/main.yml b/cluster-setup/deployment/roles/mount_network_drives/tasks/main.yml new file mode 100644 index 000000000..2dc84c253 --- /dev/null +++ b/cluster-setup/deployment/roles/mount_network_drives/tasks/main.yml @@ -0,0 +1,19 @@ +--- + +- name: set mount options for the network drive + set_fact: + mount_options: "credentials={{ cifs_credentials }},x-systemd.requires-mounts-for={{ cifs_credentials_volume }},noperm,file_mode=0777,dir_mode=0777" + +- name: set read-only if configured + when: read_only | bool + set_fact: + mount_options: "{{ mount_options }},ro" + +- name: mount macdatafile with the appropriate options + become: true + mount: + path: "{{ mount_point }}" + src: "{{ network_share }}" + opts: "{{ mount_options }}" + fstype: cifs + state: mounted diff --git a/roles/munge_node/README.md b/cluster-setup/deployment/roles/munge_node/README.md similarity index 100% rename from roles/munge_node/README.md rename to cluster-setup/deployment/roles/munge_node/README.md diff --git a/roles/munge_node/files/munge-test.key b/cluster-setup/deployment/roles/munge_node/files/munge-test.key similarity index 100% rename from roles/munge_node/files/munge-test.key rename to cluster-setup/deployment/roles/munge_node/files/munge-test.key diff --git a/roles/munge_node/tasks/main.yml b/cluster-setup/deployment/roles/munge_node/tasks/main.yml similarity index 87% rename from roles/munge_node/tasks/main.yml rename to cluster-setup/deployment/roles/munge_node/tasks/main.yml index 2dfdd84cf..e69c8efe3 100644 --- a/roles/munge_node/tasks/main.yml +++ b/cluster-setup/deployment/roles/munge_node/tasks/main.yml @@ -2,10 +2,10 @@ become: true block: - name: install munge - dnf: + apt: name: - munge - - munge-libs + - libmunge2 state: present - name: deploy munge testing key copy: @@ -17,5 +17,5 @@ - name: start munge service systemd: name: munge - state: started + state: restarted enabled: true diff --git a/roles/singularity_node/README.md b/cluster-setup/deployment/roles/singularity_node/README.md similarity index 100% rename from roles/singularity_node/README.md rename to cluster-setup/deployment/roles/singularity_node/README.md diff --git a/cluster-setup/deployment/roles/singularity_node/tasks/main.yml b/cluster-setup/deployment/roles/singularity_node/tasks/main.yml new file mode 100644 index 000000000..d32abed87 --- /dev/null +++ b/cluster-setup/deployment/roles/singularity_node/tasks/main.yml @@ -0,0 +1,9 @@ +--- + +# This role installs Singularity from the released .deb file. + +- name: install singularity + become: true + apt: + deb: https://github.com/sylabs/singularity/releases/download/v3.11.4/singularity-ce_3.11.4-jammy_amd64.deb + state: present diff --git a/cluster-setup/deployment/roles/slurm_builder/README.md b/cluster-setup/deployment/roles/slurm_builder/README.md new file mode 100644 index 000000000..dc4d0a025 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_builder/README.md @@ -0,0 +1,12 @@ +This role fetches the Slurm source code and builds +Slurm. It depends on the [slurm dependencies] role to install the dependencies +needed to build Slurm. + +It's used by the [slurm controller] role to build Slurm and place it on /usr/local, +which is shared via NFS with the worker nodes. The [slurm node] role fails if +`/usr/local/lib/systemd/system/slurmd.service` isn't present (it should be if you've +run this role on the head node). + +[slurm node]: ../slurm_node +[slurm controller]: ../slurm_controller +[slurm dependencies]: ../slurm_dependencies diff --git a/cluster-setup/deployment/roles/slurm_builder/meta/main.yml b/cluster-setup/deployment/roles/slurm_builder/meta/main.yml new file mode 100644 index 000000000..0fda72054 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_builder/meta/main.yml @@ -0,0 +1,4 @@ +--- + +dependencies: + - slurm_dependencies diff --git a/cluster-setup/deployment/roles/slurm_builder/tasks/main.yml b/cluster-setup/deployment/roles/slurm_builder/tasks/main.yml new file mode 100644 index 000000000..d6b0ad4dd --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_builder/tasks/main.yml @@ -0,0 +1,58 @@ +--- + +# Installing packages used in the building of Slurm, as per +# https://slurm.schedmd.com/quickstart_admin.html +# Certain packages need to be installed when Slurm is compiled to enable +# support for certain features. In the first play we indicate with +# comments which packages are used for which features. + +- name: check if slurm source files are already downloaded + stat: + path: "/usr/local/src/{{ slurm_tarball }}" + register: slurm_download + +- name: fetch slurm source files + become: true + get_url: + url: "{{ slurm_source_url }}" + dest: "/usr/local/src/{{ slurm_tarball }}" + checksum: "sha1:{{ slurm_sha1_checksum }}" + when: not slurm_download.stat.exists + +- name: decompress Slurm tarball + become: true + unarchive: + remote_src: true + src: "/usr/local/src/{{ slurm_tarball }}" + dest: "/usr/local/src" + owner: root + group: root + +- name: make a link to the Slurm source code directory + become: true + file: + src: "/usr/local/src/{{ slurm_src_basename }}" + dest: "/usr/local/src/slurm" + state: link + +- name: configure Slurm build + become: true + command: + argv: + - "/usr/local/src/slurm/configure" + - "--sysconfdir=/usr/local/etc/slurm" + - "--with-systemdsystemunitdir=/usr/local/lib/systemd/system" + chdir: "/usr/local/src/slurm" + creates: "/usr/local/src/slurm/Makefile" + +- name: build and install Slurm + become: true + make: + chdir: "/usr/local/src/slurm" + target: install + +- name: make Slurm libraries accessible to the system + become: true + command: + cmd: "ldconfig -n /usr/local/lib" + diff --git a/cluster-setup/deployment/roles/slurm_configuration/README.md b/cluster-setup/deployment/roles/slurm_configuration/README.md new file mode 100644 index 000000000..f647bc908 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_configuration/README.md @@ -0,0 +1,2 @@ +This role sets up the `slurm` user on a host along with some system directories +required for `slurmd` or `slurmctld` (if they aren't already set up). diff --git a/cluster-setup/deployment/roles/slurm_configuration/tasks/main.yml b/cluster-setup/deployment/roles/slurm_configuration/tasks/main.yml new file mode 100644 index 000000000..3555c4c67 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_configuration/tasks/main.yml @@ -0,0 +1,22 @@ +--- + +- name: create slurm user on all hosts + become: true + user: + name: slurm + system: yes + create_home: no + uid: 9634 + register: slurm_user + +- name: create directories used by both slurmd and slurmctld + become: true + loop: + - /usr/local/etc/slurm + - /var/log/slurm + file: + path: "{{ item }}" + owner: slurm + group: slurm + mode: '0755' + state: directory diff --git a/cluster-setup/deployment/roles/slurm_controller/README.md b/cluster-setup/deployment/roles/slurm_controller/README.md new file mode 100644 index 000000000..91c8ecce3 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_controller/README.md @@ -0,0 +1,21 @@ +This role sets up `slurmctld` on the node it runs on. + +Note that this does *not* set up `slurmd`; a node that should function +as a Slurm compute node should also run the [slurm node] role to set up `slurmd`. + +Like the [slurm node] role, this node depends on: +- the [munge node] role to set up the MUNGE authentication service; +- the [slurm dependencies] role to install slurmctld's dependencies; and +- the [slurm configuration] role to create the `slurm` user and system directories + used by slurmctld. + +[slurm node]: ../slurm_node + +To set up the Slurm controller and database daemons, it will: + +- install and configure a MariaDB server; +- deploy required configuration files (including those needed for `slurmd`); and +- spin up `slurmctld`. + +Note that the config files deployed by this role are the ones required for +`slurmd`, and in our cluster compute nodes will use these files via NFS mounts. diff --git a/cluster-setup/deployment/roles/slurm_controller/defaults/main.yml b/cluster-setup/deployment/roles/slurm_controller/defaults/main.yml new file mode 100644 index 000000000..68610077f --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_controller/defaults/main.yml @@ -0,0 +1,12 @@ +--- + +slurmctlnode: head +slurm_nodes: + - name: head + memory: 4000 + cpus: 2 + sockets: 2 + - name: worker + memory: 4000 + cpus: 2 + sockets: 2 diff --git a/roles/slurm_node/files/cgroup.conf b/cluster-setup/deployment/roles/slurm_controller/files/cgroup.conf similarity index 69% rename from roles/slurm_node/files/cgroup.conf rename to cluster-setup/deployment/roles/slurm_controller/files/cgroup.conf index 4b8e9192b..2b79b7410 100644 --- a/roles/slurm_node/files/cgroup.conf +++ b/cluster-setup/deployment/roles/slurm_controller/files/cgroup.conf @@ -5,6 +5,9 @@ # See man slurm.conf and man cgroup.conf for further # information on cgroup configuration parameters #-- -CgroupAutomount=no +CgroupAutomount=yes +# CgroupPlugin="cgroup/v2" +ConstrainCores=yes +ConstrainDevices=yes ConstrainRAMSpace=yes ConstrainSwapSpace=yes diff --git a/roles/slurm_controller/files/slurmdbd.conf b/cluster-setup/deployment/roles/slurm_controller/files/slurmdbd.conf similarity index 96% rename from roles/slurm_controller/files/slurmdbd.conf rename to cluster-setup/deployment/roles/slurm_controller/files/slurmdbd.conf index b2c402d9c..1722ca565 100644 --- a/roles/slurm_controller/files/slurmdbd.conf +++ b/cluster-setup/deployment/roles/slurm_controller/files/slurmdbd.conf @@ -31,7 +31,7 @@ SlurmUser=slurm DebugLevel=4 #DefaultQOS=normal,standby LogFile=/var/log/slurm/slurmdbd.log -PidFile=/var/run/slurm/slurmdbd.pid +PidFile=/var/run/slurmdbd.pid #PluginDir=/usr/lib/slurm #PrivateData=accounts,users,usage,jobs #TrackWCKey=yes diff --git a/roles/slurm_node/handlers/main.yml b/cluster-setup/deployment/roles/slurm_controller/handlers/main.yml similarity index 78% rename from roles/slurm_node/handlers/main.yml rename to cluster-setup/deployment/roles/slurm_controller/handlers/main.yml index bb5a524e8..5db8706ab 100644 --- a/roles/slurm_node/handlers/main.yml +++ b/cluster-setup/deployment/roles/slurm_controller/handlers/main.yml @@ -1,3 +1,4 @@ +# FIXME we may not need this - name: reconfigure slurm become: true become_user: root diff --git a/cluster-setup/deployment/roles/slurm_controller/meta/main.yml b/cluster-setup/deployment/roles/slurm_controller/meta/main.yml new file mode 100644 index 000000000..e8045539d --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_controller/meta/main.yml @@ -0,0 +1,6 @@ +--- + +dependencies: + - munge_node + - slurm_dependencies + - slurm_configuration diff --git a/cluster-setup/deployment/roles/slurm_controller/tasks/main.yml b/cluster-setup/deployment/roles/slurm_controller/tasks/main.yml new file mode 100644 index 000000000..c90b2cc22 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_controller/tasks/main.yml @@ -0,0 +1,100 @@ +--- + +- name: install and start mariadb + become: true + become_user: root + tags: slurmdb + block: + - name: install mariadb + apt: + name: + - mariadb-server + - libmariadb-dev + state: present + - name: start mariadb service + systemd: + name: mariadb + state: started + enabled: true + - name: create slurm database user + tags: slurmdb + block: + - mysql_db: + name: slurm_acct_db + login_unix_socket: /var/run/mysqld/mysqld.sock + check_implicit_admin: true + config_file: '' + state: present + - mysql_user: + name: slurm + login_unix_socket: /var/run/mysqld/mysqld.sock + check_implicit_admin: true + config_file: '' + priv: "slurm_acct_db.*:all" + +- name: add slurmctld config files + become: true + block: + - name: copy cgroup config file + notify: reconfigure slurm + copy: + src: cgroup.conf + dest: /usr/local/etc/slurm/ + owner: slurm + group: slurm + mode: "644" + - name: generate and copy slurm config file + notify: reconfigure slurm + template: + src: slurm.conf.j2 + dest: /usr/local/etc/slurm/slurm.conf + owner: slurm + group: slurm + mode: "644" + +- name: copy slurmdbd configuration + become: true + copy: + src: "slurmdbd.conf" + dest: /usr/local/etc/slurm/ + owner: slurm + group: slurm + mode: "600" + +- name: create directory for slurmctld spooling + become: true + file: + path: /var/spool/slurmctld + owner: slurm + group: slurm + mode: '0755' + state: directory + +- name: enable Slurm head-node-only services + become: true + block: + - name: check if the slurmctld service is in place + stat: path=/usr/local/lib/systemd/system/slurmctld.service + register: slurmctld_service + + - name: fail if the slurmctld service isn't there + fail: + msg: "slurmctld service is not installed." + when: not slurmctld_service.stat.exists + + - name: check if the slurmdbd service is in place + stat: path=/usr/local/lib/systemd/system/slurmdbd.service + register: slurmdbd_service + + - name: fail if the slurmdbd service isn't there + fail: + msg: "slurmdbd service is not installed." + when: not slurmdbd_service.stat.exists + + - loop: + - slurmdbd + - slurmctld + systemd: + name: "{{ item }}" + state: started + enabled: true diff --git a/roles/slurm_node/templates/slurm.conf.j2 b/cluster-setup/deployment/roles/slurm_controller/templates/slurm.conf.j2 similarity index 72% rename from roles/slurm_node/templates/slurm.conf.j2 rename to cluster-setup/deployment/roles/slurm_controller/templates/slurm.conf.j2 index 23847c750..3c731b7bc 100644 --- a/roles/slurm_node/templates/slurm.conf.j2 +++ b/cluster-setup/deployment/roles/slurm_controller/templates/slurm.conf.j2 @@ -1,34 +1,38 @@ -ControlMachine={{slurmctlnode}} -#ControlAddr= -#BackupController= -#BackupAddr= +# slurm.conf file generated by https://slurm.schedmd.com/configurator.html +# and manually edited to work as an Ansible template. + +# This file is generated by Ansible. If you edit it by hand your +# changes may be overwritten next time this cluster's playbook is run. +# +# To change the settings in this file, see the template at +# +# roles/slurm_node/templates/slurm.conf.j2 # -AuthType=auth/munge -CacheGroups=0 -#CheckpointType=checkpoint/none -CryptoType=crypto/munge +# To change the list of nodes, edit the `slurm_nodes` setting in this cluster's +# environment variables file. + +ClusterName=kivecluster +SlurmctldHost={{ slurmctlnode }} + #DisableRootJobs=NO #EnforcePartLimits=NO #Epilog= #EpilogSlurmctld= #FirstJobId=1 -#MaxJobId=999999 +#MaxJobId=67043328 #GresTypes= #GroupUpdateForce=0 #GroupUpdateTime=600 -#JobCheckpointDir=/var/slurm/checkpoint -#JobCredentialPrivateKey= -#JobCredentialPublicCertificate= #JobFileAppend=0 #JobRequeue=1 -#JobSubmitPlugins=1 +#JobSubmitPlugins=lua #KillOnBadExit=0 #LaunchType=launch/slurm #Licenses=foo*4,bar #MailProg=/bin/mail -#MaxJobCount=5000 +#MaxJobCount=10000 #MaxStepCount=40000 -#MaxTasksPerNode=128 +#MaxTasksPerNode=512 MpiDefault=none #MpiParams=ports=#-# #PluginDir= @@ -43,21 +47,19 @@ ProctrackType=proctrack/cgroup #PropagateResourceLimitsExcept= #RebootProgram= ReturnToService=1 -#SallocDefaultCommand= -SlurmctldPidFile=/var/run/slurm/slurmctld.pid +SlurmctldPidFile=/var/run/slurmctld.pid SlurmctldPort=6817 -SlurmdPidFile=/var/run/slurm/slurmd.pid +SlurmdPidFile=/var/run/slurmd.pid SlurmdPort=6818 -SlurmdSpoolDir=/var/lib/slurm/slurmd +SlurmdSpoolDir=/var/spool/slurmd SlurmUser=slurm #SlurmdUser=root #SrunEpilog= #SrunProlog= -StateSaveLocation=/var/lib/slurm/slurmctld +StateSaveLocation=/var/spool/slurmctld SwitchType=switch/none #TaskEpilog= TaskPlugin=task/cgroup -#TaskPluginParam= #TaskProlog= #TopologyPlugin=topology/tree #TmpFS=/tmp @@ -90,11 +92,8 @@ Waittime=0 # SCHEDULING #DefMemPerCPU=0 #MaxMemPerCPU=0 -#SchedulerRootFilter=1 #SchedulerTimeSlice=30 SchedulerType=sched/builtin -SchedulerPort=7321 -#SelectType=select/linear SelectType=select/cons_res SelectTypeParameters=CR_CPU_Memory # @@ -116,17 +115,15 @@ SelectTypeParameters=CR_CPU_Memory # # LOGGING AND ACCOUNTING #AccountingStorageEnforce=0 -AccountingStorageHost={{slurmctlnode}} -#AccountingStorageLoc=/var/log/slurm/accounting +AccountingStorageHost={{ slurmctlnode }} #AccountingStoragePass= AccountingStoragePort=6819 AccountingStorageType=accounting_storage/slurmdbd #AccountingStorageUser= -AccountingStoreJobComment=YES -ClusterName=kivetestcluster -#DebugFlags= +AccountingStoreFlags=job_comment #JobCompHost= JobCompLoc=/var/log/slurm/job_completions +#JobCompParams= #JobCompPass= #JobCompPort= JobCompType=jobcomp/filetxt @@ -134,12 +131,13 @@ JobCompType=jobcomp/filetxt #JobContainerType=job_container/none JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux -SlurmctldDebug=3 +SlurmctldDebug=info SlurmctldLogFile=/var/log/slurm/slurmctld.log -SlurmdDebug=3 +SlurmdDebug=info SlurmdLogFile=/var/log/slurm/slurmd.log #SlurmSchedLogFile= #SlurmSchedLogLevel= +#DebugFlags= # # # POWER SAVE SUPPORT FOR IDLE NODES (optional) @@ -153,20 +151,10 @@ SlurmdLogFile=/var/log/slurm/slurmd.log #SuspendRate= #SuspendTime= # - -# NOTE(nknight): This file is generated by Ansible. If you edit it by hand your -# changes may be overwritten next time this cluster's playbook is run. # -# To change the settings in this file, see the template at -# -# roles/slurm_node/templates/slurm.conf.j2 -# -# To change the list of nodes, edit the `slurm_nodes` setting in this cluster's -# environment variables file. - # COMPUTE NODES {% for node in slurm_nodes %} -NodeName={{ node.name }} CPUs={{ node.cpus | default('1') }} Sockets={{ node.sockets | default('1') }} CoresPerSocket={{ node.cores_per_socket | default('1') }} ThreadsPerCore={{ node.threads_per_core | default('1') }} RealMemory={{ node.memory }} State=UNKNOWN +NodeName={{ node.name }} CPUs={{ node.cpus | default('1') }} RealMemory={{ node.memory }} Sockets={{ node.sockets | default('1') }} CoresPerSocket={{ node.cores_per_socket | default('1') }} ThreadsPerCore={{ node.threads_per_core | default('1') }} State=UNKNOWN {% endfor %} PartitionName=debug Priority=3500 Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/cluster-setup/deployment/roles/slurm_dependencies/README.md b/cluster-setup/deployment/roles/slurm_dependencies/README.md new file mode 100644 index 000000000..a4ddae8fa --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_dependencies/README.md @@ -0,0 +1 @@ +This role installs system dependencies for Slurm using apt. diff --git a/cluster-setup/deployment/roles/slurm_dependencies/tasks/main.yml b/cluster-setup/deployment/roles/slurm_dependencies/tasks/main.yml new file mode 100644 index 000000000..bfc2d8667 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_dependencies/tasks/main.yml @@ -0,0 +1,56 @@ +--- + +# Installing packages used in the building and running of Slurm, as per +# https://slurm.schedmd.com/quickstart_admin.html +# Certain packages need to be installed when Slurm is compiled to enable +# support for certain features. In the first play we indicate with +# comments which packages are used for which features. + +- name: install Slurm dependencies + become: true + apt: + name: + # cgroups: + - libdbus-1-dev + - hwloc + - libhwloc-dev + # AMD GPU support: + # FIXME this does not compile correctly; can we find a proper dev package? + - rocm-device-libs + # HDF5 job profiling: + - libhdf5-dev + # To generate HTML man pages: + - man2html + # InfiniBand accounting: + - libibmad-dev + - libibumad-dev + # Intel GPU support: + - libvpl-dev + # IPMI energy consumption: + # FIXME this does not compile correctly; are all the required headers in place? + - libfreeipmi-dev + # lua support: + - liblua5.4-dev + # MUNGE support: + - libmunge-dev + # MariaDB support: + - libmariadb-dev + # NUMA affinity: + - libnuma-dev + # NVIDIA GPU support: + - libnvidia-ml-dev + # PAM support: + - libpam0g-dev + # PMIx support: + - libpmix-dev + # Readline support: + - libreadline-dev + # REST API: + - libhttp-parser-dev + - libjson-c-dev + - libyaml-dev + - libjwt-dev + # RRD external sensor data collection: + - librrd-dev + # sview: + - libgtk2.0-dev diff --git a/cluster-setup/deployment/roles/slurm_node/README.md b/cluster-setup/deployment/roles/slurm_node/README.md new file mode 100644 index 000000000..f46cc6860 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_node/README.md @@ -0,0 +1,13 @@ +This role confirms that configuration files required by slurmd are in place +and then spins up the slurmd service. It depends on: +- the [munge node] role to set up the MUNGE authentication service; +- the [slurm dependencies] role to install slurmd's dependencies; and +- the [slurm configuration] role to create the `slurm` user and system directories + used by slurmd. + +In a typical cluster configuration, the configuration files required will be mounted +via NFS, so we don't actually install them in this role or in the dependencies. + +[munge node]: ../munge_node +[slurm dependencies]: ../slurm_dependencies +[slurm configuration]: ../slurm_configuration diff --git a/cluster-setup/deployment/roles/slurm_node/meta/main.yml b/cluster-setup/deployment/roles/slurm_node/meta/main.yml new file mode 100644 index 000000000..e8045539d --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_node/meta/main.yml @@ -0,0 +1,6 @@ +--- + +dependencies: + - munge_node + - slurm_dependencies + - slurm_configuration diff --git a/cluster-setup/deployment/roles/slurm_node/tasks/main.yml b/cluster-setup/deployment/roles/slurm_node/tasks/main.yml new file mode 100644 index 000000000..c938d9453 --- /dev/null +++ b/cluster-setup/deployment/roles/slurm_node/tasks/main.yml @@ -0,0 +1,45 @@ +--- + +- name: check if Slurm is ready to go + block: + - name: check if the Slurm systemd service exists + stat: path=/usr/local/lib/systemd/system/slurmd.service + register: slurmd_service + + - name: fail if the systemd service isn't there + fail: + msg: "slurmd systemd service is not in place." + when: not slurmd_service.stat.exists + + - name: check if slurm.conf exists + stat: path=/usr/local/etc/slurm/slurm.conf + register: slurm_conf + + - name: fail if the conf file isn't there + fail: + msg: "slurm.conf is not in place." + when: not slurm_conf.stat.exists + + - name: check if cgroup.conf exists + stat: path=/usr/local/etc/slurm/cgroup.conf + register: cgroup_conf + + - name: fail if cgroup.conf file isn't there + fail: + msg: "cgroup.conf is not in place." + when: not cgroup_conf.stat.exists + +- name: create directory for Slurm spooling + become: true + file: + path: /var/spool/slurmd + owner: slurm + group: slurm + mode: '0755' + state: directory + +- name: enable slurmd service + systemd: + name: slurmd + state: started + enabled: true diff --git a/cluster-setup/deployment/roles/worker_node_networking/README.md b/cluster-setup/deployment/roles/worker_node_networking/README.md new file mode 100644 index 000000000..cb4677d30 --- /dev/null +++ b/cluster-setup/deployment/roles/worker_node_networking/README.md @@ -0,0 +1,5 @@ +This role sets up the networking infrastructure used by worker nodes, such as: +- the NFS client is installed and NFS volumes are mounted; +- ports are opened for slurmd and ssh; +- the original `/home` directory is moved aside so that `/data/home` on the head node + will be used as the home directory. diff --git a/cluster-setup/deployment/roles/worker_node_networking/tasks/main.yml b/cluster-setup/deployment/roles/worker_node_networking/tasks/main.yml new file mode 100644 index 000000000..2802588a9 --- /dev/null +++ b/cluster-setup/deployment/roles/worker_node_networking/tasks/main.yml @@ -0,0 +1,107 @@ +--- + +- name: set timezone + block: + - name: change the timezone + community.general.timezone: + name: America/Vancouver + - name: restart cron to reflect the new timezone + systemd: + name: cron + state: restarted + +- name: install NFS client software + become: true + apt: + name: + - nfs-common + state: present + +- name: ensure ufw is running + become: true + systemd: + name: ufw + state: started + enabled: true + +- name: open port for SSH access + become: true + community.general.ufw: + rule: allow + port: ssh + protocol: tcp + +# Originally this task opened ports: +# - 6817-6819/tcp +# - 6817-6819/udp +# - 7321/tcp +- name: open port for slurmctld to communicate with slurmd + become: true + community.general.ufw: + rule: allow + port: 6818 + protocol: tcp + +- name: mount /data + become: true + block: + - name: create the mount point + file: + path: /data + state: directory + - name: mount the drive + ansible.posix.mount: + path: /data + src: "{{ head_internal_address }}:/data" + fstype: nfs + state: mounted + +- name: mount /opt + become: true + ansible.posix.mount: + path: /opt + src: "{{ head_internal_address }}:/opt" + fstype: nfs + state: mounted + opts: ro + +- name: mount /usr/local + become: true + ansible.posix.mount: + path: /usr/local + src: "{{ head_internal_address }}:/usr/local" + fstype: nfs + state: mounted + opts: ro + +- name: use /data/home as the home directory + block: + - name: check if /data/home exists + stat: path=/data/home + register: data_home + + - name: fail if /data/home isn't prepared + fail: + msg: "/data/home is not in place." + when: not data_home.stat.exists + + rescue: + - name: inform user to set up the head node first and propagate failure to stop the playbook + fail: + msg: "Before setting up this node, make sure the head node is configured first!" + +- name: check if original /home has been renamed + stat: path=/original_home + register: home_backed_up + +- name: rename original /home + become: true + command: mv /home /original_home + when: not home_backed_up.stat.exists + +- name: symbolic link for /home + become: true + file: + path: /home + src: /data/home + state: link diff --git a/cluster-setup/deployment/set_locale_to_canada.yml b/cluster-setup/deployment/set_locale_to_canada.yml new file mode 100644 index 000000000..4d58a5f5f --- /dev/null +++ b/cluster-setup/deployment/set_locale_to_canada.yml @@ -0,0 +1,14 @@ +--- + +- name: change the default locale to Canada English + hosts: all + tasks: + - name: make the locale available + become: true + community.general.locale_gen: + name: "en_CA.UTF-8" + state: present + + - name: set the default locale + become: true + command: update-locale LANG=en_CA.UTF-8 diff --git a/cluster-setup/deployment/slurm_setup.yml b/cluster-setup/deployment/slurm_setup.yml new file mode 100644 index 000000000..da41d1c66 --- /dev/null +++ b/cluster-setup/deployment/slurm_setup.yml @@ -0,0 +1,30 @@ +--- + +- name: configure head node + hosts: head + tasks: + - name: set up head node networking + include_role: + name: head_node_networking + - name: build Slurm + include_role: + name: slurm_builder + - name: configure and start slurmctld and supporting services + include_role: + name: slurm_controller + - name: configure and start slurmd + include_role: + name: slurm_node + +- name: configure workers + hosts: workers + tasks: + - name: synchronize users and groups from the head node + include_role: + name: copy_users_and_groups + - name: set up worker node networking + include_role: + name: worker_node_networking + - name: configure and start slurmd + include_role: + name: slurm_node diff --git a/cluster-setup/deployment/templates/cifs_credentials.j2 b/cluster-setup/deployment/templates/cifs_credentials.j2 new file mode 100644 index 000000000..2cb02ed30 --- /dev/null +++ b/cluster-setup/deployment/templates/cifs_credentials.j2 @@ -0,0 +1,3 @@ +username={{ cifs_username }} +password={{ cifs_password }} +domain={{ cifs_domain }} diff --git a/cluster-setup/export_users_and_groups.py b/cluster-setup/export_users_and_groups.py new file mode 100644 index 000000000..f44a63168 --- /dev/null +++ b/cluster-setup/export_users_and_groups.py @@ -0,0 +1,294 @@ +#! /usr/bin/env python + +import csv +from typing import Optional, Iterable, Mapping, TypedDict +from collections.abc import Container +from dataclasses import dataclass, field, asdict +from io import TextIOBase + +import argparse +import yaml + + +@dataclass +class ShadowEntry: + name: str + hashed_password: str + last_changed: Optional[int] + min: Optional[int] + max: Optional[int] + warn: Optional[int] + inactive: Optional[int] + expire: Optional[int] + +def int_or_none(possible_int_string: str) -> Optional[int]: + try: + return int(possible_int_string) + except ValueError: + return None + +def parse_shadow(shadow_file: TextIOBase) -> dict[str, ShadowEntry]: + shadow_csv = csv.reader(shadow_file, delimiter=":") + shadow_entries: dict[str, ShadowEntry] = {} + for row in shadow_csv: + name: str = row[0] + shadow_entries[name] = ShadowEntry( + name=name, + hashed_password=row[1], + last_changed=int_or_none(row[2]), + min=int_or_none(row[3]), + max=int_or_none(row[4]), + warn=int_or_none(row[5]), + inactive=int_or_none(row[6]), + expire=int_or_none(row[7]), + ) + return shadow_entries + +@dataclass +class PasswdEntry: + name: str + passwdx: str + uid: int + gid: int + info: str + home: str + shell: str + +def parse_passwd(passwd_file: TextIOBase) -> dict[str, PasswdEntry]: + passwd_csv = csv.reader(passwd_file, delimiter=":") + passwd_entries: dict[str, PasswdEntry] = {} + for row in passwd_csv: + name: str = row[0] + passwd_entries[name] = PasswdEntry( + name=name, + passwdx=row[1], + uid=int(row[2]), + gid=int(row[3]), + info=row[4], + home=row[5], + shell=row[6], + ) + return passwd_entries + +@dataclass +class GroupEntry: + name: str + passwdx: str + gid: int + users: list[str] + +def parse_group(group_file: TextIOBase) -> dict[int, GroupEntry]: + group_csv = csv.reader(group_file, delimiter=":") + group_entries: dict[GroupEntry] = {} + for row in group_csv: + gid: int = int(row[2]) + group_entries[gid] = GroupEntry( + name=row[0], + passwdx=row[1], + gid=gid, + users=row[3].split(","), + ) + return group_entries + +def get_other_groups_by_user( + users_to_export: Iterable[str], + groups_to_export: Container[str], + passwd_entries: dict[int, PasswdEntry], + group_entries: dict[int, GroupEntry], + old_sudo: Optional[str], + new_sudo: Optional[str], +) -> dict[str, list[str]]: + """ + Assemble a mapping of username to the (non-primary) groups that this user belongs to. + """ + if old_sudo is not None: + assert new_sudo is not None, "Either both old and new sudo group names must be specified or neither" + + groups_by_user: dict[str, list[str]] = {} + for user in users_to_export: + groups_by_user[user] = [] + + for gid, group_entry in group_entries.items(): + group_name: str = group_entry.name + + if old_sudo is not None and group_name == old_sudo: + for user in group_entry.users: + if user in groups_by_user: + groups_by_user[user].append(new_sudo) + + elif group_name in groups_to_export: + for user in group_entry.users: + if user in groups_by_user: + passwd_entry: PasswdEntry = passwd_entries[user] + if gid != passwd_entry.gid: # check if this is the primary group + groups_by_user[user].append(group_name) + + return groups_by_user + + +@dataclass +class User: + name: str + hashed_password: str + uid: int + home: str + primary_group: str + groups: list[str] = field(default_factory=list) + +def create_user( + name: str, + passwd_entries: dict[str, PasswdEntry], + shadow_entries: dict[str, ShadowEntry], + group_entries: dict[str, GroupEntry], + groups_by_user: dict[str, list[str]], +) -> User: + passwd_entry: PasswdEntry = passwd_entries[name] + return User( + name=name, + hashed_password=shadow_entries[name].hashed_password, + uid=passwd_entry.uid, + home=passwd_entry.home, + primary_group=group_entries[passwd_entry.gid].name, + groups=groups_by_user[name] + ) + + +def get_user_primary_groups( + users: Iterable[User], + group_entries: dict[int, GroupEntry], +) -> dict[str, GroupEntry]: + primary_groups: dict[str, GroupEntry] = {} + groups_by_name: dict[str, GroupEntry] = {} + for group_entry in group_entries.values(): + groups_by_name[group_entry.name] = group_entry + for user in users: + primary_groups[user.name] = groups_by_name[user.primary_group] + return primary_groups + + +@dataclass +class ExportedUsersAndGroups: + users: list[User] + primary_groups: list[GroupEntry] + other_groups: list[GroupEntry] + + +def exported_users_and_groups( + users_to_export: Iterable[str], + groups_to_export: Container[str], + passwd_entries: Mapping[str, PasswdEntry], + shadow_entries: Mapping[str, ShadowEntry], + group_entries: Mapping[int, GroupEntry], + old_sudo: Optional[str], + new_sudo: Optional[str], +) -> ExportedUsersAndGroups: + + other_groups_by_user: dict[str, list[str]] = get_other_groups_by_user( + users_to_export, + groups_to_export, + passwd_entries, + group_entries, + old_sudo, + new_sudo, + ) + + users: dict[str, User] = {} + for username in users_to_export: + users[username] = create_user( + username, + passwd_entries, + shadow_entries, + group_entries, + other_groups_by_user, + ) + + primary_groups: dict[int, GroupEntry] = get_user_primary_groups( + users.values(), + group_entries, + ) + other_groups: dict[int, GroupEntry] = {} + for gid, group_entry in group_entries.items(): + if gid in primary_groups or group_entry.name not in groups_to_export: + continue + other_groups[gid] = group_entry + + return ExportedUsersAndGroups( + list(users.values()), + list(primary_groups.values()), + list(other_groups.values()), + ) + + +class SudoGroup(TypedDict): + old: Optional[str] = None + new: Optional[str] = None + +def main(): + parser = argparse.ArgumentParser( + "Collate user and group information for recreating them on a new server" + ) + parser.add_argument( + "--passwd", + help="The passwd file (as it appears in /etc/passwd on the original server)", + default="/etc/passwd", + ) + parser.add_argument( + "--shadow", + help="The shadow file (as it appears in /etc/shadow on the original server)", + default="/etc/shadow", + ) + parser.add_argument( + "--group", + help="The group file (as it appears in /etc/group on the original server)", + default="/etc/group", + ) + parser.add_argument( + "--out", + help="File to write the output YAML to (default out.yaml)", + default="out.yaml", + ) + parser.add_argument( + "users_and_groups", + help="YAML file with `users` (list of usernames to export) and `groups` (list of group names to export)", + ) + args = parser.parse_args() + + with open(args.users_and_groups, "r") as f: + users_and_groups = yaml.safe_load(f) + + users_to_export: list[str] = users_and_groups["users"] + groups_to_export: list[str] = users_and_groups["groups"] + sudo_group: SudoGroup = SudoGroup() + if users_and_groups.get("sudo_group") is not None: + sudo_group["old"] = users_and_groups["sudo_group"]["old"] + sudo_group["new"] = users_and_groups["sudo_group"]["new"] + + with open(args.passwd, "r") as f: + passwd_entries: dict[str, PasswdEntry] = parse_passwd(f) + + with open(args.shadow, "r") as f: + shadow_entries: dict[str, ShadowEntry] = parse_shadow(f) + + with open(args.group, "r") as f: + group_entries: dict[str, GroupEntry] = parse_group(f) + + for_export: ExportedUsersAndGroups = exported_users_and_groups( + users_to_export, + groups_to_export, + passwd_entries=passwd_entries, + shadow_entries=shadow_entries, + group_entries=group_entries, + old_sudo=sudo_group["old"], + new_sudo=sudo_group["new"], + ) + serialized = { + "users": [asdict(x) for x in for_export.users], + "primary_groups": [asdict(x) for x in for_export.primary_groups], + "other_groups": [asdict(x) for x in for_export.other_groups], + } + with open(args.out, "w") as f: + yaml.dump(serialized, f) + + +if __name__ == "__main__": + main() diff --git a/cluster-setup/initialization/head/cluster_hosts b/cluster-setup/initialization/head/cluster_hosts new file mode 100644 index 000000000..f5e637fd7 --- /dev/null +++ b/cluster-setup/initialization/head/cluster_hosts @@ -0,0 +1,14 @@ +192.168.69.179 bulbasaur bulby +192.168.69.86 octomore octy + +192.168.1.1 octomore head +192.168.1.2 b01 +192.168.1.3 b02 +192.168.1.4 b03 +192.168.1.5 b04 +192.168.1.6 b05 +192.168.1.7 b06 +192.168.1.8 b07a +192.168.1.9 b07b +192.168.1.10 b08a +192.168.1.11 b08b diff --git a/cluster-setup/initialization/head/create_head_user_data.py b/cluster-setup/initialization/head/create_head_user_data.py new file mode 100644 index 000000000..9c15619ed --- /dev/null +++ b/cluster-setup/initialization/head/create_head_user_data.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python + +import argparse +import textwrap + +import yaml + + +def main(): + parser = argparse.ArgumentParser("Create cloud-init user-data for the head node") + parser.add_argument( + "--template", + help="Template file to insert the host mappings into", + default="user-data.template", + ) + parser.add_argument( + "--output", + help="File to write the resulting user-data file to", + default="user-data", + ) + parser.add_argument( + "host_mapping_yaml", + help="YAML file containing the compute node details in `compute_nodes`", + ) + args = parser.parse_args() + + with open(args.host_mapping_yaml, "r") as f: + host_mappings = yaml.safe_load(f)["compute_nodes"] + + host_mapping_str: str = "\n".join( + [f'{hm["name"]}\t{hm["ip"]}' for hm in host_mappings] + ) + host_mapping_str = textwrap.indent(host_mapping_str, " ") + with open(args.template, "r") as template: + with open(args.output, "w") as output: + output.write(template.read().format(host_mappings=host_mapping_str)) + + +if __name__ == "__main__": + main() diff --git a/cluster-setup/initialization/head/head_configuration.bash b/cluster-setup/initialization/head/head_configuration.bash new file mode 100644 index 000000000..21cf04e66 --- /dev/null +++ b/cluster-setup/initialization/head/head_configuration.bash @@ -0,0 +1,12 @@ +#! /usr/bin/bash + +# Run this as root on a vanilla installation of Jammy. + +apt update -y +apt upgrade -y +apt install -y python3 python3-pip + +python3 -m pip install -r requirements.txt +ssh-keygen -t ed25519 -f /root/.ssh/id_ed25519 -N "" +cat /root/.ssh/id_ed25519.pub >> /root/.ssh/authorized_keys +cat cluster_hosts >> /etc/hosts diff --git a/cluster-setup/requirements.txt b/cluster-setup/initialization/head/requirements.txt similarity index 100% rename from cluster-setup/requirements.txt rename to cluster-setup/initialization/head/requirements.txt diff --git a/cluster-setup/initialization/head/user-data.template b/cluster-setup/initialization/head/user-data.template new file mode 100644 index 000000000..0a3d5f727 --- /dev/null +++ b/cluster-setup/initialization/head/user-data.template @@ -0,0 +1,25 @@ +#cloud-config + +package_update: true + +packages: + - python3 + - python3-pip + +write_files: + - content: | + ansible==8.0.0 + PyMySQL==1.0.3 + psycopg2-binary==2.9.6 + path: /usr/local/src/requirements.txt + owner: root + permissions: '0644' + - content: | +{host_mappings} + path: /etc/hosts + append: true + +runcmd: + - [python3, -m, pip, install, -r, /usr/local/src/requirements.txt] + - [sudo, ssh-keygen, -t, ed25519, -f, /root/.ssh/id_ed25519, -N, ""] + - "sudo cat /root/.ssh/id_ed25519.pub >> /root/.ssh/authorized_keys" diff --git a/cluster-setup/initialization/worker/cluster_hosts_bulbasaur b/cluster-setup/initialization/worker/cluster_hosts_bulbasaur new file mode 100644 index 000000000..aa7ce4293 --- /dev/null +++ b/cluster-setup/initialization/worker/cluster_hosts_bulbasaur @@ -0,0 +1,14 @@ +192.168.69.179 bulbasaur bulby +192.168.69.86 octomore octy + +192.168.1.1 bulbasaur head +192.168.1.2 b01 +192.168.1.3 b02 +192.168.1.4 b03 +192.168.1.5 b04 +192.168.1.6 b05 +192.168.1.7 b06 +192.168.1.8 b07a +192.168.1.9 b07b +192.168.1.10 b08a +192.168.1.11 b08b diff --git a/cluster-setup/initialization/worker/cluster_hosts_octomore b/cluster-setup/initialization/worker/cluster_hosts_octomore new file mode 100644 index 000000000..f5e637fd7 --- /dev/null +++ b/cluster-setup/initialization/worker/cluster_hosts_octomore @@ -0,0 +1,14 @@ +192.168.69.179 bulbasaur bulby +192.168.69.86 octomore octy + +192.168.1.1 octomore head +192.168.1.2 b01 +192.168.1.3 b02 +192.168.1.4 b03 +192.168.1.5 b04 +192.168.1.6 b05 +192.168.1.7 b06 +192.168.1.8 b07a +192.168.1.9 b07b +192.168.1.10 b08a +192.168.1.11 b08b diff --git a/cluster-setup/initialization/worker/create_worker_user_data.py b/cluster-setup/initialization/worker/create_worker_user_data.py new file mode 100644 index 000000000..398689832 --- /dev/null +++ b/cluster-setup/initialization/worker/create_worker_user_data.py @@ -0,0 +1,33 @@ +#! /usr/bin/env python + +import argparse + + +def main(): + parser = argparse.ArgumentParser("Create cloud-init user-data.template for the worker nodes") + parser.add_argument( + "--template", + help="Template file to insert the root SSH public key into", + default="user-data.template", + ) + parser.add_argument( + "--output", + help="File to write the resulting user-data.template file to", + default="user-data", + ) + parser.add_argument( + "ssh_public_key", + help="SSH public key file to insert into the template" + ) + args = parser.parse_args() + + with open(args.ssh_public_key, "r") as f: + ssh_key: str = f.read().strip() + + with open(args.template, "r") as template: + with open(args.output, "w") as output: + output.write(template.read().format(root_ssh_public_key=ssh_key)) + + +if __name__ == "__main__": + main() diff --git a/cluster-setup/initialization/worker/user-data.template b/cluster-setup/initialization/worker/user-data.template new file mode 100644 index 000000000..03dba0edf --- /dev/null +++ b/cluster-setup/initialization/worker/user-data.template @@ -0,0 +1,11 @@ +#cloud-config + +package_update: true + +users: + - name: root + ssh_authorized_keys: + - {root_ssh_public_key} + +packages: + - python3 diff --git a/cluster-setup/initialization/worker/worker_configuration.bash b/cluster-setup/initialization/worker/worker_configuration.bash new file mode 100644 index 000000000..6277cc959 --- /dev/null +++ b/cluster-setup/initialization/worker/worker_configuration.bash @@ -0,0 +1,10 @@ +#! /usr/bin/bash + +# Run this as root on a vanilla installation of Jammy on the compute nodes. + +apt update -y +apt upgrade -y +apt install -y python3 + +cat head_node_root_id_ed25519.pub >> /root/.ssh/authorized_keys +cat cluster_hosts >> /etc/hosts diff --git a/cluster-setup/setup_ssh_access.bash b/cluster-setup/setup_ssh_access.bash new file mode 100644 index 000000000..ee514aeec --- /dev/null +++ b/cluster-setup/setup_ssh_access.bash @@ -0,0 +1,6 @@ +#! /usr/bin/env bash + +# Run this as root to set up passwordless SSH access. + +cat /vagrant/setupfiles/vagrant_testkey.pub >> /root/.ssh/authorized_keys +chmod 600 /root/.ssh/authorized_keys diff --git a/cluster-setup/setup_ssh_keys.bash b/cluster-setup/setup_ssh_keys.bash new file mode 100644 index 000000000..9f9b1c1da --- /dev/null +++ b/cluster-setup/setup_ssh_keys.bash @@ -0,0 +1,17 @@ +#! /usr/bin/env bash + +# Run this as root (using sudo) to install our "stock" SSH keys. +if [ -f /root/.ssh/id_ed25519 ] +then + cp /root/.ssh/id_ed25519 /root/.ssh/id_ed25519.bak +fi + +if [ -f /root/.ssh/id_ed25519.pub ] +then + cp /root/.ssh/id_ed25519.pub /root/.ssh/id_ed25519.pub.bak +fi + +cp /vagrant/setupfiles/vagrant_testkey /root/.ssh/id_ed25519 +cp /vagrant/setupfiles/vagrant_testkey.pub /root/.ssh/id_ed25519.pub +chmod 600 /root/.ssh/id_ed25519 +chmod 644 /root/.ssh/id_ed25519.pub diff --git a/cluster-setup/setupfiles/install-ansible.sh b/cluster-setup/setupfiles/install-ansible.sh index b005716e3..8080985f3 100644 --- a/cluster-setup/setupfiles/install-ansible.sh +++ b/cluster-setup/setupfiles/install-ansible.sh @@ -2,12 +2,14 @@ set -eu -o pipefail IFS=$'\t\n' -# Enable extra repositories -dnf install -q -y epel-release -dnf config-manager --set-enabled PowerTools +# # Enable extra repositories +# dnf install -q -y epel-release +# dnf config-manager --set-enabled PowerTools # Install Python3 -dnf install -q -y python3 +# dnf install -q -y python3 +apt update +apt install -y python3 python3-pip # Install Python packages python3 -m pip install -r /vagrant/requirements.txt \ No newline at end of file diff --git a/cluster-setup/testenv/ansible.cfg b/cluster-setup/testenv/ansible.cfg deleted file mode 100644 index 4a56a6b93..000000000 --- a/cluster-setup/testenv/ansible.cfg +++ /dev/null @@ -1,7 +0,0 @@ -# Ansible configuration for the test environment. -# See the following for available sections and keys: -# https://docs.ansible.com/ansible/latest/reference_appendices/config.html - -[defaults] -inventory = ./inventory.ini -interpreter_python = /usr/bin/python3.6 \ No newline at end of file diff --git a/cluster-setup/testenv/inventory.ini b/cluster-setup/testenv/inventory.ini deleted file mode 100644 index e38bf32f7..000000000 --- a/cluster-setup/testenv/inventory.ini +++ /dev/null @@ -1,7 +0,0 @@ -# Documentation on this file: -# https://docs.ansible.com/ansible/latest/user_guide/intro_inventory.html#adding-variables-to-inventory - -head - -[workers] -worker diff --git a/cluster-setup/testenv/kive_dev_vars.yml b/cluster-setup/testenv/kive_dev_vars.yml deleted file mode 100644 index b8e0761ad..000000000 --- a/cluster-setup/testenv/kive_dev_vars.yml +++ /dev/null @@ -1,26 +0,0 @@ ---- -# Variables needed to set up Kive. -kive_allowed_hosts: "[\"*\"]" -kive_listen_port: 8080 -update_kive_source: yes - -# The following are sensitive, and should be kept secret for a production system. -kive_db_password: fixme-14mPdzu5vTOQG2DgtDG1inghQpMX0TBdUqEK6nVNHVo -kive_server_secret_key: fixme-kpXk1iKLbHn6-T7zieLHgADFA8ZSh5itd8k_Sp932fM - -# The following are defaults, and probably don't need to be changed. -# - DJango app settings -kive_venv: /opt/venv_kive -kive_slurm_path: "{{ kive_venv }}/bin" -kive_db_name: kive -kive_db_user: kive -kive_db_host: head -kive_media_root: /data/kive/media_root -kive_static_root: /var/www/html/kive/static -kive_root: /usr/local/share/Kive -# - httpd configuration -kive_httpd_user: kive -kive_httpd_group: kive -# - package variables -slurmbuilddir: "/root" - diff --git a/kive/kive/settings.py b/kive/kive/settings.py index 9aa6dc8f9..21962a0ea 100644 --- a/kive/kive/settings.py +++ b/kive/kive/settings.py @@ -212,7 +212,7 @@ LOG_HANDLER_NAMES.append('console') if ADMINS: LOG_HANDLER_NAMES.append('mail_admins') -LOG_LEVEL = os.environ.get('KIVE_LOG_LEVEL', 'WARN') +LOG_LEVEL = os.environ.get('KIVE_LOG_LEVEL', 'WARNING') # See http://docs.djangoproject.com/en/dev/topics/logging for # more details on how to customize your logging configuration. diff --git a/roles/kive_node/tasks/main.yml b/roles/kive_node/tasks/main.yml deleted file mode 100644 index b06806060..000000000 --- a/roles/kive_node/tasks/main.yml +++ /dev/null @@ -1,130 +0,0 @@ ---- - -- name: create kive user - become: true - user: - name: kive - system: yes - uid: 762 # random uid in system uid range (200, 999); hard-coded for consistency across hosts - - -# NOTE(nknight): this is done with `file` instead of during user creation so that we -# can set the permissions explicitly. -- name: create kive home directory - file: - path: /home/kive/ - state: directory - mode: "go-rx" - group: kive - owner: kive - - -- name: create kive app directories - become: true - loop: - - /etc/kive/ - - /var/kive/ - - /var/log/kive/ - - "{{ kive_media_root }}" - file: - path: "{{ item }}" - state: directory - mode: "2770" - owner: kive - group: kive - - -- name: kive environment configuration - become: true - become_user: kive - block: - - name: set kive environment variables - blockinfile: - path: /home/kive/.bash_profile - block: | - export KIVE_DB_NAME={{ kive_db_name }} - export KIVE_DB_USER={{ kive_db_user }} - export KIVE_DB_HOST={{ kive_db_host }} - export KIVE_DB_PASSWORD={{ kive_db_password }} - - export KIVE_MEDIA_ROOT={{ kive_media_root }} - export KIVE_STATIC_ROOT={{ kive_static_root }} - export KIVE_SLURM_PATH={{ kive_slurm_path }} - create: true # create the file if it doesn't exist - backup: true - owner: kive - group: kive - - -- name: fetch kive source code - become: true - git: - dest: "{{ kive_root }}" - repo: https://github.com/cfe-lab/Kive.git - version: "{{ kive_version | default('master') }}" - update: "{{ update_kive_source | default('no') }}" - - -- name: kive package dependencies - become: true - dnf: - name: - - platform-python-devel - - sqlite-devel - - words - - lsof - - graphviz - - graphviz-devel - - -- name: install kive python dependencies - become: true - block: - - name: create directory for virtualenv - file: - path: "{{ kive_venv }}" - state: directory - - name: copy requirements file to track changes - register: kive_requirements - copy: - dest: "{{ kive_venv }}/requirements.txt" - src: "{{ kive_root }}/requirements.txt" - - name: kive python dependencies - when: kive_requirements.changed - pip: - requirements: "{{ kive_root }}/requirements.txt" - virtualenv: "{{ kive_venv }}" - - -- name: install postgres database libraries - become: true - block: - - name: add postgresql GPG key - rpm_key: - state: present - key: https://download.postgresql.org/pub/repos/yum/RPM-GPG-KEY-PGDG - - name: add postgresql package repository - dnf: - name: https://download.postgresql.org/pub/repos/yum/reporpms/EL-8-x86_64/pgdg-redhat-repo-latest.noarch.rpm - - name: check if postgres is disabled - register: is_builtin_postgresql_disabled - # Check mode means this never writes the line, just checks if it's there. - check_mode: yes - lineinfile: - name: /etc/dnf/modules.d/postgresql.module - line: state=disabled - - name: disable built-in postgres module - when: (is_builtin_postgresql_disabled is changed) or (is_builtin_postgresql_disabled is failed) - command: - cmd: dnf -qy module disable postgresql - warn: false # dnf module doesn't include this sub-command, so have to use command directly - - name: install client libraries - dnf: - update_cache: true - name: postgresql12 - -- name: configure mail service for error logging - systemd: - name: postfix - state: started - enabled: true \ No newline at end of file diff --git a/roles/kive_server/files/001-kive.conf b/roles/kive_server/files/001-kive.conf deleted file mode 100644 index aa8bbca78..000000000 --- a/roles/kive_server/files/001-kive.conf +++ /dev/null @@ -1,15 +0,0 @@ -WSGIScriptAlias / /usr/local/share/Kive/kive/kive/wsgi.py -WSGIPythonPath /usr/local/share/Kive/kive:/opt/venv_kive/lib/python3.6/site-packages - - - -Require all granted - - - -Alias /static/ /var/www/html/kive/static/ - - -Order deny,allow -Allow from all - diff --git a/roles/singularity_node/tasks/main.yml b/roles/singularity_node/tasks/main.yml deleted file mode 100644 index 01db83ad4..000000000 --- a/roles/singularity_node/tasks/main.yml +++ /dev/null @@ -1,9 +0,0 @@ ---- - -# This role installs Singularity from dnf. - -- name: install singularity - become: true - dnf: - name: singularity-3.7.1 - state: present diff --git a/roles/slurm_controller/README.md b/roles/slurm_controller/README.md deleted file mode 100644 index b22837c1c..000000000 --- a/roles/slurm_controller/README.md +++ /dev/null @@ -1,10 +0,0 @@ -This role sets up a Slurm controller node. It builds on the [slurm node] -role. - -[slurm node]: ../slurm_node - -To set up the slurm controller and database daemons, it will: - -- Install and configure a MariaDB server -- Deploy additional configuration files -- Install the Slurm controller components diff --git a/roles/slurm_controller/meta/main.yml b/roles/slurm_controller/meta/main.yml deleted file mode 100644 index 4375d8034..000000000 --- a/roles/slurm_controller/meta/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- -dependencies: - - role: slurm_rpms - - role: slurm_node \ No newline at end of file diff --git a/roles/slurm_controller/tasks/main.yml b/roles/slurm_controller/tasks/main.yml deleted file mode 100644 index 7c8970fdd..000000000 --- a/roles/slurm_controller/tasks/main.yml +++ /dev/null @@ -1,103 +0,0 @@ ---- - -- name: NFS exports file - register: nfs_exports_file - blockinfile: - path: /etc/exports - block: | - /data 192.168.1.0/255.255.255.0(rw,sync,no_all_squash,no_root_squash) - /usr/local 192.168.1.0/255.255.255.0(ro,sync,no_root_squash) - /opt 192.168.1.0/255.255.255.0(ro,sync,no_root_squash) -- name: reload NFS exports - when: nfs_exports_file.changed - command: exportfs -r -- name: start NFS service - systemd: - name: nfs-server - state: started - enabled: true - -- name: install and start mariadb - become: true - become_user: root - tags: slurmdb - block: - - name: install mariadb - dnf: - name: - - mariadb-server - - mariadb-devel - state: present - - name: start mariadb service - systemd: - name: mariadb - state: started - enabled: true - - name: create slurm database user - tags: slurmdb - block: - - mysql_db: - name: slurm_acct_db - - mysql_user: - name: slurm - priv: "slurm_acct_db.*:all" - - -- name: copy slurmdbd configuration - become: true - copy: - src: "slurmdbd.conf" - dest: /etc/slurm/ - owner: slurm - group: slurm - mode: "644" - - -- name: install slurm - become: true - block: - - name: install slurm runtime requirements - dnf: - name: - - hwloc - - libibmad - - libibumad - - lua - - man2html - - numactl - - openssl - - pam-devel - - perl-devel - - rpm-build - - rrdtool-devel - - name: install slurm from rpm files - dnf: - name: - - "{{ slurmbuilddir }}/rpmbuild/RPMS/x86_64/slurm-20.02.2-1.el8.x86_64.rpm" - - "{{ slurmbuilddir }}/rpmbuild/RPMS/x86_64/slurm-example-configs-20.02.2-1.el8.x86_64.rpm" - - "{{ slurmbuilddir }}/rpmbuild/RPMS/x86_64/slurm-slurmctld-20.02.2-1.el8.x86_64.rpm" - - "{{ slurmbuilddir }}/rpmbuild/RPMS/x86_64/slurm-slurmdbd-20.02.2-1.el8.x86_64.rpm" - - name: configure slurm tmpfiles - copy: - content: "d /var/run/slurm 0755 slurm slurm" - dest: /usr/lib/tmpfiles.d/slurm.conf - - block: - - name: fix slurmdbd pidfile path in systemd unit - replace: - path: /usr/lib/systemd/system/slurmdbd.service - regexp: /var/run/slurmdbd.pid - replace: /var/run/slurm/slurmdbd.pid - - name: fix slurmctld pidfile path in systemd unit - replace: - path: /usr/lib/systemd/system/slurmctld.service - regexp: /var/run/slurmctld.pid - replace: /var/run/slurm/slurmctld.pid - - name: enable slurm services - loop: - - slurmdbd - - slurmctld - systemd: - daemon_reload: true # necessary because we've edited the unit file - name: "{{ item }}" - state: started - enabled: true diff --git a/roles/slurm_node/README.md b/roles/slurm_node/README.md deleted file mode 100644 index d6c0bfdb7..000000000 --- a/roles/slurm_node/README.md +++ /dev/null @@ -1,14 +0,0 @@ -This role creates users and deploys files that are common to the [Slurm -Controller] and [Slurm Workers]. It uses the [munge node] role to set up -the Munge authentication service - -[munge node]: ../munge_node -[Slurm Controller]: ../slurm_controller -[SLurm Workers]: ../slurm_worker - -It will: - -- Create a user called `slurm` with a consistent UID -- Creates the directories that Slurm requires -- Put copies of the shared configuration files (that `slurmd` and - `slurmctld` both use) in the appropriate places \ No newline at end of file diff --git a/roles/slurm_node/meta/main.yml b/roles/slurm_node/meta/main.yml deleted file mode 100644 index f6707d8e4..000000000 --- a/roles/slurm_node/meta/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- - -dependencies: - - munge_node \ No newline at end of file diff --git a/roles/slurm_node/tasks/main.yml b/roles/slurm_node/tasks/main.yml deleted file mode 100644 index e46d6ec74..000000000 --- a/roles/slurm_node/tasks/main.yml +++ /dev/null @@ -1,83 +0,0 @@ ---- - -- name: create slurm user - become: true - user: - name: slurm - system: yes - create_home: no - uid: 9634 - -- name: create slurm data directories - become: true - loop: - - /var/log/slurm - - /var/lib/slurm - - /etc/slurm - file: - path: "{{ item }}" - state: directory - owner: slurm - group: slurm - -- name: add slurm config files - become: true - block: - - name: copy cgroup config file - notify: reconfigure slurm - copy: - src: cgroup.conf - dest: /etc/slurm/ - owner: slurm - group: slurm - mode: "644" - - name: generate and copy slurm config file - notify: reconfigure slurm - template: - src: slurm.conf.j2 - dest: /etc/slurm/slurm.conf - owner: slurm - group: slurm - mode: "644" - - -- name: install slurmd - become: true - block: - - name: install slurm from rpm files - dnf: - disable_gpg_check: yes # We built these rpm files, so they're unsigned. - name: - - "{{ slurmbuilddir }}/rpmbuild/RPMS/x86_64/slurm-20.02.2-1.el8.x86_64.rpm" - - "{{ slurmbuilddir }}/rpmbuild/RPMS/x86_64/slurm-slurmd-20.02.2-1.el8.x86_64.rpm" - - name: fix slurmd pidfile path in systemd unit - replace: - path: /usr/lib/systemd/system/slurmd.service - regexp: /var/run/slurmd.pid - replace: /var/run/slurm/slurmd.pid - - name: Create /var/run/slurm folder - blockinfile: - path: /usr/lib/systemd/system/slurmd.service - insertafter: \[Service\] - block: | - RuntimeDirectory=slurm - - - name: enable slurmd service - systemd: - daemon_reload: true # necessary because we edited the unit file - name: slurmd - state: started - enabled: true - - -- name: network drive mounts - become: true - block: - - loop: "{{ network_mounts }}" - name: Load network drive mounts from env_vars.yml - ansible.posix.mount: - path: "{{ item.path }}" - src: "{{ item.src }}" - fstype: "{{ item.fstype }}" - state: "{{ item.state }}" - opts: "{{ item.opts }}" diff --git a/roles/slurm_rpms/README.md b/roles/slurm_rpms/README.md deleted file mode 100644 index 7879401be..000000000 --- a/roles/slurm_rpms/README.md +++ /dev/null @@ -1,8 +0,0 @@ -This role fetches the Slurm source code, installs its dependencies, and builds -RPMs that can install the Slurm controller or worker daemons. - -It's used by the [slurm node] and [slurm controller] roles to install -different components of Slurm. - -[slurm node]: ../slurm_node -[slurm controller]: ../slurm_controller \ No newline at end of file diff --git a/roles/slurm_rpms/tasks/main.yml b/roles/slurm_rpms/tasks/main.yml deleted file mode 100644 index 58eb3609c..000000000 --- a/roles/slurm_rpms/tasks/main.yml +++ /dev/null @@ -1,49 +0,0 @@ ---- - -- name: build slurm RPM files - become: true - block: - - name: install development tools - dnf: - name: "@Development Tools" - - name: install slurm build requirements - dnf: - name: - - hwloc - - hwloc-devel - - libibmad - - libibumad - - lua - - lua-devel - - man2html - - mariadb-server - - mariadb-devel - - munge-devel - - ncurses-devel - - numactl - - numactl-devel - - openssl - - openssl-devel - - pam-devel - - perl-devel - - readline-devel - - rpm-build - - rrdtool-devel - - name: create temporary build directory - file: - path: "{{ slurmbuilddir }}" - state: directory - - name: check if slurm source files are already downloaded - stat: - path: "{{ slurmbuilddir }}/slurm-20.02.2.tar.bz2" - register: slurm_download - - name: fetch slurm source files - get_url: - url: "https://download.schedmd.com/slurm/slurm-20.02.2.tar.bz2" - dest: "{{ slurmbuilddir }}/slurm-20.02.2.tar.bz2" - when: not slurm_download.stat.exists - - name: build slurm rpm file - command: - cmd: rpmbuild -ta slurm-20.02.2.tar.bz2 - chdir: "{{ slurmbuilddir }}" - creates: "{{ slurmbuilddir }}/rpmbuild/" \ No newline at end of file diff --git a/roles/slurm_worker/README.md b/roles/slurm_worker/README.md deleted file mode 100644 index 4bf3fd6b7..000000000 --- a/roles/slurm_worker/README.md +++ /dev/null @@ -1,5 +0,0 @@ -This role installs and enables the Slurm worker daemon, `slurmd`. It uses the -[slurm node] role to perform the tasks that are common between Slurm node types. -This role is not needed on the head node. - -[slurm node]: ../slurm_rpms diff --git a/roles/slurm_worker/meta/main.yml b/roles/slurm_worker/meta/main.yml deleted file mode 100644 index 4375d8034..000000000 --- a/roles/slurm_worker/meta/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- -dependencies: - - role: slurm_rpms - - role: slurm_node \ No newline at end of file diff --git a/roles/slurm_worker/tasks/main.yml b/roles/slurm_worker/tasks/main.yml deleted file mode 100644 index 2bc199b04..000000000 --- a/roles/slurm_worker/tasks/main.yml +++ /dev/null @@ -1,127 +0,0 @@ ---- - -- become: true - block: - - name: ensure firewalld is running - systemd: - name: firewalld - state: started - enabled: true - - name: open slurm ports - loop: - - 6817-6819/tcp - - 6817-6819/udp - - 7321/tcp - ansible.posix.firewalld: - port: "{{ item }}" - state: enabled - permanent: true - immediate: true - - name: mount /data - ansible.posix.mount: - path: /data - src: "{{kive_db_host}}:/data" - fstype: nfs - state: mounted - - - name: mount /opt - ansible.posix.mount: - path: /opt - src: "{{kive_db_host}}:/opt" - fstype: nfs - state: mounted - opts: ro - - - name: mount /usr/local - ansible.posix.mount: - path: /usr/local - src: "{{kive_db_host}}:/usr/local" - fstype: nfs - state: mounted - opts: ro - - - name: check if original /home has been renamed - stat: path=/original_home - register: home_backed_up - - - name: rename original /home - command: mv /home /original_home - when: not home_backed_up.stat.exists - - - name: symbolic link for /home - file: - path: /home - src: /data/home - state: link - - - name: read system users - delegate_to: localhost - register: user_list - community.general.read_csv: - path: /etc/passwd - delimiter: ":" - fieldnames: - - name - - passwdx - - uid - - gid - - info - - home - - shell - - name: read system groups - delegate_to: localhost - register: group_list - community.general.read_csv: - path: /etc/group - delimiter: ":" - fieldnames: - - name - - passwdx - - gid - - users - - name: record group members - loop: "{{ group_list.list }}" - when: item.name in copied_groups - set_fact: - group_name: "{{ item.name }}" - group_members: "{{ item.users.split(',') }}" - register: system_groups - - name: build user groups - with_subelements: - - "{{ system_groups.results }}" - - ansible_facts.group_members - when: item.1 != '' - set_fact: - user_groups: "{{ user_groups | default({}) | combine({ item.1: [item.0.ansible_facts.group_name] }, list_merge='append') }}" - - name: read system passwords - delegate_to: localhost - register: shadow_dict - community.general.read_csv: - path: /etc/shadow - delimiter: ":" - key: name - fieldnames: - - name - - passwd - - lastchanged - - min - - max - - warn - - inactive - - expire - - name: copy system groups - loop: "{{ group_list.list }}" - when: > - (item.name in copied_groups) or - (item.name in shadow_dict.dict and shadow_dict.dict[item.name]['passwd'].startswith("$")) - group: - gid: "{{ item.gid }}" - name: "{{ item.name }}" - - name: copy system users - loop: "{{ user_list.list }}" - when: shadow_dict.dict[item.name]['passwd'].startswith("$") - user: - uid: "{{ item.uid }}" - create_home: no - name: "{{ item.name }}" - password: "{{ shadow_dict.dict[item.name]['passwd'] }}"