From c43f2cad516312282e1201906b9a577a843ff65a Mon Sep 17 00:00:00 2001
From: Richard Liang <rliang@cfenet.ubc.ca>
Date: Wed, 29 Nov 2023 11:45:17 -0800
Subject: [PATCH] Some more tweaks and some WIP changes on the documentation.

---
 cluster-setup/{deployment => }/Dockerfile     |   0
 cluster-setup/README.md                       | 303 +++++++++++++-----
 cluster-setup/{deployment => }/compose.yaml   |   0
 .../deployment/create_backup_filesystem.yaml  |   4 +-
 .../copy_users_and_groups/tasks/main.yml      |   2 +
 .../roles/head_node_networking/tasks/main.yml |   9 +-
 .../deployment/roles/kive_node/tasks/main.yml |   1 +
 .../roles/kive_server/files/wsgi.load         |   1 +
 .../roles/kive_server/tasks/main.yml          | 110 ++++---
 .../worker_node_networking/tasks/main.yml     |   9 +-
 .../export_users_and_groups.py                |   0
 .../head/cluster_hosts                        |   0
 .../head/create_head_user_data.py             |   0
 .../head/head_configuration.bash              |   0
 .../head/user-data.template                   |   0
 .../worker/cluster_hosts                      |   0
 .../worker/create_worker_user_data.py         |   0
 .../worker/user-data.template                 |   0
 .../worker/worker_configuration.bash          |   0
 cluster-setup/requirements.txt                |   3 -
 20 files changed, 298 insertions(+), 144 deletions(-)
 rename cluster-setup/{deployment => }/Dockerfile (100%)
 rename cluster-setup/{deployment => }/compose.yaml (100%)
 create mode 100644 cluster-setup/deployment/roles/kive_server/files/wsgi.load
 rename cluster-setup/{deployment => }/export_users_and_groups.py (100%)
 rename cluster-setup/{cloud-init => initialization}/head/cluster_hosts (100%)
 rename cluster-setup/{cloud-init => initialization}/head/create_head_user_data.py (100%)
 rename cluster-setup/{cloud-init => initialization}/head/head_configuration.bash (100%)
 rename cluster-setup/{cloud-init => initialization}/head/user-data.template (100%)
 rename cluster-setup/{cloud-init => initialization}/worker/cluster_hosts (100%)
 rename cluster-setup/{cloud-init => initialization}/worker/create_worker_user_data.py (100%)
 rename cluster-setup/{cloud-init => initialization}/worker/user-data.template (100%)
 rename cluster-setup/{cloud-init => initialization}/worker/worker_configuration.bash (100%)
 delete mode 100644 cluster-setup/requirements.txt

diff --git a/cluster-setup/deployment/Dockerfile b/cluster-setup/Dockerfile
similarity index 100%
rename from cluster-setup/deployment/Dockerfile
rename to cluster-setup/Dockerfile
diff --git a/cluster-setup/README.md b/cluster-setup/README.md
index 84dcfa5d..5f3ec5f0 100644
--- a/cluster-setup/README.md
+++ b/cluster-setup/README.md
@@ -8,24 +8,25 @@ This procedure, as of November 22, 2023, looks like the following.
 
 ### Before you wipe the old machine
 
-Make sure your backups are in order.  System backups are typically kept using `rsnapshot`,
+If you're planning to restore the data from the old machine after the deployment,
+make sure your backups are in order.  System backups are typically kept using `rsnapshot`,
 and a backup of the Kive PostgreSQL database is kept using `barman`.  For example,
 on our production server, these are kept on a NAS mounted at `/media/dragonite`.
 
-Preserve copies of your system's `/etc/passwd`, `/etc/group`, and `/etc/shadow`.  This 
+There a few files that are worth preserving in particular and having available to you
+during the deployment process:
+
+* Preserve copies of your system's `/etc/passwd`, `/etc/group`, and `/etc/shadow`.  This 
 information will be used to populate the new system with the same users and groups
 from the old system.
-
-Create a dump of the Kive PostgreSQL database using `pg_dumpall`.  As the upgrade may
+* Create a dump of the Kive PostgreSQL database using `pg_dumpall`.  As the upgrade may
 involve moving to a newer version of PostgreSQL, we likely can't use the Barman
 backups to migrate from; thus we must do it the "old-fashioned" way.
-
-Preserve a copy of `/etc/kive/kive_apache.conf`.  This file contains the database
+* Preserve a copy of `/etc/kive/kive_apache.conf`.  This file contains the database
 password used by Kive (via `apache2`) to access PostgreSQL.  (You can also just preserve
 this password and discard the file; the file should be present in the old system's
 `rsnapshot` backups anyway if needed later.)
-
-Preserve a copy of the `barman` user's `.pgpass` file.  This contains the passwords
+* Preserve a copy of the `barman` user's `.pgpass` file.  This contains the passwords
 used by the `barman` and `streaming_barman` users when connecting to PostgreSQL,
 and keeping these makes it easier to get the database set back up after importing
 the database from the old system.
@@ -41,10 +42,15 @@ First, manually install Ubuntu Jammy on the head node using an Ubuntu live USB d
   with gateway 192.168.68.1 and DHCP server 192.168.168.101.
 Once this is done, you can interact with the head node via SSH.
 
-Next, upload the contents of [cloud-init/head] to the server and run `head_configuration.bash`.
-This sets up the root user's SSH key and `/etc/hosts`, and installs Ansible on the head node.
+Next, upload the contents of [initialization/head] to the server and run `head_configuration.bash`
+using `sudo`.
+This sets up the root user's SSH key and `/etc/hosts`, and installs Ansible on the head node.  
+Accept the defaults when it asks which services should be restarted whenever it asks.
 Now that Ansible is available on the root node, most of the rest of the procedure will be done
-using Ansible playbooks defined in the [deployment] directory.
+using Ansible playbooks defined in the [deployment] directory.  Copy the `cluster-setup` directory 
+to the head node, e.g. using `rsync -avz`, placing it in a sensible location with the appropriate 
+permissions.  If you make changes, you can also use `rsync -avz` to keep them synchronized between
+your workstation and the head node.
 
 #### Prepare Ansible configuration
 
@@ -66,12 +72,14 @@ probably `ansible_octomore.cfg`.  These files will be necessary for Ansible to w
 
 #### General preliminary setup
 
-The first thing to do with Ansible is to run the `octomore_preliminary_setup.yaml`
-playbook.  Find the `/dev/disk/by-id/` entry that corresponds to the 10GB volume on the system 
+The first playbook we will run sets up the `/data` partition, so the first thing we do
+is find the `/dev/disk/by-id/` entry that corresponds to the drive you want to use as `/data`
 and put the *basename* (i.e. the name of the soft link in the directory without the 
 `/dev/disk/by-id/` part of the path) into `group_vars/all.yml` as the lone entry in the 
 `data_physical_volumes` list.  (Or, if you wish to use several volumes combined into 
-one logical volume, put all their names in this list.)  This sets up the `/data` partition,
+one logical volume, put all their names in this list.)  
+
+Now we can run the playbook `octomore_preliminary_setup.yaml`.  This sets up the `/data` partition,
 prepares some other system stuff on the head node, and configures the internal-facing networking.
 With this in place, the playbook should set up an `ext4` volume at `/data` on the drive 
 you specified.
@@ -90,62 +98,112 @@ These machines only have one hard drive, and their ethernet should automatically
 by default (the head node provides NAT and DHCP), so this should be a very straightforward
 installation.  Again, create a user with username `ubuntu` to be the bootstrap user.
 
-Now, upload the contents of [cloud-init/worker] to each compute node, along with the SSH
-public key generated by the root user on the head node during the running of 
-`head_configuration.bash`.  Then, run `worker_configuration.bash`, which will install
+Fetch the SSH public key generated by the root user on the head node during the running of
+`head_configuration.bash` and place it in the [initialization/worker] directory as 
+`head_node_root_id_ed25519.pub` (don't commit
+this file to source control; it isn't a security risk, but it isn't needed and might
+cause confusion later).  Now, upload the contents of [initialization/worker] to each compute node, 
+along with the aforementioned SSH public key.  Then, run `worker_configuration.bash` using `sudo`, 
+which will install
 the necessary packages and set up the necessary SSH access for the node to be used with Ansible.
 
 ### Annoying detour: reassign the bootstrap user's UID and GID
 
-At this point, you can run `reassign_bootstrap_user_uid.yaml`, which is necessary because
-the `ubuntu` bootstrap user on both machines has a UID and GID that overlaps with 
-a user account that will later be imported into this machine.  You may need to create a *second* 
-bootstrap user to do this, as running the playbook as `ubuntu` may fail because the user
-is currently being used (even if you use `sudo`).
+At this point, your `ubuntu` user on all the machines may have a UID and GID of 1000.
+Unfortunately, this likely conflicts with one of the user accounts that will later be
+imported into this machine.  If this is the case, you can run `reassign_bootstrap_user_uid.yaml`.  
+You may need to create a *second* bootstrap user to do this, as running the playbook as `ubuntu` 
+may fail because the user is currently being used (even if you use `sudo`).  This second bootstrap
+user can be removed right after this playbook is done, and you can proceed again as the `ubuntu`
+user.
 
 ### Import users and groups from the old system
 
 The next playbook to run imports users from the old system.  First, a YAML file must be prepared
 using `export_users_and_groups.py` from the old system's `/etc/shadow`, `/etc/passwd`, and 
-`/etc/group`.  Next, run
+`/etc/group`.  (A Dockerfile and docker compose file are provided in this directory if you 
+need a simple environment with Python 3 to run the script.)  Next, run
 
     sudo ansible-playbook --extra-vars "@[name of the produced YAML file]" import_users.yaml
 
 This will import user accounts into the head node.  (These will later be synchronized to the
 compute node as part of a subsequent playbook.)
 
+From here, you can lock and expire the `ubuntu` user and start using one of the just-imported accounts,
+if you have one.  Make sure that your uploaded `cluster-setup` directory is accessible by
+the account you're using if you do so.
+
+### Get SSL credentials for the webserver
+
+Before you install Kive in the next step, you must get the SSL credentials for the server.
+These must be acquired securely from IT or within the software group, and placed into the 
+[deployment] directory.  *DO NOT* commit these files to source!
+
+The files needed are:
+
+* `DigiCertCA.crt`: the DigiCert certificate authority (CA) key, which specifies that DigiCert
+  issued the key.
+* `star_cfe.crt`: the wildcard certificate issued by DigiCert, which certifies that this server
+  belongs to the `cfenet.ubc.ca` or `bccfe.ca` domain.
+* `star_cfe.key`: our private signing key, used to issue a public key for HTTPS connections.
+
+These will then be used in the next step to configure Apache.
+
 ### Install Kive
 
 With all of that table-setting in place, the main playbook to run is `kive_setup.yml`.  This is
 the "main" playbook, and will take longer to run.
 
-### Restore the Kive database
+At this point, you should have a fresh, "empty" server.  If that's your goal, then
+you can stop here.  The next steps restore data from an old server.
+
+## Restore from an old system
+
+If you are restoring an old system, make the backups available somewhere on
+your system; e.g. at `/media/old_data` or a similar mount point.
+
+### Shut down Kive and backup services
+
+First, shut down the backup tasks that were created in the previous step:
 
-At this point, you should have a fresh, "empty" server.  You can now restore the Kive database
-from the database dump you made earlier on the old system.
+    sudo systemctl stop barman_backup.timer
+    sudo systemctl stop rsnapshot_alpha.timer
+    sudo systemctl stop rsnapshot_beta.timer
+    sudo systemctl stop rsnapshot_gamma.timer
 
-First, restore the Kive data folders from the old backups.  On our prod and dev 
+Barman installs a cron job by default at the system level.  For now, disable this
+by commenting out the entry in `/etc/cron.d/barman`.
+
+Shut down the PostgreSQL database and `apache2` as well:
+
+    sudo systemctl stop apache2
+    sudo systemctl stop postgresql@14-main
+
+### Restoring the database
+
+Now, restore the Kive data folders from the old backups.  On our prod and dev 
 clusters this folder was `/data/kive`; use `rsync -avz` to copy this information 
 into place on your new server.  Assuming all has gone correctly with importing users and groups,
 the ownership of the files should be as they were on the old system.
 
-With these in place, you can now restore the PostgreSQL database.  First,
-shut down `apache2` and `postgresql`:
+Next, move the just-created PostgreSQL "cluster" to a backup location (or simply
+delete it if you're very confident).  On a fresh install, the cluster is at
+`/var/lib/postgresql/14/main`.  Move this to, for example, `/var/lib/postgresql/14/main_backup`.
+Create a fresh empty cluster in the original location using `initdb`:
 
-```
-sudo systemctl stop apache2
-sudo systemctl stop postgresql@14-main
-```
+    sudo -u postgres /usr/lib/postgresql/14/bin/initdb /var/lib/postgresql/14/main
 
-Next you can restore the data using `psql` as the `postgres` user:
+Next you can restore the database using `psql` as the `postgres` user.  Bring up the database
+again (this time with the fresh empty cluster) and use `psql` to load the data:
 
 ```
-sudo su -l postgres
-psql -f [dumped file from the old system] postgres
+sudo systemctl start postgresql@14-main
+sudo -u postgres psql -f [dumped file from the old system] postgres
 ```
 
-Note that in the `psql` command, we specified the database `postgres`.  This will actually 
-be ignored but should still be specified or else `psql` will complain.
+Note that in the `psql` command, we specified the database `postgres`.  This must be 
+specified (it's a mandatory parameter to `psql`) but will actually 
+be ignored.
 
 At this point, the database will have been restored to the old settings.  If you didn't
 use it before in your Ansible configuration (i.e. in `group_vars/all.yaml`), you should
@@ -153,8 +211,60 @@ now either specify the PostgreSQL passwords preserved from the old system in
 `/etc/kive/kive_apache.conf` and the `barman` user's `.pgpass`, or reset the passwords
 using `psql` to the ones you used in your Ansible settings.
 
-[cloud-init/head]: ./cloud-init/head
-[cloud-init/worker]: ./cloud-init/worker
+### Restore other old user data
+
+This can be done at the leisure of each user, so long as the old backups are mounted.
+Use `rsync -avz` to move whatever user data back into place you like.
+
+### Start the backup tasks
+
+With everything in place, we're almost ready to start the regularly-scheduled
+backup `systemd` tasks.  First, reactivate the Barman cron job by uncommenting
+the entry you commented out before in `/etc/cron.d/barman`.  Then check on the 
+`barman` configuration by running, as the `barman` user,
+
+    barman check kive
+
+There may be problems with the configuration still.  If so, the log at `/var/log/barman/barman.log`
+may be helpful in diagnosing the problems.  Some that I experienced
+while I was going through the process:
+
+* The `barman` and `streaming_barman` PostgreSQL user passwords may be incorrect.  This happened
+  because I didn't preserve these passwords from before I wiped out the database.
+  This can easily be remedied by changing these users' PostgreSQL passwords 
+  in `psql` (as the `postgres` system user) with the command `\password [username]`;
+  use the passwords in the `barman` system user's `.pgpass` file.
+* The "replication slot" entry in the `barman check kive` output may report a failure.
+  One possible reason for this is that `barman cron` has not run successfully yet,
+  as in the previous steps we had disabled the system-level cron job that runs this
+  every minute.  This task is what invokes `barman receive-wal`.  If this appears to
+  be the problem, you can manually invoke `barman cron` as the `barman` user.  (It's
+  worth trying this anyway, as it won't do any harm.)  Or, you can wait one minute for 
+  the cron job to run it and see if this error clears up.
+* 
+
+This may not be ready to go yet as the WAL archiving must first be verified.  
+To do this, as the `barman` user, run
+
+    barman switch-wal --force --archive kive
+
+This may fail at first due to a timeout, but try again; it's likely to succeed
+eventually if all is configured well.  Check the configuration again to confirm
+that things are ready to go.
+
+With everything in place, restart the regularly-scheduled backup `systemd` tasks:
+
+* `barman_backup`
+* `rsnapshot_alpha`
+* `rsnapshot_beta`
+* `rsnapshot_gamma`
+
+For example, run `sudo systemctl start barman_backup.timer` to start `barman_backup`, and
+similarly for the others.
+
+[initialization/head]: initialization/head
+[initialization/worker]: initialization/worker
+[initialization]: initialization
 [deployment]: ./deployment
 
 ## Test Environment
@@ -164,8 +274,28 @@ Vagrant.
 
 ### Multipass
 
-The [cloud-init] directory contains templates and scripts for generating cloud-init
-files to use when setting up a "head" VM and a "worker" VM.  FIXME more instructions to come
+The [initialization] directory contains templates and scripts for generating cloud-init
+files to use when setting up a "head" VM and a "worker" VM.
+
+For the head configuration, you must supply a YAML file containing the names and IPs of 
+the compute nodes in the same format as they appear in the Ansible `group_vars`; for example, 
+simply copy `deployment/group_vars/default_template.yml` (these values are not hugely useful
+for this test deployment anyway).  Specify this as a parameter to the `create_head_user_data.py` 
+script and it will generate a `user_data` file suitable for use with Multipass:
+
+    multipass launch --name TestHead --cloud-init [user data file you generated] --mount [path to the cluster-setup directory]:/app
+
+For the worker configuration, you must put the SSH public key generated for the root user
+on the "head node" somewhere accessible by whoever you want to run `create_worker_user_data.py`,
+and specify it as the parameter.  This creates a `user_data` file suitable for use with 
+Multipass: similarly to the above,
+
+    multipass launch --name TestWorker --cloud-init [user data file you generated] --mount [path to the cluster-setup directory]:/app
+
+These commands launch the machines and also mount the `cluster-setup` directory at `/app`
+on both nodes.  Now that both machines are online and have IP addresses, you can run
+`configure_hosts_file.bash` on the head node to configure its `/etc/hosts` file so that
+Ansible will know how to reach the worker node.
 
 ### Vagrant
 
@@ -175,56 +305,78 @@ cluster management tasks. Ansible is installed on the `head` node, and this dire
 is mounted at `/vagrant`. Playbooks can be edited from the host machine, but should
 be run from the `head` node.
 
-
-# Quickstart
-
-This will guide you through setting up your test environment and running your
-first Ansible commands. You'll need to have [Vagrant] and [VirtualBox] installed.
-
+You'll need to have [Vagrant] and [VirtualBox] or VMWare installed.
 To begin, bring up the Vagrant VMs. This will create two VMs (`head` and
 `worker`) and install Ansible on `head`.
 
     vagrant up
 
-Next, log in to `head` and move into the test environment directory. This is where
-we'll do most of our testing and practice.
+On the head node, run (as root) `setup_ssh_keys.bash` and `setup_ssh_access.bash`; this will
+install some dummy keys to enable passwordless SSH from the root user to itself,
+which is necessary for Ansible.
 
-    vagrant ssh head
-    cd /vagrant/testenv
+On the compute node, run (as root) `setup_ssh_access.bash`, which will allow the head
+node's root user to SSH into the compute node without a password.  This is also needed
+for Ansible.
 
-`ansible.cfg` contains the configuration for the test environment. Most
-importantly, it directs ansible to load its inventory from
-`testenv/inventory.ini` instead of from the default location under `/etc`.
+With both nodes running, you can use `configure_hosts_file.bash` on the head node,
+also as root, to fill in the head node's `/etc/hosts` file so that Ansible will know
+how to reach the compute node.
 
-From `./testenv`, you can run Ansible commands against the inventoried
-hosts (including the head node).
+At this point, you can log into the head node and work with the code in this directory
+at `/vagrant`.  In particular, the Ansible scripts are located in `/vagrant/deployment`.
 
-This command runs the Ansible's `ping` module against all hosts, which checks that
-they can be accessed.
+To confirm that your Ansible configuration is correct, you can run this command:
 
     ansible -m ping all
 
+This command runs the Ansible's `ping` module against all hosts, which checks that
+they can be accessed.
 
 [Vagrant]: https://www.vagrantup.com/downloads.html
 [VirtualBox]: https://www.virtualbox.org/wiki/Downloads
 
+## Using Ansible
 
-# Architecture (for lack of a better name)
+`ansible.cfg` contains the configuration for the test environment. Most
+importantly, it directs ansible to load its inventory from
+`deployment/inventory.ini` instead of from the default location under `/etc`.
+
+From `./deployment`, you can run Ansible commands against the inventoried
+hosts (including the head node).
+
+### Architecture (for lack of a better name)
 
 Ansible executes *tasks* against one or more managed machines. Tasks may also
-depend on *variables*, *files*, or *templates*. These can be grouped into *roles*.
+depend on *variables*, *files*, or *templates*. These can also be grouped into *roles*,
+which we make use of in this project to help organize our code.
+
+### Running playbooks
+
+Run playbooks using `ansible-playbook`, e.g.
+
+    ansible-playbook kive_setup.yml
+
+For all of our playbooks, you're intended to use `sudo` as well.
 
-This project uses roles to configure servers (e.g. Slurm worker, Kive server).
+#### Debugging a single role
 
+Per [this](https://stackoverflow.com/questions/38350674/ansible-can-i-execute-role-from-command-line)
+stack overflow answer, a single role can be run with the following command:
+
+    ansible <hostname> -m include_role -a name=<role name>
+
+This has more verbose output and can be run in isolation, making it suitable
+for development and debugging.
 
-# Ansible Docs
+### Ansible documentation
 
-Essential:
+#### Essential
 
 - [Concepts](https://docs.ansible.com/ansible/latest/user_guide/basic_concepts.html)
 - [Quickstart](https://docs.ansible.com/ansible/latest/user_guide/quickstart.html)
 
-Thorough:
+#### Thorough
 
 - [Playbooks](https://docs.ansible.com/ansible/2.3/playbooks.html)
 - [How to build your inventory](https://docs.ansible.com/ansible/latest/user_guide/intro_inventory.html#intro-inventory)
@@ -233,7 +385,7 @@ Thorough:
 - [Best Practices](https://docs.ansible.com/ansible/latest/user_guide/playbooks_best_practices.html#playbooks-best-practices)
 - [Interpreter Discovery](https://docs.ansible.com/ansible/latest/reference_appendices/interpreter_discovery.html#interpreter-discovery)
 
-Extended:
+#### Extended
 
 - [Installation](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html#installation-guide)
 - [Become (privesc)](https://docs.ansible.com/ansible/2.3/become.html)
@@ -241,8 +393,7 @@ Extended:
 - [Asynchronous Actions and Polling](https://docs.ansible.com/ansible/2.3/playbooks_async.html)
 - [Vault](https://docs.ansible.com/ansible/2.3/playbooks_vault.html)
 
-
-# Useful modules
+#### Useful modules
 
 - [copy](https://docs.ansible.com/ansible/latest/modules/copy_module.html#copy-module)
 - [user](https://docs.ansible.com/ansible/latest/modules/user_module.html#user-module)
@@ -259,18 +410,4 @@ Extended:
 - [lineinfile](https://docs.ansible.com/ansible/latest/modules/lineinfile_module.html)
 - [blockinfile](https://docs.ansible.com/ansible/latest/modules/blockinfile_module.html#blockinfile-module)
 - [git](https://docs.ansible.com/ansible/latest/modules/git_module.html#git-module)
-- [unarchive](https://docs.ansible.com/ansible/latest/modules/unarchive_module.html)
-
-# Applying a single role
-
-Per [this](https://stackoverflow.com/questions/38350674/ansible-can-i-execute-role-from-command-line)
-stack overflow answer, a single role can be run with the following command:
-
-    ansible <hostname> -m include_role -a name=<role name>
-
-This has more verbose output and can be run in isolation, making it suitable
-for development and debugging.
-
-
-<!-- TODO(nknight): Move ansible reference into its own document -->
-<!-- TODO(nknight): Overview of roles and environments -->
+- [unarchive](https://docs.ansible.com/ansible/latest/modules/unarchive_module.html)
\ No newline at end of file
diff --git a/cluster-setup/deployment/compose.yaml b/cluster-setup/compose.yaml
similarity index 100%
rename from cluster-setup/deployment/compose.yaml
rename to cluster-setup/compose.yaml
diff --git a/cluster-setup/deployment/create_backup_filesystem.yaml b/cluster-setup/deployment/create_backup_filesystem.yaml
index 20114fdf..fc52345a 100644
--- a/cluster-setup/deployment/create_backup_filesystem.yaml
+++ b/cluster-setup/deployment/create_backup_filesystem.yaml
@@ -4,9 +4,7 @@
   hosts: head
   vars:
     drive_identifiers:
-      - "ata-ST8000AS0002-1NA17Z_Z840VBQ5"
-      - "scsi-0ATA_WDC_WD2000FYYZ-0_WD-WMC1P0H2KH10"
-      - "scsi-0ATA_WDC_WD2000FYYZ-0_WD-WMC1P0H3J8XT"
+      - "ata-ST10000NM001G-2MW103_ZS51H7QX"
   tasks:
     - name: create a single partition on each of the physical volumes
       loop: "{{ drive_identifiers }}"
diff --git a/cluster-setup/deployment/roles/copy_users_and_groups/tasks/main.yml b/cluster-setup/deployment/roles/copy_users_and_groups/tasks/main.yml
index 24185afb..819dfdcd 100644
--- a/cluster-setup/deployment/roles/copy_users_and_groups/tasks/main.yml
+++ b/cluster-setup/deployment/roles/copy_users_and_groups/tasks/main.yml
@@ -79,3 +79,5 @@
     password: "{{ shadow_dict.dict[item.name]['passwd'] }}"
     group: "{{ item.name }}"
     groups: "{{ user_groups[item.name] | default([]) }}"
+    append: true
+    shell: "{{ default_shell }}"
diff --git a/cluster-setup/deployment/roles/head_node_networking/tasks/main.yml b/cluster-setup/deployment/roles/head_node_networking/tasks/main.yml
index 22d4c7fe..30e8f6dc 100644
--- a/cluster-setup/deployment/roles/head_node_networking/tasks/main.yml
+++ b/cluster-setup/deployment/roles/head_node_networking/tasks/main.yml
@@ -1,7 +1,14 @@
 ---
 
 - name: set timezone
-  community.general.timezone: America/Vancouver
+  block:
+    - name: change the timezone
+      community.general.timezone:
+        name: America/Vancouver
+    - name: restart cron to reflect the new timezone
+      systemd:
+        name: cron
+        state: restarted
 
 - name: check if original /home has been renamed
   stat: path=/data/home
diff --git a/cluster-setup/deployment/roles/kive_node/tasks/main.yml b/cluster-setup/deployment/roles/kive_node/tasks/main.yml
index 54eefd5f..d49395bf 100644
--- a/cluster-setup/deployment/roles/kive_node/tasks/main.yml
+++ b/cluster-setup/deployment/roles/kive_node/tasks/main.yml
@@ -13,6 +13,7 @@
       - "{{ kive_python_package }}-distutils"
       - "{{ kive_python_package }}-venv"
       - "{{ kive_python_package }}-dev"
+      - "lib{{ kive_python_package }}-dev"
 
 - name: install pip for this version of Python
   become: true
diff --git a/cluster-setup/deployment/roles/kive_server/files/wsgi.load b/cluster-setup/deployment/roles/kive_server/files/wsgi.load
new file mode 100644
index 00000000..d76d1d7a
--- /dev/null
+++ b/cluster-setup/deployment/roles/kive_server/files/wsgi.load
@@ -0,0 +1 @@
+LoadModule wsgi_module /usr/lib/apache2/modules/mod_wsgi.so
diff --git a/cluster-setup/deployment/roles/kive_server/tasks/main.yml b/cluster-setup/deployment/roles/kive_server/tasks/main.yml
index a2c6c164..35459071 100644
--- a/cluster-setup/deployment/roles/kive_server/tasks/main.yml
+++ b/cluster-setup/deployment/roles/kive_server/tasks/main.yml
@@ -62,6 +62,33 @@
       - barman-cli
       - rsnapshot
 
+- name: fetch kive source code
+  become: true
+  git:
+    dest: "{{ kive_root }}"
+    repo: https://github.com/cfe-lab/Kive.git
+    version: "{{ kive_version | default('master') }}"
+    update: "{{ update_kive_source | default('no') }}"
+
+- name: set up the Kive Python virtualenv
+  become: true
+  block:
+    - name: create directory for virtualenv
+      file:
+        path: "{{ kive_venv }}"
+        state: directory
+    - name: copy requirements file to track changes
+      register: kive_requirements
+      copy:
+        dest: "{{ kive_venv }}/requirements.txt"
+        src: "{{ kive_root }}/requirements.txt"
+    - name: kive python dependencies
+      when: kive_requirements.changed
+      pip:
+        requirements: "{{ kive_root }}/requirements.txt"
+        virtualenv_command: "{{ kive_python_executable }} -m venv"
+        virtualenv: "{{ kive_venv }}"
+
 - name: install mod_wsgi from source
   become: true
   block:
@@ -96,7 +123,7 @@
       command:
         argv:
           - "/usr/local/src/mod_wsgi/configure"
-          - "--with-python={{ kive_venv }}/bin/python"
+          - "--with-python=/usr/bin/python3.7"
         chdir: "/usr/local/src/mod_wsgi"
         creates: "/usr/local/src/mod_wsgi/Makefile"
 
@@ -105,6 +132,11 @@
         chdir: "/usr/local/src/mod_wsgi"
         target: install
 
+    - name: add wsgi to the "modules available"
+      copy:
+        src: wsgi.load
+        dest: /etc/apache2/mods-available
+
     - name: enable the wsgi module in apache
       community.general.apache2_module:
         state: present
@@ -145,35 +177,6 @@
         group: root
 
 
-- name: fetch kive source code
-  become: true
-  git:
-    dest: "{{ kive_root }}"
-    repo: https://github.com/cfe-lab/Kive.git
-    version: "{{ kive_version | default('master') }}"
-    update: "{{ update_kive_source | default('no') }}"
-
-
-- name: install kive python dependencies
-  become: true
-  block:
-    - name: create directory for virtualenv
-      file:
-        path: "{{ kive_venv }}"
-        state: directory
-    - name: copy requirements file to track changes
-      register: kive_requirements
-      copy:
-        dest: "{{ kive_venv }}/requirements.txt"
-        src: "{{ kive_root }}/requirements.txt"
-    - name: kive python dependencies
-      when: kive_requirements.changed
-      pip:
-        requirements: "{{ kive_root }}/requirements.txt"
-        virtualenv_command: "{{ kive_python_executable }} -m venv"
-        virtualenv: "{{ kive_venv }}"
-
-
 - name: set up log purging
   become: true
   block:
@@ -284,7 +287,7 @@
       command:
         cmd: "a2ensite 001-kive-ssl"
 
-    - name: enable apache2
+    - name: enable and (re)start apache2
       systemd:
         name: apache2
         state: restarted
@@ -338,17 +341,18 @@
         rule: allow
         state: enabled
 
+- name: enable and start database service
+  become: true
+  systemd:
+    name: "postgresql@14-main"
+    state: started
+    enabled: true
+
 - name: configure postgres server
   become: true
   become_user: postgres
   block:
 
-    - name: start database service
-      systemd:
-        name: "postgresql"
-        state: started
-        enabled: true
-
     - name: add kive entries to pg_hba.conf
       block:
         - name: local connections
@@ -457,7 +461,7 @@
         chdir: "{{ kive_root }}/api/"
         creates: "{{ kive_root }}/api/build/"
     - name: collect kive's static files
-      notify: restart web server
+      # notify: restart web server
       environment:
         KIVE_STATIC_ROOT: "{{ kive_static_root }}"
       community.general.django_manage:
@@ -544,6 +548,17 @@
     owner: barman
     group: barman
 
+- name: barman passwords file
+  blockinfile:
+    path: /var/lib/barman/.pgpass
+    create: yes
+    owner: barman
+    group: barman
+    mode: u=rw,g=,o=
+    block: |
+      localhost:*:*:barman:{{ barman_password }}
+      localhost:*:*:streaming_barman:{{ streaming_barman_password }}
+
 - name: database backup kive config
   loop:
     - option: path_prefix
@@ -581,10 +596,10 @@
     option: "{{ item.option }}"
     value: "{{ item.value }}"
 
-- name: force a WAL switch to verify the WAL archiving process
-  become: true
-  become_user: barman
-  command: barman switch-wal --force --archive kive
+#- name: force a WAL switch to verify the WAL archiving process
+#  become: true
+#  become_user: barman
+#  command: barman switch-wal --force --archive kive
 
 - name: SSH keys
   block:
@@ -622,17 +637,6 @@
         user: postgres
         key: "{{ lookup('file', '/var/lib/barman/.ssh/id_rsa.pub') }}"
 
-- name: barman passwords file
-  blockinfile:
-    path: /var/lib/barman/.pgpass
-    create: yes
-    owner: barman
-    group: barman
-    mode: u=rw,g=,o=
-    block: |
-      localhost:*:*:barman:{{ barman_password }}
-      localhost:*:*:streaming_barman:{{ streaming_barman_password }}
-
 # This variable should be a JSON-formatted list of 2-lists, like
 # [["User One", "userone@bccfe.ca"], ..., ["User N", "userN@bccfe.ca"]]
 - name: parse admin e-mails from json
diff --git a/cluster-setup/deployment/roles/worker_node_networking/tasks/main.yml b/cluster-setup/deployment/roles/worker_node_networking/tasks/main.yml
index 7c54286f..2802588a 100644
--- a/cluster-setup/deployment/roles/worker_node_networking/tasks/main.yml
+++ b/cluster-setup/deployment/roles/worker_node_networking/tasks/main.yml
@@ -1,7 +1,14 @@
 ---
 
 - name: set timezone
-  community.general.timezone: America/Vancouver
+  block:
+    - name: change the timezone
+      community.general.timezone:
+        name: America/Vancouver
+    - name: restart cron to reflect the new timezone
+      systemd:
+        name: cron
+        state: restarted
 
 - name: install NFS client software
   become: true
diff --git a/cluster-setup/deployment/export_users_and_groups.py b/cluster-setup/export_users_and_groups.py
similarity index 100%
rename from cluster-setup/deployment/export_users_and_groups.py
rename to cluster-setup/export_users_and_groups.py
diff --git a/cluster-setup/cloud-init/head/cluster_hosts b/cluster-setup/initialization/head/cluster_hosts
similarity index 100%
rename from cluster-setup/cloud-init/head/cluster_hosts
rename to cluster-setup/initialization/head/cluster_hosts
diff --git a/cluster-setup/cloud-init/head/create_head_user_data.py b/cluster-setup/initialization/head/create_head_user_data.py
similarity index 100%
rename from cluster-setup/cloud-init/head/create_head_user_data.py
rename to cluster-setup/initialization/head/create_head_user_data.py
diff --git a/cluster-setup/cloud-init/head/head_configuration.bash b/cluster-setup/initialization/head/head_configuration.bash
similarity index 100%
rename from cluster-setup/cloud-init/head/head_configuration.bash
rename to cluster-setup/initialization/head/head_configuration.bash
diff --git a/cluster-setup/cloud-init/head/user-data.template b/cluster-setup/initialization/head/user-data.template
similarity index 100%
rename from cluster-setup/cloud-init/head/user-data.template
rename to cluster-setup/initialization/head/user-data.template
diff --git a/cluster-setup/cloud-init/worker/cluster_hosts b/cluster-setup/initialization/worker/cluster_hosts
similarity index 100%
rename from cluster-setup/cloud-init/worker/cluster_hosts
rename to cluster-setup/initialization/worker/cluster_hosts
diff --git a/cluster-setup/cloud-init/worker/create_worker_user_data.py b/cluster-setup/initialization/worker/create_worker_user_data.py
similarity index 100%
rename from cluster-setup/cloud-init/worker/create_worker_user_data.py
rename to cluster-setup/initialization/worker/create_worker_user_data.py
diff --git a/cluster-setup/cloud-init/worker/user-data.template b/cluster-setup/initialization/worker/user-data.template
similarity index 100%
rename from cluster-setup/cloud-init/worker/user-data.template
rename to cluster-setup/initialization/worker/user-data.template
diff --git a/cluster-setup/cloud-init/worker/worker_configuration.bash b/cluster-setup/initialization/worker/worker_configuration.bash
similarity index 100%
rename from cluster-setup/cloud-init/worker/worker_configuration.bash
rename to cluster-setup/initialization/worker/worker_configuration.bash
diff --git a/cluster-setup/requirements.txt b/cluster-setup/requirements.txt
deleted file mode 100644
index 9e284baf..00000000
--- a/cluster-setup/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-ansible==8.0.0
-PyMySQL==1.0.3
-psycopg2-binary==2.9.6
\ No newline at end of file