diff --git a/.envrc b/.envrc index a4b55848..0eb5c961 100644 --- a/.envrc +++ b/.envrc @@ -1,5 +1,6 @@ #shellcheck disable=SC2148,SC2155 export KUBECONFIG="$(expand_path ./kubeconfig)" export SOPS_AGE_KEY_FILE="$(expand_path ./age.key)" +export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES export TALOSCONFIG="$(expand_path ./kubernetes/talos/clusterconfig/talosconfig)" use flake diff --git a/.sops.yaml b/.sops.yaml index ef98b50f..a2bd7ee9 100644 --- a/.sops.yaml +++ b/.sops.yaml @@ -10,3 +10,7 @@ creation_rules: key_groups: - age: - "age1k5xl02aujw4rsgghnnd0sdymmwd095w5nqgjvf76warwrdc0uqpqsm2x8m" + - path_regex: .*\.sops\.ya?ml + key_groups: + - age: + - "age1k5xl02aujw4rsgghnnd0sdymmwd095w5nqgjvf76warwrdc0uqpqsm2x8m" diff --git a/.taskfiles/Ansible/Taskfile.yaml b/.taskfiles/Ansible/Taskfile.yaml new file mode 100644 index 00000000..41ba250c --- /dev/null +++ b/.taskfiles/Ansible/Taskfile.yaml @@ -0,0 +1,18 @@ +--- +# yaml-language-server: $schema=https://taskfile.dev/schema.json +version: "3" + +vars: + ANSIBLE_INVENTORY_DIR: "{{.ANSIBLE_DIR}}/inventory" + ANSIBLE_PLAYBOOK_DIR: "{{.ANSIBLE_DIR}}/playbooks" + +tasks: + proxmox-setup: + desc: Run Ansible setup playbook on the nodes + cmds: + - ansible-playbook -i {{.ANSIBLE_INVENTORY_DIR}}/hosts.yaml {{.ANSIBLE_PLAYBOOK_DIR}}/proxmox-setup.yaml -v + + proxmox-update: + desc: Update proxmox packages + cmds: + - ansible-playbook -i {{.ANSIBLE_INVENTORY_DIR}}/hosts.yaml {{.ANSIBLE_PLAYBOOK_DIR}}/proxmox-apt-upgrade.yaml diff --git a/README.md b/README.md index 439c18ec..26643d5b 100644 --- a/README.md +++ b/README.md @@ -29,16 +29,30 @@ exploring Kubernetes and Infrastructure as Code (IaC) practices using tools like ## ๐Ÿ“– Table of contents - [๐Ÿผ Overview](#-overview) - - [๐Ÿ“– Table of contents](#-table-of-contents) - - [๐Ÿ“š Documentation](#-documentation) - - [๐Ÿ–ฅ๏ธ Technological Stack](#-technological-stack) - - [๐Ÿ”ง Hardware](#-hardware) - - [โ˜๏ธ External Dependencies](#-external-dependencies) - - [๐Ÿค– Automation](#-automation) - - [๐Ÿค Thanks](#-thanks) +- [๐Ÿ“– Table of contents](#-table-of-contents) +- [๐Ÿ“š Documentation](#-documentation) +- [๐Ÿ–ฅ๏ธ Technological Stack](#-technological-stack) +- [๐Ÿ”ง Hardware](#-hardware) +- [โ˜๏ธ External Dependencies](#-external-dependencies) +- [๐Ÿค– Automation](#-automation) +- [๐Ÿค Thanks](#-thanks) ## ๐Ÿ“š Documentation +1. [Prerequisites](docs/prerequisites.md) + - [Cloudflare](docs/prerequisites.md#1-set-up-cloudflare) + - [Secrets store](docs/prerequisites.md#2-set-up-secrets-store) + - [UDM](docs/prerequisites.md#3-set-up-udm) + - [Discord](docs/prerequisites.md#4-get-discord-token) + - [PiHole](docs/prerequisites.md#5-set-up-pihole-and-generate-token-for-homepage) + - [NAS and Minio](docs/prerequisites.md#6-nas-set-up) +2. [Setup Guide](docs/set-up.md) + - [Install and Configure Proxmox](docs/set-up.md#install-and-configure-proxmox) + - [Create and Install Talos Images](docs/set-up.md#create-and-install-talos-images) + - [Bootstrap Kubernetes Cluster](docs/set-up.md#bootstrap-kubernetes-cluster) + - [Install Flux](docs/set-up.md#install-flux) +3. [How To](docs/howto.md) + ## ๐Ÿ–ฅ๏ธ Technological Stack | | Name | Description | @@ -74,17 +88,17 @@ exploring Kubernetes and Infrastructure as Code (IaC) practices using tools like rack -| Device | Count | Disk Size | RAM | OS | Purpose | -|----------------------------|-------|-----------|------|---------|-------------------------| -| Lenovo M910Q Tiny i5-6500T | 3 | 256G | 32GB | Talos | Kubernetes Master Nodes | -| Raspberry Pi 5 | 1 | | 8GB | RpiOS | DNS, SmartHome | -| Synology RS422+ | 1 | 4x16TB | 2GB | DSM | NAS | -| UPS 5UTRA91227 | 1 | | | | UPS | -| UniFi UDM Pro | 1 | | | UnifiOS | Router | -| UniFi USW PRO 24 Gen2 | 1 | | | | Switch | -| UniFi USW Lite 8 | 1 | | | | Switch | -| UniFi U6 In-Wall | 1 | | | | Access Point | -| UniFi U6 Mesh | 1 | | | | Access Point | +| Device | Count | Disk Size | RAM | OS | Purpose | +|----------------------------|-------|------------|------|---------|-------------------------| +| Lenovo M910Q Tiny i5-6500T | 3 | 2x1TB SSD | 32GB | Talos | Kubernetes Master Nodes | +| Raspberry Pi 5 | 1 | | 8GB | RpiOS | DNS, SmartHome | +| Synology RS422+ | 1 | 4x16TB HDD | 2GB | DSM | NAS | +| UPS 5UTRA91227 | 1 | | | | UPS | +| UniFi UDM Pro | 1 | | | UnifiOS | Router | +| UniFi USW PRO 24 Gen2 | 1 | | | | Switch | +| UniFi USW Lite 8 | 1 | | | | Switch | +| UniFi U6 In-Wall | 1 | | | | Access Point | +| UniFi U6 Mesh | 1 | | | | Access Point | ## โ˜๏ธ External Dependencies diff --git a/Taskfile.yaml b/Taskfile.yaml index 0940a1c3..b5c85d94 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -5,13 +5,17 @@ version: "3" vars: # Directories KUBERNETES_DIR: "{{.ROOT_DIR}}/kubernetes" + INFRA_DIR: "{{.ROOT_DIR}}/infrastructure" + ANSIBLE_DIR: "{{.INFRA_DIR}}/ansible" # Files AGE_FILE: "{{.ROOT_DIR}}/age.key" KUBECONFIG_FILE: "{{.ROOT_DIR}}/kubeconfig" + INFRA_SECRETS_FILE: "{{.INFRA_DIR}}/secrets.sops.yaml" env: KUBECONFIG: "{{.KUBECONFIG_FILE}}" SOPS_AGE_KEY_FILE: "{{.AGE_FILE}}" + INFRA_SECRETS_FILE: "{{.INFRA_SECRETS_FILE}}" includes: kubernetes: @@ -21,6 +25,7 @@ includes: talos: .taskfiles/Talos/Taskfile.yaml sops: .taskfiles/Sops/Taskfile.yaml volsync: .taskfiles/VolSync/Taskfile.yaml + ansible: .taskfiles/Ansible/Taskfile.yaml secrets: .taskfiles/ExternalSecrets/Taskfile.yaml tasks: diff --git a/docs/howto.md b/docs/howto.md new file mode 100644 index 00000000..27ffbc95 --- /dev/null +++ b/docs/howto.md @@ -0,0 +1,72 @@ +## How to + +### Reset node ephemeral storage + +In case some of the local hostpath PVs use all the node storage and fill up the disk, the only way is to completely +reset the disk. It can be done with the following command: + +```sh +talosctl --talosconfig=./kubernetes/bootstrap/talos/clusterconfig/talosconfig --nodes=[NODE_IP] reset --system-labels-to-wipe EPHEMERAL + ``` + +1. Start the node from the Proxmox UI. +2. Manually delete all previous PVCs and PVs for a local-hostpath storage class that were hosted on the node. +3. Manually delete pods so they are recreated + +### Upgrade ssd storage + +1. Add a new SSD to the machine +2. Wipe it from the Proxmox UI and press โ€œInitialize Disk with GPT.โ€ +3. Create a new LVM Volume group. LVM allows creating snapshots, which is probably not needed. +4. Add the disk as hardware to the VM. Donโ€™t forget to disable backup. + +### Replace a node + +1. Reset the Talos node + ```sh + talosctl --talosconfig=./kubernetes/bootstrap/talos/clusterconfig/talosconfig --nodes=[node-ip] reset` + ``` +2. Delete the node from Kubernetes + ```shell + Add the disk as hardware to the VM. Donโ€™t forget to disable backup. + ``` +3. Delete the node from the Proxmox cluster. SSH to an existing node and run: + ```sh + pvecm delnode [node-name] + ``` + where node-name is the name from the Proxmox cluster configuration. +4. Delete information about the node on Proxmox machines from /etc/pve/nodes. +5. Continue with the [setup guide](./set-up.md) until the bootstrapping cluster point. +6. Apply the configuration to the new node: + ```sh + talosctl apply-config --talosconfig=./clusterconfig/talosconfig --nodes=[node-ip] --file=./clusterconfig/home-kubernetes-k8s-control-1.yaml --insecure` + ``` + +### Remove Cluster Info from Proxmox Node + +```sh +systemctl stop pve-cluster corosync +pmxcfs -l +rm -rf /etc/corosync/* +rm /etc/pve/corosync.conf +killall pmxcfs +systemctl start pve-cluster +``` + +Delete information about rest nodes in /etc/pve/nodes + +### Set Up GitHub App for a New Repository + +1. Create a GitHub app following + the [guideline](https://docs.github.com/en/apps/creating-github-apps/registering-a-github-app/registering-a-github-app) +2. Copy the app ID and save it to a `BOT_APP_ID` repository secret and to a `ACTION_RUNNER_CONTROLLER_GITHUB_APP_ID` + property of an `actions-runner-controller` 1Password secret. +3. Generate a new app private key and add it to a `BOT_APP_PRIVATE_KEY` repository secret and to + the `ACTION_RUNNER_CONTROLLER_GITHUB_PRIVATE_KEY` property of an `actions-runner-controller` 1Password secret in the + format + ``` + -----BEGIN RSA PRIVATE KEY----- + ... + -----END RSA PRIVATE KEY----- + ``` + diff --git a/docs/prerequisites.md b/docs/prerequisites.md new file mode 100644 index 00000000..b92784f0 --- /dev/null +++ b/docs/prerequisites.md @@ -0,0 +1,114 @@ +## Prerequisites + +### 1. Set up cloudflare +1. Go to [Cloudflare API Tokens](https://dash.cloudflare.com/profile/api-tokens) and create an API Token. +2. Under the `API Tokens` section, click the blue `Create Token` button. +3. Select the `Edit zone DNS` template by clicking the blue `Use template` button. +4. Under `Permissions`, click `+ Add More` and add the following permissions: + - `Zone - DNS - Edit` + - `Account - Cloudflare Tunnel - Read` +5. Limit the permissions to specific account and zone resources. +6. Click the blue `Continue to Summary` button and then the blue `Create Token` button. +7. Copy the token and save it to the secrets store under a `CF_API_TOKEN` field. + +### 2. Set up secrets store +I use 1Password as the secrets store for my homelab cluster. To execute the IaC scripts that provision the +infrastructure, the [1Password Connect](https://developer.1password.com/docs/connect/) must be set up separately with access +to the 1Password vault. Once the cluster setup is complete, 1Password Connect will be hosted inside the cluster. + +Ensure you update `OP_CONNECT_HOST` and `OP_CONNECT_TOKEN` in the [env file](../infrastructure/secrets.sops.yaml). + +The 1Password vault should contain the following items: +
+1Password Vault Items + +| Item name | Fields | Description | +|---------------------------|-------------------------------------------------|-----------------------------------------------------------| +| mino | MINIO_ROOT_USER | | +| | MINO_ROOT_PASSWORD | | +| | MINO_LOKI_BUCKET | | +| | MINO_LOKI_SECRET_KEY | | +| | MINO_LOKI_ACCESS_KEY | | +| | MINO_THANOS_BUCKET | | +| | MINO_THANOS_SECRET_KEY | | +| | MINO_THANOS_ACCESS_KEY | | +| cloudnative-pg | POSTGRESS_SUPER_USER | | +| | POSTGRESS_SUPER_PASS | | +| cloudflare | CLOUDFLARE_ACCOUNT_TAG | | +| | CLOUDFLARE_TUNNEL_SECRET | | +| | CLUSTER_CLOUDFLARE_TUNNEL_ID | | +| | CLOUDFLARE_HOMEPAGE_TUNNEL_SECRET | | +| | CF_API_TOKEN | | +| proxmox | username | | +| | password | | +| | HOMEPAGE_PROXMOX_USERNAME | | +| | HOMEPAGE_PROXMOX_PASSWORD | | +| actions-runner-controller | ACTION_RUNNER_CONTROLLER_GITHUB_APP_ID | | +| | ACTION_RUNNER_CONTROLLER_GITHUB_INSTALLATION_ID | | +| | ACTION_RUNNER_CONTROLLER_GITHUB_PRIVATE_KEY | In a format starting with -----BEGIN RSA PRIVATE KEY----- | +| unifipoller | username | | +| | password | | +| discord | GATUS_DISCORD_WEBHOOK | | +| | ALERTMANAGER_DISCORD_WEBHOOK | | +| gatus | GATUS_POSTGRES_USER | | +| | GATUS_POSTGRES_PASS | | +| nodered | CREDENTIAL_SECRET | Used to encrypt nodered secrets | +| overseerr | OVERSEERR_TOKEN | Used in homepage | +| pihole | HOMEPAGE_PI_HOLE_TOKEN | | +| synology | HOMEPAGE_SYNOLOGY_USERNAME | | +| | HOMEPAGE_SYNOLOGY_PASSWORD | | +| plex | PLEX_TOKEN | Used in homepage | +| prowlarr | PROWLARR_API_KEY | Used in homepage | +| | PROWLARR_POSTGRES_USER | | +| | PROWLARR_POSTGRES_PASSWORD | | +| sonarr | SONARR_API_KEY | Used in homepage | +| | SONARR_POSTGRES_USER | | +| | SONARR_POSTGRES_PASSWORD | | +| radarr | RADARR_API_KEY | Used in homepage | +| | RADARR_POSTGRES_USER | | +| | RADARR_POSTGRES_PASSWORD | | +| qbittorrent | username | | +| | password | | +| grafana | GRAFANA_POSTGRESS_USER | | +| | GRAFANA_POSTGRESS_PASS | | +| pihole | HOMEPAGE_PI_HOLE_TOKEN | | +
+ +### 3. Set up UDM + +1. Set up the unifipoller user (TODO docs). +2. Forward port for qBittorrent (TODO docs). + +### 4. Get discord token + +1. Go to Server settings -> Integrations and create two webhooks: + - Webhook for Prometheus alerts. Save it to the `ALERTMANAGER_DISCORD_WEBHOOK` item in 1Password. + - Webhook for Gatus alerts. Save it to the `GATUS_DISCORD_WEBHOOK` item in 1Password. + +### 5. Set up pihole and generate token for Homepage + +1. Set up Pi-hole on a separate Raspberry Pi. +2. Generate a token for the Homepage widget in Pi-hole and save it to the `HOMEPAGE_PI_HOLE_TOKEN` item in 1Password. + +### 6. NAS set up + +#### Install and Configure Minio on NAS + +1. **Install Synology Container Manager:** + 1. Install the `Synology Container Manager` package from the Package Center. + 2. Open the `Synology Container Manager` and run a Docker container using the `minio/minio` image. Ensure that port `9000` is forwarded. + +2. **Create Minio Buckets:** + - Manually create the following buckets: + - `cloudnative-pg` for PostgreSQL backups. + - `loki-bucket` to store logs. + - `thanos` to store old metrics data with Thanos. + - Update the corresponding 1Password items with the necessary details. + +#### Configure NFS Connections + +1. **Create a Shared Folder:** + 1. Open the Synology Control Panel and navigate to `Shared Folders`. + 2. Create a shared folder for the Kubernetes cluster. + 3. Go to the folder settings and select `NFS Permissions`. + 4. Add the IP addresses of all Kubernetes nodes. Select `Squash` as `No`. diff --git a/docs/set-up.md b/docs/set-up.md index 18ca2ef4..ff37e77a 100644 --- a/docs/set-up.md +++ b/docs/set-up.md @@ -1,66 +1,56 @@ -# Set up guide - -## Install and configure Proxmox - -1. Download official image from an official Proxmox [site](https://www.proxmox.com/en/downloads/proxmox-virtual-environment/iso) -2. Flush image and install it to the machines. During installation specify and write down static ip address that will be -used by the machine. -3. Disable subscription repositories. Go to Repositories setting menu and disable all components marked as `enterprise` and -`pve-enterprise` -4. ssh to the node and run `apt get update` following by `apt get upgrade` -5. Go to Network, select Linux Bridge and check `VLAN aware checkox` in order to be able to assign virtual machines to a -different VLANs. -6. Set up a simple proxmox cluster using menu wizard. No need to make it HA since kubernetes will handle the HA. - -### Set up GPU passthrough -1. Edit `/etc/default/grub` with the following changes: - ``` - GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on" - ``` -2. Run `update-grub` and reboot the node -3. Verify that IOMMU is enabled -``` -dmesg | grep -e DMAR -e IOMMU -``` -There should be a line that looks like `DMAR: IOMMU enabled` -4. For any troubleshouting check out [this guide](https://3os.org/infrastructure/proxmox/gpu-passthrough/igpu-passthrough-to-vm/#proxmox-configuration-for-igpu-full-passthrough) - -## Create and install Talos images -1. Head over to https://factory.talos.dev and follow the instructions which will eventually lead you to download a Talos -Linux iso file. Make sure to note the schematic ID you will need this later on. Add following extensions - - siderolabs/iscsi-tools -- for longhorn - - siderolabs/util-linux-tools -- for longhorn - - siderolabs/qemu-guest-agent -- for being able to manage VM from a proxmox UI -2. Create VM with following configuration: - - Startup on boot - - Bios: SeaBios - - Machine: q35 - - Memory: baloon disabled - - CPU: type host, cpu units 1024 - - Network: vlan 20, firewall disabled, mac address one of the following: BC:24:11:B5:DD:1F, BC:24:11:0C:FD:22, BC:24:11:A8:19:33 -3. Add PCI device `Inter HD Graphics` +# Setup Guide + +## Install and Configure Proxmox + +1. Download the official image from + the [Proxmox site](https://www.proxmox.com/en/downloads/proxmox-virtual-environment/iso). +2. Flash the image and install it on the machines. During installation, specify and write down the static IP address + that will be used by the machine. +3. Go to the machine disks, click on an SSD, and select "Initialize disk with GPT." +4. Go to the LVM subsection and add a new Volume Group based on the disk, named "SSD." +5. Inspect the [Ansible inventory file](../infrastructure/ansible/inventory/hosts.yaml) and + run `task ansible:proxmox-setup` to configure Proxmox nodes. This will provision the SSH key, update Proxmox to the + latest versions, and set up GPU passthrough. For any troubleshooting with GPU, check + out [this guide](https://3os.org/infrastructure/proxmox/gpu-passthrough/igpu-passthrough-to-vm/#proxmox-configuration-for-igpu-full-passthrough). +6. Go to Network, select Linux Bridge, and check the `VLAN aware` checkbox to assign virtual machines to different + VLANs. +7. Set up a simple Proxmox cluster using the menu wizard. No need to make it HA since Kubernetes will handle the HA. + +## Create and Install Talos Images + +1. Head over to [Talos Factory](https://factory.talos.dev) and follow the instructions to download a Talos Linux ISO + file. Note the schematic ID; you will need this later on. Add the following extensions: + +- siderolabs/iscsi-tools -- for Longhorn +- siderolabs/util-linux-tools -- for Longhorn +- siderolabs/qemu-guest-agent -- for managing VMs from the Proxmox UI + +2. Go to `/infrastructure/terraform/talos.tf` and update the ISO URL if needed. +3. Check the Terraform changes with `terraform plan`. +4. Run Terraform to create VMs with Talos nodes: + ```sh + terraform apply + ``` ## Bootstrap kubernetes cluster -1. Deploy the talos cluster to machines -``` -task talos:bootstrap -``` - -2. It might take a while for the cluster to be setup (10+ minutes is normal), during which time you will see a variety of -error messages like: "couldn't get current server API group list," "error: no matching resources found", etc. This is a -normal. If this step gets interrupted, e.g. by pressing Ctrl + C, you likely will need to nuke the cluster -before trying again. - -This task will create a `talosconfig` in a `/kubernetes/bootstrap/talos/clusterconfig` directory. You can use it to -get access to a Talos cluster for troubleshooting -``` -talosctl --talosconfig=./kubernetes/bootstrap/talos/clusterconfig/talosconfig --nodes=192.168.20.51 health -``` - -3. The `kubeconfig` for interacting with the cluster will be generated in the root directory. - - Verify the nodes are online: - ```shell + +1. Deploy the Talos cluster to machines: + ```sh + task talos:bootstrap + ``` + It might take a while for the cluster to be set up (10+ minutes is normal), during which time you will see various + error messages like: โ€œcouldnโ€™t get current server API group list,โ€ โ€œerror: no matching resources found,โ€ etc. This is + normal. If this step gets interrupted, e.g., by pressing Ctrl + C, you likely will need to nuke the cluster before + trying again. + + This task will create a talosconfig in the /kubernetes/bootstrap/talos/clusterconfig directory. You can use it to get + access to a Talos cluster for troubleshooting: + ```sh + talosctl --talosconfig=./kubernetes/bootstrap/talos/clusterconfig/talosconfig --nodes=192.168.20.51 health + ``` + +2. The `kubeconfig` for interacting with the cluster will be generated in the root directory. Verify the nodes are online: + ```sh kubectl get nodes -o wide # NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME # k8s-control-1 Ready control-plane 4d21h v1.30.1 192.168.20.51 Talos (v1.7.2) 6.6.30-talos containerd://1.7.16 @@ -68,6 +58,14 @@ talosctl --talosconfig=./kubernetes/bootstrap/talos/clusterconfig/talosconfig -- # k8s-control-3 Ready control-plane 4d21h v1.30.1 192.168.20.53 Talos (v1.7.2) 6.6.30-talos containerd://1.7.16 ``` +3. Add longhorn annotations to each node + + ```shell + kubectl annotate node k8s-control-1 node.longhorn.io/default-disks-config='[{"name": "nvme","path":"/var/lib/longhorn","tags":["nvme"]},{"name": "ssd","path":"/var/mnt/ssd/longhorn","allowScheduling":true,"tags":["ssd"]}]' + kubectl annotate node k8s-control-2 node.longhorn.io/default-disks-config='[{"name": "nvme","path":"/var/lib/longhorn","tags":["nvme"]},{"name": "ssd","path":"/var/mnt/ssd/longhorn","allowScheduling":true,"tags":["ssd"]}]' + kubectl annotate node k8s-control-3 node.longhorn.io/default-disks-config='[{"name": "nvme","path":"/var/lib/longhorn","tags":["nvme"]},{"name": "ssd","path":"/var/mnt/ssd/longhorn","allowScheduling":true,"tags":["ssd"]}]' + ``` + 4. Continue with installing flux ## Install Flux diff --git a/infrastructure/.envrc b/infrastructure/.envrc new file mode 100644 index 00000000..bcbff56c --- /dev/null +++ b/infrastructure/.envrc @@ -0,0 +1,9 @@ +use_sops() { + local path=${1:-$PWD/secrets.sops.yaml} + eval "$(sops -d --output-type dotenv "$path" | direnv dotenv bash /dev/stdin)" + watch_file "$path" +} + +source_up +# will load secrets.yaml +use sops diff --git a/infrastructure/ansible/collections/requirements.yml b/infrastructure/ansible/collections/requirements.yml new file mode 100644 index 00000000..8282a646 --- /dev/null +++ b/infrastructure/ansible/collections/requirements.yml @@ -0,0 +1,5 @@ +--- +collections: + - name: prometheus.prometheus + version: 0.17.0 + - name: onepassword.connect diff --git a/infrastructure/ansible/inventory/hosts.yaml b/infrastructure/ansible/inventory/hosts.yaml new file mode 100644 index 00000000..1f45d623 --- /dev/null +++ b/infrastructure/ansible/inventory/hosts.yaml @@ -0,0 +1,15 @@ +--- +all: + hosts: + children: + proxmox: + hosts: + proxmox-1: + ansible_host: 192.168.0.41 + proxmox-2: + ansible_host: 192.168.0.42 + proxmox-3: + ansible_host: 192.168.0.43 + vars: + ansible_user: root + sops: "{{ lookup('community.sops.sops', lookup('env', 'INFRA_SECRETS_FILE')) | ansible.builtin.from_yaml }}" diff --git a/infrastructure/ansible/playbooks/proxmox-apt-upgrade.yaml b/infrastructure/ansible/playbooks/proxmox-apt-upgrade.yaml new file mode 100644 index 00000000..5261eeb7 --- /dev/null +++ b/infrastructure/ansible/playbooks/proxmox-apt-upgrade.yaml @@ -0,0 +1,18 @@ +--- +- name: Upgrade proxmox packages + hosts: + - proxmox + become: true + gather_facts: true + any_errors_fatal: true + tasks: + - name: Upgrade packages + ansible.builtin.apt: + upgrade: full + update_cache: true + cache_valid_time: 3600 + autoclean: true + autoremove: true + register: apt_upgrade + retries: 5 + until: apt_upgrade is success diff --git a/infrastructure/ansible/playbooks/proxmox-setup.yaml b/infrastructure/ansible/playbooks/proxmox-setup.yaml new file mode 100644 index 00000000..cbedb2e9 --- /dev/null +++ b/infrastructure/ansible/playbooks/proxmox-setup.yaml @@ -0,0 +1,188 @@ +--- +- name: Setup proxmox nodes + hosts: + - proxmox + become: false + gather_facts: false + any_errors_fatal: true + + tasks: + - name: Get proxmox password from 1Password + onepassword.connect.field_info: + token: "{{ sops.OP_CONNECT_TOKEN }}" + hostname: "{{ sops.OP_CONNECT_HOST }}" + item: Proxmox root + field: password + vault: 4ebapsbutjt6t66s3y22ne4l4u + no_log: true # Turn off logs to avoid logging sensitive data + delegate_to: localhost # this task is only run on localhost + register: password + + - set_fact: # use registered password and vars to setup connection + ansible_password: "{{ password.field.value }}" + no_log: true + + - name: Gathering facts + setup: + + + - name: Remove PVE Enterprise repo + ansible.builtin.apt_repository: + repo: deb https://enterprise.proxmox.com/debian/pve {{ ansible_distribution_release }} pve-enterprise + state: absent + filename: pve-enterprise + + - name: Remove PVE Enterprise repo ceph + ansible.builtin.apt_repository: + repo: deb https://enterprise.proxmox.com/debian/ceph-quincy {{ ansible_distribution_release }} enterprise + state: absent + filename: pve-enterprise + + - name: Add PVE no-subscription repo + ansible.builtin.apt_repository: + repo: deb http://download.proxmox.com/debian/pve {{ ansible_distribution_release }} pve-no-subscription + state: present + filename: pve-no-subscription + + - name: Upgrade all system packages + ansible.builtin.apt: + upgrade: full + update_cache: true + cache_valid_time: 3600 + autoclean: true + autoremove: true + register: apt_upgrade + retries: 5 + until: apt_upgrade is success + + - name: Print the field definition + ansible.builtin.debug: + msg: "{{ sops.SSH_PUB_KEY | regex_replace(\"[']\", '') }}" + + - name: Provision ssh key + ansible.builtin.lineinfile: + path: /{{ ansible_user }}/.ssh/authorized_keys + line: "{{ sops.SSH_PUB_KEY | regex_replace(\"[']\", '') }}" + create: yes + state: present + + - name: Install common packages + ansible.builtin.apt: + name: + - vim + - htop + - linux-cpupower + - lm-sensors + install_recommends: true + update_cache: true + cache_valid_time: 3600 + autoclean: true + autoremove: true + + - name: Load lm_sensors modules + community.general.modprobe: + name: "{{ item }}" + state: present + loop: + - coretemp + - drivetemp + - vfio + - vfio_iommu_type1 + - vfio_pci + - kvmgt + + - name: Enable lm_sensors modules on boot + ansible.builtin.copy: + mode: "0644" + content: "{{ item }}" + dest: "/etc/modules-load.d/{{ item }}.conf" + loop: + - coretemp + - drivetemp + - vfio + - vfio_iommu_type1 + - vfio_pci + - kvmgt + + - name: "Gather installed packages" + ansible.builtin.package_facts: + manager: auto + + - name: "Install pve fake subscription" + when: "'pve-fake-subscription' not in ansible_facts.packages" + block: + - name: Get newest pve-fake-subscription release + ansible.builtin.uri: + url: https://api.github.com/repos/Jamesits/pve-fake-subscription/releases/latest + return_content: true + register: json_reponse + + - name: Create tmpdir + ansible.builtin.tempfile: + state: directory + register: tmpdir + + - name: Download pve-fake-subscription + ansible.builtin.get_url: + url: "{{ json_reponse.json.assets[0].browser_download_url }}" + dest: "{{ tmpdir.path }}" + mode: "0644" + register: download + + - name: Install pve-fake-subscription + ansible.builtin.apt: + deb: "{{ download.dest }}" + + - name: Install node-exporter + ansible.builtin.include_role: + name: prometheus.prometheus.node_exporter + + - name: Set up GPU Passthrough + block: + - name: Read /etc/default/grub content + ansible.builtin.slurp: + path: /etc/default/grub + register: grub_file_content + + - name: Check if GRUB_CMDLINE_LINUX_DEFAULT contains intel_iommu=on + set_fact: + grub_cmdline_contains_iommu: "{{ (grub_file_content.content | b64decode).split('\n') | select('search', 'GRUB_CMDLINE_LINUX_DEFAULT=.*intel_iommu=on') | list | length > 0 }}" + + - name: Ensure GRUB_CMDLINE_LINUX_DEFAULT contains intel_iommu=on + ansible.builtin.lineinfile: + path: /etc/default/grub + regexp: '^GRUB_CMDLINE_LINUX_DEFAULT=' + line: 'GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on"' + backrefs: yes + create: yes + register: grub_config + when: not grub_cmdline_contains_iommu + + - name: Update grub configuration if /etc/default/grub was modified + ansible.builtin.command: + cmd: update-grub + when: grub_config.changed + + - name: Reboot the machine if /etc/default/grub was modified + ansible.builtin.reboot: + when: grub_config.changed + + - name: Reboot the machine if /etc/default/grub was modified + ansible.builtin.reboot: + when: grub_config.changed + register: reboot_result + + - name: Wait for the machine to reboot + ansible.builtin.wait_for_connection: + timeout: 300 + when: reboot_result is changed + + - name: Check dmesg for DMAR or IOMMU messages + ansible.builtin.command: dmesg | grep -e DMAR -e IOMMU + register: dmesg_output + when: reboot_result is changed + + - name: Fail if IOMMU is not enabled + ansible.builtin.fail: + msg: "IOMMU is not enabled" + when: reboot_result is changed and 'IOMMU enabled' not in dmesg_output.stdout diff --git a/infrastructure/secrets.sops.yaml b/infrastructure/secrets.sops.yaml new file mode 100644 index 00000000..514e8299 --- /dev/null +++ b/infrastructure/secrets.sops.yaml @@ -0,0 +1,24 @@ +OP_CONNECT_HOST: ENC[AES256_GCM,data:8iBWmj/iPcysoAECfXRoe1qODKTpRgzibAy7BmmOKV75rQlhzPaF+yo=,iv:BrYlRkvczZg/thrTtMfxq1fWw98zlhriK8wSvQ89rL0=,tag:DaKCPb3KslIDb0aqYsf9PQ==,type:str] +OP_CONNECT_TOKEN: ENC[AES256_GCM,data:AdO8tkxiWIu88UxX3RhqScDWn0xZ4jzTTvvtCRY3fPDF29eXWyxv6+VlMFDW0u5H6Mk7FrjmCBfuF3ask0xgFVYQHAzX5BUXYbOfv3cbwuw5bd/fkTWrI2Oy82Obx0wGgQkYZc43zvMYuYssZizndhmfKolMnC9vTNCW/XZp2ODRveMpwhPaRuZojO5ZmWA14JATDaAI5pIpo8VlG8dy1the7NgboqabFk/ZTDbdlPpsr+B8VtePurezF++Zi3dGHyHla8UtX67oNO9hToynyE4QbLnkQ5LpB7cFOhtCv7NcWg/t66gaoI/78MIlxDxhS9qpazTIeZTEDaV+z2v/IIHCPBwcvOe2DD1nPl9tVUzWdmVQPKtN7z/YkVX85zKUWFL52EE3/KnOa0YQzEmIyWuwavkkYXJW80Od8ZX0xUkAGnAoQkl0P0pb6rzUOgmA4A8CQCA5LLpLoSPBFqcHApdFOGyjkduFHUTd3XnUdrwBQSctMIYVk3yi+DIhz7OURZYnW/pD4VApV8kWlDJkuAmghNbZnnxne4I/tM3FC4g+hhtpMV8IeRnGUOksA+hAMGhtOvZqP/PA2C8heBjsDQqp7+ymHobttttKa+JojTfO57yB3YtHTvHiUO/8Ky8rMcMAm9hfqFlo+iCG2U2N2r2ahydmafbEMcjYfZNI98Gso1bb7NjEpTTOmJB9jajAdnBIWd0muOY6hty6gpxU9P0JXovhgvIR9RR7x2TG800/kipQiLZQbwogyK9HFUVkH4sgGGNRaPZTIwx4BHnOZ+vp27QCO636HEr9ZgGB5Wlr+MxE3AEOtMdIExzoxui1T3tA9vpcox5u1GfpLUnU/cQ8,iv:m2+xlBhkHDFbNSUYKHIWToOd+rD3EDjBeBjouvVpu8w=,tag:H3PPO+xZ/gAZe8nVHTz48A==,type:str] +PROXMOX_ROOT_PASSWORD: ENC[AES256_GCM,data:LClyIbFR5uCNZX057v0=,iv:961DtGX6oneEuNRJTDoDTNRnvX1iiowvYC/PhqZiyHc=,tag:vxCLP373KcCgQjp3TskoGg==,type:str] +SSH_PUB_KEY: ENC[AES256_GCM,data:ZAphxKS/0+hWhngvQqPROefI/bSmyxCEJO8N3tT/SQyhC8+kxzArS+09oFVVO7/GRnQe4RFhyuZsbIbZ7JqdVlLRzhrxYmT4wUCNDVHAycWTACkYlneut19b7duigl0RLs0EIlPSStzADvwkmBdpBv5V/tF717QG6FKP15FowrGC+zc0UqKIDFSvztO+zvjdtQwcjcUZ7kMvV50LKunGQk0nrJzNKzPD+zz/3i3aMc6w++ltuqICAOR2SIQNHHGuhLo0IWdGFLivn7n7wz87CtRypjzcdTn4f3aVjaG2LOPFUE71MZxjptqHlci3lu/5i48A9p7VooBNPN53IysEUqZSqULDSXJybBCP1lkuOTBCkeu/+C8T7+HlSXuQW5mhrI0C81z6dgFSL65hx7Yz1WlShAFy5jCcJW72KjMc+oYF7y2otNxyrXRAaweoXVXOq4JlyfXp/MhVeFnoUUCdl5pgerEQJtZt0q8khAfNU984/EioU8o2+//WsLvCATokrbiIZ59jwwN1oNru75XPl4D5rg0T+by/UYHPVJRXJb3KKQ0F/i8WLXG1IanI3S6IJbscWwjCb10NlPEULgzBQSam9+YqxP2pmSQK7WZsoGfGq2asY2PQOn8uHbDPswgwHRWzBUkPwJ95ePylA4GtJbltEV4A61+4ydB2w2uo1zOLSF0p14UKO+8kLyfV9W1r+C+zdXNXa7N/mg3k5gXZ5Pv/SdsphOiRB5KadUs0eftLcXv/0NmCQOZx41HYNLH5hOAdCwRK61P9SDF82X8saCYI1mFSzC6ItJl9ATtLohxro7jZjj11m5hlub/O4SlLZSmYQD5RNXcIkKXUwwsgdANoGz7FzfGNP4YlxNsD0pt03AZqMjLt721Dw+uDH2GSMZ4XveyyijjEIJOkIeDuodOWSKUrCdPnEuBCSkfXqN1EeDs6T2J+e+/pS/QIKyOhLEAnd1DCQKFyoSEkfk82bpkBKq94FuNZok8oFDz1,iv:cP9OMaySJ30N5yycoMia6iC4lCGSg2wdRYpwL/qojFk=,tag:MRbrZqjQcrxLGPFoksSQPQ==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1k5xl02aujw4rsgghnnd0sdymmwd095w5nqgjvf76warwrdc0uqpqsm2x8m + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSAvcjRRMWIvM3QxUHBrWElK + SmRkVFBCdWZvVVdzRi91Q3R1WmR6ZlhSbUIwCmVSaEhCUEhYWlhNS3F5SkJGaCs4 + S2dJK2ZseHJ6QjRRSUdHa0lraUVsOTAKLS0tIEtXaXdUcXlDZnZ1TWVubHZ1SG1i + ZmJuUE5tYWhrMmtFeFFnZ3dWK25saUEKZElOdUu9mUmYvQkrYBzTwDQJwffqjxvp + jBBUbAypdjQTP3clZ9QEPtNqsXafIdHNPWoTzqg4r9zi5NVJqUJzQw== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2024-07-21T22:38:05Z" + mac: ENC[AES256_GCM,data:dEdZkoNQPMK+LGYpQnU/wAooBtid/vtPTBk2wQwWNxFyRbP0/SgVf5jR8KG6HtAgnVfKm3+Ia3ftX+3eaH3343fx5cbVLJuhJs57S8PZ4nPIEJLxSVGkgghwByaXZ7u82VUzfVLgnkLrvHW9fGvUdZxk0wdLgPUi6+BOdc0wbvo=,iv:DDurd9ouR6BAKnP+q/BvFehHJIb1NFMZ4S5BOiTairY=,tag:D8jqoIfHDe4+lQrgYsFGEg==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.8.1 diff --git a/infrastructure/terraform/proxmox/.terraform.lock.hcl b/infrastructure/terraform/proxmox/.terraform.lock.hcl new file mode 100644 index 00000000..d4e0f9f3 --- /dev/null +++ b/infrastructure/terraform/proxmox/.terraform.lock.hcl @@ -0,0 +1,70 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/1password/onepassword" { + version = "2.1.0" + constraints = "2.1.0" + hashes = [ + "h1:WaVFLtfvlx899m6rNsL3qr4aS+v2lIhO8QgfcDgC4NM=", + "zh:0d6f803760aa7cae0e841cfca17ef04411231170b2844cc0b30556d5476d9dff", + "zh:17badbffb56309f28aee1893a6b93d1cd87ed5157704fb17b93889f0ccf8cc2d", + "zh:185e0c7c66cc159769d7b91c37ab51a546efc13fb99eb206481739a521f75236", + "zh:19e213f8265445a29d8bb7c7b1f0d4e3c1fdfd538178704f8e8378db2dcdf359", + "zh:49929666304f97301f44ee0fdd39f40f63e35ccfb4c81588439bdab6d5bafde0", + "zh:4de33f5630350d6a561d5d62994d525beb8849c94287c2658f39242fe3170cf8", + "zh:4f212a8fbbbaa7a47f1b31857be3bad2d590f92be845c6b252c9716bb70076d9", + "zh:596cc2bd9aaafd2e649aabcff0125afa9d4270f702813c935fbd5694eed002e7", + "zh:618e703a43608c502066c5b909ead45b1f4202f7cebc993f447278477d32cda2", + "zh:61fde3651bcb2e691ee9d82ce1de03588d006f53b2e8e2516910321da8627228", + "zh:890df766e9b839623b1f0437355032a3c006226a6c200cd911e15ee1a9014e9f", + "zh:db05022113841a00174bba5e24cfc77195bbc03d24339c5e8ac4346069901e45", + "zh:dcc7792a24c74890081a96ba2bc360d90ab71a4d25232ca18046d9868c835e21", + "zh:f2e67a298d20bf52cb208611767b420962d3f0d518e89cf41cc432551b1faf63", + "zh:f7e587814506c7e74fc1d80b29465c8e4b7bdbf803f7f8c0a8bb498968cdd58d", + ] +} + +provider "registry.terraform.io/bpg/proxmox" { + version = "0.61.1" + constraints = "0.61.1" + hashes = [ + "h1:SQSHTHj2ThcF08cON2gHYcnkS/XLmoF8E4cRIgpagtE=", + "zh:27d8b589a2dc1e0a5b0f8ab299b9f3704a2f0b69799d1d4d8845c68056986d1f", + "zh:46dfa6b33ddd7007a2144f38090457604eb56a59a303b37bb0ad1be5c84ddaca", + "zh:47a1b14a759393c5ecc76f2feb950677c418c910b8c677fde0dd3e4675c41579", + "zh:582e49d109d1c2b1f3b1268a7cbc43548f3c6d96a87c92a5428767097a5e383e", + "zh:5e98ad6afae5969a4c3ffb14c0484936550c66c8313d7686551c29b633ff32f2", + "zh:7b9e24b76f947ab8f1e571cf61beefc983b7d2aa1b85df35c4f015728fe37a38", + "zh:8255ca210f279a0f7b8ca2762df26d2ea1a01704298c5e3d5cf601bd39a743f0", + "zh:85d7655fdc95dedced9cf8105a0beeb0d7bc8f668c55f62019a7215a76d60300", + "zh:8aeea5a1d001b06baaf923b754e1a14d06c75eb8c8b87a7f65a3c8205fc8b079", + "zh:a9cfab6c06f613658c5fdd83742cd22c0eb7563778924b1407965ef8c36c1ce0", + "zh:ceaab67801d49a92eb5858b1ddae6df2569462e5ffbe31f9dbd79dcb684ea142", + "zh:dc25b506d5c55d1d78a335d3ebd03213c99b4b2a5859812349a955c2f746ff7e", + "zh:e04b477fd77a0d37a0bdb76a7cf69184dad9e7fbba9b4f3a378a8901b82b75e5", + "zh:f1e6838d9141557f73340df9b21fce5a82b41cc16ae36f063a920ccc36bc0758", + "zh:f26e0763dbe6a6b2195c94b44696f2110f7f55433dc142839be16b9697fa5597", + ] +} + +provider "registry.terraform.io/paultyng/unifi" { + version = "0.41.0" + constraints = "0.41.0" + hashes = [ + "h1:fc0gUNc7ddxeLKVgVbt2iuYBn0U9GKE9lxK78w8oRF4=", + "zh:03ddd3aee05a08e1446f75a7b3f52810181d3307728cba08ce8fb67f109a9c00", + "zh:11b14b79ad02b0a55fd6116b10c0eb6fab432dd7d1f3527af0e2055adf292451", + "zh:18c0eb19889927f115a1e05d64f59b4e8d530ccdf1a8b574940a86be20973564", + "zh:2df9ca0c21830d2757758e574b19d0d4e54965ce80dbbfb3f124db1dac3d7e8f", + "zh:36274af3b7e8b08ba69c04a226c63e0dd2ec386c583288ebd7bc2a30e349ee8f", + "zh:413eb222ef30889bab33ccbfc46c9fb64307555da34eac4625d51e696ac72e1d", + "zh:4839814ff9f405a13397ffadd6f1052c770b88802280a4d8cde066f9a19718c7", + "zh:9547b7831852cc5b9c0fd13ab447d48539eae94582c8725ad255af36e31fb5d9", + "zh:a855c89b12326eb1c89bbf292a2bb1de3651794e3409d5012076ada89aabdc8a", + "zh:aef12a33b90fd77a9bf4e9d397966ccbfa4a037a648a1725074aff2db2d90fb0", + "zh:b3c72a6a02e29b4d21aa0d0831a272ca7cb82c3f8c2c3c7f09fcc2d2dcd78752", + "zh:c8354eaaab5f526e8e530b098544c7583a0f0b5b27d67500c7b3e9da56a3a7e5", + "zh:dc29f1e70f20ce86d3c6a66c7a817616f993a1cf9d941604dfd5222a06992c4c", + "zh:e772779333419f34d2c6da333c7f7d235a5a34f21ea47636b548e132aed74f3b", + ] +} diff --git a/infrastructure/terraform/proxmox/main.tf b/infrastructure/terraform/proxmox/main.tf new file mode 100644 index 00000000..5bb6c28f --- /dev/null +++ b/infrastructure/terraform/proxmox/main.tf @@ -0,0 +1,49 @@ +terraform { +# backend "s3" { +# bucket = "terraform" +# key = "proxmox/state.tfstate" +# skip_credentials_validation = true +# skip_metadata_api_check = true +# skip_region_validation = true +# force_path_style = true +# } + + required_providers { + proxmox = { + source = "bpg/proxmox" + version = "0.61.1" + } + unifi = { + source = "paultyng/unifi" + version = "0.41.0" + } + } +} + +module "secret_pve" { + # Remember to export OP_CONNECT_HOST and OP_CONNECT_TOKEN + source = "github.com/bjw-s/terraform-1password-item?ref=main" + vault = "homelab" + item = "Proxmox root" +} + +module "secret_unifi" { + # Remember to export OP_CONNECT_HOST and OP_CONNECT_TOKEN + source = "github.com/bjw-s/terraform-1password-item?ref=main" + vault = "homelab" + item = "Unifi" +} + +provider "proxmox" { + endpoint = "https://192.168.0.41:8006/" + username = "${module.secret_pve.fields.username}@pam" + password = module.secret_pve.fields.password + insecure = true +} + +provider "unifi" { + username = module.secret_unifi.fields.username + password = module.secret_unifi.fields.password + api_url = "https://192.168.0.1" + allow_insecure = true +} diff --git a/infrastructure/terraform/proxmox/talos-node/instance.tf b/infrastructure/terraform/proxmox/talos-node/instance.tf new file mode 100644 index 00000000..31eff9a3 --- /dev/null +++ b/infrastructure/terraform/proxmox/talos-node/instance.tf @@ -0,0 +1,80 @@ +resource "proxmox_virtual_environment_vm" "node" { + name = var.machine_name + node_name = var.target_node + vm_id = var.vmid + + on_boot = true + tablet_device = false + timeout_stop_vm = 600 + boot_order = ["scsi0", "ide0"] + + operating_system { + type = "l26" + } + + agent { + enabled = true + type = "virtio" + timeout = "10s" + } + + bios = "seabios" + + machine = "q35" + + cpu { + cores = var.cpu_cores + type = "host" + } + + memory { + dedicated = var.memory + floating = var.memory + } + + scsi_hardware = "virtio-scsi-single" + + dynamic "disk" { + for_each = var.disks + content { + datastore_id = disk.value.datastore_id + discard = "on" + interface = disk.value.interface + iothread = true + size = disk.value.size + file_format = "raw" + ssd = true + } + } + + network_device { + model = "virtio" + bridge = "vmbr0" + mac_address = var.mac_address + vlan_id = var.vlan_id + } + + cdrom { + enabled = true + file_id = proxmox_virtual_environment_download_file.talos_img.id + interface = "ide0" + } + + hostpci { + device = "hostpci0" + id = "0000:00:02.0" + } + + lifecycle { + ignore_changes = [ + cpu["architecture"] + ] + } +} + +resource "proxmox_virtual_environment_download_file" "talos_img" { + content_type = "iso" + datastore_id = "local" + node_name = var.target_node + url = var.iso_path +} diff --git a/infrastructure/terraform/proxmox/talos-node/main.tf b/infrastructure/terraform/proxmox/talos-node/main.tf new file mode 100644 index 00000000..e5901a2c --- /dev/null +++ b/infrastructure/terraform/proxmox/talos-node/main.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + proxmox = { + source = "bpg/proxmox" + } + } +} diff --git a/infrastructure/terraform/proxmox/talos-node/variables.tf b/infrastructure/terraform/proxmox/talos-node/variables.tf new file mode 100644 index 00000000..bcfc5cc2 --- /dev/null +++ b/infrastructure/terraform/proxmox/talos-node/variables.tf @@ -0,0 +1,55 @@ +variable "machine_name" { + type = string +} + +variable "mac_address" { + type = string +} + +variable "vmid" { + type = number + default = 0 +} + +variable "target_node" { + type = string +} + +variable "iso_path" { + type = string + default = "" +} + +variable "oncreate" { + type = bool + default = true +} + +variable "startup" { + type = string + default = "" +} + +variable "cpu_cores" { + type = number + default = 1 +} + +variable "memory" { + type = number + default = 1024 +} + +variable "vlan_id" { + type = number + default = 0 +} + +variable "disks" { + type = list(object({ + datastore_id = string + interface = string + size = string + })) + default = [] +} diff --git a/infrastructure/terraform/proxmox/talos.tf b/infrastructure/terraform/proxmox/talos.tf new file mode 100644 index 00000000..f5fcd9bb --- /dev/null +++ b/infrastructure/terraform/proxmox/talos.tf @@ -0,0 +1,36 @@ +data "unifi_network" "Servers" { + name = "Servers Trusted" +} + +locals { + // if changing, don't forget to change it in other places. DRY is to hard + mac_addresses = ["BC:24:11:B5:DD:1F", "BC:24:11:0C:FD:22", "BC:24:11:A8:19:33"] + # renovate: datasource=docker depName=ghcr.io/siderolabs/installer + talos_version = "v1.7.5" +} + +module "talos-controlplanes" { + source = "./talos-node" + oncreate = false + count = 3 + machine_name = "k8s-control-${count.index + 1}" + vmid = sum([100, count.index]) + target_node = "proxmox${count.index + 1}" + iso_path = "https://factory.talos.dev/image/88d1f7a5c4f1d3aba7df787c448c1d3d008ed29cfb34af53fa0df4336a56040b/${local.talos_version}/nocloud-amd64.iso" + cpu_cores = 4 + memory = 29 * 1024 + vlan_id = data.unifi_network.Servers.vlan_id + mac_address = local.mac_addresses[count.index] + disks = [ + { + datastore_id : "local-lvm" + interface : "scsi0" + size : "900" + }, + { + datastore_id : "ssd" + interface : "scsi1" + size : "900" + } + ] +} diff --git a/kubernetes/apps/cert-manager/cert-manager/issuers/externalsecret.yaml b/kubernetes/apps/cert-manager/cert-manager/issuers/externalsecret.yaml new file mode 100644 index 00000000..8fa616e1 --- /dev/null +++ b/kubernetes/apps/cert-manager/cert-manager/issuers/externalsecret.yaml @@ -0,0 +1,19 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/external-secrets.io/externalsecret_v1beta1.json +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: cloudflare +spec: + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-connect + target: + name: cloudflare-secret + template: + engineVersion: v2 + data: + CF_API_TOKEN: "{{ .CF_API_TOKEN }}" + dataFrom: + - extract: + key: cloudflare diff --git a/kubernetes/apps/cert-manager/cert-manager/issuers/issuers.yaml b/kubernetes/apps/cert-manager/cert-manager/issuers/issuers.yaml index 1cf7148a..ce09da8b 100644 --- a/kubernetes/apps/cert-manager/cert-manager/issuers/issuers.yaml +++ b/kubernetes/apps/cert-manager/cert-manager/issuers/issuers.yaml @@ -12,8 +12,8 @@ spec: - dns01: cloudflare: apiTokenSecretRef: - name: cert-manager-secret - key: api-token + name: cloudflare-secret + key: CF_API_TOKEN selector: dnsZones: - "${SECRET_DOMAIN}" @@ -32,8 +32,8 @@ spec: - dns01: cloudflare: apiTokenSecretRef: - name: cert-manager-secret - key: api-token + name: cloudflare-secret + key: CF_API_TOKEN selector: dnsZones: - "${SECRET_DOMAIN}" diff --git a/kubernetes/apps/cert-manager/cert-manager/issuers/kustomization.yaml b/kubernetes/apps/cert-manager/cert-manager/issuers/kustomization.yaml index fd43d965..d6ac943f 100644 --- a/kubernetes/apps/cert-manager/cert-manager/issuers/kustomization.yaml +++ b/kubernetes/apps/cert-manager/cert-manager/issuers/kustomization.yaml @@ -3,5 +3,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - ./secret.sops.yaml + - ./externalsecret.yaml - ./issuers.yaml diff --git a/kubernetes/apps/cert-manager/cert-manager/issuers/secret.sops.yaml b/kubernetes/apps/cert-manager/cert-manager/issuers/secret.sops.yaml deleted file mode 100644 index eb1a98c9..00000000 --- a/kubernetes/apps/cert-manager/cert-manager/issuers/secret.sops.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: cert-manager-secret -stringData: - api-token: ENC[AES256_GCM,data:V/OeW+bpuNGXDAiNZ2WmawliZ8JakYzZvSqNhuLRCif3e1nXDXXL+Q==,iv:yq3rE8ZsK2ih6FMNtFRvak7xNNTTB/VCz0+Mp8CiJ5M=,tag:2eY19fzMjg99TAlbC44ntw==,type:str] -sops: - kms: [] - gcp_kms: [] - azure_kv: [] - hc_vault: [] - age: - - recipient: age1k5xl02aujw4rsgghnnd0sdymmwd095w5nqgjvf76warwrdc0uqpqsm2x8m - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBnSm52UU95ZVJMUE52cjc3 - THplYTFpbFd4ZDJSV2RIaDNLWVRxZFd6TEVzCi9OSmIvYUhvVUhHQldoalJzMEpX - dXNPV3AreUowSHBBY1NYUlh5b24wZDAKLS0tIHVwVkViazFmVmhqQjBqNkJiVlN4 - VGFML3ZMZzk4WlF3NjJ6SXpobzJPMlEKheLxsJRKPxsPwGOKZ8kb5viGJ07RT9eq - id87ugUEST/+c5l0YE4Q5DDRpikoiT3uoDS7X+PfIGHgQWiQUq4uNQ== - -----END AGE ENCRYPTED FILE----- - lastmodified: "2024-02-17T21:45:25Z" - mac: ENC[AES256_GCM,data:bKwwkxj+C5/dPsKsiFi599+d31RpAbcQQ5HugHBNIANGT0nwmYx9Cj8gDGcAeY4OBs9fWzZ2uHVW9ZbgrzyOdsSH2VdurPvOruJZ2kuWZ1BYZm1pbsFXRWhuWxaaJLTK9mP4YlOEQ76uYVMaaXORS7Pt4AHmliDReOyGJF4X+lI=,iv:SVsKQ7xOkLcODOWe7A/IFoQpIMB4Cbb7p8P4st3lZjo=,tag:Wq6M1In3czCDwUXsFBHMGQ==,type:str] - pgp: [] - encrypted_regex: ^(data|stringData)$ - version: 3.7.3 diff --git a/kubernetes/apps/observability/kustomization.yaml b/kubernetes/apps/observability/kustomization.yaml index d070a45b..f345c413 100644 --- a/kubernetes/apps/observability/kustomization.yaml +++ b/kubernetes/apps/observability/kustomization.yaml @@ -7,7 +7,7 @@ resources: - ./alert.yaml - ./portainer/ks.yaml - ./gatus/ks.yaml -# - ./thanos/ks.yaml + - ./thanos/ks.yaml - ./kube-prometheus-stack/ks.yaml - ./prometheus-operator-crds/ks.yaml - ./grafana/ks.yaml diff --git a/kubernetes/apps/observability/thanos/app/helmrelease.yaml b/kubernetes/apps/observability/thanos/app/helmrelease.yaml index 112662b0..82ad8f47 100644 --- a/kubernetes/apps/observability/thanos/app/helmrelease.yaml +++ b/kubernetes/apps/observability/thanos/app/helmrelease.yaml @@ -65,8 +65,8 @@ spec: - --retention.resolution-1h=60d persistence: &persistence enabled: true - storageClass: longhorn-local - size: 20Gi + storageClass: local-hostpath + size: 25Gi query: replicas: 2 extraArgs: ["--alert.query-url=https://thanos.${SECRET_DOMAIN}"] diff --git a/kubernetes/apps/storage/local-path-provisioner/app/helmrelease.yaml b/kubernetes/apps/storage/local-path-provisioner/app/helmrelease.yaml index 2d96e0bb..ac2907d2 100644 --- a/kubernetes/apps/storage/local-path-provisioner/app/helmrelease.yaml +++ b/kubernetes/apps/storage/local-path-provisioner/app/helmrelease.yaml @@ -57,7 +57,7 @@ spec: config: driver: local-hostpath local-hostpath: - shareBasePath: &storagePath /var/democratic-csi/local + shareBasePath: &storagePath /var/mnt/ssd/democratic-csi/local controllerBasePath: *storagePath dirPermissionsMode: "0770" dirPermissionsUser: 0 diff --git a/kubernetes/bootstrap/talos/talconfig.yaml b/kubernetes/bootstrap/talos/talconfig.yaml index 930dc418..cff4fbf0 100644 --- a/kubernetes/bootstrap/talos/talconfig.yaml +++ b/kubernetes/bootstrap/talos/talconfig.yaml @@ -27,7 +27,7 @@ nodes: talosImageURL: factory.talos.dev/installer/88d1f7a5c4f1d3aba7df787c448c1d3d008ed29cfb34af53fa0df4336a56040b controlPlane: true nodeLabels: - "node.longhorn.io/create-default-disk": "true" + "node.longhorn.io/create-default-disk": "config" networkInterfaces: - deviceSelector: hardwareAddr: "bc:24:11:b5:dd:1f" @@ -46,7 +46,7 @@ nodes: talosImageURL: factory.talos.dev/installer/88d1f7a5c4f1d3aba7df787c448c1d3d008ed29cfb34af53fa0df4336a56040b controlPlane: true nodeLabels: - "node.longhorn.io/create-default-disk": "true" + "node.longhorn.io/create-default-disk": "config" networkInterfaces: - deviceSelector: hardwareAddr: "bc:24:11:0c:fd:22" @@ -65,7 +65,7 @@ nodes: talosImageURL: factory.talos.dev/installer/88d1f7a5c4f1d3aba7df787c448c1d3d008ed29cfb34af53fa0df4336a56040b controlPlane: true nodeLabels: - "node.longhorn.io/create-default-disk": "true" + "node.longhorn.io/create-default-disk": "config" networkInterfaces: - deviceSelector: hardwareAddr: "bc:24:11:a8:19:33" @@ -139,7 +139,15 @@ patches: net.core.rmem_max: "2500000" net.core.wmem_max: "2500000" - # Mount longhorn in kubelet + # Add additional ssd + - |- + machine: + disks: + - device: /dev/sdb + partitions: + - mountpoint: /var/mnt/ssd + + # Mount longhorn and local-hostpath in kubelet - |- machine: kubelet: @@ -151,9 +159,16 @@ patches: - bind - rshared - rw - - destination: /var/democratic-csi/local + - destination: /var/mnt/ssd/longhorn + type: bind + source: /var/mnt/ssd/longhorn + options: + - bind + - rshared + - rw + - destination: /var/mnt/ssd/democratic-csi/local type: bind - source: /var/democratic-csi/local + source: /var/mnt/ssd/democratic-csi/local options: - bind - rshared diff --git a/kubernetes/talos/clusterconfig/talosconfig b/kubernetes/talos/clusterconfig/talosconfig deleted file mode 100644 index e35e5db5..00000000 --- a/kubernetes/talos/clusterconfig/talosconfig +++ /dev/null @@ -1,2 +0,0 @@ -context: "" -contexts: {}