From 6286fff56fb6af8b42f15469f4199e133937c1b4 Mon Sep 17 00:00:00 2001 From: Piotr Rojek Date: Fri, 13 Dec 2024 10:18:09 +0100 Subject: [PATCH 1/3] feat: add observability stack installation option --- ansible/roles/observability/defaults/main.yml | 21 + ansible/roles/observability/handlers/main.yml | 32 ++ ansible/roles/observability/tasks/main.yml | 216 ++++++++++ .../templates/gpu-dashboard.json.j2 | 301 +++++++++++++ .../observability/templates/grafana.ini.j2 | 16 + .../templates/node-dashboard.json.j2 | 401 ++++++++++++++++++ .../templates/node_exporter.service.j2 | 15 + .../templates/prometheus.service.j2 | 17 + .../observability/templates/prometheus.yml.j2 | 20 + ansible/slurm.yml | 7 + main.tf | 2 + variables.tf | 13 + 12 files changed, 1061 insertions(+) create mode 100644 ansible/roles/observability/defaults/main.yml create mode 100644 ansible/roles/observability/handlers/main.yml create mode 100644 ansible/roles/observability/tasks/main.yml create mode 100644 ansible/roles/observability/templates/gpu-dashboard.json.j2 create mode 100644 ansible/roles/observability/templates/grafana.ini.j2 create mode 100644 ansible/roles/observability/templates/node-dashboard.json.j2 create mode 100644 ansible/roles/observability/templates/node_exporter.service.j2 create mode 100644 ansible/roles/observability/templates/prometheus.service.j2 create mode 100644 ansible/roles/observability/templates/prometheus.yml.j2 diff --git a/ansible/roles/observability/defaults/main.yml b/ansible/roles/observability/defaults/main.yml new file mode 100644 index 0000000..88cfc4c --- /dev/null +++ b/ansible/roles/observability/defaults/main.yml @@ -0,0 +1,21 @@ +--- +# Prometheus version +prometheus_version: "2.53.3" +node_exporter_version: "1.8.2" +dcgm_exporter_version: "3.3.9-3.6.1" + +# Grafana settings +grafana_admin_password: "admin" # Should be overridden in vault +grafana_version: "latest" + +# Ports +prometheus_port: 9090 +node_exporter_port: 9100 +dcgm_exporter_port: 9400 +grafana_port: 3000 + +# Users and groups +prometheus_user: "prometheus" +prometheus_group: "prometheus" +node_exporter_user: "node_exporter" +node_exporter_group: "node_exporter" \ No newline at end of file diff --git a/ansible/roles/observability/handlers/main.yml b/ansible/roles/observability/handlers/main.yml new file mode 100644 index 0000000..bed0340 --- /dev/null +++ b/ansible/roles/observability/handlers/main.yml @@ -0,0 +1,32 @@ +--- +- name: reload systemd + systemd: + daemon_reload: yes + +- name: restart prometheus + systemd: + name: prometheus + state: restarted + enabled: yes + +- name: restart node_exporter + systemd: + name: node_exporter + state: restarted + enabled: yes + +- name: restart grafana + systemd: + name: grafana-server + state: restarted + enabled: yes + +- name: restart services + systemd: + name: "{{ item }}" + state: restarted + enabled: yes + with_items: + - prometheus + - node_exporter + - grafana-server \ No newline at end of file diff --git a/ansible/roles/observability/tasks/main.yml b/ansible/roles/observability/tasks/main.yml new file mode 100644 index 0000000..e50ff94 --- /dev/null +++ b/ansible/roles/observability/tasks/main.yml @@ -0,0 +1,216 @@ +--- +- name: Create directories for observability stack + file: + path: "{{ item }}" + state: directory + mode: '0755' + with_items: + - /opt/prometheus + - /opt/grafana + - /opt/node_exporter + - /opt/dcgm_exporter + - /etc/prometheus + - /etc/grafana + - /var/lib/grafana/dashboards + +- name: Install required packages + apt: + name: + - curl + - wget + - gnupg2 + - apt-transport-https + - software-properties-common + state: present + update_cache: yes + +- name: Add Docker GPG key + apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + state: present + +- name: Add Docker repository + apt_repository: + repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable + state: present + filename: docker + +- name: Install Docker packages + apt: + name: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose-plugin + state: present + update_cache: yes + +- name: Ensure Docker service is running + systemd: + name: docker + state: started + enabled: yes + +- name: Install Python Docker package + apt: + name: python3-docker + state: present + +- name: Download and install Prometheus + unarchive: + src: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-amd64.tar.gz" + dest: /opt/prometheus + remote_src: yes + creates: /opt/prometheus/prometheus + +- name: Download and install Node Exporter + unarchive: + src: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/node_exporter-{{ node_exporter_version }}.linux-amd64.tar.gz" + dest: /opt/node_exporter + remote_src: yes + creates: /opt/node_exporter/node_exporter + +- name: Install NVIDIA Container Toolkit repository + apt_repository: + repo: deb https://nvidia.github.io/libnvidia-container/stable/ubuntu20.04/$(ARCH) / + state: present + filename: nvidia-container-toolkit + +- name: Install NVIDIA Container Toolkit + apt: + name: nvidia-container-toolkit + state: present + update_cache: yes + +- name: Pull DCGM Exporter container + docker_container: + name: dcgm-exporter + image: "nvidia/dcgm-exporter:{{ dcgm_exporter_version }}-ubuntu20.04" + state: started + restart_policy: always + ports: + - "{{ dcgm_exporter_port }}:9400" + devices: + - "/dev/nvidiactl:/dev/nvidiactl" + - "/dev/nvidia0:/dev/nvidia0" + volumes: + - "/sys/class/nvidia-gpu:/sys/class/nvidia-gpu" + - "/run/nvidia-persistenced:/run/nvidia-persistenced" + +- name: Add Grafana GPG key + apt_key: + url: https://packages.grafana.com/gpg.key + state: present + +- name: Add Grafana repository + apt_repository: + repo: deb https://packages.grafana.com/oss/deb stable main + state: present + filename: grafana + +- name: Install Grafana + apt: + name: grafana + state: present + update_cache: yes + +- name: Configure Prometheus + template: + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + notify: restart prometheus + +- name: Configure Grafana + template: + src: grafana.ini.j2 + dest: /etc/grafana/grafana.ini + notify: restart grafana + +- name: Create systemd service files + template: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + with_items: + - { src: 'prometheus.service.j2', dest: '/etc/systemd/system/prometheus.service' } + - { src: 'node_exporter.service.j2', dest: '/etc/systemd/system/node_exporter.service' } + notify: + - reload systemd + - restart services + +- name: Create Grafana dashboards directory + file: + path: /var/lib/grafana/dashboards + state: directory + owner: grafana + group: grafana + mode: '0755' + +- name: Configure Grafana dashboard provisioning + copy: + content: | + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + dest: /etc/grafana/provisioning/dashboards/default.yaml + owner: grafana + group: grafana + mode: '0644' + notify: restart grafana + +- name: Configure Grafana datasource provisioning + copy: + content: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://localhost:{{ prometheus_port }} + isDefault: true + dest: /etc/grafana/provisioning/datasources/prometheus.yaml + owner: grafana + group: grafana + mode: '0644' + notify: restart grafana + +- name: Copy Grafana dashboards + template: + src: "{{ item }}" + dest: "/var/lib/grafana/dashboards/{{ item | basename | regex_replace('\\.j2$', '') }}" + owner: grafana + group: grafana + mode: '0644' + with_items: + - gpu-dashboard.json.j2 + - node-dashboard.json.j2 + notify: restart grafana + +- name: Create users and set permissions + user: + name: "{{ item.user }}" + system: yes + create_home: no + shell: /sbin/nologin + with_items: + - { user: "{{ prometheus_user }}" } + - { user: "{{ node_exporter_user }}" } + +- name: Set directory permissions + file: + path: "{{ item.path }}" + owner: "{{ item.owner }}" + group: "{{ item.group }}" + recurse: yes + with_items: + - { path: '/opt/prometheus', owner: '{{ prometheus_user }}', group: '{{ prometheus_group }}' } + - { path: '/opt/node_exporter', owner: '{{ node_exporter_user }}', group: '{{ node_exporter_group }}' } + - { path: '/etc/prometheus', owner: '{{ prometheus_user }}', group: '{{ prometheus_group }}' } \ No newline at end of file diff --git a/ansible/roles/observability/templates/gpu-dashboard.json.j2 b/ansible/roles/observability/templates/gpu-dashboard.json.j2 new file mode 100644 index 0000000..7ca2ad5 --- /dev/null +++ b/ansible/roles/observability/templates/gpu-dashboard.json.j2 @@ -0,0 +1,301 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "GPU Utilization", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "DCGM_FI_DEV_GPU_UTIL", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "GPU Memory Usage", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "DCGM_FI_DEV_FB_USED", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "GPU Temperature", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "DCGM_FI_DEV_GPU_TEMP", + "refId": "A" + } + ] + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["gpu", "nvidia", "dcgm"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "GPU Monitoring", + "uid": "gpu-monitoring", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/ansible/roles/observability/templates/grafana.ini.j2 b/ansible/roles/observability/templates/grafana.ini.j2 new file mode 100644 index 0000000..078d955 --- /dev/null +++ b/ansible/roles/observability/templates/grafana.ini.j2 @@ -0,0 +1,16 @@ +[server] +http_addr = 0.0.0.0 +http_port = 3000 + +[security] +admin_user = admin +admin_password = {{ grafana_admin_password | default('admin') }} + +[auth.anonymous] +enabled = false + +[analytics] +reporting_enabled = false + +[users] +allow_sign_up = false \ No newline at end of file diff --git a/ansible/roles/observability/templates/node-dashboard.json.j2 b/ansible/roles/observability/templates/node-dashboard.json.j2 new file mode 100644 index 0000000..9bac7e9 --- /dev/null +++ b/ansible/roles/observability/templates/node-dashboard.json.j2 @@ -0,0 +1,401 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "CPU Usage", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Memory Usage", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Disk Usage", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "node_filesystem_size_bytes{mountpoint=\"/\"} - node_filesystem_free_bytes{mountpoint=\"/\"}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Network Traffic", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])", + "refId": "A", + "legendFormat": "{{device}} Receive" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])", + "refId": "B", + "legendFormat": "{{device}} Transmit" + } + ] + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["node", "system"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Node Metrics", + "uid": "node-metrics", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/ansible/roles/observability/templates/node_exporter.service.j2 b/ansible/roles/observability/templates/node_exporter.service.j2 new file mode 100644 index 0000000..cddf2e7 --- /dev/null +++ b/ansible/roles/observability/templates/node_exporter.service.j2 @@ -0,0 +1,15 @@ +[Unit] +Description=Node Exporter +Wants=network-online.target +After=network-online.target + +[Service] +User=node_exporter +Group=node_exporter +Type=simple +ExecStart=/opt/node_exporter/node_exporter \ + --collector.systemd \ + --collector.processes + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/roles/observability/templates/prometheus.service.j2 b/ansible/roles/observability/templates/prometheus.service.j2 new file mode 100644 index 0000000..f6e9cb8 --- /dev/null +++ b/ansible/roles/observability/templates/prometheus.service.j2 @@ -0,0 +1,17 @@ +[Unit] +Description=Prometheus +Wants=network-online.target +After=network-online.target + +[Service] +User=prometheus +Group=prometheus +Type=simple +ExecStart=/opt/prometheus/prometheus \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/opt/prometheus/data \ + --web.console.templates=/opt/prometheus/consoles \ + --web.console.libraries=/opt/prometheus/console_libraries + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/roles/observability/templates/prometheus.yml.j2 b/ansible/roles/observability/templates/prometheus.yml.j2 new file mode 100644 index 0000000..7b31cb9 --- /dev/null +++ b/ansible/roles/observability/templates/prometheus.yml.j2 @@ -0,0 +1,20 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node_exporter' + static_configs: + - targets: ['localhost:9100'] + + - job_name: 'dcgm_exporter' + static_configs: + - targets: ['localhost:9400'] + + - job_name: 'slurm_nodes' + static_configs: + - targets: {{ groups['compute_nodes'] | map('regex_replace', '^(.*)$', '\\1:9100') | list | to_json }} \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 23fcb7f..17f1164 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -15,12 +15,19 @@ become: yes roles: - slurm_head_node + - { role: observability, + tags: ['observability'], + when: enable_observability | default(false) | bool } - hosts: slurm_compute_nodes remote_user: ubuntu become: yes roles: - slurm_compute_node + - { role: observability, + tags: ['observability'], + prometheus_server: false, + when: enable_observability | default(false) | bool } - hosts: all remote_user: ubuntu diff --git a/main.tf b/main.tf index 36eada8..289891e 100644 --- a/main.tf +++ b/main.tf @@ -193,6 +193,8 @@ resource "ansible_group" "all" { variables = { slurm_users = jsonencode(var.slurm_users) partitions = jsonencode(var.partitions) + enable_observability = var.enable_observability + grafana_admin_password = var.grafana_admin_password } } diff --git a/variables.tf b/variables.tf index 7be15e8..b31ba82 100644 --- a/variables.tf +++ b/variables.tf @@ -161,3 +161,16 @@ variable "slurm_shared_volumes" { })) default = [] } + +variable "enable_observability" { + description = "Enable observability stack (Prometheus, Grafana, GPU monitoring)" + type = bool + default = false +} + +variable "grafana_admin_password" { + description = "Admin password for Grafana (if observability is enabled)" + type = string + default = "admin" + sensitive = true +} From 1fc29e9c68fa7f8ef31d857e27efa30224e2c3a6 Mon Sep 17 00:00:00 2001 From: Piotr Rojek Date: Fri, 13 Dec 2024 15:20:32 +0100 Subject: [PATCH 2/3] Changes to ansible scripts --- ansible/roles/observability/defaults/main.yml | 2 +- ansible/roles/observability/tasks/main.yml | 48 ++++++++++--------- ansible/roles/requirements.yml | 2 + examples/observability.tfvars | 32 +++++++++++++ 4 files changed, 61 insertions(+), 23 deletions(-) create mode 100644 examples/observability.tfvars diff --git a/ansible/roles/observability/defaults/main.yml b/ansible/roles/observability/defaults/main.yml index 88cfc4c..15af841 100644 --- a/ansible/roles/observability/defaults/main.yml +++ b/ansible/roles/observability/defaults/main.yml @@ -2,7 +2,7 @@ # Prometheus version prometheus_version: "2.53.3" node_exporter_version: "1.8.2" -dcgm_exporter_version: "3.3.9-3.6.1" +dcgm_exporter_version: "3.3.9-3.6.1-ubuntu22.04" # Grafana settings grafana_admin_password: "admin" # Should be overridden in vault diff --git a/ansible/roles/observability/tasks/main.yml b/ansible/roles/observability/tasks/main.yml index e50ff94..0d0f3a6 100644 --- a/ansible/roles/observability/tasks/main.yml +++ b/ansible/roles/observability/tasks/main.yml @@ -24,16 +24,20 @@ state: present update_cache: yes -- name: Add Docker GPG key - apt_key: - url: https://download.docker.com/linux/ubuntu/gpg - state: present +# - name: Add Docker GPG key +# apt_key: +# url: https://download.docker.com/linux/ubuntu/gpg +# state: present -- name: Add Docker repository - apt_repository: - repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable - state: present - filename: docker +# - name: Add Docker repository +# apt_repository: +# repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable +# state: present +# filename: docker + +# - name: Refresh apt +# apt: +# update_cache: true - name: Install Docker packages apt: @@ -71,11 +75,11 @@ remote_src: yes creates: /opt/node_exporter/node_exporter -- name: Install NVIDIA Container Toolkit repository - apt_repository: - repo: deb https://nvidia.github.io/libnvidia-container/stable/ubuntu20.04/$(ARCH) / - state: present - filename: nvidia-container-toolkit +# - name: Install NVIDIA Container Toolkit repository +# apt_repository: +# repo: deb https://nvidia.github.io/libnvidia-container/stable/ubuntu20.04/$(ARCH) / +# state: present +# filename: nvidia-container-toolkit - name: Install NVIDIA Container Toolkit apt: @@ -84,19 +88,19 @@ update_cache: yes - name: Pull DCGM Exporter container - docker_container: + community.docker.docker_container: name: dcgm-exporter - image: "nvidia/dcgm-exporter:{{ dcgm_exporter_version }}-ubuntu20.04" + image: "nvcr.io/nvidia/k8s/dcgm-exporter:{{ dcgm_exporter_version }}" state: started restart_policy: always ports: - "{{ dcgm_exporter_port }}:9400" - devices: - - "/dev/nvidiactl:/dev/nvidiactl" - - "/dev/nvidia0:/dev/nvidia0" - volumes: - - "/sys/class/nvidia-gpu:/sys/class/nvidia-gpu" - - "/run/nvidia-persistenced:/run/nvidia-persistenced" + capabilities: + - SYS_ADMIN + device_requests: + driver: nvidia + count: -1 + - name: Add Grafana GPG key apt_key: diff --git a/ansible/roles/requirements.yml b/ansible/roles/requirements.yml index ffbac44..24bb175 100644 --- a/ansible/roles/requirements.yml +++ b/ansible/roles/requirements.yml @@ -1,3 +1,5 @@ collections: - name: cloud.terraform version: "2.0.0" + - name: community.docker + version: "4.1.0" \ No newline at end of file diff --git a/examples/observability.tfvars b/examples/observability.tfvars new file mode 100644 index 0000000..a02e35e --- /dev/null +++ b/examples/observability.tfvars @@ -0,0 +1,32 @@ +# common configuration +location = "us-northcentral1-a" +project_id = "4ba4b775-28a3-4481-bb93-9037a23fb8e0" +ssh_public_key_path = "~/.ssh/id_ed25519.pub" +vpc_subnet_id = "5963d82e-59cd-43ee-8b15-3e7b6fb9839b" + +# head node +slurm_head_node_count = 1 +slurm_head_node_type = "c1a.8x" + +# login node +slurm_login_node_count = 1 +slurm_login_node_type = "c1a.8x" + +# nfs node +slurm_nfs_node_type = "s1a.20x" +slurm_nfs_home_size = "1024GiB" + +# slurm-compute-node configuration +slurm_compute_node_type = "a40.1x" +slurm_compute_node_count = 1 + +# slurm users configuration +slurm_users = [{ + name = "user1" + uid = 1001 + ssh_pubkey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDQD5doToJjyyq0BH8TDlHZqqVy+kZpuGgJP5gbDanpF piotr.rojek (at) deepsense.ai" +}] + +# observability +enable_observability = true +grafana_admin_password = "admin123" \ No newline at end of file From 7fce2c24e1ee7066edd718e545ece549e7cedfb9 Mon Sep 17 00:00:00 2001 From: Piotr Rojek Date: Thu, 19 Dec 2024 13:15:01 +0100 Subject: [PATCH 3/3] Small changes to ansible script --- ansible/roles/observability/tasks/main.yml | 20 ++++----- ansible/slurm.yml | 50 ++++++++++++---------- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/ansible/roles/observability/tasks/main.yml b/ansible/roles/observability/tasks/main.yml index 0d0f3a6..ff551e6 100644 --- a/ansible/roles/observability/tasks/main.yml +++ b/ansible/roles/observability/tasks/main.yml @@ -22,7 +22,7 @@ - apt-transport-https - software-properties-common state: present - update_cache: yes + update_cache: true # - name: Add Docker GPG key # apt_key: @@ -48,13 +48,13 @@ - docker-buildx-plugin - docker-compose-plugin state: present - update_cache: yes + update_cache: true - name: Ensure Docker service is running systemd: name: docker state: started - enabled: yes + enabled: true - name: Install Python Docker package apt: @@ -65,14 +65,14 @@ unarchive: src: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-amd64.tar.gz" dest: /opt/prometheus - remote_src: yes + remote_src: true creates: /opt/prometheus/prometheus - name: Download and install Node Exporter unarchive: src: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/node_exporter-{{ node_exporter_version }}.linux-amd64.tar.gz" dest: /opt/node_exporter - remote_src: yes + remote_src: true creates: /opt/node_exporter/node_exporter # - name: Install NVIDIA Container Toolkit repository @@ -85,7 +85,7 @@ apt: name: nvidia-container-toolkit state: present - update_cache: yes + update_cache: true - name: Pull DCGM Exporter container community.docker.docker_container: @@ -100,7 +100,7 @@ device_requests: driver: nvidia count: -1 - + - name: Add Grafana GPG key apt_key: @@ -117,7 +117,7 @@ apt: name: grafana state: present - update_cache: yes + update_cache: true - name: Configure Prometheus template: @@ -201,7 +201,7 @@ - name: Create users and set permissions user: name: "{{ item.user }}" - system: yes + system: true create_home: no shell: /sbin/nologin with_items: @@ -213,7 +213,7 @@ path: "{{ item.path }}" owner: "{{ item.owner }}" group: "{{ item.group }}" - recurse: yes + recurse: true with_items: - { path: '/opt/prometheus', owner: '{{ prometheus_user }}', group: '{{ prometheus_group }}' } - { path: '/opt/node_exporter', owner: '{{ node_exporter_user }}', group: '{{ node_exporter_group }}' } diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 17f1164..417b9b8 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -1,44 +1,48 @@ -- hosts: all +- name: Set hostname + hosts: all remote_user: ubuntu - become: yes + become: true roles: - hostname -- hosts: slurm_nfs_nodes +- name: Configure NFS nodes + hosts: slurm_nfs_nodes remote_user: ubuntu - become: yes + become: true roles: - slurm_nfs_node -- hosts: slurm_head_nodes +- name: Configure head nodes + hosts: slurm_head_nodes remote_user: ubuntu - become: yes + become: true roles: - slurm_head_node - - { role: observability, - tags: ['observability'], + - { role: observability, + tags: ['observability'], when: enable_observability | default(false) | bool } -- hosts: slurm_compute_nodes +- name: Configure compute nodes + hosts: slurm_compute_nodes remote_user: ubuntu - become: yes + become: true roles: - slurm_compute_node - - { role: observability, - tags: ['observability'], - prometheus_server: false, + - { role: observability, + tags: ['observability'], + observability_prometheus_server: false, when: enable_observability | default(false) | bool } - hosts: all remote_user: ubuntu - become: yes + become: true tasks: - - name: "Add users" - ansible.builtin.include_role: - name: user - vars: - user_name: "{{ item.name }}" - user_uid: "{{ item.uid }}" - user_ssh_pubkey: "{{ item.ssh_pubkey }}" - loop: "{{ slurm_users }}" - when: slurm_users is defined + - name: "Add users" + ansible.builtin.include_role: + name: user + vars: + user_name: "{{ item.name }}" + user_uid: "{{ item.uid }}" + user_ssh_pubkey: "{{ item.ssh_pubkey }}" + loop: "{{ slurm_users }}" + when: slurm_users is defined