From 6e15ae128b122c20aa80108f04143fa6243a2c9a Mon Sep 17 00:00:00 2001 From: Jussi Nummelin Date: Thu, 25 Apr 2024 13:46:47 +0300 Subject: [PATCH] Add docs how to collect support bundles This also adds common "baseline" yamls for both controller & worker roles on what to collect. Signed-off-by: Jussi Nummelin --- docs/support-bundle-controller.yaml | 239 ++++++++++++++++++++++ docs/support-bundle-worker.yaml | 295 ++++++++++++++++++++++++++++ docs/support-dump.md | 44 +++++ mkdocs.yml | 1 + 4 files changed, 579 insertions(+) create mode 100644 docs/support-bundle-controller.yaml create mode 100644 docs/support-bundle-worker.yaml create mode 100644 docs/support-dump.md diff --git a/docs/support-bundle-controller.yaml b/docs/support-bundle-controller.yaml new file mode 100644 index 000000000000..8a436d47be72 --- /dev/null +++ b/docs/support-bundle-controller.yaml @@ -0,0 +1,239 @@ +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: controller +spec: + uri: # TODO + collectors: + - clusterInfo: {} + - clusterResources: + namespaces: + - kube-system + - k0s-autopilot + - kube-node-lease + - default # so we get kubernetes svc endpoints + - nodeMetrics: {} + hostCollectors: + # System Info Collectors + - cpu: {} + - hostOS: {} + - hostServices: {} + - ipv4Interfaces: {} + - memory: {} + - time: {} + # Certificate Info for ETCD and K8s API + - certificate: + collectorName: k8s-api-keypair + certificatePath: /var/lib/k0s/pki/server.crt + keyPath: /var/lib/k0s/pki/server.key + - certificate: + collectorName: etcd-keypair + certificatePath: /var/lib/k0s/pki/etcd/server.crt + keyPath: /var/lib/k0s/pki/etcd/server.key + # Disk usage for commonly used directories in kURL installs + - diskUsage: + collectorName: root + path: / + # Run collectors for system information + - run: + collectorName: k8s-api-healthz-6443 + command: "curl" + args: ["-k", "--cert", "/var/lib/k0s/pki/admin.crt", "--key", "/var/lib/k0s/pki/admin.key", "https://localhost:6443/healthz?verbose"] + - run: + collectorName: curl-etcd-health-2379 + command: "curl" + args: ["-ki", "https://localhost:2379/health", "--cert", "/var/lib/k0s/pki/apiserver-etcd-client.crt", "--key", "/var/lib/k0s/pki/apiserver-etcd-client.key"] + - run: + collectorName: etcd-members + command: "k0s" + args: ["etcd", "member-list"] + - run: + collectorName: "free" + command: "free" + args: ["-m"] + - run: + collectorName: "top" + command: "top" + args: ["-b", "-n", "1"] + - run: + collectorName: "uptime" + command: "uptime" + args: [] + - run: + collectorName: "uname" + command: "uname" + args: ["-a"] + - run: + collectorName: "df" + command: "df" + args: ["-h"] + - run: + collectorName: "iostat" + command: "iostat" + args: ["-x"] + # Systemctl service statuses + - run: + collectorName: "systemctl-firewalld-status" + command: "systemctl" + args: ["status", "firewalld"] + - run: + collectorName: "systemctl-ufw-status" + command: "systemctl" + args: ["status", "ufw"] + - run: + collectorName: "systemctl-k0s-status" + command: "systemctl" + args: ["status", "k0s*"] + # Systemd Service Configurations + - run: + collectorName: "systemctl-cat-journald" + command: "systemctl" + args: ["cat", "systemd-journald"] + - run: + collectorName: "systemctl-cat-k0s" + command: "systemctl" + args: ["cat", "k0s*"] + # TODO Add same checks for rc-service + # Logs for k0s + - run: + collectorName: "journalctl-k0s" + command: "journalctl" + args: ["-u", "k0s*", "--no-pager", "-S", "7 days ago"] + - run: + collectorName: "journalctl-dmesg" + command: "journalctl" + args: ["--dmesg", "--no-pager", "-S", "7 days ago"] + # k0s status + - run: + collectorName: k0s-status + command: "k0s" + args: ["status", "-o", "yaml"] + # Gathering hostname info to help troubleshoot scenarios where the hostname mismatch + - run: + collectorName: "hostnames" + command: "sh" + args: + - -c + - | + echo "hostname = $(hostname)" + echo "/proc/sys/kernel/hostname = $(cat /proc/sys/kernel/hostname)" + echo "uname -n = $(uname -n)" + # System Info Collectors + - run: + collectorName: "vmstat" + command: "vmstat" + args: ["-w"] + - run: + collectorName: "ps-high-load" + command: "sh" + args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"] + - filesystemPerformance: + collectorName: filesystem-latency-two-minute-benchmark + timeout: 2m + directory: /var/lib/k0s/etcd + fileSize: 22Mi + operationSizeBytes: 2300 + datasync: true + enableBackgroundIOPS: true + backgroundIOPSWarmupSeconds: 10 + backgroundWriteIOPS: 300 + backgroundWriteIOPSJobs: 6 + backgroundReadIOPS: 50 + backgroundReadIOPSJobs: 1 + exclude: true + - run: + collectorName: "localhost-ips" + command: "sh" + args: ["-c", "host localhost"] + hostAnalyzers: + - time: + checkName: "ntp-status" + outcomes: + - fail: + when: "ntp == unsynchronized+inactive" + message: "System clock is not synchronized" + - warn: + when: "ntp == unsynchronized+active" + message: System clock not yet synchronized + - pass: + when: "ntp == synchronized+active" + message: "System clock is synchronized" + - warn: + when: "timezone != UTC" + message: "Non UTC timezone can interfere with system function" + - pass: + when: "timezone == UTC" + message: "Timezone is set to UTC" + - diskUsage: + checkName: "root" + collectorName: "root" + outcomes: + - fail: + when: "total < 40Gi" + message: The disk containing directory / has less than 40Gi of total space + - warn: + when: "used/total > 80%" + message: The disk containing directory / is more than 80% full + - warn: + when: "available < 10Gi" + message: The disk containing directory / has less than 10Gi of disk space available + - pass: + message: The disk containing directory / has sufficient space + - diskUsage: + checkName: "tmp" + collectorName: "tmp" + outcomes: + - warn: + when: "total < 8Gi" + message: The disk containing directory /tmp has less than 8Gi of total space + - warn: + when: "used/total > 80%" + message: The disk containing directory /tmp is more than 80% full + - warn: + when: "available < 2Gi" + message: The disk containing directory /tmp has less than 2Gi of disk space available + - pass: + message: The disk containing directory /tmp has sufficient space + - filesystemPerformance: + collectorName: filesystem-latency-two-minute-benchmark + outcomes: + - pass: + when: "p99 < 10ms" + message: "Write latency is ok (p99 target < 10ms)" + - warn: + message: "Write latency is high. p99 target >= 10ms)" + exclude: true + analyzers: + - textAnalyze: + checkName: Kubernetes API health check + fileName: host-collectors/run-host/k8s-api-healthz-6443.txt + regex: ".*healthz check passed*" + outcomes: + - fail: + when: "false" + message: "Kubernetes API health check did not pass. One or more components are not working." + - pass: + when: "true" + message: "Kubernetes API health check passed" + - textAnalyze: + checkName: ETCD API Health + fileName: host-collectors/run-host/curl-etcd-health-2379.txt + regex: ".*\"health\":\"true\"*" + outcomes: + - fail: + when: "false" + message: "ETCD status returned: unhealthy" + - pass: + when: "true" + message: "ETCD status returned: healthy" + - textAnalyze: + checkName: Check if localhost resolves to 127.0.0.1 + fileName: host-collectors/run-host/localhost-ips.txt + regex: 'localhost has address 127.0.0.1' + outcomes: + - fail: + when: "false" + message: "'localhost' does not resolve to 127.0.0.1 ip address" + - pass: + when: "true" + message: "'localhost' resolves to 127.0.0.1 ip address" diff --git a/docs/support-bundle-worker.yaml b/docs/support-bundle-worker.yaml new file mode 100644 index 000000000000..30f1493572ac --- /dev/null +++ b/docs/support-bundle-worker.yaml @@ -0,0 +1,295 @@ +# Spec to run when a kURL cluster is down and in-cluster specs can't be run +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: worker +spec: + uri: #TODO + hostCollectors: + # System Info Collectors + - blockDevices: {} + - cpu: {} + - hostOS: {} + - hostServices: {} + - ipv4Interfaces: {} + - memory: {} + - time: {} + - ipv4Interfaces: {} + # Disk usage for commonly used directories in kURL installs + - diskUsage: + collectorName: root + path: / + - diskUsage: + collectorName: tmp + path: /tmp + - diskUsage: + collectorName: var-lib-k0s + path: /var/lib/k0s + # Run collectors for system information + - run: + collectorName: k8s-api-healthz-6443 + command: "k0s" + args: ["kubectl", "--kubeconfig", "/var/lib/k0s/kubelet.conf", "get", "--raw", "/healthz?verbose"] + - run: + collectorName: "free" + command: "free" + args: ["-m"] + - run: + collectorName: "top" + command: "top" + args: ["-b", "-n", "1"] + - run: + collectorName: "uptime" + command: "uptime" + args: [] + - run: + collectorName: "uname" + command: "uname" + args: ["-a"] + - run: + collectorName: "df" + command: "df" + args: ["-h"] + - run: + collectorName: "iostat" + command: "iostat" + args: ["-x"] + # SELinux status + - run: + collectorName: "sestatus" + command: "sestatus" + args: [] + - run: + collectorName: "apparmor-status" + command: "apparmor_status" + args: [] + - run: + collectorName: "crictl-ps" + command: "k0s" + args: ["ctr", "c", "ls"] + - run: + collectorName: "iptables" + command: "/var/lib/k0s/bin/iptables" + args: ["-L", "-v"] + - run: + collectorName: "iptables-version" + command: "/var/lib/k0s/bin/iptables" + args: ["-V"] + - run: + collectorName: "lsblk" + command: "lsblk" + args: ["--fs"] + - run: + collectorName: "netstat-ports" + command: "netstat" + args: ["-t", "-u", "-l", "-p", "-n"] + - run: + collectorName: "netstat-route-table" + command: "netstat" + args: ["-r", "-n"] + - run: + collectorName: "ip-route-table" + command: "ip" + args: ["route"] + - run: + collectorName: "sysctl" + command: "sysctl" + args: ["-a"] + # Systemctl service statuses for CRI, Kubelet, and Firewall + - run: + collectorName: "systemctl-k0s-status" + command: "systemctl" + args: ["status", "k0s*"] + # Systemd Service Configurations for CRI, Kubelet + - run: + collectorName: "systemctl-cat-journald" + command: "systemctl" + args: ["cat", "systemd-journald"] + - run: + collectorName: "systemctl-cat-k0s" + command: "systemctl" + args: ["cat", "k0s*"] + # Logs for CRI, Kubelet, Kernel + - run: + collectorName: "journalctl-k0s" + command: "journalctl" + args: ["-u", "k0s*", "--no-pager", "-S", "7 days ago"] + - run: + collectorName: "journalctl-dmesg" + command: "journalctl" + args: ["--dmesg", "--no-pager", "-S", "7 days ago"] + + # sysctl parameters + - run: + collectorName: "sysctl-all" + command: "sh" + args: ["-c", "sysctl --all 2>/dev/null"] + - run: + collectorName: "k0s-sysinfo" + command: "k0s" + args: ["sysinfo"] + # Gathering hostname info to help troubleshoot scenarios where the hostname mismatch + - run: + collectorName: "hostnames" + command: "sh" + args: + - -c + - | + echo "hostname = $(hostname)" + echo "/proc/sys/kernel/hostname = $(cat /proc/sys/kernel/hostname)" + echo "uname -n = $(uname -n)" + + # System Info Collectors + - run: + collectorName: "du-root" + command: "sh" + args: ["-c", "du -Shax / --exclude /proc | sort -rh | head -20"] + - run: + collectorName: "mount" + command: "mount" + args: ["-l"] + - run: + collectorName: "vmstat" + command: "vmstat" + args: ["-w"] + - run: + collectorName: "ps-high-load" + command: "sh" + args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"] + - run: + collectorName: "ps-detect-antivirus-and-security-tools" + command: "sh" + args: [-c, "ps -ef | grep -E 'clamav|sophos|esets_daemon|fsav|symantec|mfend|ds_agent|kav|bdagent|s1agent|falcon|illumio' | grep -v grep"] + - run: + collectorName: "localhost-ips" + command: "sh" + args: ["-c", "host localhost"] + hostAnalyzers: + - time: + checkName: "ntp-status" + outcomes: + - fail: + when: "ntp == unsynchronized+inactive" + message: "System clock is not synchronized" + - warn: + when: "ntp == unsynchronized+active" + message: System clock not yet synchronized + - pass: + when: "ntp == synchronized+active" + message: "System clock is synchronized" + - warn: + when: "timezone != UTC" + message: "Non UTC timezone can interfere with system function" + - pass: + when: "timezone == UTC" + message: "Timezone is set to UTC" + - diskUsage: + checkName: "root" + collectorName: "root" + outcomes: + - fail: + when: "total < 40Gi" + message: The disk containing directory / has less than 40Gi of total space + - warn: + when: "used/total > 80%" + message: The disk containing directory / is more than 80% full + - warn: + when: "available < 10Gi" + message: The disk containing directory / has less than 10Gi of disk space available + - pass: + message: The disk containing directory / has sufficient space + - diskUsage: + checkName: "tmp" + collectorName: "tmp" + outcomes: + - warn: + when: "total < 8Gi" + message: The disk containing directory /tmp has less than 8Gi of total space + - warn: + when: "used/total > 80%" + message: The disk containing directory /tmp is more than 80% full + - warn: + when: "available < 2Gi" + message: The disk containing directory /tmp has less than 2Gi of disk space available + - pass: + message: The disk containing directory /tmp has sufficient space + - diskUsage: + checkName: "var-lib-k0s" + collectorName: "var-lib-k0s" + outcomes: + - warn: + when: "used/total > 80%" + message: The disk containing directory /var/lib/k0s is more than 80% full + - warn: + when: "available < 10Gi" + message: The disk containing directory /var/lib/k0s has less than 10Gi of disk space available + - pass: + message: The disk containing directory /var/lib/k0s has sufficient space + + - filesystemPerformance: + collectorName: filesystem-latency-two-minute-benchmark + outcomes: + - pass: + when: "p99 < 10ms" + message: "Write latency is ok (p99 target < 10ms)" + - warn: + message: "Write latency is high. p99 target >= 10ms)" + exclude: true + analyzers: + - textAnalyze: + checkName: Hostname Mismatch + fileName: host-collectors/run-host/journalctl-k0s.txt + regex: ".*can only access node lease with the same name as the requesting node.*" + outcomes: + - fail: + when: "true" + message: "Possible hostname change. Verify that the current hostname matches what's expected by the k8s control plane" + - pass: + when: "false" + message: "No signs of hostname changes found" + - textAnalyze: + checkName: Kubernetes API health check + fileName: host-collectors/run-host/k8s-api-healthz-6443.txt + regex: ".*healthz check passed*" + outcomes: + - fail: + when: "false" + message: "Kubernetes API health check did not pass. One or more components are not working." + - pass: + when: "true" + message: "Kubernetes API health check passed" + - textAnalyze: + checkName: Check if localhost resolves to 127.0.0.1 + fileName: host-collectors/run-host/localhost-ips.txt + regex: 'localhost has address 127.0.0.1' + outcomes: + - fail: + when: "false" + message: "'localhost' does not resolve to 127.0.0.1 ip address" + - pass: + when: "true" + message: "'localhost' resolves to 127.0.0.1 ip address" + - textAnalyze: + checkName: Check if SELinux is enabled + fileName: host-collectors/run-host/sestatus.txt + regex: '(?m)^Current mode:\s+enforcing' + ignoreIfNoFiles: true + outcomes: + - fail: + when: "true" + message: "SELinux is enabled when it should be disabled for kubernetes to work properly" + - pass: + when: "false" + message: "SELinux is disabled as expected" + - textAnalyze: + checkName: "Detect Threat Management and Network Security Tools" + fileName: host-collectors/run-host/ps-detect-antivirus-and-security-tools.txt + regex: '\b(clamav|sophos|esets_daemon|fsav|symantec|mfend|ds_agent|kav|bdagent|s1agent|falcon|illumio)\b' + ignoreIfNoFiles: true + outcomes: + - fail: + when: "true" + message: "Antivirus or Network Security tools detected. These tools can interfere with kubernetes operation." + - pass: + when: "false" + message: "No Antivirus or Network Security tools detected." \ No newline at end of file diff --git a/docs/support-dump.md b/docs/support-dump.md new file mode 100644 index 000000000000..dc9fda12d135 --- /dev/null +++ b/docs/support-dump.md @@ -0,0 +1,44 @@ +# Support Insight + +In many cases, especially when looking for [commercial support](commercial-support.md) there's a need for share the cluster state with other people. +While one could always give access to the live cluster that is not always desired nor even possible. + +For those kind of cases we can lean on the work our friends at [troubleshoot.sh](https://troubleshoot.sh) have done. + +With troubleshoot tool you can essentially take a dump of the cluster state and share it with other people. You can even use [sbctl](https://github.com/replicatedhq/sbctl) tool to make the dump tarball to act as Kubernetes API. + +Let's look at how this works with k0s. + +## Setting up + +To gather all the needed data we need another tool called [`support-bundle`](https://troubleshoot.sh/docs/support-bundle/introduction/). + +You can download it from the [releases page](https://github.com/replicatedhq/troubleshoot/releases), pay attention that you download the right architecture. + +## Creating support bundle + +A Support Bundle needs to know what to collect and optionally, what to analyze. This is defined in a YAML file. + +While you can customize the data collection and analysis for your specific needs, we've made a good reference for k0s. These cover the core k0s things like: + +- collecting info on the host +- collecting system component statuses from `kube-system` namespace +- checking health of Kubernetes API, Etcd etc. components +- collecting k0s logs +- checking status of firewalls, anti-virus etc. services which are known to interfere with Kubernetes + +As we need to collect host level info you should run the commands on the hosts directly, on controllers and/or workers. + +To get a support bundle, after setting up the [tooling](#setting-up), you simply run: + +```shell +support-bundle --kubeconfig /var/lib/k0s/pki/admin.conf https://docs.k0sproject.io/stable/support-bundle-.yaml +``` + +Above `` refers to either `controller`or `worker`. For different roles we collect different things. If you are running a controller with `--enable-worker` or `--single`, where it becomes also a worker, you can also get a comobined dump: + +```shell +support-bundle --kubeconfig /var/lib/k0s/pki/admin.conf https://docs.k0sproject.io/stable/support-bundle-controller.yaml https://docs.k0sproject.io/stable/support-bundle-worker.yaml +``` + +Once the data collection and analysis finishes you will get a file called like `support-bundle-.tar.gz`. The file contains all the collected info which you can share with other people. diff --git a/mkdocs.yml b/mkdocs.yml index 20922a4016c7..a6d9ed4c2a41 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -67,6 +67,7 @@ nav: - Troubleshooting: - FAQ: FAQ.md - Common Pitfalls: troubleshooting.md + - Support Insights: support-dump.md - Reference: - Architecture: architecture.md - Kube-bench Security Benchmark: cis_benchmark.md