From bd1ee9f472aee9686664304e582f39326dfb3970 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Nussbaumer?=
 <clement.nussbaumer@postfinance.ch>
Date: Tue, 12 Mar 2024 16:09:57 +0100
Subject: [PATCH] docs: neighbour filtering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Clément Nussbaumer <clement.nussbaumer@postfinance.ch>
---
 README.md                               | 52 +++++++++++++++++++++++--
 helm/kubenurse/templates/daemonset.yaml |  2 +
 helm/kubenurse/values.yaml              |  2 +
 internal/kubenurse/server.go            |  2 +
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b6dc7691..cb67bac5 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,12 @@
 ![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/postfinance/kubenurse)
 
 # Kubenurse
+
 kubenurse is a little service that monitors all network connections in a Kubernetes
 cluster. Kubenurse measures request durations, records errors and exports those metrics in Prometheus format.
 
 ## Deployment
+
 You can get the Docker image from [Docker Hub](https://hub.docker.com/r/postfinance/kubenurse/).
 The [examples](https://github.com/postfinance/kubenurse/tree/master/examples) directory
 contains manifests which can be used to deploy kubenurse to the kube-system namespace of your cluster.
@@ -45,6 +47,7 @@ The following command can be used to install kubenurse with Helm: `helm upgrade
 | insecure                           | Set `KUBENURSE_INSECURE` environment variable                                                                        | `true`                             |
 | allow_unschedulable                | Sets `KUBENURSE_ALLOW_UNSCHEDULABLE` environment variable                                                            | `false`                            |
 | neighbour_filter                   | Sets `KUBENURSE_NEIGHBOUR_FILTER` environment variable                                                               | `app.kubernetes.io/name=kubenurse` |
+| neighbour_limit                    | Sets `KUBENURSE_NEIGHBOUR_LIMIT` environment variable                                                                | `10`                               |
 | extra_ca                           | Sets `KUBENURSE_EXTRA_CA` environment variable                                                                       |                                    |
 | check_api_server_direct            | Sets `KUBENURSE_CHECK_API_SERVER_DIRECT` environment variable                                                        | `true`                             |
 | check_api_server_dns               | Sets `KUBENURSE_CHECK_API_SERVER_DNS` environment variable                                                           | `true`                             |
@@ -74,7 +77,6 @@ dashboards [as this example](./doc/grafana-kubenurse.json) that show network lat
 ![Grafana ingress view](doc/grafana_ingress.png "Grafana ingress view")
 ![Grafana path view](doc/grafana_path.png "Grafana path view")
 
-
 ## Configuration
 
 kubenurse is configured with environment variables:
@@ -85,12 +87,13 @@ kubenurse is configured with environment variables:
 - `KUBENURSE_EXTRA_CA`: Additional CA cert path for TLS connections
 - `KUBENURSE_NAMESPACE`: Namespace in which to look for the neighbour kubenurses
 - `KUBENURSE_NEIGHBOUR_FILTER`: A Kubernetes label selector (eg. `app=kubenurse`) to filter neighbour kubenurses
+- `KUBENURSE_NEIGHBOUR_LIMIT`: The maximum number of neighbours each kubenurse will query
 - `KUBENURSE_ALLOW_UNSCHEDULABLE`: If this is `"true"`, path checks to neighbouring kubenurses are made even if they are running on unschedulable nodes.
 - `KUBENURSE_CHECK_API_SERVER_DIRECT`: If this is `"true"` kubenurse will perform the check [API Server Direct](#API Server Direct). default is "true"
 - `KUBENURSE_CHECK_API_SERVER_DNS`: If this is `"true"`, kubenurse will perform the check [API Server DNS](#API Server DNS). default is "true"
 - `KUBENURSE_CHECK_ME_INGRESS`: If this is `"true"`, kubenurse will perform the check [Me Ingress](#Me Ingress). default is "true"
 - `KUBENURSE_CHECK_ME_SERVICE`: If this is `"true"`, kubenurse will perform the check [Me Service](#Me Service). default is "true"
-- `KUBENURSE_CHECK_NEIGHBOURHOOD`: If this is `"true"`, kubenurse will perform the check [Neighbourhood](#Neighbourhood). default is "true"
+- `KUBENURSE_CHECK_NEIGHBOURHOOD`: If this is `"true"`, kubenurse will perform the check [Neighbourhood](#neighbourhood). default is "true"
 - `KUBENURSE_CHECK_INTERVAL`: the frequency to perform kubenurse checks. the string should be formatted for [time.ParseDuration](https://pkg.go.dev/time#ParseDuration). defaults to `5s`
 - `KUBENURSE_REUSE_CONNECTIONS`: whether to reuse connections or not for all checks. default is "false"
 - `KUBENURSE_HISTOGRAM_BUCKETS`: optional comma-separated list of float64, used in place of the [default prometheus histogram buckets](https://pkg.go.dev/github.com/prometheus/client_golang@v1.16.0/prometheus#DefBuckets)
@@ -152,8 +155,8 @@ The `/alive` endpoint returns a JSON like this with status code 200 if everythin
 }
 ```
 
-
 ## Health Checks
+
 Every five seconds and on every access of `/alive`, the checks described below are run.
 Check results are cached for 3 seconds in order to prevent excessive network traffic.
 
@@ -162,12 +165,14 @@ A little illustration of what communication occurs, is here:
 ![Communication](doc/Communication.png "Communication")
 
 ### API Server Direct
+
 Checks the `/version` endpoint of the Kubernetes API Server through
 the direct link (`KUBERNETES_SERVICE_HOST`, `KUBERNETES_SERVICE_PORT`).
 
 Metric type: `api_server_direct`
 
 ### API Server DNS
+
 Checks the `/version` endpoint of the Kubernetes API Server through
 the Cluster DNS URL `https://kubernetes.default.svc:$KUBERNETES_SERVICE_PORT`.
 This also verifies a working `kube-dns` deployment.
@@ -175,6 +180,7 @@ This also verifies a working `kube-dns` deployment.
 Metric type: `api_server_dns`
 
 ### Me Ingress
+
 Checks if the kubenurse is reachable at the `/alwayshappy` endpoint behind the ingress.
 This address is provided by the environment variable `KUBENURSE_INGRESS_URL` that
 could look like `https://kubenurse.example.com`.
@@ -183,6 +189,7 @@ This also verifies a correct upstream DNS resolution.
 Metric type: `me_ingress`
 
 ### Me Service
+
 Checks if the kubenurse is reachable at the `/alwayshappy` endpoint through the Kubernetes service.
 The address is provided by the environment variable `KUBENURSE_SERVICE_URL` that
 could look like `http://kubenurse.mynamespace.default.svc:8080`.
@@ -191,6 +198,7 @@ This also verifies a working `kube-proxy` setup.
 Metric type: `me_service`
 
 ### Neighbourhood
+
 Checks if every neighbour kubenurse is reachable at the `/alwayshappy` endpoint.
 Neighbours are discovered by querying the kube-apiserver for every Pod in the
 `KUBENURSE_NAMESPACE` with label `KUBENURSE_NEIGHBOUR_FILTER`.
@@ -201,7 +209,44 @@ this can be changed by setting `KUBENURSE_ALLOW_UNSCHEDULABLE="true"`.
 
 Metric type: `path_$KUBELET_HOSTNAME`
 
+#### Neighbourhood filtering
+
+The number of checks for the neighbourhood used to grow as $O(N^2)$, which
+rendered `kubenurse` impractical on large clusters, as documented in issue
+[#55](https://github.com/postfinance/kubenurse/issues/55).
+To combat this, a node filtering feature was implemented, which works as follows
+
+- kubenurse computes the `sha256` checksums for all neighbours' node names
+- it sorts those checksums (this is actually implemented with a max-heap)
+- it computes its own node name checksum, and queries the next 10 (per default)
+  nodes in the sorted checksums list
+
+Thanks to this, every node is making queries to the same 10 nodes, unless one
+of those nodes disappears, in which case kubenurse will pick the next node in
+the sorted checksums list. This comes with several advantages:
+
+- because of the way we first hash the node names, the checks distribution is
+  randomly distributed, independant of the node names. if we only picked the 10
+  next nodes in a sorted list of the node names, then we might have biased the
+  results in environments where node names are sequential
+- metrics-wise, a `kubenurse` pod should typically only have entries for ca. 10
+  other neighbouring nodes worth of checks, which greatly reduces the load on
+  your monitoring infrastructure
+- because we use a deterministic algorithm to choose which nodes to query, the
+  metrics churn rate stays minimal. (that is, if we randomly picked 10 nodes
+  for every check, then in the end there would be one prometheus bucket for
+  every node on the cluster, which would put useless load on the monitoring
+  infrastructure)
+
+Per default, the neighbourhood filtering is set to 10 nodes, which means that
+on cluster with more than 10 nodes, each kubenurse will query 10 nodes, as
+described above.
+
+To bypass the node filtering feature, you simply need to set the
+`KUBENURSE_NEIGHBOUR_LIMIT` environment variable to 0.
+
 ## Metrics
+
 All performed checks expose metrics which can be used to monitor/alert:
 
 - SDN network latencies and errors
@@ -214,5 +259,6 @@ All performed checks expose metrics which can be used to monitor/alert:
 - External DNS resolution errors (ingress URL resolution)
 
 At `/metrics` you will find these:
+
 - `kubenurse_errors_total`: Kubenurse error counter partitioned by error type
 - `kubenurse_request_duration`: a histogram for Kubenurse request duration partitioned by error type
diff --git a/helm/kubenurse/templates/daemonset.yaml b/helm/kubenurse/templates/daemonset.yaml
index 1f56a3ce..fa5d0118 100644
--- a/helm/kubenurse/templates/daemonset.yaml
+++ b/helm/kubenurse/templates/daemonset.yaml
@@ -56,6 +56,8 @@ spec:
           value: {{ .Release.Namespace }}
         - name: KUBENURSE_NEIGHBOUR_FILTER
           value: {{ .Values.neighbour_filter }}
+        - name: KUBENURSE_NEIGHBOUR_LIMIT
+          value: {{ .Values.neighbour_limit | quote }}
           {{- if .Values.extra_ca }}
         - name: KUBENURSE_EXTRA_CA
           value: {{ .Values.extra_ca }}
diff --git a/helm/kubenurse/values.yaml b/helm/kubenurse/values.yaml
index ad0561d5..2b7312a2 100644
--- a/helm/kubenurse/values.yaml
+++ b/helm/kubenurse/values.yaml
@@ -35,6 +35,8 @@ service_url: ""
 allow_unschedulable: false
 # KUBENURSE_NEIGHBOUR_FILTER
 neighbour_filter: app.kubernetes.io/name=kubenurse
+# KUBENURSE_NEIGHBOUR_LIMIT
+neighbour_limit: 10
 # KUBENURSE_EXTRA_CA
 extra_ca: ""
 # KUBENURSE_CHECK_API_SERVER_DIRECT
diff --git a/internal/kubenurse/server.go b/internal/kubenurse/server.go
index e91a5a86..cc794fe8 100644
--- a/internal/kubenurse/server.go
+++ b/internal/kubenurse/server.go
@@ -148,6 +148,8 @@ func New(ctx context.Context, c client.Client) (*Server, error) { //nolint:funle
 		if err != nil {
 			return nil, err
 		}
+	} else {
+		chk.NeighbourLimit = 10
 	}
 
 	//nolint:goconst // No need to make "false" a constant in my opinion, readability is better like this.