From 94925a0013298106c45c2eb23a3dacd0a035d96e Mon Sep 17 00:00:00 2001 From: Domenic Rosati Date: Tue, 5 Mar 2019 10:54:15 -0400 Subject: [PATCH] add initial valence manifests and instructions --- README.md | 257 +++ example-workloads.yaml | 1008 ++++++++++++ example/kustomization.yaml | 3 + .../kube-state-metrics/deployment.yaml | 22 + .../kube-state-metrics/kustomization.yaml | 7 + example/tooling/kube-state-metrics/rbac.yaml | 91 ++ .../tooling/kube-state-metrics/service.yaml | 18 + example/tooling/kustomization.yaml | 3 + .../tooling/metrics-server/api-service.yaml | 14 + .../metrics-server/auth-delegator.yaml | 13 + .../tooling/metrics-server/auth-reader.yaml | 14 + .../tooling/metrics-server/deployment.yaml | 40 + .../tooling/metrics-server/kustomization.yaml | 10 + .../metrics-server/resource-reader.yaml | 38 + example/tooling/metrics-server/service.yaml | 15 + example/workloads/kustomization.yaml | 10 + example/workloads/postgres/deployment.yaml | 41 + example/workloads/postgres/kustomization.yaml | 6 + .../workloads/postgres/persistantVolume.yaml | 13 + example/workloads/postgres/service.yaml | 13 + example/workloads/slo-microservices.yaml | 14 + example/workloads/slo-webapps.yaml | 13 + .../deployment.yaml | 61 + .../kustomization.yaml | 6 + .../todo-backend-django-valence/service.yaml | 17 + .../todo-backend-django-valence/workload.yaml | 32 + .../todo-backend-django/deployment.yaml | 52 + .../todo-backend-django/kustomization.yaml | 6 + .../todo-backend-django/service.yaml | 17 + .../todo-backend-django/workload.yaml | 32 + .../todo-backend-express/deployment.yaml | 60 + .../todo-backend-express/kustomization.yaml | 6 + .../todo-backend-express/service.yaml | 17 + .../todo-backend-express/workload.yaml | 32 + .../todo-backend-golang/deployment.yaml | 56 + .../todo-backend-golang/kustomization.yaml | 6 + .../todo-backend-golang/service.yaml | 17 + .../todo-backend-golang/workload.yaml | 32 + .../todo-backend-java/deployment.yaml | 58 + .../todo-backend-java/kustomization.yaml | 6 + .../workloads/todo-backend-java/service.yaml | 17 + .../workloads/todo-backend-java/workload.yaml | 32 + makefile | 8 + manifests/kustomization.yaml | 2 + manifests/valence/grafana/configMap.yaml | 31 + .../valence/grafana/dashboard-valence.yaml | 632 ++++++++ manifests/valence/grafana/deployment.yaml | 52 + manifests/valence/grafana/kustomization.yaml | 10 + manifests/valence/grafana/service.yaml | 10 + manifests/valence/kustomization.yaml | 8 + manifests/valence/operator/crds.yaml | 11 + manifests/valence/operator/deployment.yaml | 27 + manifests/valence/operator/kustomization.yaml | 12 + manifests/valence/operator/namespace.yaml | 4 + manifests/valence/operator/rbac.yaml | 46 + manifests/valence/operator/service.yaml | 12 + manifests/valence/prometheus/config-map.yaml | 310 ++++ .../valence/prometheus/kustomization.yaml | 8 + .../prometheus-service-accounts.yaml | 42 + manifests/valence/prometheus/service.yaml | 13 + .../valence/prometheus/stateful-set.yaml | 50 + valence.yaml | 1403 +++++++++++++++++ 62 files changed, 4916 insertions(+) create mode 100644 README.md create mode 100644 example-workloads.yaml create mode 100644 example/kustomization.yaml create mode 100644 example/tooling/kube-state-metrics/deployment.yaml create mode 100644 example/tooling/kube-state-metrics/kustomization.yaml create mode 100644 example/tooling/kube-state-metrics/rbac.yaml create mode 100644 example/tooling/kube-state-metrics/service.yaml create mode 100644 example/tooling/kustomization.yaml create mode 100644 example/tooling/metrics-server/api-service.yaml create mode 100644 example/tooling/metrics-server/auth-delegator.yaml create mode 100644 example/tooling/metrics-server/auth-reader.yaml create mode 100644 example/tooling/metrics-server/deployment.yaml create mode 100644 example/tooling/metrics-server/kustomization.yaml create mode 100644 example/tooling/metrics-server/resource-reader.yaml create mode 100644 example/tooling/metrics-server/service.yaml create mode 100644 example/workloads/kustomization.yaml create mode 100644 example/workloads/postgres/deployment.yaml create mode 100644 example/workloads/postgres/kustomization.yaml create mode 100644 example/workloads/postgres/persistantVolume.yaml create mode 100644 example/workloads/postgres/service.yaml create mode 100644 example/workloads/slo-microservices.yaml create mode 100644 example/workloads/slo-webapps.yaml create mode 100644 example/workloads/todo-backend-django-valence/deployment.yaml create mode 100644 example/workloads/todo-backend-django-valence/kustomization.yaml create mode 100644 example/workloads/todo-backend-django-valence/service.yaml create mode 100644 example/workloads/todo-backend-django-valence/workload.yaml create mode 100644 example/workloads/todo-backend-django/deployment.yaml create mode 100644 example/workloads/todo-backend-django/kustomization.yaml create mode 100644 example/workloads/todo-backend-django/service.yaml create mode 100644 example/workloads/todo-backend-django/workload.yaml create mode 100644 example/workloads/todo-backend-express/deployment.yaml create mode 100644 example/workloads/todo-backend-express/kustomization.yaml create mode 100644 example/workloads/todo-backend-express/service.yaml create mode 100644 example/workloads/todo-backend-express/workload.yaml create mode 100644 example/workloads/todo-backend-golang/deployment.yaml create mode 100644 example/workloads/todo-backend-golang/kustomization.yaml create mode 100644 example/workloads/todo-backend-golang/service.yaml create mode 100644 example/workloads/todo-backend-golang/workload.yaml create mode 100644 example/workloads/todo-backend-java/deployment.yaml create mode 100644 example/workloads/todo-backend-java/kustomization.yaml create mode 100644 example/workloads/todo-backend-java/service.yaml create mode 100644 example/workloads/todo-backend-java/workload.yaml create mode 100644 makefile create mode 100644 manifests/kustomization.yaml create mode 100644 manifests/valence/grafana/configMap.yaml create mode 100644 manifests/valence/grafana/dashboard-valence.yaml create mode 100644 manifests/valence/grafana/deployment.yaml create mode 100644 manifests/valence/grafana/kustomization.yaml create mode 100644 manifests/valence/grafana/service.yaml create mode 100644 manifests/valence/kustomization.yaml create mode 100644 manifests/valence/operator/crds.yaml create mode 100644 manifests/valence/operator/deployment.yaml create mode 100644 manifests/valence/operator/kustomization.yaml create mode 100644 manifests/valence/operator/namespace.yaml create mode 100644 manifests/valence/operator/rbac.yaml create mode 100644 manifests/valence/operator/service.yaml create mode 100644 manifests/valence/prometheus/config-map.yaml create mode 100644 manifests/valence/prometheus/kustomization.yaml create mode 100644 manifests/valence/prometheus/prometheus-service-accounts.yaml create mode 100644 manifests/valence/prometheus/service.yaml create mode 100644 manifests/valence/prometheus/stateful-set.yaml create mode 100644 valence.yaml diff --git a/README.md b/README.md new file mode 100644 index 0000000..2e0a34a --- /dev/null +++ b/README.md @@ -0,0 +1,257 @@ +# Valence + +Valence is a cost and performance management operator for Kubernetes for right sizing and autoscaling containers intelligently to meet performance objectives. It learns how applications behave and optimizes resources according to defined Service Level Objectives manifests. Valence acts as a bidirectional pod autoscaling solution and/or intelligent right sizing solution in order to ensure maximum utility of your cluster without performance degredation. + +Valence is based on the notion of Declarative Performance. We believe you should be able to declare performance objectives and have an operator, Valence, figures out how to autoscale, right size, and pack your Kubernetes resources. In contrast, current Kubernetes scaling and performance management tools are largely imperative requiring overhead to determine right size, autoscaling metrics, related configuration. Since code, traffic, and node utilization changes - we believe this should be managed automatically by an operator, rather than by manual calculation and intervention. We also think the right unit of scaling isn't utilization or metrics thresholds but based, dynamically, on how applications behavour (utilization) responds to its use(such as HTTP Requests). + +1) [Suggested On-boarding](#suggested-on-boarding) +2) [Installation](#installation) +3) [Using Valence](#using-valence) +4) [Testing Valence with Example Workloads](#example-workloads) + +Want to get started quickly with example workloads? +- start on a fresh cluster such as docker-for-desktop +- if your cluster already has metrics-server remove `./metrics-server` from `./example/tooling/kustomization.yaml` and recompile `make example-workloads` +- `kubectly apply -f valence.yaml -f example-workloads.yaml` +- `kubectl proxy svc/grafana -n valence-system &` +- `open http://localhost:8001/api/v1/namespaces/valence-system/services/grafana/proxy` +- Recommendations for Replicas, Requests and Limits, and live changes to those should start coming in 5-20 minutes. + +## Suggested On-boarding +In order to get the most of out Valence, we recommend starting with Valence in recommendation mode. This will allow you to gain a comfortable level and understanding of the configuration options of Valence, before going into Live mode where Valence takes control of your deployments resourcing and scaling on your behalf. + +**Step 1 - Installation:** +Follow the installation instructions below (full support from the Valence team will be available) + +**Step 2 - Recommendation Mode:** +Pick a few deployments you’d like to see recommendations being made on and write SLO manifests for them. +We recommend you observe Valence recommendations for a couple days at this point. Please discuss any concerns you may have or feedback with the Valence team as you are observing recommendations. During this period you should manually use those recommendations as you please. +**Note: our prometheus only retains data for 6 hours so you will have to make your observations accordingly** + +**Step 3 - Live Mode, limited deployments:** +Now we recommend you let Valence take full control of those deployments by [using Valence Annotations](#using-valence-annotations). Again take a couple days to observe how Valence is operating those deployments and direct any feedback to the Valence team. + +**Step 4 - Full roll out:** +Add more deployments for recommendations or management by Valence. + +## Installation + +Installing Valence: +1. [Installing Valence Operator](#installing-valence-operator) +2. [Preparing Deployments and Services for Operation by Valence](#operating-with-valence) +3. [Setting SLOs](#setting-slos) + +### Installing Valence Operator + +Valence is an operator that lives in its own namespace with all the tools it needs. + + +You will need to have the following components installed to use Valence. +If you don't have these, you can take a look at the tooling manifests for examples. +**Prerequests:** +- [metrics-server](https://github.com/kubernetes-incubator/metrics-server) +- [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) + +Valence can be installed by applying the valence.yaml you will find in the valence repo. +``` +kubectl apply -f valence.yaml +``` +Valence can be removed by deleting valence.yaml +``` +kubectl delete -f valence.yaml +``` + +Components installed in valence-system namespace: +- Prometheus (Valence’s own managed Prometheus) +- Grafana with Valence Dashboards (Valence’s own managed Grafana) +- Valence Operator + +If you need to modify these files you can use the make commands to recompile the manifests. (ie. `make valence` (you will need Kustomize `make install-kustomize` to install)), + +### Operating with Valence +There are five steps to operating a deployment with Valence. + +**1) Write a SLO for a deployment or group of deployments** +``` +apiVersion: optimizer.valence.io/v1alpha1 +kind: ServiceLevelObjective +metadata: + name: slo-microservices +spec: + selector: + slo: slo-microservices + objectives: + - type: HTTP + http: + latency: + # Valid values are 99, 95, 90, 75, 50. + percentile: 99 + responseTime: 100ms + # Omit this for autoscaling (ie. latency objective valid for all throughputs). + # This is throughput of queries per minute. + throughput: 500 +``` + +**2) Label the deployment with that SLO:** +``` +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: todo-backend-django + labels: + app: todo-backend-django + slo: slo-microservices +... + template: + metadata: + labels: + app: todo-backend-django + slo: slo-microservices +``` + +**3) Add the prometheus-proxy container to the deployment and modify the service to include prometheus.** + +Valence collects application metrics through a sidecar. If you’d prefer to collect metrics based on your ingress, load-balancer, envoy containers or otherwise, let the Valence team know. This will eventually be automated, all feedback is appreciated! + +Add the proxy container to your deployment and set the target address to where your application is normally serving. + +``` + spec: + containers: + - name: prometheus-proxy + image: valencenet/prometheus-proxy:0.1.14 + imagePullPolicy: IfNotPresent + env: + - name: TARGET_ADDRESS + value: "http://127.0.0.1:8000" # where your app is serving on + args: + - start +``` + +**4) Label the Service with the Valence proxy collection and replace your existing service with a Valence comptable service.** + +Change: +``` +apiVersion: v1 +kind: Service +metadata: + labels: + service: todo-backend-django + name: todo-backend-django +spec: + type: NodePort + ports: + - name: headless + port: 80 + targetPort: 8080 + selector: + app: todo-backend-django +``` +To: +``` +apiVersion: v1 +kind: Service +metadata: + name: todo-backend-django + labels: + service: todo-backend-django + # scrape promehteus metrics by valence + app.kubernetes.io/managed-by: valence +spec: + type: NodePort + ports: + - name: headless + port: 80 + targetPort: 8081 # this is the port prometheus-proxy is serving on + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-django +``` +## Using Valence + +Using Valence: +1. [Using Valence Annotations](#using-valence-annotations) +2. [Viewing Valence Recommendations and Changes](#viewing-valence-recommendations-and-changes) + +### Setting SLOs +Setting a SLO is done via writing the manifest, applying it, and registering a deployment using the label defined in the slo selector. + +Example: +``` +apiVersion: optimizer.valence.io/v1alpha1 +kind: ServiceLevelObjective +metadata: + name: slo-microservices +spec: + selector: + # The label you want to select on deployments. + slo: slo-microservices + objectives: + - type: HTTP + http: + latency: + # Percentile you'd like your response times to fall under. + # Valid values are 99, 95, 90, 75, 50. + percentile: 99 + # Response time you want your application to meet. + responseTime: 100ms + # The throughput objective you want the latency objective to be valid for. + # Omit this for throughput scaling (ie. latency objective valid for all throughputs). + # This is throughput of queries per minute. + throughput: 500 +``` + +## Using Valence Annotations +These annotations are optional: +``` + annotations: + # Whether to make changes automatically with recommendations. + valence.io/optimizer.configure: "true" + # Minimum amount of replicas to recommend. + valence.io/optimizer.min-replicas: "2" + # Minimum cpu requests to recommend. + valence.io/optimizer.min-cpu-requests: "100m" + # Minimum memory requests to recommend. + valence.io/optimizer.min-memory-requests: "500M" +``` + +## Viewing Valence Recommendations and Changes + +Open Grafana +``` +kubectl proxy svc/grafana -n valence-system +open http://localhost:8001/api/v1/namespaces/valence-system/services/grafana/proxy +``` + +Once you are in Grafana look at the Valence Recommendations dashboard. +You will see: +- Memory recommendations and resources +- CPU recommendations and resources +- HTTP Request Count in Queries per Second +- HTTP Latency at selected percentile +- Replica recommendations and current replicas + +## Example Workloads + +If you want to test out valence on example workloads we have provided examples manifests that you can use. We generate synthetic workloads using our realistic workload generation tool Majin (see the workload.yaml files). See the `example/workloads dir for more details`. + +The workloads for testing are: +- todo-backend-django (this is a control workload not using valence) +- todo-backend-django-valence +- todo-backend-express +- todo-backend-golang +- todo-backend-java + +They will use two different SLO manifests: +- slo-microservices +- slo-webapps + +Want to get started quickly with example workloads? +- start on a fresh cluster such as docker-for-desktop +- if your cluster already has metrics-server remove `./metrics-server` from `./example/tooling/kustomization.yaml` and recompile `make example-workloads` +- `kubectly apply -f valence.yaml -f example-workloads.yaml` +- `kubectl proxy svc/grafana -n valence-system &` +- `open http://localhost:8001/api/v1/namespaces/valence-system/services/grafana/proxy` +- Recommendations for Replicas, Requests and Limits, and live changes to those should start coming in 5-20 minutes. diff --git a/example-workloads.yaml b/example-workloads.yaml new file mode 100644 index 0000000..7cc5062 --- /dev/null +++ b/example-workloads.yaml @@ -0,0 +1,1008 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app: kube-state-metrics + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: metrics-server + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app: kube-state-metrics + name: kube-state-metrics-resizer + namespace: kube-system +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get +- apiGroups: + - extensions + resourceNames: + - kube-state-metrics + resources: + - deployments + verbs: + - get + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app: kube-state-metrics + name: kube-state-metrics +rules: +- apiGroups: + - "" + resources: + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + - secrets + - configmaps + verbs: + - list + - watch +- apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - list + - watch +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:metrics-server +rules: +- apiGroups: + - "" + resources: + - pods + - nodes + - nodes/stats + - namespaces + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - deployments + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app: kube-state-metrics + name: kube-state-metrics + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics-resizer +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: metrics-server-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app: kube-state-metrics + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:metrics-server +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:metrics-server +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: metrics-server:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: "true" + labels: + app: kube-state-metrics + name: kube-state-metrics + namespace: kube-system +spec: + ports: + - name: metrics + port: 8080 + protocol: TCP + targetPort: metrics + - name: telemetry + port: 8081 + protocol: TCP + targetPort: telemetry + selector: + app: kube-state-metrics +--- +apiVersion: v1 +kind: Service +metadata: + labels: + kubernetes.io/name: metrics-server + name: metrics-server + namespace: kube-system +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 443 + selector: + k8s-app: metrics-server +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: postgres + service: postgres + name: postgres +spec: + ports: + - name: "5432" + port: 5432 + targetPort: 5432 + selector: + app: postgres + service: postgres +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: todo-backend-django-valence + app.kubernetes.io/managed-by: valence + name: todo-backend-django-valence +spec: + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-django-valence + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: todo-backend-django + app.kubernetes.io/managed-by: valence + name: todo-backend-django +spec: + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-django + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: todo-backend-express + app.kubernetes.io/managed-by: valence + name: todo-backend-express +spec: + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-express + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: todo-backend-golang + app.kubernetes.io/managed-by: valence + name: todo-backend-golang +spec: + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-golang + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: todo-backend-java + app.kubernetes.io/managed-by: valence + name: todo-backend-java +spec: + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-java + type: NodePort +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: kube-state-metrics + name: kube-state-metrics + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + containers: + - image: gcr.io/google_containers/kube-state-metrics:v1.3.1 + name: kube-state-metrics + ports: + - containerPort: 8080 + name: metrics + resources: + limits: + cpu: 200m + memory: 500Mi + requests: + cpu: 100m + memory: 300Mi + serviceAccountName: kube-state-metrics +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + k8s-app: metrics-server + name: metrics-server + namespace: kube-system +spec: + selector: + matchLabels: + k8s-app: metrics-server + template: + metadata: + labels: + k8s-app: metrics-server + name: metrics-server + spec: + containers: + - command: + - /metrics-server + - --source=kubernetes.summary_api:'' + - --metric-resolution=5s + image: gcr.io/google_containers/metrics-server-amd64:v0.2.1 + imagePullPolicy: Always + name: metrics-server + resources: + limits: + cpu: 80m + memory: 200Mi + requests: + cpu: 40m + memory: 32Mi + serviceAccountName: metrics-server +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: postgres + service: postgres + name: postgres +spec: + replicas: 1 + selector: + matchLabels: + app: postgres + strategy: + type: Recreate + template: + metadata: + creationTimestamp: null + labels: + app: postgres + service: postgres + spec: + containers: + - env: + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + - name: POSTGRES_USER + value: postgres + - name: POSTGRES_DB + value: todos + image: postgres:9.6-alpine + name: postgres + ports: + - containerPort: 5432 + name: http + protocol: TCP + resources: {} + volumeMounts: + - mountPath: /var/lib/postgresql/data/pgdata + name: postgres-data + subPath: postgres + restartPolicy: Always + volumes: + - name: postgres-data + persistentVolumeClaim: + claimName: postgres-data +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + annotations: + valence.io/optimizer.configure: "true" + labels: + app: todo-backend-django-valence + slo: slo-webapps + name: todo-backend-django-valence +spec: + replicas: 2 + revisionHistoryLimit: 1 + selector: + matchLabels: + app: todo-backend-django-valence + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-django-valence + prometheus: valence + slo: slo-webapps + spec: + containers: + - args: + - start + env: + - name: TARGET_ADDRESS + value: http://127.0.0.1:8000 + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + name: prometheus-proxy + resources: + requests: + cpu: 100m + - env: + - name: PORT + value: "8000" + image: manifoldco/todo-backend-django:latest + imagePullPolicy: IfNotPresent + name: todo-backend-django-valence + ports: + - containerPort: 8000 + name: http + protocol: TCP + readinessProbe: + failureThreshold: 30 + httpGet: + path: /todos + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + restartPolicy: Always +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: todo-backend-django + name: todo-backend-django +spec: + replicas: 2 + selector: + matchLabels: + app: todo-backend-django + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-django + spec: + containers: + - args: + - start + env: + - name: TARGET_ADDRESS + value: http://127.0.0.1:8000 + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + name: prometheus-proxy + - env: + - name: PORT + value: "8000" + image: manifoldco/todo-backend-django:latest + imagePullPolicy: IfNotPresent + name: todo-backend-django + ports: + - containerPort: 8000 + name: http + protocol: TCP + readinessProbe: + failureThreshold: 30 + httpGet: + path: /todos + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + restartPolicy: Always +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + annotations: + valence.io/optimizer.configure: "true" + labels: + app: todo-backend-express + slo: slo-webapps + name: todo-backend-express +spec: + replicas: 1 + revisionHistoryLimit: 1 + selector: + matchLabels: + app: todo-backend-express + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-express + slo: slo-webapps + spec: + containers: + - args: + - start + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + name: prometheus-proxy + resources: + requests: + cpu: 100m + - args: + - -c + - sleep 60; node node_modules/db-migrate/bin/db-migrate up; node server.js + command: + - /bin/sh + env: + - name: PORT + value: "8080" + - name: DATABASE_URL + value: postgres://postgres@postgres.default:5432 + image: manifoldco/todo-backend-express:latest + imagePullPolicy: IfNotPresent + name: todo-backend-express + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + failureThreshold: 30 + httpGet: + path: / + port: 8080 + initialDelaySeconds: 90 + periodSeconds: 60 + timeoutSeconds: 30 + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + annotations: + valence.io/optimizer.configure: "true" + labels: + app: todo-backend-golang + prometheus: valence + slo: slo-microservices + name: todo-backend-golang +spec: + revisionHistoryLimit: 1 + selector: + matchLabels: + app: todo-backend-golang + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-golang + slo: slo-microservices + spec: + containers: + - args: + - start + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + name: prometheus-proxy + resources: + requests: + cpu: 100m + - env: + - name: PORT + value: "8080" + image: manifoldco/todo-backend-golang:latest + imagePullPolicy: IfNotPresent + name: todo-backend-golang + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + failureThreshold: 30 + httpGet: + path: /todos + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + annotations: + valence.io/optimizer.configure: "true" + labels: + app: todo-backend-java + slo: slo-webapps + name: todo-backend-java +spec: + revisionHistoryLimit: 1 + selector: + matchLabels: + app: todo-backend-java + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-java + slo: slo-webapps + spec: + containers: + - args: + - start + env: + - name: TARGET_ADDRESS + value: http://127.0.0.1:80 + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + name: prometheus-proxy + resources: + requests: + cpu: 100m + - env: + - name: PORT + value: "80" + image: manifoldco/todo-backend-java:latest + imagePullPolicy: IfNotPresent + name: todo-backend-java + ports: + - containerPort: 80 + name: http + protocol: TCP + readinessProbe: + failureThreshold: 30 + httpGet: + path: /todos + port: 80 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M +--- +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.metrics.k8s.io +spec: + group: metrics.k8s.io + groupPriorityMinimum: 100 + insecureSkipTLSVerify: true + service: + name: metrics-server + namespace: kube-system + version: v1beta1 + versionPriority: 100 +--- +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app: todo-backend-django-valence + name: majin-todo-backend-django-valence +spec: + template: + metadata: + labels: + app: todo-backend-django-valence + spec: + containers: + - args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-django-valence.default/todos + image: valencenet/majin:0.2.0 + name: majin + restartPolicy: OnFailure +--- +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app: todo-backend-django + name: majin-todo-backend-django +spec: + template: + metadata: + labels: + app: todo-backend-django + spec: + containers: + - args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-django.default/todos + image: valencenet/majin:0.2.0 + name: majin + restartPolicy: OnFailure +--- +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app: todo-backend-express + name: majin-todo-backend-express +spec: + template: + metadata: + labels: + app: todo-backend-express + spec: + containers: + - args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-express.default/ + image: valencenet/majin:0.2.0 + name: majin + restartPolicy: OnFailure +--- +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app: todo-backend-golang + name: majin-todo-backend-golang +spec: + template: + metadata: + labels: + app: todo-backend-golang + spec: + containers: + - args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-golang.default/todos + image: valencenet/majin:0.2.0 + name: majin + restartPolicy: OnFailure +--- +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app: todo-backend-java + name: majin-todo-backend-java +spec: + template: + metadata: + labels: + app: todo-backend-java + spec: + containers: + - args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-java.default/todos + image: valencenet/majin:0.2.0 + name: majin + restartPolicy: OnFailure +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + app: postgres + service: postgres-data + name: postgres-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +status: {} +--- +apiVersion: optimizer.valence.io/v1alpha1 +kind: ServiceLevelObjective +metadata: + name: slo-microservices +spec: + objectives: + - http: + latency: + percentile: 95 + responseTime: 500ms + throughput: 500 + type: HTTP + selector: + slo: slo-microservices +--- +apiVersion: optimizer.valence.io/v1alpha1 +kind: ServiceLevelObjective +metadata: + name: slo-webapps +spec: + objectives: + - http: + latency: + percentile: 99 + responseTime: 100ms + type: HTTP + selector: + slo: slo-webapps diff --git a/example/kustomization.yaml b/example/kustomization.yaml new file mode 100644 index 0000000..2f9350b --- /dev/null +++ b/example/kustomization.yaml @@ -0,0 +1,3 @@ +bases: + - ./tooling + - ./workloads \ No newline at end of file diff --git a/example/tooling/kube-state-metrics/deployment.yaml b/example/tooling/kube-state-metrics/deployment.yaml new file mode 100644 index 0000000..ea8b536 --- /dev/null +++ b/example/tooling/kube-state-metrics/deployment.yaml @@ -0,0 +1,22 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: kube-state-metrics +spec: + replicas: 1 + template: + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: gcr.io/google_containers/kube-state-metrics:v1.3.1 + ports: + - name: metrics + containerPort: 8080 + resources: + requests: + memory: 300Mi + cpu: 100m + limits: + memory: 500Mi + cpu: 200m diff --git a/example/tooling/kube-state-metrics/kustomization.yaml b/example/tooling/kube-state-metrics/kustomization.yaml new file mode 100644 index 0000000..dadace9 --- /dev/null +++ b/example/tooling/kube-state-metrics/kustomization.yaml @@ -0,0 +1,7 @@ +commonLabels: + app: kube-state-metrics +namespace: kube-system +resources: + - rbac.yaml + - deployment.yaml + - service.yaml diff --git a/example/tooling/kube-state-metrics/rbac.yaml b/example/tooling/kube-state-metrics/rbac.yaml new file mode 100644 index 0000000..405bc3f --- /dev/null +++ b/example/tooling/kube-state-metrics/rbac.yaml @@ -0,0 +1,91 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: kube-state-metrics-resizer +rules: +- apiGroups: [""] + resources: + - pods + verbs: ["get"] +- apiGroups: ["extensions"] + resources: + - deployments + resourceNames: ["kube-state-metrics"] + verbs: ["get", "update"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: [""] + resources: + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + - secrets + - configmaps + verbs: ["list", "watch"] +- apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] +- apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics-resizer +subjects: +- kind: ServiceAccount + name: kube-state-metrics \ No newline at end of file diff --git a/example/tooling/kube-state-metrics/service.yaml b/example/tooling/kube-state-metrics/service.yaml new file mode 100644 index 0000000..a85c6d7 --- /dev/null +++ b/example/tooling/kube-state-metrics/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + name: kube-state-metrics +spec: + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP + - name: telemetry + port: 8081 + targetPort: telemetry + protocol: TCP + selector: + app: kube-state-metrics diff --git a/example/tooling/kustomization.yaml b/example/tooling/kustomization.yaml new file mode 100644 index 0000000..6d0a849 --- /dev/null +++ b/example/tooling/kustomization.yaml @@ -0,0 +1,3 @@ +bases: + - ./kube-state-metrics + - ./metrics-server diff --git a/example/tooling/metrics-server/api-service.yaml b/example/tooling/metrics-server/api-service.yaml new file mode 100644 index 0000000..08b0530 --- /dev/null +++ b/example/tooling/metrics-server/api-service.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.metrics.k8s.io +spec: + service: + name: metrics-server + namespace: kube-system + group: metrics.k8s.io + version: v1beta1 + insecureSkipTLSVerify: true + groupPriorityMinimum: 100 + versionPriority: 100 diff --git a/example/tooling/metrics-server/auth-delegator.yaml b/example/tooling/metrics-server/auth-delegator.yaml new file mode 100644 index 0000000..e3442c5 --- /dev/null +++ b/example/tooling/metrics-server/auth-delegator.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: metrics-server:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system diff --git a/example/tooling/metrics-server/auth-reader.yaml b/example/tooling/metrics-server/auth-reader.yaml new file mode 100644 index 0000000..f0616e1 --- /dev/null +++ b/example/tooling/metrics-server/auth-reader.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: metrics-server-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system diff --git a/example/tooling/metrics-server/deployment.yaml b/example/tooling/metrics-server/deployment.yaml new file mode 100644 index 0000000..81a12e3 --- /dev/null +++ b/example/tooling/metrics-server/deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: metrics-server + namespace: kube-system + +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: metrics-server + namespace: kube-system + labels: + k8s-app: metrics-server +spec: + selector: + matchLabels: + k8s-app: metrics-server + template: + metadata: + name: metrics-server + labels: + k8s-app: metrics-server + spec: + serviceAccountName: metrics-server + containers: + - name: metrics-server + image: gcr.io/google_containers/metrics-server-amd64:v0.0.0 + imagePullPolicy: Always + command: + - /metrics-server + - --source=kubernetes.summary_api:'' + - --metric-resolution=5s + resources: + requests: + cpu: 40m + memory: 32Mi + limits: + cpu: 80m + memory: 200Mi diff --git a/example/tooling/metrics-server/kustomization.yaml b/example/tooling/metrics-server/kustomization.yaml new file mode 100644 index 0000000..11794bd --- /dev/null +++ b/example/tooling/metrics-server/kustomization.yaml @@ -0,0 +1,10 @@ +resources: + - api-service.yaml + - auth-delegator.yaml + - auth-reader.yaml + - deployment.yaml + - resource-reader.yaml + - service.yaml +imageTags: + - name: gcr.io/google_containers/metrics-server-amd64 + newTag: v0.2.1 diff --git a/example/tooling/metrics-server/resource-reader.yaml b/example/tooling/metrics-server/resource-reader.yaml new file mode 100644 index 0000000..34294a3 --- /dev/null +++ b/example/tooling/metrics-server/resource-reader.yaml @@ -0,0 +1,38 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:metrics-server +rules: +- apiGroups: + - "" + resources: + - pods + - nodes + - nodes/stats + - namespaces + verbs: + - get + - list + - watch +- apiGroups: + - "extensions" + resources: + - deployments + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:metrics-server +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:metrics-server +subjects: +- kind: ServiceAccount + name: metrics-server + namespace: kube-system diff --git a/example/tooling/metrics-server/service.yaml b/example/tooling/metrics-server/service.yaml new file mode 100644 index 0000000..5c06f53 --- /dev/null +++ b/example/tooling/metrics-server/service.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: metrics-server + namespace: kube-system + labels: + kubernetes.io/name: "metrics-server" +spec: + selector: + k8s-app: metrics-server + ports: + - port: 443 + protocol: TCP + targetPort: 443 diff --git a/example/workloads/kustomization.yaml b/example/workloads/kustomization.yaml new file mode 100644 index 0000000..e25c4ed --- /dev/null +++ b/example/workloads/kustomization.yaml @@ -0,0 +1,10 @@ +bases: + - ./todo-backend-django + - ./todo-backend-django-valence + - ./todo-backend-express + - ./postgres + - ./todo-backend-golang + - ./todo-backend-java +resources: + - ./slo-microservices.yaml + - ./slo-webapps.yaml \ No newline at end of file diff --git a/example/workloads/postgres/deployment.yaml b/example/workloads/postgres/deployment.yaml new file mode 100644 index 0000000..a272563 --- /dev/null +++ b/example/workloads/postgres/deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: postgres + service: postgres + name: postgres +spec: + replicas: 1 + strategy: + type: Recreate + template: + metadata: + creationTimestamp: null + labels: + service: postgres + spec: + containers: + - env: + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + - name: POSTGRES_USER + value: postgres + - name: POSTGRES_DB + value: todos + image: postgres:9.6-alpine + name: postgres + ports: + - containerPort: 5432 + name: http + protocol: TCP + resources: {} + volumeMounts: + - mountPath: /var/lib/postgresql/data/pgdata + name: postgres-data + subPath: postgres + restartPolicy: Always + volumes: + - name: postgres-data + persistentVolumeClaim: + claimName: postgres-data diff --git a/example/workloads/postgres/kustomization.yaml b/example/workloads/postgres/kustomization.yaml new file mode 100644 index 0000000..7a45c9a --- /dev/null +++ b/example/workloads/postgres/kustomization.yaml @@ -0,0 +1,6 @@ +commonLabels: + app: postgres +resources: + - deployment.yaml + - service.yaml + - persistantVolume.yaml \ No newline at end of file diff --git a/example/workloads/postgres/persistantVolume.yaml b/example/workloads/postgres/persistantVolume.yaml new file mode 100644 index 0000000..e2e33e1 --- /dev/null +++ b/example/workloads/postgres/persistantVolume.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + service: postgres-data + name: postgres-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +status: {} diff --git a/example/workloads/postgres/service.yaml b/example/workloads/postgres/service.yaml new file mode 100644 index 0000000..f964e38 --- /dev/null +++ b/example/workloads/postgres/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + service: postgres + name: postgres +spec: + ports: + - name: "5432" + port: 5432 + targetPort: 5432 + selector: + service: postgres diff --git a/example/workloads/slo-microservices.yaml b/example/workloads/slo-microservices.yaml new file mode 100644 index 0000000..5619ad7 --- /dev/null +++ b/example/workloads/slo-microservices.yaml @@ -0,0 +1,14 @@ +apiVersion: optimizer.valence.io/v1alpha1 +kind: ServiceLevelObjective +metadata: + name: slo-microservices +spec: + selector: + slo: slo-microservices + objectives: + - type: HTTP + http: + latency: + percentile: 95 + responseTime: 500ms + throughput: 500 diff --git a/example/workloads/slo-webapps.yaml b/example/workloads/slo-webapps.yaml new file mode 100644 index 0000000..b3375df --- /dev/null +++ b/example/workloads/slo-webapps.yaml @@ -0,0 +1,13 @@ +apiVersion: optimizer.valence.io/v1alpha1 +kind: ServiceLevelObjective +metadata: + name: slo-webapps +spec: + selector: + slo: slo-webapps + objectives: + - type: HTTP + http: + latency: + percentile: 99 + responseTime: 100ms diff --git a/example/workloads/todo-backend-django-valence/deployment.yaml b/example/workloads/todo-backend-django-valence/deployment.yaml new file mode 100644 index 0000000..1493f6a --- /dev/null +++ b/example/workloads/todo-backend-django-valence/deployment.yaml @@ -0,0 +1,61 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: todo-backend-django-valence + labels: + app: todo-backend-django-valence + slo: slo-webapps + annotations: + valence.io/optimizer.configure: "true" +spec: + revisionHistoryLimit: 1 + replicas: 2 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-django-valence + slo: slo-webapps + prometheus: valence + spec: + restartPolicy: Always + containers: + - name: prometheus-proxy + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + env: + - name: TARGET_ADDRESS + value: "http://127.0.0.1:8000" + args: + - start + resources: + requests: + cpu: 100m + - image: manifoldco/todo-backend-django:latest + imagePullPolicy: IfNotPresent + name: todo-backend-django-valence + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + env: + - name: PORT + value: "8000" + ports: + - containerPort: 8000 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /todos + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 30 \ No newline at end of file diff --git a/example/workloads/todo-backend-django-valence/kustomization.yaml b/example/workloads/todo-backend-django-valence/kustomization.yaml new file mode 100644 index 0000000..df79bfa --- /dev/null +++ b/example/workloads/todo-backend-django-valence/kustomization.yaml @@ -0,0 +1,6 @@ +commonLabels: + app: todo-backend-django-valence +resources: + - deployment.yaml + - service.yaml + - workload.yaml \ No newline at end of file diff --git a/example/workloads/todo-backend-django-valence/service.yaml b/example/workloads/todo-backend-django-valence/service.yaml new file mode 100644 index 0000000..d889538 --- /dev/null +++ b/example/workloads/todo-backend-django-valence/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: todo-backend-django-valence + labels: + app.kubernetes.io/managed-by: valence +spec: + type: NodePort + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-django-valence \ No newline at end of file diff --git a/example/workloads/todo-backend-django-valence/workload.yaml b/example/workloads/todo-backend-django-valence/workload.yaml new file mode 100644 index 0000000..4d28d8d --- /dev/null +++ b/example/workloads/todo-backend-django-valence/workload.yaml @@ -0,0 +1,32 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: majin-todo-backend-django-valence +spec: + template: + spec: + containers: + - name: majin + image: valencenet/majin:0.2.0 + args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-django-valence.default/todos + restartPolicy: OnFailure diff --git a/example/workloads/todo-backend-django/deployment.yaml b/example/workloads/todo-backend-django/deployment.yaml new file mode 100644 index 0000000..3117470 --- /dev/null +++ b/example/workloads/todo-backend-django/deployment.yaml @@ -0,0 +1,52 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: todo-backend-django + labels: + app: todo-backend-django +spec: + replicas: 2 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-django + spec: + restartPolicy: Always + containers: + - name: prometheus-proxy + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + env: + - name: TARGET_ADDRESS + value: "http://127.0.0.1:8000" + args: + - start + - image: manifoldco/todo-backend-django:latest + imagePullPolicy: IfNotPresent + name: todo-backend-django + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + env: + - name: PORT + value: "8000" + ports: + - containerPort: 8000 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /todos + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 30 \ No newline at end of file diff --git a/example/workloads/todo-backend-django/kustomization.yaml b/example/workloads/todo-backend-django/kustomization.yaml new file mode 100644 index 0000000..514bf1d --- /dev/null +++ b/example/workloads/todo-backend-django/kustomization.yaml @@ -0,0 +1,6 @@ +commonLabels: + app: todo-backend-django +resources: + - deployment.yaml + - service.yaml + - workload.yaml \ No newline at end of file diff --git a/example/workloads/todo-backend-django/service.yaml b/example/workloads/todo-backend-django/service.yaml new file mode 100644 index 0000000..034d3c9 --- /dev/null +++ b/example/workloads/todo-backend-django/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: todo-backend-django + labels: + app.kubernetes.io/managed-by: valence +spec: + type: NodePort + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-django \ No newline at end of file diff --git a/example/workloads/todo-backend-django/workload.yaml b/example/workloads/todo-backend-django/workload.yaml new file mode 100644 index 0000000..779f7f0 --- /dev/null +++ b/example/workloads/todo-backend-django/workload.yaml @@ -0,0 +1,32 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: majin-todo-backend-django +spec: + template: + spec: + containers: + - name: majin + image: valencenet/majin:0.2.0 + args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-django.default/todos + restartPolicy: OnFailure diff --git a/example/workloads/todo-backend-express/deployment.yaml b/example/workloads/todo-backend-express/deployment.yaml new file mode 100644 index 0000000..1868f2a --- /dev/null +++ b/example/workloads/todo-backend-express/deployment.yaml @@ -0,0 +1,60 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: todo-backend-express + labels: + app: todo-backend-express + slo: slo-webapps + annotations: + valence.io/optimizer.configure: "true" +spec: + revisionHistoryLimit: 1 + replicas: 1 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-express + slo: slo-webapps + spec: + containers: + - name: prometheus-proxy + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + args: + - start + resources: + requests: + cpu: 100m + - image: manifoldco/todo-backend-express:latest + imagePullPolicy: IfNotPresent + name: todo-backend-express + command: ["/bin/sh"] + args: ["-c", "sleep 60; node node_modules/db-migrate/bin/db-migrate up; node server.js"] + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + env: + - name: PORT + value: "8080" + - name: DATABASE_URL + value: postgres://postgres@postgres.default:5432 + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 90 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 30 diff --git a/example/workloads/todo-backend-express/kustomization.yaml b/example/workloads/todo-backend-express/kustomization.yaml new file mode 100644 index 0000000..327ba93 --- /dev/null +++ b/example/workloads/todo-backend-express/kustomization.yaml @@ -0,0 +1,6 @@ +commonLabels: + app: todo-backend-express +resources: + - deployment.yaml + - service.yaml + - workload.yaml \ No newline at end of file diff --git a/example/workloads/todo-backend-express/service.yaml b/example/workloads/todo-backend-express/service.yaml new file mode 100644 index 0000000..e1e3661 --- /dev/null +++ b/example/workloads/todo-backend-express/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: todo-backend-express + labels: + app.kubernetes.io/managed-by: valence +spec: + type: NodePort + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-express \ No newline at end of file diff --git a/example/workloads/todo-backend-express/workload.yaml b/example/workloads/todo-backend-express/workload.yaml new file mode 100644 index 0000000..d1d1d06 --- /dev/null +++ b/example/workloads/todo-backend-express/workload.yaml @@ -0,0 +1,32 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: majin-todo-backend-express +spec: + template: + spec: + containers: + - name: majin + image: valencenet/majin:0.2.0 + args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-express.default/ + restartPolicy: OnFailure diff --git a/example/workloads/todo-backend-golang/deployment.yaml b/example/workloads/todo-backend-golang/deployment.yaml new file mode 100644 index 0000000..41b5d05 --- /dev/null +++ b/example/workloads/todo-backend-golang/deployment.yaml @@ -0,0 +1,56 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: todo-backend-golang + labels: + app: todo-backend-golang + slo: slo-microservices + prometheus: valence + annotations: + valence.io/optimizer.configure: "true" +spec: + revisionHistoryLimit: 1 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-golang + slo: slo-microservices + spec: + containers: + - name: prometheus-proxy + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + args: + - start + resources: + requests: + cpu: 100m + - image: manifoldco/todo-backend-golang:latest + imagePullPolicy: IfNotPresent + name: todo-backend-golang + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + env: + - name: PORT + value: "8080" + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /todos + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 30 \ No newline at end of file diff --git a/example/workloads/todo-backend-golang/kustomization.yaml b/example/workloads/todo-backend-golang/kustomization.yaml new file mode 100644 index 0000000..f6b7d90 --- /dev/null +++ b/example/workloads/todo-backend-golang/kustomization.yaml @@ -0,0 +1,6 @@ +commonLabels: + app: todo-backend-golang +resources: + - deployment.yaml + - service.yaml + - workload.yaml \ No newline at end of file diff --git a/example/workloads/todo-backend-golang/service.yaml b/example/workloads/todo-backend-golang/service.yaml new file mode 100644 index 0000000..6eb4f3a --- /dev/null +++ b/example/workloads/todo-backend-golang/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: todo-backend-golang + labels: + app.kubernetes.io/managed-by: valence +spec: + type: NodePort + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-golang \ No newline at end of file diff --git a/example/workloads/todo-backend-golang/workload.yaml b/example/workloads/todo-backend-golang/workload.yaml new file mode 100644 index 0000000..996a667 --- /dev/null +++ b/example/workloads/todo-backend-golang/workload.yaml @@ -0,0 +1,32 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: majin-todo-backend-golang +spec: + template: + spec: + containers: + - name: majin + image: valencenet/majin:0.2.0 + args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-golang.default/todos + restartPolicy: OnFailure diff --git a/example/workloads/todo-backend-java/deployment.yaml b/example/workloads/todo-backend-java/deployment.yaml new file mode 100644 index 0000000..3f68c19 --- /dev/null +++ b/example/workloads/todo-backend-java/deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: todo-backend-java + labels: + app: todo-backend-java + slo: slo-webapps + annotations: + valence.io/optimizer.configure: "true" +spec: + revisionHistoryLimit: 1 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: todo-backend-java + slo: slo-webapps + spec: + containers: + - name: prometheus-proxy + image: valencenet/prometheus-proxy:0.2.0 + imagePullPolicy: IfNotPresent + env: + - name: TARGET_ADDRESS + value: "http://127.0.0.1:80" + args: + - start + resources: + requests: + cpu: 100m + - image: manifoldco/todo-backend-java:latest + imagePullPolicy: IfNotPresent + name: todo-backend-java + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + env: + - name: PORT + value: "80" + ports: + - containerPort: 80 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /todos + port: 80 + initialDelaySeconds: 5 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 30 diff --git a/example/workloads/todo-backend-java/kustomization.yaml b/example/workloads/todo-backend-java/kustomization.yaml new file mode 100644 index 0000000..73a4ec0 --- /dev/null +++ b/example/workloads/todo-backend-java/kustomization.yaml @@ -0,0 +1,6 @@ +commonLabels: + app: todo-backend-java +resources: + - deployment.yaml + - service.yaml + - workload.yaml \ No newline at end of file diff --git a/example/workloads/todo-backend-java/service.yaml b/example/workloads/todo-backend-java/service.yaml new file mode 100644 index 0000000..4065629 --- /dev/null +++ b/example/workloads/todo-backend-java/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: todo-backend-java + labels: + app.kubernetes.io/managed-by: valence +spec: + type: NodePort + ports: + - name: headless + port: 80 + targetPort: 8081 + - name: prometheus + port: 8181 + targetPort: 8181 + selector: + app: todo-backend-java \ No newline at end of file diff --git a/example/workloads/todo-backend-java/workload.yaml b/example/workloads/todo-backend-java/workload.yaml new file mode 100644 index 0000000..2465f87 --- /dev/null +++ b/example/workloads/todo-backend-java/workload.yaml @@ -0,0 +1,32 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: majin-todo-backend-java +spec: + template: + spec: + containers: + - name: majin + image: valencenet/majin:0.2.0 + args: + - attack + - --random + - "true" + - --base-load + - "50" + - --period + - "3600" + - --noise + - "100" + - --duration + - "3600" + - --burst-frequency + - "100" + - --burst-index + - "10" + - --burst-duration + - "600" + env: + - name: TARGET + value: http://todo-backend-java.default/todos + restartPolicy: OnFailure diff --git a/makefile b/makefile new file mode 100644 index 0000000..8e41b10 --- /dev/null +++ b/makefile @@ -0,0 +1,8 @@ +install-kustomize: + go get sigs.k8s.io/kustomize + +valence: + kustomize build ./manifests > valence.yaml + +example-workloads: + kustomize build ./example > example-workloads.yaml diff --git a/manifests/kustomization.yaml b/manifests/kustomization.yaml new file mode 100644 index 0000000..67d8ccc --- /dev/null +++ b/manifests/kustomization.yaml @@ -0,0 +1,2 @@ +bases: + - ./valence diff --git a/manifests/valence/grafana/configMap.yaml b/manifests/valence/grafana/configMap.yaml new file mode 100644 index 0000000..4cd9350 --- /dev/null +++ b/manifests/valence/grafana/configMap.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources +data: + prometheus.yaml: | + apiVersion: 1 + datasources: + - name: DS_PROM_VALENCE + type: prometheus + access: proxy + url: http://prometheus-valence.valence-system:9090 + editable: false + version: 1 + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-providers +data: + default.yaml: | + apiVersion: 1 + providers: + - name: 'default' + org_id: 1 + folder: '' + type: 'file' + options: + path: '/var/lib/grafana/dashboards' diff --git a/manifests/valence/grafana/dashboard-valence.yaml b/manifests/valence/grafana/dashboard-valence.yaml new file mode 100644 index 0000000..ac4b044 --- /dev/null +++ b/manifests/valence/grafana/dashboard-valence.yaml @@ -0,0 +1,632 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards-valence +data: + valence.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 1, + "iteration": 1551793537194, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "-- Mixed --", + "description": "Recommendations of Memory requests and limits to set for $deployment", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(container_memory_working_set_bytes{container_name=\"$deployment\"})", + "format": "time_series", + "instant": false, + "interval": "5s", + "intervalFactor": 1, + "legendFormat": "Observed Memory Value", + "refId": "A" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_memory_requests{container_name=\"$deployment\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended Memory Request", + "refId": "B" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_memory_limits{container_name=\"$deployment\"})", + "format": "time_series", + "hide": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended Memory Limit", + "refId": "C" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_requests_memory_bytes{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests", + "refId": "D" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_limits_memory_bytes{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limits", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory recommendations: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "-- Mixed --", + "description": "Recommendations of CPU requests and limits to set for $deployment", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 4, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "DS_PROM_VALENCE", + "expr": "avg(rate(container_cpu_usage_seconds_total{container_name=\"$deployment\"}[5m]))", + "format": "time_series", + "instant": false, + "interval": "5s", + "intervalFactor": 1, + "legendFormat": "Observed CPU Value", + "refId": "A" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_cpu_requests{container_name=\"$deployment\"} / 1000)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended CPU Request", + "refId": "B" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_cpu_limits{container_name=\"$deployment\"} / 1000)", + "format": "time_series", + "hide": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended CPU Limit", + "refId": "C" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_requests_cpu_cores{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests", + "refId": "D" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_limits_cpu_cores{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limits", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cpu recommendations: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "DS_PROM_VALENCE", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 6, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(promproxy_metric_handler_detailed_requests_count{service=\"$deployment\"}[1m])) / 60", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "HTTP Queries Per Second", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Count: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "QpS", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "DS_PROM_VALENCE", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 7, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(rate(promproxy_metric_handler_detailed_requests{service=\"$deployment\", quantile=\"$LatencyPercentile\", code!=\"502\"}[5m]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "HTTP Request Latency", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Latency: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "-- Mixed --", + "description": "Number of replicas observed and recommended for: $deployment", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 9, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "DS_PROM_VALENCE", + "expr": "valence_recommendations_replicas{name=\"$deployment\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Recommended Replicas", + "refId": "A" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Current Replicas", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "todo-backend-java", + "value": "todo-backend-java" + }, + "datasource": "DS_PROM_VALENCE", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment", + "options": [], + "query": "label_values(promproxy_metric_handler_detailed_requests, service)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "0.95", + "value": "0.95" + }, + "datasource": "DS_PROM_VALENCE", + "hide": 0, + "includeAll": false, + "label": "Latency Percentile", + "multi": false, + "name": "LatencyPercentile", + "options": [], + "query": "label_values(promproxy_metric_handler_detailed_requests, quantile)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "1s", + "5s", + "30s", + "1m" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Valence", + "uid": "9ri9X0Qiz", + "version": 1 + } \ No newline at end of file diff --git a/manifests/valence/grafana/deployment.yaml b/manifests/valence/grafana/deployment.yaml new file mode 100644 index 0000000..61237ea --- /dev/null +++ b/manifests/valence/grafana/deployment.yaml @@ -0,0 +1,52 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + template: + spec: + containers: + - image: grafana/grafana:0.0.0 + name: grafana + ports: + - containerPort: 3000 + protocol: TCP + env: + - name: GF_SERVER_ROOT_URL + value: /api/v1/namespaces/valence-system/services/grafana/proxy/ + resources: + limits: + cpu: 500m + memory: 2500Mi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - mountPath: /var/lib/grafana + name: data + - mountPath: /etc/grafana/provisioning/dashboards + name: providers + - mountPath: /etc/grafana/provisioning/datasources + name: datasources + - mountPath: /var/lib/grafana/dashboards/capacity-planning.json + name: dashboards-valence + subPath: valence.json + restartPolicy: Always + volumes: + - emptyDir: {} + name: data + - name: providers + configMap: + name: grafana-providers + - name: datasources + configMap: + name: grafana-datasources + - name: dashboards-valence + configMap: + name: grafana-dashboards-valence diff --git a/manifests/valence/grafana/kustomization.yaml b/manifests/valence/grafana/kustomization.yaml new file mode 100644 index 0000000..6cf4cac --- /dev/null +++ b/manifests/valence/grafana/kustomization.yaml @@ -0,0 +1,10 @@ +commonLabels: + app.kubernetes.io/name: grafana +resources: + - service.yaml + - deployment.yaml + - configMap.yaml + - dashboard-valence.yaml +imageTags: + - name: grafana/grafana + newTag: 5.2.4 diff --git a/manifests/valence/grafana/service.yaml b/manifests/valence/grafana/service.yaml new file mode 100644 index 0000000..f72a961 --- /dev/null +++ b/manifests/valence/grafana/service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana +spec: + ports: + - port: 3000 + protocol: TCP + targetPort: 3000 + type: NodePort diff --git a/manifests/valence/kustomization.yaml b/manifests/valence/kustomization.yaml new file mode 100644 index 0000000..c7026cb --- /dev/null +++ b/manifests/valence/kustomization.yaml @@ -0,0 +1,8 @@ +commonLabels: + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 +namespace: valence-system +bases: + - ./grafana + - ./prometheus + - ./operator diff --git a/manifests/valence/operator/crds.yaml b/manifests/valence/operator/crds.yaml new file mode 100644 index 0000000..242f355 --- /dev/null +++ b/manifests/valence/operator/crds.yaml @@ -0,0 +1,11 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: servicelevelobjectives.optimizer.valence.io +spec: + group: optimizer.valence.io + version: v1alpha1 + scope: Namespaced + names: + plural: servicelevelobjectives + kind: ServiceLevelObjective diff --git a/manifests/valence/operator/deployment.yaml b/manifests/valence/operator/deployment.yaml new file mode 100644 index 0000000..7bf7f4a --- /dev/null +++ b/manifests/valence/operator/deployment.yaml @@ -0,0 +1,27 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: optimization-operator +spec: + replicas: 1 + template: + spec: + serviceAccountName: valence-operator + containers: + - name: optimization-operator + image: valencenet/valence:0.0.0 + imagePullPolicy: Always + args: + - operator + env: + - name: MIN_SAMPLE_SIZE + value: "20" + - name: PROMETHEUS_URL + value: http://prometheus-valence.valence-system.svc:9090 + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M diff --git a/manifests/valence/operator/kustomization.yaml b/manifests/valence/operator/kustomization.yaml new file mode 100644 index 0000000..d63c9a6 --- /dev/null +++ b/manifests/valence/operator/kustomization.yaml @@ -0,0 +1,12 @@ +commonLabels: + app.kubernetes.io/name: valence + app.kubernetes.io/component: operator +resources: + - crds.yaml + - rbac.yaml + - deployment.yaml + - service.yaml + - namespace.yaml +imageTags: + - name: valencenet/valence + newTag: 0.2.0 diff --git a/manifests/valence/operator/namespace.yaml b/manifests/valence/operator/namespace.yaml new file mode 100644 index 0000000..2214958 --- /dev/null +++ b/manifests/valence/operator/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: valence-system diff --git a/manifests/valence/operator/rbac.yaml b/manifests/valence/operator/rbac.yaml new file mode 100644 index 0000000..e0a9f5b --- /dev/null +++ b/manifests/valence/operator/rbac.yaml @@ -0,0 +1,46 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: valence:optimization-operator +rules: + - apiGroups: ["extensions"] + resources: ["deployments"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["metrics.k8s.io"] + resources: ["pods"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: + - pods + - events + - nodes + verbs: ["get", "list", "watch"] + - apiGroups: ["optimizer.valence.io"] + resources: ["servicelevelobjectives"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["*"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: valence:optimization-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: valence:optimization-operator +subjects: + - name: valence-operator + namespace: valence-system + kind: ServiceAccount + +--- + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: valence-operator + namespace: valence-system diff --git a/manifests/valence/operator/service.yaml b/manifests/valence/operator/service.yaml new file mode 100644 index 0000000..8186f92 --- /dev/null +++ b/manifests/valence/operator/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: optimization-operator +spec: + type: NodePort + ports: + - name: prometheus + port: 8080 + targetPort: 8080 + selector: + app.kubernetes.io/name: valence diff --git a/manifests/valence/prometheus/config-map.yaml b/manifests/valence/prometheus/config-map.yaml new file mode 100644 index 0000000..a5b4c27 --- /dev/null +++ b/manifests/valence/prometheus/config-map.yaml @@ -0,0 +1,310 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-valence +data: + prometheus.yaml: | + global: + scrape_interval: 10s + scrape_timeout: 10s + evaluation_interval: 10s + rule_files: + - "/etc/prometheus-rules/*.rules" + scrape_configs: + - job_name: kube-state-metrics/0 + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: kube-state-metrics + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: metrics + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: (.+) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: metrics + action: replace + - job_name: kube-state-metrics/1 + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - valence-system + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: kube-state-metrics + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: telemetry + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: (.+) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: telemetry + action: replace + - job_name: kubernetes-nodes + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - api_server: null + role: node + namespaces: + names: [] + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + relabel_configs: + - separator: ; + regex: __meta_kubernetes_node_label_(.+) + replacement: $1 + action: labelmap + - separator: ; + regex: (.*) + target_label: __address__ + replacement: kubernetes.default.svc:443 + action: replace + - source_labels: [__meta_kubernetes_node_name] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + action: replace + - job_name: kubernetes-nodes-cadvisor + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - api_server: null + role: node + namespaces: + names: [] + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + relabel_configs: + - separator: ; + regex: __meta_kubernetes_node_label_(.+) + replacement: $1 + action: labelmap + - separator: ; + regex: (.*) + target_label: __address__ + replacement: kubernetes.default.svc:443 + action: replace + - source_labels: [__meta_kubernetes_node_name] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + action: replace + - job_name: prometheus-valence + scrape_interval: 5s + scrape_timeout: 5s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] + separator: ; + regex: valence + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: prometheus + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: prometheus + action: replace + - job_name: valence + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - valence-system + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_component] + separator: ; + regex: operator + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: prometheus + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: prometheus + action: replace \ No newline at end of file diff --git a/manifests/valence/prometheus/kustomization.yaml b/manifests/valence/prometheus/kustomization.yaml new file mode 100644 index 0000000..72ed1a4 --- /dev/null +++ b/manifests/valence/prometheus/kustomization.yaml @@ -0,0 +1,8 @@ +commonLabels: + app.kubernetes.io/name: prometheus-valence +resources: + - config-map.yaml + - prometheus-service-accounts.yaml + - service.yaml + - stateful-set.yaml + diff --git a/manifests/valence/prometheus/prometheus-service-accounts.yaml b/manifests/valence/prometheus/prometheus-service-accounts.yaml new file mode 100644 index 0000000..596b2cc --- /dev/null +++ b/manifests/valence/prometheus/prometheus-service-accounts.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-valence + +--- + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus-valence +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus-valence +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-valence +subjects: +- kind: ServiceAccount + name: prometheus-valence + namespace: valence-system + diff --git a/manifests/valence/prometheus/service.yaml b/manifests/valence/prometheus/service.yaml new file mode 100644 index 0000000..596953c --- /dev/null +++ b/manifests/valence/prometheus/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-valence +spec: + type: NodePort + ports: + - name: web + port: 9090 + protocol: TCP + targetPort: web + selector: + app.kubernetes.io/name: prometheus-valence diff --git a/manifests/valence/prometheus/stateful-set.yaml b/manifests/valence/prometheus/stateful-set.yaml new file mode 100644 index 0000000..53abfe4 --- /dev/null +++ b/manifests/valence/prometheus/stateful-set.yaml @@ -0,0 +1,50 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus-valence + namespace: valence-system +spec: + volumeClaimTemplates: [] + replicas: 1 + serviceName: prometheus-valence + template: + spec: + serviceAccountName: prometheus-valence + containers: + - name: prometheus + args: + - --web.console.templates=/etc/prometheus/consoles + - --web.console.libraries=/etc/prometheus/console_libraries + - --config.file=/etc/prometheus/prometheus.yaml + - --storage.tsdb.path=/prometheus + - --web.enable-lifecycle + - --storage.tsdb.no-lockfile + - --web.route-prefix=/ + - --storage.tsdb.retention.time=6h + image: quay.io/prometheus/prometheus:v2.7.1 + ports: + - name: web + containerPort: 9090 + resources: + limits: + cpu: 250m + memory: 2.5Gi + requests: + cpu: 100m + memory: 2Gi + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: rules-volume + mountPath: /etc/prometheus-rules + - mountPath: /prometheus + name: prometheus-valence-db + volumes: + - name: config-volume + configMap: + name: prometheus-valence + - name: rules-volume + emptyDir: {} + - emptyDir: {} + name: prometheus-valence-db + diff --git a/valence.yaml b/valence.yaml new file mode 100644 index 0000000..9aedb75 --- /dev/null +++ b/valence.yaml @@ -0,0 +1,1403 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: valence-system +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: servicelevelobjectives.optimizer.valence.io +spec: + group: optimizer.valence.io + names: + kind: ServiceLevelObjective + plural: servicelevelobjectives + scope: Namespaced + version: v1alpha1 +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: prometheus-valence + namespace: valence-system +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: valence-operator + namespace: valence-system +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: prometheus-valence +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: valence:optimization-operator +rules: +- apiGroups: + - extensions + resources: + - deployments + verbs: + - get + - list + - watch + - update + - patch +- apiGroups: + - metrics.k8s.io + resources: + - pods + verbs: + - get + - list +- apiGroups: + - "" + resources: + - pods + - events + - nodes + verbs: + - get + - list + - watch +- apiGroups: + - optimizer.valence.io + resources: + - servicelevelobjectives + verbs: + - get + - list + - watch + - update + - patch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: prometheus-valence +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-valence +subjects: +- kind: ServiceAccount + name: prometheus-valence + namespace: valence-system +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: valence:optimization-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: valence:optimization-operator +subjects: +- kind: ServiceAccount + name: valence-operator + namespace: valence-system +--- +apiVersion: v1 +data: + valence.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 1, + "iteration": 1551793537194, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "-- Mixed --", + "description": "Recommendations of Memory requests and limits to set for $deployment", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(container_memory_working_set_bytes{container_name=\"$deployment\"})", + "format": "time_series", + "instant": false, + "interval": "5s", + "intervalFactor": 1, + "legendFormat": "Observed Memory Value", + "refId": "A" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_memory_requests{container_name=\"$deployment\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended Memory Request", + "refId": "B" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_memory_limits{container_name=\"$deployment\"})", + "format": "time_series", + "hide": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended Memory Limit", + "refId": "C" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_requests_memory_bytes{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests", + "refId": "D" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_limits_memory_bytes{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limits", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory recommendations: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "-- Mixed --", + "description": "Recommendations of CPU requests and limits to set for $deployment", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 4, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "DS_PROM_VALENCE", + "expr": "avg(rate(container_cpu_usage_seconds_total{container_name=\"$deployment\"}[5m]))", + "format": "time_series", + "instant": false, + "interval": "5s", + "intervalFactor": 1, + "legendFormat": "Observed CPU Value", + "refId": "A" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_cpu_requests{container_name=\"$deployment\"} / 1000)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended CPU Request", + "refId": "B" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(valence_recommendations_cpu_limits{container_name=\"$deployment\"} / 1000)", + "format": "time_series", + "hide": false, + "interval": "60s", + "intervalFactor": 1, + "legendFormat": "Recommended CPU Limit", + "refId": "C" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_requests_cpu_cores{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests", + "refId": "D" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "max(kube_pod_container_resource_limits_cpu_cores{container=\"$deployment\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limits", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cpu recommendations: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "DS_PROM_VALENCE", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 6, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(promproxy_metric_handler_detailed_requests_count{service=\"$deployment\"}[1m])) / 60", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "HTTP Queries Per Second", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Count: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "QpS", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "DS_PROM_VALENCE", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 7, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(rate(promproxy_metric_handler_detailed_requests{service=\"$deployment\", quantile=\"$LatencyPercentile\", code!=\"502\"}[5m]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "HTTP Request Latency", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Latency: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "-- Mixed --", + "description": "Number of replicas observed and recommended for: $deployment", + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 9, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "DS_PROM_VALENCE", + "expr": "valence_recommendations_replicas{name=\"$deployment\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Recommended Replicas", + "refId": "A" + }, + { + "datasource": "DS_PROM_VALENCE", + "expr": "kube_deployment_status_replicas_available{deployment=\"$deployment\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Current Replicas", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replicas: $deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "todo-backend-java", + "value": "todo-backend-java" + }, + "datasource": "DS_PROM_VALENCE", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment", + "options": [], + "query": "label_values(promproxy_metric_handler_detailed_requests, service)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "0.95", + "value": "0.95" + }, + "datasource": "DS_PROM_VALENCE", + "hide": 0, + "includeAll": false, + "label": "Latency Percentile", + "multi": false, + "name": "LatencyPercentile", + "options": [], + "query": "label_values(promproxy_metric_handler_detailed_requests, quantile)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "1s", + "5s", + "30s", + "1m" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Valence", + "uid": "9ri9X0Qiz", + "version": 1 + } +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: grafana-dashboards-valence + namespace: valence-system +--- +apiVersion: v1 +data: + prometheus.yaml: | + apiVersion: 1 + datasources: + - name: DS_PROM_VALENCE + type: prometheus + access: proxy + url: http://prometheus-valence.valence-system:9090 + editable: false + version: 1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: grafana-datasources + namespace: valence-system +--- +apiVersion: v1 +data: + default.yaml: | + apiVersion: 1 + providers: + - name: 'default' + org_id: 1 + folder: '' + type: 'file' + options: + path: '/var/lib/grafana/dashboards' +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: grafana-providers + namespace: valence-system +--- +apiVersion: v1 +data: + prometheus.yaml: | + global: + scrape_interval: 10s + scrape_timeout: 10s + evaluation_interval: 10s + rule_files: + - "/etc/prometheus-rules/*.rules" + scrape_configs: + - job_name: kube-state-metrics/0 + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: kube-state-metrics + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: metrics + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: (.+) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: metrics + action: replace + - job_name: kube-state-metrics/1 + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - valence-system + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: kube-state-metrics + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: telemetry + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_label_app] + separator: ; + regex: (.+) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: telemetry + action: replace + - job_name: kubernetes-nodes + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - api_server: null + role: node + namespaces: + names: [] + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + relabel_configs: + - separator: ; + regex: __meta_kubernetes_node_label_(.+) + replacement: $1 + action: labelmap + - separator: ; + regex: (.*) + target_label: __address__ + replacement: kubernetes.default.svc:443 + action: replace + - source_labels: [__meta_kubernetes_node_name] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + action: replace + - job_name: kubernetes-nodes-cadvisor + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - api_server: null + role: node + namespaces: + names: [] + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + relabel_configs: + - separator: ; + regex: __meta_kubernetes_node_label_(.+) + replacement: $1 + action: labelmap + - separator: ; + regex: (.*) + target_label: __address__ + replacement: kubernetes.default.svc:443 + action: replace + - source_labels: [__meta_kubernetes_node_name] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + action: replace + - job_name: prometheus-valence + scrape_interval: 5s + scrape_timeout: 5s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] + separator: ; + regex: valence + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: prometheus + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: prometheus + action: replace + - job_name: valence + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - valence-system + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_component] + separator: ; + regex: operator + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: prometheus + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: ${1} + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: prometheus + action: replace +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: prometheus-valence + namespace: valence-system +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: grafana + namespace: valence-system +spec: + ports: + - port: 3000 + protocol: TCP + targetPort: 3000 + selector: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: optimization-operator + namespace: valence-system +spec: + ports: + - name: prometheus + port: 8080 + targetPort: 8080 + selector: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + type: NodePort +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: prometheus-valence + namespace: valence-system +spec: + ports: + - name: web + port: 9090 + protocol: TCP + targetPort: web + selector: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + type: NodePort +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: grafana + namespace: valence-system +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + spec: + containers: + - env: + - name: GF_SERVER_ROOT_URL + value: /api/v1/namespaces/valence-system/services/grafana/proxy/ + image: grafana/grafana:5.2.4 + name: grafana + ports: + - containerPort: 3000 + protocol: TCP + resources: + limits: + cpu: 500m + memory: 2500Mi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - mountPath: /var/lib/grafana + name: data + - mountPath: /etc/grafana/provisioning/dashboards + name: providers + - mountPath: /etc/grafana/provisioning/datasources + name: datasources + - mountPath: /var/lib/grafana/dashboards/capacity-planning.json + name: dashboards-valence + subPath: valence.json + restartPolicy: Always + volumes: + - emptyDir: {} + name: data + - configMap: + name: grafana-providers + name: providers + - configMap: + name: grafana-datasources + name: datasources + - configMap: + name: grafana-dashboards-valence + name: dashboards-valence +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: optimization-operator + namespace: valence-system +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + template: + metadata: + labels: + app.kubernetes.io/component: operator + app.kubernetes.io/name: valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + spec: + containers: + - args: + - operator + env: + - name: MIN_SAMPLE_SIZE + value: "20" + - name: PROMETHEUS_URL + value: http://prometheus-valence.valence-system.svc:9090 + image: valencenet/valence:0.2.0 + imagePullPolicy: Always + name: optimization-operator + resources: + limits: + cpu: 500m + memory: 500M + requests: + cpu: 250m + memory: 250M + serviceAccountName: valence-operator +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + name: prometheus-valence + namespace: valence-system +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + serviceName: prometheus-valence + template: + metadata: + labels: + app.kubernetes.io/name: prometheus-valence + app.kubernetes.io/part-of: valence + app.kubernetes.io/version: 0.1.14 + spec: + containers: + - args: + - --web.console.templates=/etc/prometheus/consoles + - --web.console.libraries=/etc/prometheus/console_libraries + - --config.file=/etc/prometheus/prometheus.yaml + - --storage.tsdb.path=/prometheus + - --web.enable-lifecycle + - --storage.tsdb.no-lockfile + - --web.route-prefix=/ + - --storage.tsdb.retention.time=6h + image: quay.io/prometheus/prometheus:v2.7.1 + name: prometheus + ports: + - containerPort: 9090 + name: web + resources: + limits: + cpu: 250m + memory: 2.5Gi + requests: + cpu: 100m + memory: 2Gi + volumeMounts: + - mountPath: /etc/prometheus + name: config-volume + - mountPath: /etc/prometheus-rules + name: rules-volume + - mountPath: /prometheus + name: prometheus-valence-db + serviceAccountName: prometheus-valence + volumes: + - configMap: + name: prometheus-valence + name: config-volume + - emptyDir: {} + name: rules-volume + - emptyDir: {} + name: prometheus-valence-db + volumeClaimTemplates: []