diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 1530fd4..2572655 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -29,6 +29,9 @@ k8s_prometheus_pushgateway_image: prom/pushgateway k8s_prometheus_pushgateway_image_tag: v0.4.0 k8s_prometheus_grafana_image: grafana/grafana k8s_prometheus_grafana_image_tag: 4.5.2 +k8s_prometheus_grafana_watcher_image: quay.io/coreos/grafana-watcher +k8s_prometheus_grafana_watcher_image_tag: v0.0.8 + k8s_prometheus_blackbox_image: prom/blackbox-exporter k8s_prometheus_blackbox_image_tag: v0.10.0 diff --git a/roles/prometheus/templates/grafana-dashboards/capacity-planning.json b/roles/prometheus/templates/grafana-dashboards/capacity-planning.json new file mode 100644 index 0000000..da08d01 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/capacity-planning.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"description":"","editable":true,"gnetId":22,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"refresh":false,"rows":[{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100","hide":false,"intervalFactor":10,"legendFormat":"","refId":"A","step":50}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Idle cpu","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percent","label":"cpu usage","logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":9,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(node_load1)","intervalFactor":4,"legendFormat":"load 1m","refId":"A","step":20,"target":""},{"expr":"sum(node_load5)","intervalFactor":4,"legendFormat":"load 5m","refId":"B","step":20,"target":""},{"expr":"sum(node_load15)","intervalFactor":4,"legendFormat":"load 15m","refId":"C","step":20,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"System load","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":4,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}","yaxis":2}],"spaceLength":10,"span":9,"stack":true,"steppedLine":false,"targets":[{"expr":"sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)","intervalFactor":2,"legendFormat":"memory usage","metric":"memo","refId":"A","step":10,"target":""},{"expr":"sum(node_memory_Buffers)","interval":"","intervalFactor":2,"legendFormat":"memory buffers","metric":"memo","refId":"B","step":10,"target":""},{"expr":"sum(node_memory_Cached)","interval":"","intervalFactor":2,"legendFormat":"memory cached","metric":"memo","refId":"C","step":10,"target":""},{"expr":"sum(node_memory_MemFree)","interval":"","intervalFactor":2,"legendFormat":"memory free","metric":"memo","refId":"D","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":"0","show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100","intervalFactor":2,"metric":"","refId":"A","step":60,"target":""}],"thresholds":"80, 90","title":"Memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":246,"panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"read","yaxis":1},{"alias":"{instance=\"172.17.0.1:9100\"}","yaxis":2},{"alias":"io time","yaxis":2}],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(node_disk_bytes_read[5m]))","hide":false,"intervalFactor":4,"legendFormat":"read","refId":"A","step":20,"target":""},{"expr":"sum(rate(node_disk_bytes_written[5m]))","intervalFactor":4,"legendFormat":"written","refId":"B","step":20},{"expr":"sum(rate(node_disk_io_time_ms[5m]))","intervalFactor":4,"legendFormat":"io time","refId":"C","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Disk I/O","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"ms","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":12,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"0.75, 0.9","title":"Disk space usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":8,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))","hide":false,"intervalFactor":2,"legendFormat":"","refId":"A","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network received","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))","hide":false,"intervalFactor":2,"legendFormat":"","refId":"B","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network transmitted","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":276,"panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","fill":1,"id":11,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(kube_pod_info)","format":"time_series","intervalFactor":2,"legendFormat":"Current number of Pods","refId":"A","step":10},{"expr":"sum(kube_node_status_capacity_pods)","format":"time_series","intervalFactor":2,"legendFormat":"Maximum capacity of pods","refId":"B","step":10}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Cluster Pod Utilization","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":60,"target":""}],"thresholds":"80,90","title":"Pod Utilization","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Dashboard Row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Kubernetes Capacity Planning","version":4},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/cluster-health.json b/roles/prometheus/templates/grafana-dashboards/cluster-health.json new file mode 100644 index 0000000..d8f5b24 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cluster-health.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"editable":true,"gnetId":null,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"rows":[{"collapse":false,"height":254,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":1,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Control Plane Components Down","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"Everything UP and healthy","value":"null"},{"op":"=","text":"","value":""}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":2,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"3,5","title":"Alerts Firing","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"0","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":3,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"3,5","title":"Alerts Pending","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"0","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":4,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"count(increase(kube_pod_container_status_restarts[1h]) > 5)","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Crashlooping Pods","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"0","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Dashboard Row","titleSize":"h6"},{"collapse":false,"height":250,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(kube_node_status_ready{condition!=\"true\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Node Not Ready","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":6,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(kube_node_status_disk_pressure{condition=\"true\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Node Disk Pressure","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(kube_node_status_memory_pressure{condition=\"true\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Node Memory Pressure","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":8,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(kube_node_spec_unschedulable)","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Nodes Unschedulable","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Dashboard Row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[]},"time":{"from":"now-6h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"","title":"Kubernetes Cluster Health","version":9},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/cluster-monitoring.json b/roles/prometheus/templates/grafana-dashboards/cluster-monitoring.json new file mode 100644 index 0000000..8c6d58a --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cluster-monitoring.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"Compliant with Prometheus 1.5.2","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"singlestat","name":"Singlestat","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.3.0"}],"id":null,"title":"Kubernetes cluster monitoring (via Prometheus)","description":"Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.","tags":["kubernetes"],"style":"dark","timezone":"browser","editable":true,"hideControls":false,"sharedCrosshair":false,"rows":[{"collapse":false,"editable":true,"height":"200px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)","thresholdLine":false},"height":"200px","id":32,"isNew":true,"legend":{"alignAsTable":false,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":false,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Received","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"legendFormat":"Sent","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Network I/O pressure","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Network I/O pressure"},{"collapse":false,"editable":true,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":4,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster memory usage","transparent":false,"type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":6,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster CPU usage (1m avg)","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"height":"180px","id":7,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) * 100","interval":"10s","intervalFactor":1,"legendFormat":"","metric":"","refId":"A","step":10}],"thresholds":"65, 90","title":"Cluster filesystem usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":9,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"20%","prefix":"","prefixFontSize":"20%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":10,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":11,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":12,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":" cores","postfixFontSize":"30%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":13,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_usage_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Used","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"format":"bytes","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"height":"1px","id":14,"interval":null,"isNew":true,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":2,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"targets":[{"expr":"sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})","interval":"10s","intervalFactor":1,"refId":"A","step":10}],"thresholds":"","title":"Total","type":"singlestat","valueFontSize":"50%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"showTitle":false,"title":"Total usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":17,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ pod_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods CPU usage (1m avg)","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"showTitle":false,"title":"Pods CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":23,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (systemd_service_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services CPU usage (1m avg)","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services CPU usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"height":"","id":24,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":null,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ pod_name }} | {{ container_name }}","metric":"container_cpu","refId":"A","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_cpu","refId":"B","step":10},{"expr":"sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_cpu","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers CPU usage (1m avg)","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers CPU usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":3,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":20,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_cpu","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes CPU usage (1m avg)","tooltip":{"msResolution":true,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"none","label":"cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"repeat":null,"showTitle":false,"title":"All processes CPU usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":25,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ pod_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":26,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (container_memory_working_set_bytes{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (systemd_service_name)","interval":"10s","intervalFactor":1,"legendFormat":"{{ systemd_service_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"System services memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"System services memory usage"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":27,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (container_name, pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"pod: {{ pod_name }} | {{ container_name }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10},{"expr":"sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)","interval":"10s","intervalFactor":1,"legendFormat":"docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"container_memory_usage:sort_desc","refId":"B","step":10},{"expr":"sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)","interval":"10s","intervalFactor":1,"legendFormat":"rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"container_memory_usage:sort_desc","refId":"C","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers memory usage"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":28,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":true,"targets":[{"expr":"sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"{{ id }}","metric":"container_memory_usage:sort_desc","refId":"A","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes memory usage","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes memory usage"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ pod_name }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ pod_name }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"Pods network I/O (1m avg)","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Pods network I/O"},{"collapse":true,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":30,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> pod: {{ pod_name }} | {{ container_name }}","metric":"network","refId":"B","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- pod: {{ pod_name }} | {{ container_name }}","metric":"network","refId":"D","step":10},{"expr":"sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})","metric":"network","refId":"C","step":10},{"expr":"sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"E","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)","hide":false,"interval":"10s","intervalFactor":1,"legendFormat":"<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}","metric":"network","refId":"F","step":10}],"timeFrom":null,"timeShift":null,"title":"Containers network I/O (1m avg)","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"Containers network I/O"},{"collapse":true,"editable":true,"height":"500px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","decimals":2,"editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":29,"isNew":true,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":false,"show":true,"sideWidth":200,"sort":"current","sortDesc":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum (rate (container_network_receive_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"-> {{ id }}","metric":"network","refId":"A","step":10},{"expr":"- sum (rate (container_network_transmit_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)","interval":"10s","intervalFactor":1,"legendFormat":"<- {{ id }}","metric":"network","refId":"B","step":10}],"timeFrom":null,"timeShift":null,"title":"All processes network I/O (1m avg)","tooltip":{"msResolution":false,"shared":true,"sort":2,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"title":"All processes network I/O"}],"time":{"from":"now-5m","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"multi":false,"name":"Node","options":[],"query":"label_values(kubernetes_io_hostname)","refresh":1,"type":"query"}]},"annotations":{"list":[]},"refresh":"10s","schemaVersion":12,"version":13,"links":[],"gnetId":1621},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/cluster-status.json b/roles/prometheus/templates/grafana-dashboards/cluster-status.json new file mode 100644 index 0000000..46b7921 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cluster-status.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"editable":true,"gnetId":null,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"rows":[{"collapse":false,"height":129,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":6,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(up{job=~\"kubernetes-apiservers|kube-scheduler|kube-controller-manager\"} == 0)","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Control Plane UP","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"UP","value":"null"}],"valueName":"total"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":6,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":6,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"3,5","title":"Alerts Firing","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"0","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":true,"title":"Cluster Health","titleSize":"h6"},{"collapse":false,"height":168,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":1,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(up{job=\"kubernetes-apiservers\"} == 1) / count(up{job=\"kubernetes-apiservers\"})) * 100","format":"time_series","interval":"","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"50,80","title":"API Servers UP","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":2,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100","format":"time_series","interval":"","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"50,80","title":"Controller Managers UP","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":3,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100","format":"time_series","interval":"","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"50,80","title":"Schedulers UP","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"},{"cacheTimeout":null,"colorBackground":false,"colorValue":true,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"hideTimeOverride":false,"id":4,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)","format":"time_series","interval":"","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"1,3","title":"Crashlooping Control Plane Pods","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"0","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":true,"title":"Control Plane Status","titleSize":"h6"},{"collapse":false,"height":158,"panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":8,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"sum(100 - (avg by (instance) (rate(node_cpu{component=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{component=\"node-exporter\",mode=\"idle\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"80,90","title":"CPU Utilization","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"80,90","title":"Memory Utilization","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":9,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"80,90","title":"Filesystem Utilization","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":10,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"80,90","title":"Pod Utilization","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":true,"title":"Capacity Planing","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[]},"time":{"from":"now-6h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"","title":"Kubernetes Cluster Status","version":3},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/control-plane-status.json b/roles/prometheus/templates/grafana-dashboards/control-plane-status.json new file mode 100644 index 0000000..b6260ca --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/control-plane-status.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"editable":true,"gnetId":null,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"rows":[{"collapse":false,"height":"250px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":1,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(up{job=\"kubernetes-apiservers\"} == 1) / sum(up{job=\"kubernetes-apiservers\"})) * 100","format":"time_series","intervalFactor":2,"refId":"A","step":600}],"thresholds":"50,80","title":"API Servers UP","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":2,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100","format":"time_series","intervalFactor":2,"refId":"A","step":600}],"thresholds":"50,80","title":"Controller Mangers UP","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":3,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100","format":"time_series","intervalFactor":2,"refId":"A","step":600}],"thresholds":"50,80","title":"Schedulers UP","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":4,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"5,10","title":"API Server Request Error Rate","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"0","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Dashboard Row","titleSize":"h6"},{"collapse":false,"height":250,"panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","fill":1,"id":7,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by(verb) (rate(apiserver_request_latencies_sum[5m]) >= 0)","format":"time_series","intervalFactor":2,"legendFormat":"","refId":"A","step":30}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"API Server Request Latency","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Dashboard Row","titleSize":"h6"},{"collapse":false,"height":250,"panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","fill":1,"id":5,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"cluster:scheduler_e2e_scheduling_latency_seconds:quantile","format":"time_series","intervalFactor":2,"refId":"A","step":60}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"End to end scheduling latency","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"dtdurations","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","fill":1,"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))","format":"time_series","intervalFactor":2,"legendFormat":"Error Rate","refId":"A","step":60},{"expr":"sum by(instance) (rate(apiserver_request_count[5m]))","format":"time_series","intervalFactor":2,"legendFormat":"Request Rate","refId":"B","step":60}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"API Server Request Rates","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Dashboard Row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[]},"time":{"from":"now-6h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"","title":"Kubernetes Control Plane Status","version":3},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/deployment.json b/roles/prometheus/templates/grafana-dashboards/deployment.json new file mode 100644 index 0000000..1a2ab84 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/deployment.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"editable":true,"gnetId":null,"graphTooltip":1,"hideControls":false,"id":null,"links":[],"rows":[{"collapse":false,"height":"200px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":8,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"cores","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"tableColumn":"","targets":[{"expr":"sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ","intervalFactor":2,"refId":"A","step":600}],"thresholds":"","title":"CPU","type":"singlestat","valueFontSize":"110%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":9,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"GB","postfixFontSize":"50%","prefix":"","prefixFontSize":"80%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"tableColumn":"","targets":[{"expr":"sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3","intervalFactor":2,"refId":"A","step":600}],"thresholds":"","title":"Memory","type":"singlestat","valueFontSize":"110%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"Bps","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":false},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":4,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"tableColumn":"","targets":[{"expr":"sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) ","intervalFactor":2,"refId":"A","step":600}],"thresholds":"","title":"Network","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Row","titleSize":"h6"},{"collapse":false,"height":"100px","panels":[{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":false},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"metric":"kube_deployment_spec_replicas","refId":"A","step":600}],"thresholds":"","title":"Desired Replicas","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":6,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"refId":"A","step":600}],"thresholds":"","title":"Available Replicas","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":3,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"","title":"Observed Generation","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"none","gauge":{"maxValue":100,"minValue":0,"show":false,"thresholdLabels":false,"thresholdMarkers":true},"id":2,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"legendFormat":"","refId":"A","step":600}],"thresholds":"","title":"Metadata Generation","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"350px","panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":1,"legend":{"avg":false,"current":false,"hideZero":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"legendFormat":"current replicas","refId":"A","step":30},{"expr":"min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"legendFormat":"available","refId":"B","step":30},{"expr":"max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"legendFormat":"unavailable","refId":"C","step":30},{"expr":"min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"legendFormat":"updated","refId":"D","step":30},{"expr":"max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)","intervalFactor":2,"legendFormat":"desired","refId":"E","step":30}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Replicas","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"transparent":false,"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"none","label":"","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Namespace","multi":false,"name":"deployment_namespace","options":[],"query":"label_values(kube_deployment_metadata_generation, namespace)","refresh":1,"regex":"","sort":0,"tagValuesQuery":null,"tags":[],"tagsQuery":"","type":"query","useTags":false},{"allValue":null,"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Deployment","multi":false,"name":"deployment_name","options":[],"query":"label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"deployment","type":"query","useTags":false}]},"time":{"from":"now-6h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Deployment","version":1},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/nodes.json b/roles/prometheus/templates/grafana-dashboards/nodes.json new file mode 100644 index 0000000..8baef30 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/nodes.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"description":"Dashboard to get an overview of one server","editable":true,"gnetId":22,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"refresh":false,"rows":[{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)","hide":false,"intervalFactor":10,"legendFormat":"{{cpu}}","refId":"A","step":50}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Idle cpu","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percent","label":"cpu usage","logBase":1,"max":100,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":9,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"node_load1{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 1m","refId":"A","step":20,"target":""},{"expr":"node_load5{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 5m","refId":"B","step":20,"target":""},{"expr":"node_load15{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 15m","refId":"C","step":20,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"System load","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":4,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}","yaxis":2}],"spaceLength":10,"span":9,"stack":true,"steppedLine":false,"targets":[{"expr":"node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}","hide":false,"interval":"","intervalFactor":2,"legendFormat":"memory used","metric":"","refId":"C","step":10},{"expr":"node_memory_Buffers{instance=\"$server\"}","interval":"","intervalFactor":2,"legendFormat":"memory buffers","metric":"","refId":"E","step":10},{"expr":"node_memory_Cached{instance=\"$server\"}","intervalFactor":2,"legendFormat":"memory cached","metric":"","refId":"F","step":10},{"expr":"node_memory_MemFree{instance=\"$server\"}","intervalFactor":2,"legendFormat":"memory free","metric":"","refId":"D","step":10}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":"0","show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"80, 90","title":"Memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"read","yaxis":1},{"alias":"{instance=\"172.17.0.1:9100\"}","yaxis":2},{"alias":"io time","yaxis":2}],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))","hide":false,"intervalFactor":4,"legendFormat":"read","refId":"A","step":20,"target":""},{"expr":"sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"written","refId":"B","step":20},{"expr":"sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"io time","refId":"C","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Disk I/O","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"ms","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"0.75, 0.9","title":"Disk space usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":8,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"A","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network received","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"B","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network transmitted","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":null,"multi":false,"name":"server","options":[],"query":"label_values(node_boot_time, instance)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Nodes","version":2},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/pods.json b/roles/prometheus/templates/grafana-dashboards/pods.json new file mode 100644 index 0000000..bcdcc49 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/pods.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"annotations":{"list":[]},"editable":true,"gnetId":null,"graphTooltip":1,"hideControls":false,"id":null,"links":[],"rows":[{"collapse":false,"height":"250px","panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":1,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})","interval":"10s","intervalFactor":1,"legendFormat":"Current: {{ container_name }}","metric":"container_memory_usage_bytes","refId":"A","step":15},{"expr":"kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}","interval":"10s","intervalFactor":2,"legendFormat":"Requested: {{ container }}","metric":"kube_pod_container_resource_requests_memory_bytes","refId":"B","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory Usage","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":2,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m] ) )","intervalFactor":2,"legendFormat":"{{ container_name }}","refId":"A","step":30}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"CPU Usage","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":3,"legend":{"alignAsTable":true,"avg":true,"current":true,"max":false,"min":false,"rightSide":true,"show":true,"total":false,"values":true},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":12,"stack":false,"steppedLine":false,"targets":[{"expr":"sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{pod_name=\"$pod\"}[1m]) ))","intervalFactor":2,"legendFormat":"{{ pod_name }}","refId":"A","step":30}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network I/O","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Namespace","multi":false,"name":"namespace","options":[],"query":"label_values(kube_pod_info, namespace)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false},{"allValue":null,"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Pod","multi":false,"name":"pod","options":[],"query":"label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Container","multi":false,"name":"container","options":[],"query":"label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-6h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Pods","version":1},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/resource-requests.json b/roles/prometheus/templates/grafana-dashboards/resource-requests.json new file mode 100644 index 0000000..08ed977 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/resource-requests.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"description":"","editable":true,"gnetId":null,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"rows":[{"collapse":false,"height":"300","panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","description":"This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.","fill":1,"id":1,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"min(sum(kube_node_status_allocatable_cpu_cores) by (instance))","hide":false,"intervalFactor":2,"legendFormat":"Allocatable CPU Cores","refId":"A","step":20},{"expr":"max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))","intervalFactor":2,"legendFormat":"Requested CPU Cores","refId":"B","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"CPU Cores","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"short","label":"CPU Cores","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":2,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"tableColumn":"","targets":[{"expr":"max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100","intervalFactor":2,"legendFormat":"","refId":"A","step":240}],"thresholds":"80, 90","title":"CPU Cores","type":"singlestat","valueFontSize":"110%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"CPU Cores","titleSize":"h6"},{"collapse":false,"height":"300","panels":[{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","description":"This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.","fill":1,"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"links":[],"nullPointMode":"null","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"min(sum(kube_node_status_allocatable_memory_bytes) by (instance))","hide":false,"intervalFactor":2,"legendFormat":"Allocatable Memory","refId":"A","step":20},{"expr":"max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))","intervalFactor":2,"legendFormat":"Requested Memory","refId":"B","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":"Memory","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","decimals":null,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":4,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":true},"tableColumn":"","targets":[{"expr":"max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100","intervalFactor":2,"legendFormat":"","refId":"A","step":240}],"thresholds":"80, 90","title":"Memory","type":"singlestat","valueFontSize":"110%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"Memory","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[]},"time":{"from":"now-3h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Kubernetes Resource Requests","version":2},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} diff --git a/roles/prometheus/templates/grafana.yaml b/roles/prometheus/templates/grafana.yaml index 56391f0..32e4281 100644 --- a/roles/prometheus/templates/grafana.yaml +++ b/roles/prometheus/templates/grafana.yaml @@ -1,5 +1,52 @@ --- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app: prometheus + component: "grafana" + namespace: {{ k8s_prometheus_namespace }} + name: prometheus-grafana +data: + capacity-planning-dashboard.json: |- +{% include 'grafana-dashboards/capacity-planning.json' %} + + cluster-health-dashboard.json: |- +{% include 'grafana-dashboards/cluster-health.json' %} + + cluster-monitoring-dashboard.json: |- +{% include 'grafana-dashboards/cluster-monitoring.json' %} + + cluster-status-dashboard.json: |- +{% include 'grafana-dashboards/cluster-status.json' %} + + control-plane-status-dashboard.json: |- +{% include 'grafana-dashboards/control-plane-status.json' %} + + deployment-dashboard.json: |- +{% include 'grafana-dashboards/deployment.json' %} + + nodes-dashboard.json: |- +{% include 'grafana-dashboards/nodes.json' %} + + pods-dashboard.json: |- +{% include 'grafana-dashboards/pods.json' %} + + resource-requests-dashboard.json: |- +{% include 'grafana-dashboards/resource-requests.json' %} + + prometheus-datasource.json: |- + { + "access": "proxy", + "basicAuth": false, + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus-server" + } + +--- + apiVersion: v1 kind: Service metadata: @@ -64,9 +111,27 @@ spec: requests: cpu: 100m memory: 100Mi + - name: grafana-watcher + image: {{ k8s_prometheus_grafana_watcher_image }}:{{ k8s_prometheus_grafana_watcher_image_tag }} + args: + - '--watch-dir=/var/grafana-dashboards' + - '--grafana-url=http://localhost:3000' + resources: + requests: + memory: "16Mi" + cpu: "50m" + limits: + memory: "32Mi" + cpu: "100m" + volumeMounts: + - name: grafana-dashboards + mountPath: /var/grafana-dashboards volumes: - - name: grafana-data - emptyDir: {} + - name: grafana-data + emptyDir: {} + - name: grafana-dashboards + configMap: + name: prometheus-grafana {% if k8s_prometheus_grafana_name is defined and k8s_prometheus_grafana_name != '' %} ---