Skip to content

Commit

Permalink
Splitting Telegraf MTR and Speedtest-cli in CronJobs
Browse files Browse the repository at this point in the history
In order to avoid overload the original Telegraf DaemonSet, I splitted
the MTR process and Speedtest in different CronJobs that runs a copy of
the Telegraf process. Each one takes a key from the ConfigMap in order
to load their own configuration and making use of the Telegraf's
`--once` flag we're scheduling an execution every 10 minutes.

Also we have improved Memory and CPU limits and SecurityContexts.
  • Loading branch information
fmdlc committed Oct 23, 2020
1 parent de2d87e commit 76b5211
Show file tree
Hide file tree
Showing 10 changed files with 20,634 additions and 7,002 deletions.
13,920 changes: 10,368 additions & 3,552 deletions kubernetes/ISP-Checker-deploy.yaml

Large diffs are not rendered by default.

261 changes: 154 additions & 107 deletions kubernetes/yaml/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,123 +3,118 @@ apiVersion: v1
kind: ConfigMap
metadata:
namespace: monitoring
name: grafana-user-config
name: grafana-dashboard-provisioner
labels:
app.kubernetes.io/app: grafana
app.kubernetes.io/project: isp-checker
data:
default.json: |-
{
"homeDashboardId": 1,
"theme": "dark",
"timezone": "browser"
}
---
apiVersion: v1
kind: ConfigMap
metadata:
namespace: monitoring
name: network-dashboard-provisioner
data:
networking.yaml: |-
dashboards.yaml: |-
apiVersion: 1
providers:
- name: 'ISP-Checker'
- name: 'Dashboards'
orgId: 1
folder: ''
type: file
disableDeletion: true
editable: false
options:
path: /tmp/config/network-dashboard.json
path: /var/lib/grafana/dashboards/
foldersFromFilesStructure: true
homeDashboardId: 1
---
apiVersion: v1
kind: ConfigMap
metadata:
namespace: monitoring
name: telegraf-config
labels:
app.kubernetes.io/app: grafana
app.kubernetes.io/project: isp-checker
data:
telegraf.conf: |+
telegraf.conf: |-
[global_tags]
[agent]
interval = "60s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
hostname = "$HOSTNAME"
omit_hostname = false
debug = false
quiet = true
interval = "60s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
hostname = "$HOSTNAME"
omit_hostname = false
debug = false
quiet = true
[[outputs.influxdb]]
urls = ["http://influxdb-svc.monitoring.svc.cluster.local:8086/"]
database = "$INFLUXDB_DB"
timeout = "60s"
username = "$INFLUXDB_ADMIN_USER"
password = "$INFLUXDB_ADMIN_PASSWORD"
user_agent = "telegraf"
skip_database_creation = false
urls = ["http://influxdb-svc.monitoring.svc.cluster.local:8086/"]
database = "$INFLUXDB_DB"
timeout = "60s"
username = "$INFLUXDB_ADMIN_USER"
password = "$INFLUXDB_ADMIN_PASSWORD"
user_agent = "telegraf"
skip_database_creation = false
[[inputs.net]]
[[inputs.netstat]]
[[inputs.diskio]]
[[inputs.kernel]]
[[inputs.linux_sysctl_fs]]
[[inputs.mem]]
[[inputs.processes]]
[[inputs.swap]]
[[inputs.system]]
[[inputs.cpu]]
percpu = true
totalcpu = true
collect_cpu_time = false
report_active = false
percpu = true
totalcpu = true
collect_cpu_time = false
report_active = false
[[inputs.disk]]
ignore_fs = ["tmpfs",
"devtmpfs",
"devfs",
"iso9660",
"overlay",
"aufs",
"squashfs"
]
ignore_fs = ["tmpfs",
"devtmpfs",
"devfs",
"iso9660",
"overlay",
"aufs",
"squashfs"
]
[[inputs.dns_query]]
servers = ["4.2.2.1", "8.8.8.8"]
domains = ["www.google.com",
"www.twitter.com",
"www.amazon.com",
"www.wikipedia.org"
]
record_type = "A"
port = 53
timeout = 3
[[inputs.http_response]]
urls = [
"http://www.google.com",
"http://www.twitter.com",
"http://www.amazon.com",
"http://www.yahoo.com"
]
response_timeout = "5s"
method = "GET"
follow_redirects = true
[[inputs.internal]]
servers = ["4.2.2.1", "8.8.8.8"]
domains = ["www.google.com",
"www.twitter.com",
"www.amazon.com",
"www.wikipedia.org"
]
record_type = "A"
port = 53
timeout = 3
[[inputs.http_response]]
urls = [
"http://www.google.com",
"http://www.twitter.com",
"http://www.amazon.com",
"http://www.yahoo.com"
]
response_timeout = "5s"
method = "GET"
follow_redirects = true
[[inputs.internal]]
collect_memstats = true
[[inputs.ping]]
[[inputs.ping]]
urls = [
"google.com",
"twitter.com",
"amazon.com",
"yahoo.com"
]
"google.com",
"twitter.com",
"amazon.com",
"yahoo.com"
]
method = "exec"
count = 1
ping_interval = 1.0
Expand All @@ -128,35 +123,87 @@ data:
binary = "ping"
ipv6 = false
[[inputs.exec]]
commands=["mtr -C google.com s3-website.ap-northeast-2.amazonaws.com s3-website.eu-central-1.amazonaws.com s3-website.af-south-1.amazonaws.com"]
timeout = "3m"
interval = "7m"
data_format = "csv"
csv_skip_rows = 1
csv_column_names=[ "", "", "status","dest","hop","ip","loss","snt","", "","avg","best","worst","stdev"]
name_override = "mtr"
csv_tag_columns = ["dest", "hop", "ip"]
[[inputs.docker]]
endpoint = "unix:///var/run/docker.sock"
[[inputs.file]]
files = ["/sys/class/thermal/thermal_zone0/temp"]
name_override = "cpu_temperature"
data_format = "value"
data_type = "integer"
[[inputs.exec]]
commands = ["cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"]
name_override = "cpu_frequency"
data_format = "value"
data_type = "integer"
telegraf-mtr.conf: |-
[global_tags]
[agent]
interval = "60s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
hostname = "$HOSTNAME"
omit_hostname = false
debug = false
quiet = true
[[outputs.influxdb]]
urls = ["http://influxdb-svc.monitoring.svc.cluster.local:8086/"]
database = "$INFLUXDB_DB"
timeout = "60s"
username = "$INFLUXDB_ADMIN_USER"
password = "$INFLUXDB_ADMIN_PASSWORD"
user_agent = "telegraf"
skip_database_creation = false
[[inputs.exec]]
commands = ["/usr/bin/speedtest-cli --csv --bytes"]
name_override = "speedtest"
timeout = "3m"
interval = "15m"
data_format = "csv"
csv_column_names = ['Server ID',
'Sponsor',
'Server Name',
'Timestamp',
'Distance',
'Ping',
'Download',
'Upload',
'Share',
'IP Address'
]
csv_timestamp_column = "Timestamp"
csv_timestamp_format = "2006-01-02T15:04:05Z07:00"
commands=["mtr -C google.com s3-website.ap-northeast-2.amazonaws.com s3-website.eu-central-1.amazonaws.com s3-website.af-south-1.amazonaws.com"]
timeout = "3m"
interval = "7m"
data_format = "csv"
csv_skip_rows = 1
csv_column_names=[ "", "", "status","dest","hop","ip","loss","snt","", "","avg","best","worst","stdev"]
name_override = "mtr"
csv_tag_columns = ["dest", "hop", "ip"]
[[inputs.docker]]
endpoint = "unix:///var/run/docker.sock"
telegraf-speedtest.conf: |-
[global_tags]
[agent]
interval = "60s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
hostname = "$HOSTNAME"
omit_hostname = false
debug = false
quiet = true
[[outputs.influxdb]]
urls = ["http://influxdb-svc.monitoring.svc.cluster.local:8086/"]
database = "$INFLUXDB_DB"
timeout = "60s"
username = "$INFLUXDB_ADMIN_USER"
password = "$INFLUXDB_ADMIN_PASSWORD"
user_agent = "telegraf"
skip_database_creation = false
[[inputs.exec]]
commands = ["/usr/bin/speedtest-cli --csv --bytes"]
name_override = "speedtest"
timeout = "3m"
interval = "15m"
data_format = "csv"
csv_column_names = ['Server ID', 'Sponsor', 'Server Name', 'Timestamp', 'Distance', 'Ping', 'Download', 'Upload', 'Share', 'IP Address']
csv_timestamp_column = "Timestamp"
csv_timestamp_format = "2006-01-02T15:04:05Z07:00"
Loading

0 comments on commit 76b5211

Please sign in to comment.