From d1bcefc43826e8f474c909842e2ba18f1f9534d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= <939888+Abuelodelanada@users.noreply.github.com> Date: Mon, 27 Jun 2022 23:17:59 -0300 Subject: [PATCH] Adding self-monitoring, alert rules and grafana dashboard (#70) * Add self-metrics-endpoint relation, and alertrules * grafana-dashboard relation added * Linting... * Fixing static analysis checks * Fix broken integrations tests * Rename itest and test new relation * Timeouts in test_upgrade_charm.py increased * More tiemeouts increased * More timeouts... * unpin pytest-operator * Deploy prom with --trust * Simplyfing num_peers Co-authored-by: Leon <82407168+sed-i@users.noreply.github.com> * Missing trust=True in rerelate_app itest * Specify relation name in the other side of the relation Co-authored-by: Leon <82407168+sed-i@users.noreply.github.com> --- .../grafana_k8s/v0/grafana_dashboard.py | 1544 +++ .../prometheus_k8s/v0/prometheus_scrape.py | 2304 ++++ metadata.yaml | 4 + pyproject.toml | 4 +- src/charm.py | 16 +- .../alertmanager_rev4.json.tmpl | 11162 ++++++++++++++++ ...tmanager_configuration_reload_failure.rule | 12 + .../alertmanager_missing.rule | 12 + .../alertmanager_notifications_failed.rule | 11 + ...alertmanager_dispatch_metrics_endpoint.py} | 33 +- .../test_update_status_pressure.py | 8 +- tests/integration/test_upgrade_charm.py | 10 +- tests/unit/test_charm.py | 4 +- tox.ini | 4 +- 14 files changed, 15107 insertions(+), 21 deletions(-) create mode 100644 lib/charms/grafana_k8s/v0/grafana_dashboard.py create mode 100644 lib/charms/prometheus_k8s/v0/prometheus_scrape.py create mode 100644 src/grafana_dashboards/alertmanager_rev4.json.tmpl create mode 100644 src/prometheus_alert_rules/alertmanager_configuration_reload_failure.rule create mode 100644 src/prometheus_alert_rules/alertmanager_missing.rule create mode 100644 src/prometheus_alert_rules/alertmanager_notifications_failed.rule rename tests/integration/{test_rerelate_alertmanager_dispatch.py => test_rerelate_alertmanager_dispatch_metrics_endpoint.py} (64%) diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py new file mode 100644 index 00000000..bbf15ed7 --- /dev/null +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -0,0 +1,1544 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +"""## Overview. + +This document explains how to integrate with the Grafana charm +for the purpose of providing a dashboard which can be used by +end users. It also explains the structure of the data +expected by the `grafana-dashboard` interface, and may provide a +mechanism or reference point for providing a compatible interface +or library by providing a definitive reference guide to the +structure of relation data which is shared between the Grafana +charm and any charm providing datasource information. + +## Provider Library Usage + +The Grafana charm interacts with its dashboards using its charm +library. The goal of this library is to be as simple to use as +possible, and instantiation of the class with or without changing +the default arguments provides a complete use case. For the simplest +use case of a charm which bundles dashboards and provides a +`provides: grafana-dashboard` interface, + + requires: + grafana-dashboard: + interface: grafana_dashboard + +creation of a `GrafanaDashboardProvider` object with the default arguments is +sufficient. + +:class:`GrafanaDashboardProvider` expects that bundled dashboards should +be included in your charm with a default path of: + + path/to/charm.py + path/to/src/grafana_dashboards/*.{json|json.tmpl|.tmpl} + +Where the files are Grafana dashboard JSON data either from the +Grafana marketplace, or directly exported from a Grafana instance. +Refer to the [official docs](https://grafana.com/tutorials/provision-dashboards-and-data-sources/) +for more information. + +When constructing a dashboard that is intended to be consumed by COS, make sure to use variables +for your datasources, and name them "prometheusds" and "lokids". You can also use the following +juju topology variables in your dashboards: $juju_model, $juju_model_uuid, $juju_application +and $juju_unit. Note, however, that if metrics are coming via peripheral charms (scrape-config +or cos-config) then topology labels would not exist. + +The default constructor arguments are: + + `charm`: `self` from the charm instantiating this library + `relation_name`: grafana-dashboard + `dashboards_path`: "/src/grafana_dashboards" + +If your configuration requires any changes from these defaults, they +may be set from the class constructor. It may be instantiated as +follows: + + from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider + + class FooCharm: + def __init__(self, *args): + super().__init__(*args, **kwargs) + ... + self.grafana_dashboard_provider = GrafanaDashboardProvider(self) + ... + +The first argument (`self`) should be a reference to the parent (providing +dashboards), as this charm's lifecycle events will be used to re-submit +dashboard information if a charm is upgraded, the pod is restarted, or other. + +An instantiated `GrafanaDashboardProvider` validates that the path specified +in the constructor (or the default) exists, reads the file contents, then +compresses them with LZMA and adds them to the application relation data +when a relation is established with Grafana. + +Provided dashboards will be checked by Grafana, and a series of dropdown menus +providing the ability to select query targets by Juju Model, application instance, +and unit will be added if they do not exist. + +To avoid requiring `jinja` in `GrafanaDashboardProvider` users, template validation +and rendering occurs on the other side of the relation, and relation data in +the form of: + + { + "event": { + "valid": `true|false`, + "errors": [], + } + } + +Will be returned if rendering or validation fails. In this case, the +`GrafanaDashboardProvider` object will emit a `dashboard_status_changed` event +of the type :class:`GrafanaDashboardEvent`, which will contain information +about the validation error. + +This information is added to the relation data for the charms as serialized JSON +from a dict, with a structure of: +``` +{ + "application": { + "dashboards": { + "uuid": a uuid generated to ensure a relation event triggers, + "templates": { + "file:{hash}": { + "content": `{compressed_template_data}`, + "charm": `charm.meta.name`, + "juju_topology": { + "model": `charm.model.name`, + "model_uuid": `charm.model.uuid`, + "application": `charm.app.name`, + "unit": `charm.unit.name`, + } + }, + "file:{other_file_hash}": { + ... + }, + }, + }, + }, +} +``` + +This is ingested by :class:`GrafanaDashboardConsumer`, and is sufficient for configuration. + +The [COS Configuration Charm](https://charmhub.io/cos-configuration-k8s) can be used to +add dashboards which are not bundled with charms. + +## Consumer Library Usage + +The `GrafanaDashboardConsumer` object may be used by Grafana +charms to manage relations with available dashboards. For this +purpose, a charm consuming Grafana dashboard information should do +the following things: + +1. Instantiate the `GrafanaDashboardConsumer` object by providing it a +reference to the parent (Grafana) charm and, optionally, the name of +the relation that the Grafana charm uses to interact with dashboards. +This relation must confirm to the `grafana-dashboard` interface. + +For example a Grafana charm may instantiate the +`GrafanaDashboardConsumer` in its constructor as follows + + from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardConsumer + + def __init__(self, *args): + super().__init__(*args) + ... + self.grafana_dashboard_consumer = GrafanaDashboardConsumer(self) + ... + +2. A Grafana charm also needs to listen to the +`GrafanaDashboardConsumer` events emitted by the `GrafanaDashboardConsumer` +by adding itself as an observer for these events: + + self.framework.observe( + self.grafana_source_consumer.on.sources_changed, + self._on_dashboards_changed, + ) + +Dashboards can be retrieved the :meth:`dashboards`: + +It will be returned in the format of: + +``` +[ + { + "id": unique_id, + "relation_id": relation_id, + "charm": the name of the charm which provided the dashboard, + "content": compressed_template_data + }, +] +``` + +The consuming charm should decompress the dashboard. +""" + +import base64 +import json +import logging +import lzma +import os +import re +import uuid +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from ops.charm import ( + CharmBase, + HookEvent, + RelationBrokenEvent, + RelationChangedEvent, + RelationCreatedEvent, + RelationEvent, + RelationRole, +) +from ops.framework import ( + EventBase, + EventSource, + Object, + ObjectEvents, + StoredDict, + StoredList, + StoredState, +) +from ops.model import Relation + +# The unique Charmhub library identifier, never change it +LIBID = "c49eb9c7dfef40c7b6235ebd67010a3f" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 12 + +logger = logging.getLogger(__name__) + + +DEFAULT_RELATION_NAME = "grafana-dashboard" +DEFAULT_PEER_NAME = "grafana" +RELATION_INTERFACE_NAME = "grafana_dashboard" + +TEMPLATE_DROPDOWNS = [ + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": "label_values(up,juju_model)", + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju model", + "multi": False, + "name": "juju_model", + "query": { + "query": "label_values(up,juju_model)", + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju model uuid", + "multi": False, + "name": "juju_model_uuid", + "query": { + "query": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju application", + "multi": False, + "name": "juju_application", + "query": { + "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "Juju unit", + "multi": False, + "name": "juju_unit", + "query": { + "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": None, + "multi": False, + "name": "prometheusds", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "type": "datasource", + }, +] + +REACTIVE_CONVERTER = { # type: ignore + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": "hosts", + "multi": True, + "name": "host", + "options": [], + "query": { + "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, +} + + +class RelationNotFoundError(Exception): + """Raised if there is no relation with the given name.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has a different interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as " + "interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different direction.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +class InvalidDirectoryPathError(Exception): + """Raised if the grafana dashboards folder cannot be found or is otherwise invalid.""" + + def __init__( + self, + grafana_dashboards_absolute_path: str, + message: str, + ): + self.grafana_dashboards_absolute_path = grafana_dashboards_absolute_path + self.message = message + + super().__init__(self.message) + + +def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: + """Resolve the provided path items against the directory of the main file. + + Look up the directory of the charmed operator file being executed. This is normally + going to be the charm.py file of the charm including this library. Then, resolve + the provided path elements and return its absolute path. + + Raises: + InvalidDirectoryPathError if the resolved path does not exist or it is not a directory + + """ + charm_dir = Path(str(charm.charm_dir)) + if not charm_dir.exists() or not charm_dir.is_dir(): + # Operator Framework does not currently expose a robust + # way to determine the top level charm source directory + # that is consistent across deployed charms and unit tests + # Hence for unit tests the current working directory is used + # TODO: updated this logic when the following ticket is resolved + # https://github.com/canonical/operator/issues/643 + charm_dir = Path(os.getcwd()) + + dir_path = charm_dir.absolute().joinpath(*path_elements) + + if not dir_path.exists(): + raise InvalidDirectoryPathError(str(dir_path), "directory does not exist") + if not dir_path.is_dir(): + raise InvalidDirectoryPathError(str(dir_path), "is not a directory") + + return str(dir_path) + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +) -> None: + """Verifies that a relation has the necessary characteristics. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + named like the value of the `relation_name` argument. + RelationInterfaceMismatchError: If the relation interface of the + relation named as the provided `relation_name` argument does not + match the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation named as the provided `relation_name` + argument has a different role than what is specified by the + `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + actual_relation_interface = relation.interface_name + if actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface + ) + + if expected_relation_role == RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role == RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +def _encode_dashboard_content(content: Union[str, bytes]) -> str: + if isinstance(content, str): + content = bytes(content, "utf-8") + + return base64.b64encode(lzma.compress(content)).decode("utf-8") + + +def _decode_dashboard_content(encoded_content: str) -> str: + return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() + + +def _convert_dashboard_fields(content: str) -> str: + """Make sure values are present for Juju topology. + + Inserts Juju topology variables and selectors into the template, as well as + a variable for Prometheus. + """ + dict_content = json.loads(content) + datasources = {} + existing_templates = False + + # If no existing template variables exist, just insert our own + if "templating" not in dict_content: + dict_content["templating"] = {"list": [d for d in TEMPLATE_DROPDOWNS]} + else: + # Otherwise, set a flag so we can go back later + existing_templates = True + for maybe in dict_content["templating"]["list"]: + # Build a list of `datasource_name`: `datasource_type` mappings + # The "query" field is actually "prometheus", "loki", "influxdb", etc + if "type" in maybe and maybe["type"] == "datasource": + datasources[maybe["name"]] = maybe["query"] + + # Put our own variables in the template + for d in TEMPLATE_DROPDOWNS: + if d not in dict_content["templating"]["list"]: + dict_content["templating"]["list"].insert(0, d) + + dict_content = _replace_template_fields(dict_content, datasources, existing_templates) + + return json.dumps(dict_content) + + +def _replace_template_fields( # noqa: C901 + dict_content: dict, datasources: dict, existing_templates: bool +) -> dict: + """Make templated fields get cleaned up afterwards. + + If existing datasource variables are present, try to substitute them, otherwise + assume they are all for Prometheus and put the prometheus variable there. + """ + replacements = {"loki": "${lokids}", "prometheus": "${prometheusds}"} + used_replacements = [] + + # If any existing datasources match types we know, or we didn't find + # any templating variables at all, template them. + if datasources or not existing_templates: + panels = dict_content["panels"] + + # Go through all of the panels. If they have a datasource set, AND it's one + # that we can convert to ${lokids} or ${prometheusds}, by stripping off the + # ${} templating and comparing the name to the list we built, replace it, + # otherwise, leave it alone. + # + # COS only knows about Prometheus and Loki. + for panel in panels: + if "datasource" not in panel or not panel.get("datasource", ""): + continue + if not existing_templates: + panel["datasource"] = "${prometheusds}" + else: + if panel["datasource"] in replacements.values(): + # Already a known template variable + continue + if not panel["datasource"]: + # Don't worry about null values + continue + # Strip out variable characters and maybe braces + ds = re.sub(r"(\$|\{|\})", "", panel["datasource"]) + replacement = replacements.get(datasources[ds], "") + if replacement: + used_replacements.append(ds) + panel["datasource"] = replacement or panel["datasource"] + + # Put our substitutions back + dict_content["panels"] = panels + + # Finally, go back and pop off the templates we stubbed out + deletions = [] + for tmpl in dict_content["templating"]["list"]: + if tmpl["name"] and tmpl["name"] in used_replacements: + deletions.append(tmpl) + + for d in deletions: + dict_content["templating"]["list"].remove(d) + + return dict_content + + +def _type_convert_stored(obj): + """Convert Stored* to their appropriate types, recursively.""" + if isinstance(obj, StoredList): + return list(map(_type_convert_stored, obj)) + elif isinstance(obj, StoredDict): + rdict = {} # type: Dict[Any, Any] + for k in obj.keys(): + rdict[k] = _type_convert_stored(obj[k]) + return rdict + else: + return obj + + +class GrafanaDashboardsChanged(EventBase): + """Event emitted when Grafana dashboards change.""" + + def __init__(self, handle, data=None): + super().__init__(handle) + self.data = data + + def snapshot(self) -> Dict: + """Save grafana source information.""" + return {"data": self.data} + + def restore(self, snapshot): + """Restore grafana source information.""" + self.data = snapshot["data"] + + +class GrafanaDashboardEvents(ObjectEvents): + """Events raised by :class:`GrafanaSourceEvents`.""" + + dashboards_changed = EventSource(GrafanaDashboardsChanged) + + +class GrafanaDashboardEvent(EventBase): + """Event emitted when Grafana dashboards cannot be resolved. + + Enables us to set a clear status on the provider. + """ + + def __init__(self, handle, errors: List[Dict[str, str]] = [], valid: bool = False): + super().__init__(handle) + self.errors = errors + self.error_message = "; ".join([error["error"] for error in errors if "error" in error]) + self.valid = valid + + def snapshot(self) -> Dict: + """Save grafana source information.""" + return { + "error_message": self.error_message, + "valid": self.valid, + "errors": json.dumps(self.errors), + } + + def restore(self, snapshot): + """Restore grafana source information.""" + self.error_message = snapshot["error_message"] + self.valid = snapshot["valid"] + self.errors = json.loads(snapshot["errors"]) + + +class GrafanaProviderEvents(ObjectEvents): + """Events raised by :class:`GrafanaSourceEvents`.""" + + dashboard_status_changed = EventSource(GrafanaDashboardEvent) + + +class GrafanaDashboardProvider(Object): + """An API to provide Grafana dashboards to a Grafana charm.""" + + _stored = StoredState() + on = GrafanaProviderEvents() + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + dashboards_path: str = "src/grafana_dashboards", + ) -> None: + """API to provide Grafana dashboard to a Grafana charmed operator. + + The :class:`GrafanaDashboardProvider` object provides an API + to upload dashboards to a Grafana charm. In its most streamlined + usage, the :class:`GrafanaDashboardProvider` is integrated in a + charmed operator as follows: + + self.grafana = GrafanaDashboardProvider(self) + + The :class:`GrafanaDashboardProvider` will look for dashboard + templates in the `/grafana_dashboards` folder. + Additionally, dashboard templates can be uploaded programmatically + via the :method:`GrafanaDashboardProvider.add_dashboard` method. + + To use the :class:`GrafanaDashboardProvider` API, you need a relation + defined in your charm operator's metadata.yaml as follows: + + provides: + grafana-dashboard: + interface: grafana_dashboard + + If you would like to use relation name other than `grafana-dashboard`, + you will need to specify the relation name via the `relation_name` + argument when instantiating the :class:`GrafanaDashboardProvider` object. + However, it is strongly advised to keep the the default relation name, + so that people deploying your charm will have a consistent experience + with all other charms that provide Grafana dashboards. + + It is possible to provide a different file path for the Grafana dashboards + to be automatically managed by the :class:`GrafanaDashboardProvider` object + via the `dashboards_path` argument. This may be necessary when the directory + structure of your charmed operator repository is not the "usual" one as + generated by `charmcraft init`, for example when adding the charmed operator + in a Java repository managed by Maven or Gradle. However, unless there are + such constraints with other tooling, it is strongly advised to store the + Grafana dashboards in the default `/grafana_dashboards` + folder, in order to provide a consistent experience for other charmed operator + authors. + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + relation_name: a :string: name of the relation managed by this + :class:`GrafanaDashboardProvider`; it defaults to "grafana-dashboard". + dashboards_path: a filesystem path relative to the charm root + where dashboard templates can be located. By default, the library + expects dashboard files to be in the `/grafana_dashboards` + directory. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + try: + dashboards_path = _resolve_dir_against_charm_path(charm, dashboards_path) + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + + super().__init__(charm, relation_name) + + self._charm = charm + self._relation_name = relation_name + self._dashboards_path = dashboards_path + + # No peer relation bucket we can rely on providers, keep StoredState here, too + self._stored.set_default(dashboard_templates={}) + + self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) + self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) + + self.framework.observe( + self._charm.on[self._relation_name].relation_created, + self._on_grafana_dashboard_relation_created, + ) + self.framework.observe( + self._charm.on[self._relation_name].relation_changed, + self._on_grafana_dashboard_relation_changed, + ) + + def add_dashboard(self, content: str) -> None: + """Add a dashboard to the relation managed by this :class:`GrafanaDashboardProvider`. + + Args: + content: a string representing a Jinja template. Currently, no + global variables are added to the Jinja template evaluation + context. + """ + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + stored_dashboard_templates = self._stored.dashboard_templates + + encoded_dashboard = _encode_dashboard_content(content) + + # Use as id the first chars of the encoded dashboard, so that + # it is predictable across units. + id = "prog:{}".format(encoded_dashboard[-24:-16]) + stored_dashboard_templates[id] = self._content_to_dashboard_object(encoded_dashboard) + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def remove_non_builtin_dashboards(self) -> None: + """Remove all dashboards to the relation added via :method:`add_dashboard`.""" + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + stored_dashboard_templates = self._stored.dashboard_templates + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("prog:"): + del stored_dashboard_templates[dashboard_id] + self._stored.dashboard_templates = stored_dashboard_templates + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def update_dashboards(self) -> None: + """Trigger the re-evaluation of the data on all relations.""" + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _update_all_dashboards_from_dir(self, _: Optional[HookEvent] = None) -> None: + """Scans the built-in dashboards and updates relations with changes.""" + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + + # Ensure we do not leave outdated dashboards by removing from stored all + # the encoded dashboards that start with "file/". + if self._dashboards_path: + stored_dashboard_templates = self._stored.dashboard_templates + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("file:"): + del stored_dashboard_templates[dashboard_id] + + # Path.glob uses fnmatch on the backend, which is pretty limited, so use a + # custom function for the filter + def _is_dashbaord(p: Path) -> bool: + return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + + for path in filter(_is_dashbaord, Path(self._dashboards_path).glob("*")): + # path = Path(path) + id = "file:{}".format(path.stem) + stored_dashboard_templates[id] = self._content_to_dashboard_object( + _encode_dashboard_content(path.read_bytes()) + ) + + self._stored.dashboard_templates = stored_dashboard_templates + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _reinitialize_dashboard_data(self) -> None: + """Triggers a reload of dashboard outside of an eventing workflow. + + This will destroy any existing relation data. + """ + try: + _resolve_dir_against_charm_path(self._charm, self._dashboards_path) + self._update_all_dashboards_from_dir() + + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + stored_dashboard_templates = self._stored.dashboard_templates + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("file:"): + del stored_dashboard_templates[dashboard_id] + self._stored.dashboard_templates = stored_dashboard_templates + + # With all of the file-based dashboards cleared out, force a refresh + # of relation data + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> None: + """Watch for a relation being created and automatically send dashboards. + + Args: + event: The :class:`RelationJoinedEvent` sent when a + `grafana_dashboaard` relationship is joined + """ + if self._charm.unit.is_leader(): + self._upset_dashboards_on_relation(event.relation) + + def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: + """Watch for changes so we know if there's an error to signal back to the parent charm. + + Args: + event: The `RelationChangedEvent` that triggered this handler. + """ + if self._charm.unit.is_leader(): + data = json.loads(event.relation.data[event.app].get("event", "{}")) + + if not data: + return + + valid = bool(data.get("valid", True)) + errors = data.get("errors", []) + if valid and not errors: + self.on.dashboard_status_changed.emit(valid=valid) + else: + self.on.dashboard_status_changed.emit(valid=valid, errors=errors) + + def _upset_dashboards_on_relation(self, relation: Relation) -> None: + """Update the dashboards in the relation data bucket.""" + # It's completely ridiculous to add a UUID, but if we don't have some + # pseudo-random value, this never makes it across 'juju set-state' + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), + "uuid": str(uuid.uuid4()), + } + + relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + def _content_to_dashboard_object(self, content: str) -> Dict: + return { + "charm": self._charm.meta.name, + "content": content, + "juju_topology": self._juju_topology, + } + + # This is not actually used in the dashboards, but is present to provide a secondary + # salt to ensure uniqueness in the dict keys in case individual charm units provide + # dashboards + @property + def _juju_topology(self) -> Dict: + return { + "model": self._charm.model.name, + "model_uuid": self._charm.model.uuid, + "application": self._charm.app.name, + "unit": self._charm.unit.name, + } + + @property + def dashboard_templates(self) -> List: + """Return a list of the known dashboard templates.""" + return [v for v in self._stored.dashboard_templates.values()] + + +class GrafanaDashboardConsumer(Object): + """A consumer object for working with Grafana Dashboards.""" + + on = GrafanaDashboardEvents() + _stored = StoredState() + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + ) -> None: + """API to receive Grafana dashboards from charmed operators. + + The :class:`GrafanaDashboardConsumer` object provides an API + to consume dashboards provided by a charmed operator using the + :class:`GrafanaDashboardProvider` library. The + :class:`GrafanaDashboardConsumer` is integrated in a + charmed operator as follows: + + self.grafana = GrafanaDashboardConsumer(self) + + To use this library, you need a relation defined as follows in + your charm operator's metadata.yaml: + + requires: + grafana-dashboard: + interface: grafana_dashboard + + If you would like to use a different relation name than + `grafana-dashboard`, you need to specify the relation name via the + `relation_name` argument. However, it is strongly advised not to + change the default, so that people deploying your charm will have + a consistent experience with all other charms that consume Grafana + dashboards. + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + relation_name: a :string: name of the relation managed by this + :class:`GrafanaDashboardConsumer`; it defaults to "grafana-dashboard". + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + + self._stored.set_default(dashboards=dict()) + + self.framework.observe( + self._charm.on[self._relation_name].relation_changed, + self._on_grafana_dashboard_relation_changed, + ) + self.framework.observe( + self._charm.on[self._relation_name].relation_broken, + self._on_grafana_dashboard_relation_broken, + ) + self.framework.observe( + self._charm.on[DEFAULT_PEER_NAME].relation_changed, + self._on_grafana_peer_changed, + ) + + def get_dashboards_from_relation(self, relation_id: int) -> List: + """Get a list of known dashboards for one instance of the monitored relation. + + Args: + relation_id: the identifier of the relation instance, as returned by + :method:`ops.model.Relation.id`. + + Returns: a list of known dashboards coming from the provided relation instance. + """ + return [ + self._to_external_object(relation_id, dashboard) + for dashboard in self._get_stored_dashboards(relation_id) + ] + + def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: + """Handle relation changes in related providers. + + If there are changes in relations between Grafana dashboard consumers + and providers, this event handler (if the unit is the leader) will + get data for an incoming grafana-dashboard relation through a + :class:`GrafanaDashboardsChanged` event, and make the relation data + available in the app's datastore object. The Grafana charm can + then respond to the event to update its configuration. + """ + changes = False + if self._charm.unit.is_leader(): + changes = self._render_dashboards_and_signal_changed(event.relation) + + if changes: + self.on.dashboards_changed.emit() + + def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: + """Emit dashboard events on peer events so secondary charm data updates.""" + if self._charm.unit.is_leader(): + return + self.on.dashboards_changed.emit() + + def update_dashboards(self, relation: Optional[Relation] = None) -> None: + """Re-establish dashboards on one or more relations. + + If something changes between this library and a datasource, try to re-establish + invalid dashboards and invalidate active ones. + + Args: + relation: a specific relation for which the dashboards have to be + updated. If not specified, all relations managed by this + :class:`GrafanaDashboardConsumer` will be updated. + """ + changes = False + if self._charm.unit.is_leader(): + relations = ( + [relation] if relation else self._charm.model.relations[self._relation_name] + ) + + for relation in relations: + self._render_dashboards_and_signal_changed(relation) # type: ignore + + if changes: + self.on.dashboards_changed.emit() + + def _on_grafana_dashboard_relation_broken(self, event: RelationBrokenEvent) -> None: + """Update job config when providers depart. + + When a Grafana dashboard provider departs, the configuration + for that provider is removed from the list of dashboards + """ + if not self._charm.unit.is_leader(): + return + + self._remove_all_dashboards_for_relation(event.relation) + + def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # type: ignore + """Validate a given dashboard. + + Verify that the passed dashboard data is able to be found in our list + of datasources and will render. If they do, let the charm know by + emitting an event. + + Args: + relation: Relation; The relation the dashboard is associated with. + + Returns: + a boolean indicating whether an event should be emitted + """ + other_app = relation.app + + raw_data = relation.data[other_app].get("dashboards", {}) + + if not raw_data: + logger.warning( + "No dashboard data found in the %s:%s relation", + self._relation_name, + str(relation.id), + ) + return False + + data = json.loads(raw_data) + + # The only piece of data needed on this side of the relations is "templates" + templates = data.pop("templates") + + # Import only if a charmed operator uses the consumer, we don't impose these + # dependencies on the client + from jinja2 import Template # type: ignore + from jinja2.exceptions import TemplateSyntaxError # type: ignore + + # The dashboards are WAY too big since this ultimately calls out to Juju to + # set the relation data, and it overflows the maximum argument length for + # subprocess, so we have to use b64, annoyingly. + # Worse, Python3 expects absolutely everything to be a byte, and a plain + # `base64.b64encode()` is still too large, so we have to go through hoops + # of encoding to byte, compressing with lzma, converting to base64 so it + # can be converted to JSON, then all the way back. + + rendered_dashboards = [] + relation_has_invalid_dashboards = False + + for _, (fname, template) in enumerate(templates.items()): + decoded_content = None + content = None + error = None + try: + decoded_content = _decode_dashboard_content(template["content"]) + content = Template(decoded_content).render() + content = _encode_dashboard_content(_convert_dashboard_fields(content)) + except lzma.LZMAError as e: + error = str(e) + relation_has_invalid_dashboards = True + except json.JSONDecodeError as e: + error = str(e.msg) + logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname)) + continue + except TemplateSyntaxError as e: + error = str(e) + relation_has_invalid_dashboards = True + + # Prepend the relation name and ID to the dashboard ID to avoid clashes with + # multiple relations with apps from the same charm, or having dashboards with + # the same ids inside their charm operators + rendered_dashboards.append( + { + "id": "{}:{}/{}".format(relation.name, relation.id, fname), + "original_id": fname, + "content": content if content else None, + "template": template, + "valid": (error is None), + "error": error, + } + ) + + if relation_has_invalid_dashboards: + self._remove_all_dashboards_for_relation(relation) + + invalid_templates = [ + data["original_id"] for data in rendered_dashboards if not data["valid"] + ] + + logger.warning( + "Cannot add one or more Grafana dashboards from relation '{}:{}': the following " + "templates are invalid: {}".format( + relation.name, + relation.id, + invalid_templates, + ) + ) + + relation.data[self._charm.app]["event"] = json.dumps( + { + "errors": [ + { + "dashboard_id": rendered_dashboard["original_id"], + "error": rendered_dashboard["error"], + } + for rendered_dashboard in rendered_dashboards + if rendered_dashboard["error"] + ] + } + ) + + # Dropping dashboards for a relation needs to be signalled + return True + else: + stored_data = rendered_dashboards + currently_stored_data = self._get_stored_dashboards(relation.id) + + coerced_data = ( + _type_convert_stored(currently_stored_data) if currently_stored_data else {} + ) + + if not coerced_data == stored_data: + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards[relation.id] = stored_data + self.set_peer_data("dashboards", stored_dashboards) + return True + + def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: + """If an errored dashboard is in stored data, remove it and trigger a deletion.""" + if self._get_stored_dashboards(relation.id): + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards.pop(str(relation.id)) + self.set_peer_data("dashboards", stored_dashboards) + self.on.dashboards_changed.emit() + + def _to_external_object(self, relation_id, dashboard): + return { + "id": dashboard["original_id"], + "relation_id": relation_id, + "charm": dashboard["template"]["charm"], + "content": _decode_dashboard_content(dashboard["content"]), + } + + @property + def dashboards(self) -> List[Dict]: + """Get a list of known dashboards across all instances of the monitored relation. + + Returns: a list of known dashboards. The JSON of each of the dashboards is available + in the `content` field of the corresponding `dict`. + """ + dashboards = [] + + for _, (relation_id, dashboards_for_relation) in enumerate( + self.get_peer_data("dashboards").items() + ): + for dashboard in dashboards_for_relation: + dashboards.append(self._to_external_object(relation_id, dashboard)) + + return dashboards + + def _get_stored_dashboards(self, relation_id: int) -> list: + """Pull stored dashboards out of the peer data bucket.""" + return self.get_peer_data("dashboards").get(str(relation_id), {}) + + def _set_default_data(self) -> None: + """Set defaults if they are not in peer relation data.""" + data = {"dashboards": {}} # type: ignore + for k, v in data.items(): + if not self.get_peer_data(k): + self.set_peer_data(k, v) + + def set_peer_data(self, key: str, data: Any) -> None: + """Put information into the peer data bucket instead of `StoredState`.""" + self._charm.peers.data[self._charm.app][key] = json.dumps(data) # type: ignore + + def get_peer_data(self, key: str) -> Any: + """Retrieve information from the peer data bucket instead of `StoredState`.""" + data = self._charm.peers.data[self._charm.app].get(key, "") # type: ignore + return json.loads(data) if data else {} + + +class GrafanaDashboardAggregator(Object): + """API to retrieve Grafana dashboards from machine dashboards. + + The :class:`GrafanaDashboardAggregator` object provides a way to + collate and aggregate Grafana dashboards from reactive/machine charms + and transport them into Charmed Operators, using Juju topology. + + For detailed usage instructions, see the documentation for + :module:`lma-proxy-operator`, as this class is intended for use as a + single point of intersection rather than use in individual charms. + + Since :class:`GrafanaDashboardAggregator` serves as a bridge between + Canonical Observability Stack Charmed Operators and Reactive Charms, + deployed in a Reactive Juju model, both a target relation which is + used to collect events from Reactive charms and a `grafana_relation` + which is used to send the collected data back to the Canonical + Observability Stack are required. + + In its most streamlined usage, :class:`GrafanaDashboardAggregator` is + integrated in a charmed operator as follows: + + self.grafana = GrafanaDashboardAggregator(self) + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + target_relation: a :string: name of a relation managed by this + :class:`GrafanaDashboardAggregator`, which is used to communicate + with reactive/machine charms it defaults to "dashboards". + grafana_relation: a :string: name of a relation used by this + :class:`GrafanaDashboardAggregator`, which is used to communicate + with charmed grafana. It defaults to "downstream-grafana-dashboard" + """ + + _stored = StoredState() + on = GrafanaProviderEvents() + + def __init__( + self, + charm: CharmBase, + target_relation: str = "dashboards", + grafana_relation: str = "downstream-grafana-dashboard", + ): + super().__init__(charm, grafana_relation) + + # Reactive charms may be RPC-ish and not leave reliable data around. Keep + # StoredState here + self._stored.set_default( + dashboard_templates={}, + id_mappings={}, + ) + + self._charm = charm + self._target_relation = target_relation + self._grafana_relation = grafana_relation + + self.framework.observe( + self._charm.on[self._grafana_relation].relation_joined, + self._update_remote_grafana, + ) + self.framework.observe( + self._charm.on[self._grafana_relation].relation_changed, + self._update_remote_grafana, + ) + self.framework.observe( + self._charm.on[self._target_relation].relation_changed, + self.update_dashboards, + ) + self.framework.observe( + self._charm.on[self._target_relation].relation_broken, + self.remove_dashboards, + ) + + def update_dashboards(self, event: RelationEvent) -> None: + """If we get a dashboard from a reactive charm, parse it out and update.""" + if self._charm.unit.is_leader(): + self._upset_dashboards_on_event(event) + + def _upset_dashboards_on_event(self, event: RelationEvent) -> None: + """Update the dashboards in the relation data bucket.""" + dashboards = self._handle_reactive_dashboards(event) + + if not dashboards: + logger.warning( + "Could not find dashboard data after a relation change for {}".format(event.app) + ) + return + + for id in dashboards: + self._stored.dashboard_templates[id] = self._content_to_dashboard_object( + dashboards[id], event + ) + + self._stored.id_mappings[event.app.name] = dashboards + self._update_remote_grafana(event) + + def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: + """Push dashboards to the downstream Grafana relation.""" + # It's still ridiculous to add a UUID here, but needed + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), + "uuid": str(uuid.uuid4()), + } + + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + def remove_dashboards(self, event: RelationBrokenEvent) -> None: + """Remove a dashboard if the relation is broken.""" + app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) + + del self._stored.id_mappings[event.app.name] + for id in app_ids: + del self._stored.dashboard_templates[id] + + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), + "uuid": str(uuid.uuid4()), + } + + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + # Yes, this has a fair amount of branching. It's not that complex, though + def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 + """Remove existing reactive charm datasource templating out. + + This method iterates through *known* places where reactive charms may set + data in contributed dashboards and removes them. + + `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from + the Grafana UI. It is not present in earlier Grafana versions, and can be disabled + in 5.3.4 and above (optionally). If set, any values present will be substituted on + import. Some reactive charms use this for Prometheus. LMA2 uses dropdown selectors + for datasources, and leaving this present results in "default" datasource values + which are broken. + + Similarly, `dashboard["templating"]["list"][N]["name"] == "host"` can be used to + set a `host` variable for use in dashboards which is not meaningful in the context + of Juju topology and will yield broken dashboards. + + Further properties may be discovered. + """ + dash = template["dashboard"] + try: + if "list" in dash["templating"]: + for i in range(len(dash["templating"]["list"])): + if ( + "datasource" in dash["templating"]["list"][i] + and "Juju" in dash["templating"]["list"][i]["datasource"] + ): + dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" + if ( + "name" in dash["templating"]["list"][i] + and dash["templating"]["list"][i]["name"] == "host" + ): + dash["templating"]["list"][i] = REACTIVE_CONVERTER + except KeyError: + logger.debug("No existing templating data in dashboard") + + if "__inputs" in dash: + inputs = dash + for i in range(len(dash["__inputs"])): + if dash["__inputs"][i]["pluginName"] == "Prometheus": + del inputs["__inputs"][i] + if inputs: + dash["__inputs"] = inputs["__inputs"] + else: + del dash["__inputs"] + + template["dashboard"] = dash + return template + + def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: + """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" + templates = [] + id = "" + + # Reactive data can reliably be pulled out of events. In theory, if we got an event, + # it's on the bucket, but using event explicitly keeps the mental model in + # place for reactive + for k in event.relation.data[event.unit].keys(): + if k.startswith("request_"): + templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) + + for k in event.relation.data[event.app].keys(): + if k.startswith("request_"): + templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) + + builtins = self._maybe_get_builtin_dashboards(event) + + if not templates and not builtins: + return {} + + dashboards = {} + for t in templates: + # Replace values with LMA-style templating + t = self._strip_existing_datasources(t) + + # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON + # in the bucket back out to the actual "dashboard" we _need_, this is the way + # This is not a mistake -- there's a double nesting in reactive charms, and + # Grafana won't load it. We have to unbox: + # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], + # and the final unboxing is below. + dash = json.dumps(t["dashboard"]) + + # Replace the old-style datasource templates + dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) + dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) + + from jinja2 import Template + + content = _encode_dashboard_content( + Template(dash).render(host=event.unit.name, datasource="prometheus") + ) + id = "prog:{}".format(content[-24:-16]) + + dashboards[id] = content + return {**builtins, **dashboards} + + def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: + """Tries to match the event with an included dashboard. + + Scans dashboards packed with the charm instantiating this class, and tries to match + one with the event. There is no guarantee that any given event will match a builtin, + since each charm instantiating this class may include a different set of dashboards, + or none. + """ + builtins = {} + dashboards_path = None + + try: + dashboards_path = _resolve_dir_against_charm_path( + self._charm, "src/grafana_dashboards" + ) + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + + if dashboards_path: + + def _is_dashbaord(p: Path) -> bool: + return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + + for path in filter(_is_dashbaord, Path(dashboards_path).glob("*")): + # path = Path(path) + if event.app.name in path.name: + id = "file:{}".format(path.stem) + builtins[id] = self._content_to_dashboard_object( + _encode_dashboard_content(path.read_bytes()), event + ) + + return builtins + + def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict: + return { + "charm": event.app.name, + "content": content, + "juju_topology": self._juju_topology(event), + } + + # This is not actually used in the dashboards, but is present to provide a secondary + # salt to ensure uniqueness in the dict keys in case individual charm units provide + # dashboards + def _juju_topology(self, event: RelationEvent) -> Dict: + return { + "model": self._charm.model.name, + "model_uuid": self._charm.model.uuid, + "application": event.app.name, + "unit": event.unit.name, + } diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py new file mode 100644 index 00000000..135971a6 --- /dev/null +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -0,0 +1,2304 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +"""## Overview. + +This document explains how to integrate with the Prometheus charm +for the purpose of providing a metrics endpoint to Prometheus. It +also explains how alternative implementations of the Prometheus charms +may maintain the same interface and be backward compatible with all +currently integrated charms. Finally this document is the +authoritative reference on the structure of relation data that is +shared between Prometheus charms and any other charm that intends to +provide a scrape target for Prometheus. + +## Provider Library Usage + +This Prometheus charm interacts with its scrape targets using its +charm library. Charms seeking to expose metric endpoints for the +Prometheus charm, must do so using the `MetricsEndpointProvider` +object from this charm library. For the simplest use cases, using the +`MetricsEndpointProvider` object only requires instantiating it, +typically in the constructor of your charm (the one which exposes a +metrics endpoint). The `MetricsEndpointProvider` constructor requires +the name of the relation over which a scrape target (metrics endpoint) +is exposed to the Prometheus charm. This relation must use the +`prometheus_scrape` interface. By default address of the metrics +endpoint is set to the unit IP address, by each unit of the +`MetricsEndpointProvider` charm. These units set their address in +response to the `PebbleReady` event of each container in the unit, +since container restarts of Kubernetes charms can result in change of +IP addresses. The default name for the metrics endpoint relation is +`metrics-endpoint`. It is strongly recommended to use the same +relation name for consistency across charms and doing so obviates the +need for an additional constructor argument. The +`MetricsEndpointProvider` object may be instantiated as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_endpoint = MetricsEndpointProvider(self) + ... + +Note that the first argument (`self`) to `MetricsEndpointProvider` is +always a reference to the parent (scrape target) charm. + +An instantiated `MetricsEndpointProvider` object will ensure that each +unit of its parent charm, is a scrape target for the +`MetricsEndpointConsumer` (Prometheus) charm. By default +`MetricsEndpointProvider` assumes each unit of the consumer charm +exports its metrics at a path given by `/metrics` on port 80. These +defaults may be changed by providing the `MetricsEndpointProvider` +constructor an optional argument (`jobs`) that represents a +Prometheus scrape job specification using Python standard data +structures. This job specification is a subset of Prometheus' own +[scrape +configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) +format but represented using Python data structures. More than one job +may be provided using the `jobs` argument. Hence `jobs` accepts a list +of dictionaries where each dictionary represents one `` +object as described in the Prometheus documentation. The currently +supported configuration subset is: `job_name`, `metrics_path`, +`static_configs` + +Suppose it is required to change the port on which scraped metrics are +exposed to 8000. This may be done by providing the following data +structure as the value of `jobs`. + +``` +[ + { + "static_configs": [ + { + "targets": ["*:8000"] + } + ] + } +] +``` + +The wildcard ("*") host specification implies that the scrape targets +will automatically be set to the host addresses advertised by each +unit of the consumer charm. + +It is also possible to change the metrics path and scrape multiple +ports, for example + +``` +[ + { + "metrics_path": "/my-metrics-path", + "static_configs": [ + { + "targets": ["*:8000", "*:8081"], + } + ] + } +] +``` + +More complex scrape configurations are possible. For example + +``` +[ + { + "static_configs": [ + { + "targets": ["10.1.32.215:7000", "*:8000"], + "labels": { + "some-key": "some-value" + } + } + ] + } +] +``` + +This example scrapes the target "10.1.32.215" at port 7000 in addition +to scraping each unit at port 8000. There is however one difference +between wildcard targets (specified using "*") and fully qualified +targets (such as "10.1.32.215"). The Prometheus charm automatically +associates labels with metrics generated by each target. These labels +localise the source of metrics within the Juju topology by specifying +its "model name", "model UUID", "application name" and "unit +name". However unit name is associated only with wildcard targets but +not with fully qualified targets. + +Multiple jobs with different metrics paths and labels are allowed, but +each job must be given a unique name: + +``` +[ + { + "job_name": "my-first-job", + "metrics_path": "one-path", + "static_configs": [ + { + "targets": ["*:7000"], + "labels": { + "some-key": "some-value" + } + } + ] + }, + { + "job_name": "my-second-job", + "metrics_path": "another-path", + "static_configs": [ + { + "targets": ["*:8000"], + "labels": { + "some-other-key": "some-other-value" + } + } + ] + } +] +``` + +**Important:** `job_name` should be a fixed string (e.g. hardcoded literal). +For instance, if you include variable elements, like your `unit.name`, it may break +the continuity of the metrics time series gathered by Prometheus when the leader unit +changes (e.g. on upgrade or rescale). + +Additionally, it is also technically possible, but **strongly discouraged**, to +configure the following scrape-related settings, which behave as described by the +[Prometheus documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config): + +- `static_configs` +- `scrape_interval` +- `scrape_timeout` +- `proxy_url` +- `relabel_configs` +- `metrics_relabel_configs` +- `sample_limit` +- `label_limit` +- `label_name_length_limit` +- `label_value_length_limit` + +The settings above are supported by the `prometheus_scrape` library only for the sake of +specialized facilities like the [Prometheus Scrape Config](https://charmhub.io/prometheus-scrape-config-k8s) +charm. Virtually no charms should use these settings, and charmers definitely **should not** +expose them to the Juju administrator via configuration options. + +## Consumer Library Usage + +The `MetricsEndpointConsumer` object may be used by Prometheus +charms to manage relations with their scrape targets. For this +purposes a Prometheus charm needs to do two things + +1. Instantiate the `MetricsEndpointConsumer` object by providing it a +reference to the parent (Prometheus) charm and optionally the name of +the relation that the Prometheus charm uses to interact with scrape +targets. This relation must confirm to the `prometheus_scrape` +interface and it is strongly recommended that this relation be named +`metrics-endpoint` which is its default value. + +For example a Prometheus charm may instantiate the +`MetricsEndpointConsumer` in its constructor as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_consumer = MetricsEndpointConsumer(self) + ... + +2. A Prometheus charm also needs to respond to the +`TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as +an observer for these events, as in + + self.framework.observe( + self.metrics_consumer.on.targets_changed, + self._on_scrape_targets_changed, + ) + +In responding to the `TargetsChangedEvent` event the Prometheus +charm must update the Prometheus configuration so that any new scrape +targets are added and/or old ones removed from the list of scraped +endpoints. For this purpose the `MetricsEndpointConsumer` object +exposes a `jobs()` method that returns a list of scrape jobs. Each +element of this list is the Prometheus scrape configuration for that +job. In order to update the Prometheus configuration, the Prometheus +charm needs to replace the current list of jobs with the list provided +by `jobs()` as follows + + def _on_scrape_targets_changed(self, event): + ... + scrape_jobs = self.metrics_consumer.jobs() + for job in scrape_jobs: + prometheus_scrape_config.append(job) + ... + +## Alerting Rules + +This charm library also supports gathering alerting rules from all +related `MetricsEndpointProvider` charms and enabling corresponding alerts within the +Prometheus charm. Alert rules are automatically gathered by `MetricsEndpointProvider` +charms when using this library, from a directory conventionally named +`prometheus_alert_rules`. This directory must reside at the top level +in the `src` folder of the consumer charm. Each file in this directory +is assumed to be in one of two formats: +- the official prometheus alert rule format, conforming to the +[Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +- a single rule format, which is a simplified subset of the official format, +comprising a single alert rule per file, using the same YAML fields. + +The file name must have the `.rule` extension. + +An example of the contents of such a file in the custom single rule +format is shown below. + +``` +alert: HighRequestLatency +expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5 +for: 10m +labels: + severity: Medium + type: HighLatency +annotations: + summary: High request latency for {{ $labels.instance }}. +``` + +The `MetricsEndpointProvider` will read all available alert rules and +also inject "filtering labels" into the alert expressions. The +filtering labels ensure that alert rules are localised to the metrics +provider charm's Juju topology (application, model and its UUID). Such +a topology filter is essential to ensure that alert rules submitted by +one provider charm generates alerts only for that same charm. When +alert rules are embedded in a charm, and the charm is deployed as a +Juju application, the alert rules from that application have their +expressions automatically updated to filter for metrics coming from +the units of that application alone. This remove risk of spurious +evaluation, e.g., when you have multiple deployments of the same charm +monitored by the same Prometheus. + +Not all alerts one may want to specify can be embedded in a +charm. Some alert rules will be specific to a user's use case. This is +the case, for example, of alert rules that are based on business +constraints, like expecting a certain amount of requests to a specific +API every five minutes. Such alert rules can be specified via the +[COS Config Charm](https://charmhub.io/cos-configuration-k8s), +which allows importing alert rules and other settings like dashboards +from a Git repository. + +Gathering alert rules and generating rule files within the Prometheus +charm is easily done using the `alerts()` method of +`MetricsEndpointConsumer`. Alerts generated by Prometheus will +automatically include Juju topology labels in the alerts. These labels +indicate the source of the alert. The following labels are +automatically included with each alert + +- `juju_model` +- `juju_model_uuid` +- `juju_application` + +## Relation Data + +The Prometheus charm uses both application and unit relation data to +obtain information regarding its scrape jobs, alert rules and scrape +targets. This relation data is in JSON format and it closely resembles +the YAML structure of Prometheus [scrape configuration] +(https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). + +Units of Metrics provider charms advertise their names and addresses +over unit relation data using the `prometheus_scrape_unit_name` and +`prometheus_scrape_unit_address` keys. While the `scrape_metadata`, +`scrape_jobs` and `alert_rules` keys in application relation data +of Metrics provider charms hold eponymous information. + +""" # noqa: W505 + +import ipaddress +import json +import logging +import os +import platform +import socket +import subprocess +from collections import OrderedDict +from pathlib import Path +from typing import Dict, List, Optional, Union + +import yaml +from ops.charm import CharmBase, RelationRole +from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents + +# The unique Charmhub library identifier, never change it +LIBID = "bc84295fef5f4049878f07b131968ee2" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 20 + +logger = logging.getLogger(__name__) + + +ALLOWED_KEYS = { + "job_name", + "metrics_path", + "static_configs", + "scrape_interval", + "scrape_timeout", + "proxy_url", + "relabel_configs", + "metrics_relabel_configs", + "sample_limit", + "label_limit", + "label_name_length_limit", + "label_value_lenght_limit", +} +DEFAULT_JOB = { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], +} + + +DEFAULT_RELATION_NAME = "metrics-endpoint" +RELATION_INTERFACE_NAME = "prometheus_scrape" + +DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" + + +class RelationNotFoundError(Exception): + """Raised if there is no relation with the given name is found.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has a different interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different role.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +): + """Verifies that a relation has the necessary characteristics. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the same relation interface + as specified via the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the same role as specified + via the `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + actual_relation_interface = relation.interface_name + if actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface + ) + + if expected_relation_role == RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role == RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +def _sanitize_scrape_configuration(job) -> dict: + """Restrict permissible scrape configuration options. + + If job is empty then a default job is returned. The + default job is + + ``` + { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], + } + ``` + + Args: + job: a dict containing a single Prometheus job + specification. + + Returns: + a dictionary containing a sanitized job specification. + """ + sanitized_job = DEFAULT_JOB.copy() + sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) + return sanitized_job + + +class JujuTopology: + """Class for storing and formatting juju topology information.""" + + STUB = "%%juju_topology%%" + + def __new__(cls, *args, **kwargs): + """Reject instantiation of a base JujuTopology class. Children only.""" + if cls is JujuTopology: + raise TypeError("only children of '{}' may be instantiated".format(cls.__name__)) + return object.__new__(cls) + + def __init__( + self, + model: str, + model_uuid: str, + application: str, + unit: Optional[str] = "", + charm_name: Optional[str] = "", + ): + """Build a JujuTopology object. + + A `JujuTopology` object is used for storing and transforming + Juju Topology information. This information is used to + annotate Prometheus scrape jobs and alert rules. Such + annotation when applied to scrape jobs helps in identifying + the source of the scrapped metrics. On the other hand when + applied to alert rules topology information ensures that + evaluation of alert expressions is restricted to the source + (charm) from which the alert rules were obtained. + + Args: + model: a string name of the Juju model + model_uuid: a globally unique string identifier for the Juju model + application: an application name as a string + unit: a unit name as a string + charm_name: name of charm as a string + + Note: + `JujuTopology` should not be constructed directly by charm code. Please + use `ProviderTopology` or `AggregatorTopology`. + """ + self.model = model + self.model_uuid = model_uuid + self.application = application + self.charm_name = charm_name + self.unit = unit + + @classmethod + def from_charm(cls, charm): + """Factory method for creating `JujuTopology` children from a given charm. + + Args: + charm: a `CharmBase` object for which the `JujuTopology` has to be constructed + + Returns: + a `JujuTopology` object. + """ + return cls( + model=charm.model.name, + model_uuid=charm.model.uuid, + application=charm.model.app.name, + unit=charm.model.unit.name, + charm_name=charm.meta.name, + ) + + @classmethod + def from_relation_data(cls, data: dict): + """Factory method for creating `JujuTopology` children from a dictionary. + + Args: + data: a dictionary with four keys providing topology information. The keys are + - "model" + - "model_uuid" + - "application" + - "unit" + - "charm_name" + + `unit` and `charm_name` may be empty, but will result in more limited + labels. However, this allows us to support payload-only charms. + + Returns: + a `JujuTopology` object. + """ + return cls( + model=data["model"], + model_uuid=data["model_uuid"], + application=data["application"], + unit=data.get("unit", ""), + charm_name=data.get("charm_name", ""), + ) + + @property + def identifier(self) -> str: + """Format the topology information into a terse string.""" + # This is odd, but may have `None` as a model key + return "_".join([str(val) for val in self.as_promql_label_dict().values()]).replace( + "/", "_" + ) + + @property + def promql_labels(self) -> str: + """Format the topology information into a verbose string.""" + return ", ".join( + ['{}="{}"'.format(key, value) for key, value in self.as_promql_label_dict().items()] + ) + + def as_dict(self, rename_keys: Optional[Dict[str, str]] = None) -> OrderedDict: + """Format the topology information into a dict. + + Use an OrderedDict so we can rely on the insertion order on Python 3.5 (and 3.6, + which still does not guarantee it). + + Args: + rename_keys: A dictionary mapping old key names to new key names, which will + be substituted when invoked. + """ + ret = OrderedDict( + [ + ("model", self.model), + ("model_uuid", self.model_uuid), + ("application", self.application), + ("unit", self.unit), + ("charm_name", self.charm_name), + ] + ) + + ret["unit"] or ret.pop("unit") + ret["charm_name"] or ret.pop("charm_name") + + # If a key exists in `rename_keys`, replace the value + if rename_keys: + ret = OrderedDict( + (rename_keys.get(k), v) if rename_keys.get(k) else (k, v) for k, v in ret.items() # type: ignore + ) + + return ret + + def as_promql_label_dict(self): + """Format the topology information into a dict with keys having 'juju_' as prefix.""" + vals = { + "juju_{}".format(key): val + for key, val in self.as_dict(rename_keys={"charm_name": "charm"}).items() + } + # The leader is the only unit that sets alert rules, if "juju_unit" is present, + # then the rules will only be evaluated for that unit + if "juju_unit" in vals: + vals.pop("juju_unit") + + return vals + + def render(self, template: str): + """Render a juju-topology template string with topology info.""" + return template.replace(JujuTopology.STUB, self.promql_labels) + + +class AggregatorTopology(JujuTopology): + """Class for initializing topology information for MetricsEndpointAggregator.""" + + @classmethod + def create(cls, model: str, model_uuid: str, application: str, unit: str): + """Factory method for creating the `AggregatorTopology` dataclass from a given charm. + + Args: + model: a string representing the model + model_uuid: the model UUID as a string + application: the application name + unit: the unit name + + Returns: + a `AggregatorTopology` object. + """ + return cls( + model=model, + model_uuid=model_uuid, + application=application, + unit=unit, + ) + + def as_promql_label_dict(self): + """Format the topology information into a dict with keys having 'juju_' as prefix.""" + vals = {"juju_{}".format(key): val for key, val in self.as_dict().items()} + + # FIXME: Why is this different? I have no idea. The uuid length should be the same + vals["juju_model_uuid"] = vals["juju_model_uuid"][:7] + + return vals + + +class ProviderTopology(JujuTopology): + """Class for initializing topology information for MetricsEndpointProvider.""" + + @property + def scrape_identifier(self): + """Format the topology information into a scrape identifier.""" + # This is used only by Metrics[Consumer|Provider] and does not need a + # unit name, so only check for the charm name + return "juju_{}_prometheus_scrape".format(self.identifier) + + +class InvalidAlertRulePathError(Exception): + """Raised if the alert rules folder cannot be found or is otherwise invalid.""" + + def __init__( + self, + alert_rules_absolute_path: Path, + message: str, + ): + self.alert_rules_absolute_path = alert_rules_absolute_path + self.message = message + + super().__init__(self.message) + + +def _is_official_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in the upstream format as supported by Prometheus. + + Alert rules in dictionary format are in "official" form if they + contain a "groups" key, since this implies they contain a list of + alert rule groups. + + Args: + rules_dict: a set of alert rules in Python dictionary format + + Returns: + True if alert rules are in official Prometheus file format. + """ + return "groups" in rules_dict + + +def _is_single_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in single rule format. + + The Prometheus charm library supports reading of alert rules in a + custom format that consists of a single alert rule per file. This + does not conform to the official Prometheus alert rule file format + which requires that each alert rules file consists of a list of + alert rule groups and each group consists of a list of alert + rules. + + Alert rules in dictionary form are considered to be in single rule + format if in the least it contains two keys corresponding to the + alert rule name and alert expression. + + Returns: + True if alert rule is in single rule file format. + """ + # one alert rule per file + return set(rules_dict) >= {"alert", "expr"} + + +class AlertRules: + """Utility class for amalgamating prometheus alert rule files and injecting juju topology. + + An `AlertRules` object supports aggregating alert rules from files and directories in both + official and single rule file formats using the `add_path()` method. All the alert rules + read are annotated with Juju topology labels and amalgamated into a single data structure + in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be + easily dumped into JSON format and exchanged over relation data. The dictionary can also + be dumped into YAML format and written directly into an alert rules file that is read by + Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, + since Prometheus allows only a single list of alert rule groups per alert rules file. + + The official Prometheus format is a YAML file conforming to the Prometheus documentation + (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). + The custom single rule format is a subsection of the official YAML, having a single alert + rule, effectively "one alert per file". + """ + + # This class uses the following terminology for the various parts of a rule file: + # - alert rules file: the entire groups[] yaml, including the "groups:" key. + # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list + # of dictionaries that have the "name" and "rules" keys. + # - alert group (singular): a single dictionary that has the "name" and "rules" keys. + # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with + # the "alert" and "expr" keys. + # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. + + def __init__(self, topology: Optional[JujuTopology] = None): + """Build and alert rule object. + + Args: + topology: an optional `JujuTopology` instance that is used to annotate all alert rules. + """ + self.topology = topology + self.alert_groups = [] # type: List[dict] + + def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: + """Read a rules file from path, injecting juju topology. + + Args: + root_path: full path to the root rules folder (used only for generating group name) + file_path: full path to a *.rule file. + + Returns: + A list of dictionaries representing the rules file, if file is valid (the structure is + formed by `yaml.safe_load` of the file); an empty list otherwise. + """ + with file_path.open() as rf: + # Load a list of rules from file then add labels and filters + try: + rule_file = yaml.safe_load(rf) + + except Exception as e: + logger.error("Failed to read alert rules from %s: %s", file_path.name, e) + return [] + + if _is_official_alert_rule_format(rule_file): + alert_groups = rule_file["groups"] + elif _is_single_alert_rule_format(rule_file): + # convert to list of alert groups + # group name is made up from the file name + alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] + else: + # invalid/unsupported + logger.error("Invalid rules file: %s", file_path.name) + return [] + + # update rules with additional metadata + for alert_group in alert_groups: + # update group name with topology and sub-path + alert_group["name"] = self._group_name( + str(root_path), + str(file_path), + alert_group["name"], + ) + + # add "juju_" topology labels + for alert_rule in alert_group["rules"]: + if "labels" not in alert_rule: + alert_rule["labels"] = {} + + if self.topology: + alert_rule["labels"].update(self.topology.as_promql_label_dict()) + # insert juju topology filters into a prometheus alert rule + alert_rule["expr"] = self.topology.render(alert_rule["expr"]) + + return alert_groups + + def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: + """Generate group name from path and topology. + + The group name is made up of the relative path between the root dir_path, the file path, + and topology identifier. + + Args: + root_path: path to the root rules dir. + file_path: path to rule file. + group_name: original group name to keep as part of the new augmented group name + + Returns: + New group name, augmented by juju topology and relative path. + """ + rel_path = os.path.relpath(os.path.dirname(file_path), root_path) + rel_path = "" if rel_path == "." else rel_path.replace(os.path.sep, "_") + + # Generate group name: + # - name, from juju topology + # - suffix, from the relative path of the rule file; + group_name_parts = [self.topology.identifier] if self.topology else [] + group_name_parts.extend([rel_path, group_name, "alerts"]) + # filter to remove empty strings + return "_".join(filter(None, group_name_parts)) + + @classmethod + def _multi_suffix_glob( + cls, dir_path: Path, suffixes: List[str], recursive: bool = True + ) -> list: + """Helper function for getting all files in a directory that have a matching suffix. + + Args: + dir_path: path to the directory to glob from. + suffixes: list of suffixes to include in the glob (items should begin with a period). + recursive: a flag indicating whether a glob is recursive (nested) or not. + + Returns: + List of files in `dir_path` that have one of the suffixes specified in `suffixes`. + """ + all_files_in_dir = dir_path.glob("**/*" if recursive else "*") + return list(filter(lambda f: f.is_file() and f.suffix in suffixes, all_files_in_dir)) + + def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: + """Read all rule files in a directory. + + All rules from files for the same directory are loaded into a single + group. The generated name of this group includes juju topology. + By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. + + Args: + dir_path: directory containing *.rule files (alert rules without groups). + recursive: flag indicating whether to scan for rule files recursively. + + Returns: + a list of dictionaries representing prometheus alert rule groups, each dictionary + representing an alert group (structure determined by `yaml.safe_load`). + """ + alert_groups = [] # type: List[dict] + + # Gather all alerts into a list of groups + for file_path in self._multi_suffix_glob(dir_path, [".rule", ".rules"], recursive): + alert_groups_from_file = self._from_file(dir_path, file_path) + if alert_groups_from_file: + logger.debug("Reading alert rule from %s", file_path) + alert_groups.extend(alert_groups_from_file) + + return alert_groups + + def add_path(self, path: str, *, recursive: bool = False) -> None: + """Add rules from a dir path. + + All rules from files are aggregated into a data structure representing a single rule file. + All group names are augmented with juju topology. + + Args: + path: either a rules file or a dir of rules files. + recursive: whether to read files recursively or not (no impact if `path` is a file). + + Returns: + True if path was added else False. + """ + path = Path(path) # type: Path + if path.is_dir(): + self.alert_groups.extend(self._from_dir(path, recursive)) + elif path.is_file(): + self.alert_groups.extend(self._from_file(path.parent, path)) + else: + logger.debug("Alert rules path does not exist: %s", path) + + def as_dict(self) -> dict: + """Return standard alert rules file in dict representation. + + Returns: + a dictionary containing a single list of alert rule groups. + The list of alert rule groups is provided as value of the + "groups" dictionary key. + """ + return {"groups": self.alert_groups} if self.alert_groups else {} + + +class TargetsChangedEvent(EventBase): + """Event emitted when Prometheus scrape targets change.""" + + def __init__(self, handle, relation_id): + super().__init__(handle) + self.relation_id = relation_id + + def snapshot(self): + """Save scrape target relation information.""" + return {"relation_id": self.relation_id} + + def restore(self, snapshot): + """Restore scrape target relation information.""" + self.relation_id = snapshot["relation_id"] + + +class MonitoringEvents(ObjectEvents): + """Event descriptor for events raised by `MetricsEndpointConsumer`.""" + + targets_changed = EventSource(TargetsChangedEvent) + + +class MetricsEndpointConsumer(Object): + """A Prometheus based Monitoring service.""" + + on = MonitoringEvents() + + def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): + """A Prometheus based Monitoring service. + + Args: + charm: a `CharmBase` instance that manages this + instance of the Prometheus service. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that consume metrics endpoints. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.requires` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._transformer = PromqlTransformer(self._charm) + events = self._charm.on[relation_name] + self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed) + self.framework.observe( + events.relation_departed, self._on_metrics_provider_relation_departed + ) + + def _on_metrics_provider_relation_changed(self, event): + """Handle changes with related metrics providers. + + Anytime there are changes in relations between Prometheus + and metrics provider charms the Prometheus charm is informed, + through a `TargetsChangedEvent` event. The Prometheus charm can + then choose to update its scrape configuration. + + Args: + event: a `CharmEvent` in response to which the Prometheus + charm must update its scrape configuration. + """ + rel_id = event.relation.id + + self.on.targets_changed.emit(relation_id=rel_id) + + def _on_metrics_provider_relation_departed(self, event): + """Update job config when a metrics provider departs. + + When a metrics provider departs the Prometheus charm is informed + through a `TargetsChangedEvent` event so that it can update its + scrape configuration to ensure that the departed metrics provider + is removed from the list of scrape jobs and + + Args: + event: a `CharmEvent` that indicates a metrics provider + unit has departed. + """ + rel_id = event.relation.id + self.on.targets_changed.emit(relation_id=rel_id) + + def jobs(self) -> list: + """Fetch the list of scrape jobs. + + Returns: + A list consisting of all the static scrape configurations + for each related `MetricsEndpointProvider` that has specified + its scrape targets. + """ + scrape_jobs = [] + + for relation in self._charm.model.relations[self._relation_name]: + static_scrape_jobs = self._static_scrape_config(relation) + if static_scrape_jobs: + scrape_jobs.extend(static_scrape_jobs) + + return scrape_jobs + + def alerts(self) -> dict: + """Fetch alerts for all relations. + + A Prometheus alert rules file consists of a list of "groups". Each + group consists of a list of alerts (`rules`) that are sequentially + executed. This method returns all the alert rules provided by each + related metrics provider charm. These rules may be used to generate a + separate alert rules file for each relation since the returned list + of alert groups are indexed by that relations Juju topology identifier. + The Juju topology identifier string includes substrings that identify + alert rule related metadata such as the Juju model, model UUID and the + application name from where the alert rule originates. Since this + topology identifier is globally unique, it may be used for instance as + the name for the file into which the list of alert rule groups are + written. For each relation, the structure of data returned is a dictionary + representation of a standard prometheus rules file: + + {"groups": [{"name": ...}, ...]} + + per official prometheus documentation + https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + + The value of the `groups` key is such that it may be used to generate + a Prometheus alert rules file directly using `yaml.dump` but the + `groups` key itself must be included as this is required by Prometheus. + + For example the list of alert rule groups returned by this method may + be written into files consumed by Prometheus as follows + + ``` + for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): + filename = "juju_" + topology_identifier + ".rules" + path = os.path.join(PROMETHEUS_RULES_DIR, filename) + rules = yaml.dump(alert_rule_groups) + container.push(path, rules, make_dirs=True) + ``` + + Returns: + A dictionary mapping the Juju topology identifier of the source charm to + its list of alert rule groups. + """ + alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files + for relation in self._charm.model.relations[self._relation_name]: + if not relation.units or not relation.app: + continue + + alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) + if not alert_rules: + continue + + identifier = None + try: + scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) + identifier = ProviderTopology.from_relation_data(scrape_metadata).identifier + alerts[identifier] = self._transformer.apply_label_matchers(alert_rules) + + except KeyError as e: + logger.debug( + "Relation %s has no 'scrape_metadata': %s", + relation.id, + e, + ) + identifier = self._get_identifier_by_alert_rules(alert_rules) + + if not identifier: + logger.error( + "Alert rules were found but no usable group or identifier was present" + ) + continue + alerts[identifier] = alert_rules + + return alerts + + def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: + """Determine an appropriate dict key for alert rules. + + The key is used as the filename when writing alerts to disk, so the structure + and uniqueness is important. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + logger.debug("No alert groups were found in relation data") + return None + + # Construct an ID based on what's in the alert rules if they have labels + for group in rules["groups"]: + try: + labels = group["rules"][0]["labels"] + identifier = "{}_{}_{}".format( + labels["juju_model"], + labels["juju_model_uuid"], + labels["juju_application"], + ) + return identifier + except KeyError: + logger.debug("Alert rules were found but no usable labels were present") + continue + + logger.warning( + "No labeled alert rules were found, and no 'scrape_metadata' " + "was available. Using the alert group name as filename." + ) + try: + for group in rules["groups"]: + return group["name"] + except KeyError: + logger.debug("No group name was found to use as identifier") + + return None + + def _static_scrape_config(self, relation) -> list: + """Generate the static scrape configuration for a single relation. + + If the relation data includes `scrape_metadata` then the value + of this key is used to annotate the scrape jobs with Juju + Topology labels before returning them. + + Args: + relation: an `ops.model.Relation` object whose static + scrape configuration is required. + + Returns: + A list (possibly empty) of scrape jobs. Each job is a + valid Prometheus scrape configuration for that job, + represented as a Python dictionary. + """ + if not relation.units: + return [] + + scrape_jobs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) + + if not scrape_jobs: + return [] + + scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) + + if not scrape_metadata: + return scrape_jobs + + job_name_prefix = ProviderTopology.from_relation_data(scrape_metadata).scrape_identifier + + hosts = self._relation_hosts(relation) + + labeled_job_configs = [] + for job in scrape_jobs: + config = self._labeled_static_job_config( + _sanitize_scrape_configuration(job), + job_name_prefix, + hosts, + scrape_metadata, + ) + labeled_job_configs.append(config) + + return labeled_job_configs + + def _relation_hosts(self, relation) -> dict: + """Fetch unit names and address of all metrics provider units for a single relation. + + Args: + relation: An `ops.model.Relation` object for which the unit name to + address mapping is required. + + Returns: + A dictionary that maps unit names to unit addresses for + the specified relation. + """ + hosts = {} + for unit in relation.units: + # TODO deprecate and remove unit.name + unit_name = relation.data[unit].get("prometheus_scrape_unit_name") or unit.name + # TODO deprecate and remove "prometheus_scrape_host" + unit_address = relation.data[unit].get( + "prometheus_scrape_unit_address" + ) or relation.data[unit].get("prometheus_scrape_host") + if unit_name and unit_address: + hosts.update({unit_name: unit_address}) + + return hosts + + def _labeled_static_job_config(self, job, job_name_prefix, hosts, scrape_metadata) -> dict: + """Construct labeled job configuration for a single job. + + Args: + + job: a dictionary representing the job configuration as obtained from + `MetricsEndpointProvider` over relation data. + job_name_prefix: a string that may either be used as the + job name if the job has no associated name or used as a prefix for + the job if it does have a job name. + hosts: a dictionary mapping host names to host address for + all units of the relation for which this job configuration + must be constructed. + scrape_metadata: scrape configuration metadata obtained + from `MetricsEndpointProvider` from the same relation for + which this job configuration is being constructed. + + Returns: + A dictionary representing a Prometheus job configuration + for a single job. + """ + name = job.get("job_name") + job_name = "{}_{}".format(job_name_prefix, name) if name else job_name_prefix + + labeled_job = job.copy() + labeled_job["job_name"] = job_name + + static_configs = job.get("static_configs") + labeled_job["static_configs"] = [] + + # relabel instance labels so that instance identifiers are globally unique + # stable over unit recreation + instance_relabel_config = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + # label all static configs in the Prometheus job + # labeling inserts Juju topology information and + # sets a relable config for instance labels + for static_config in static_configs: + labels = static_config.get("labels", {}) if static_configs else {} + all_targets = static_config.get("targets", []) + + # split all targets into those which will have unit labels + # and those which will not + ports = [] + unitless_targets = [] + for target in all_targets: + host, port = target.split(":") + if host.strip() == "*": + ports.append(port.strip()) + else: + unitless_targets.append(target) + + # label scrape targets that do not have unit labels + if unitless_targets: + unitless_config = self._labeled_unitless_config( + unitless_targets, labels, scrape_metadata + ) + labeled_job["static_configs"].append(unitless_config) + + # label scrape targets that do have unit labels + for host_name, host_address in hosts.items(): + static_config = self._labeled_unit_config( + host_name, host_address, ports, labels, scrape_metadata + ) + labeled_job["static_configs"].append(static_config) + if "juju_unit" not in instance_relabel_config["source_labels"]: + instance_relabel_config["source_labels"].append("juju_unit") # type: ignore + + # ensure topology relabeling of instance label is last in order of relabelings + relabel_configs = job.get("relabel_configs", []) + relabel_configs.append(instance_relabel_config) + labeled_job["relabel_configs"] = relabel_configs + + return labeled_job + + def _set_juju_labels(self, labels, scrape_metadata) -> dict: + """Create a copy of metric labels with Juju topology information. + + Args: + labels: a dictionary containing Prometheus metric labels. + scrape_metadata: scrape related metadata provided by + `MetricsEndpointProvider`. + + Returns: + a copy of the `labels` dictionary augmented with Juju + topology information with the exception of unit name. + """ + juju_labels = labels.copy() # deep copy not needed + juju_labels.update( + ProviderTopology.from_relation_data(scrape_metadata).as_promql_label_dict() + ) + + return juju_labels + + def _labeled_unitless_config(self, targets, labels, scrape_metadata) -> dict: + """Static scrape configuration for fully qualified host addresses. + + Fully qualified hosts are those scrape targets for which the + address are specified by the `MetricsEndpointProvider` as part + of the scrape job specification set in application relation data. + The address specified need not belong to any unit of the + `MetricsEndpointProvider` charm. As a result there is no reliable + way to determine the name (Juju topology unit name) for such a + target. + + Args: + targets: a list of addresses of fully qualified hosts. + labels: labels specified by `MetricsEndpointProvider` clients + which are associated with `targets`. + scrape_metadata: scrape related metadata provided by `MetricsEndpointProvider`. + + Returns: + A dictionary containing the static scrape configuration + for a list of fully qualified hosts. + """ + juju_labels = self._set_juju_labels(labels, scrape_metadata) + unitless_config = {"targets": targets, "labels": juju_labels} + return unitless_config + + def _labeled_unit_config( + self, unit_name, host_address, ports, labels, scrape_metadata + ) -> dict: + """Static scrape configuration for a wildcard host. + + Wildcard hosts are those scrape targets whose name (Juju unit + name) and address (unit IP address) is set into unit relation + data by the `MetricsEndpointProvider` charm, which sets this + data for ALL its units. + + Args: + unit_name: a string representing the unit name of the wildcard host. + host_address: a string representing the address of the wildcard host. + ports: list of ports on which this wildcard host exposes its metrics. + labels: a dictionary of labels provided by + `MetricsEndpointProvider` intended to be associated with + this wildcard host. + scrape_metadata: scrape related metadata provided by `MetricsEndpointProvider`. + + Returns: + A dictionary containing the static scrape configuration + for a single wildcard host. + """ + juju_labels = self._set_juju_labels(labels, scrape_metadata) + + juju_labels["juju_unit"] = unit_name + + static_config = {"labels": juju_labels} + + if ports: + targets = [] + for port in ports: + targets.append("{}:{}".format(host_address, port)) + static_config["targets"] = targets # type: ignore + else: + static_config["targets"] = [host_address] # type: ignore + + return static_config + + +def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: + """Resolve the provided path items against the directory of the main file. + + Look up the directory of the `main.py` file being executed. This is normally + going to be the charm.py file of the charm including this library. Then, resolve + the provided path elements and, if the result path exists and is a directory, + return its absolute path; otherwise, raise en exception. + + Raises: + InvalidAlertRulePathError, if the path does not exist or is not a directory. + """ + charm_dir = Path(str(charm.charm_dir)) + if not charm_dir.exists() or not charm_dir.is_dir(): + # Operator Framework does not currently expose a robust + # way to determine the top level charm source directory + # that is consistent across deployed charms and unit tests + # Hence for unit tests the current working directory is used + # TODO: updated this logic when the following ticket is resolved + # https://github.com/canonical/operator/issues/643 + charm_dir = Path(os.getcwd()) + + alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) + + if not alerts_dir_path.exists(): + raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist") + if not alerts_dir_path.is_dir(): + raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory") + + return str(alerts_dir_path) + + +class MetricsEndpointProvider(Object): + """A metrics endpoint for Prometheus.""" + + def __init__( + self, + charm, + relation_name: str = DEFAULT_RELATION_NAME, + jobs=None, + alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, + ): + """Construct a metrics provider for a Prometheus charm. + + If your charm exposes a Prometheus metrics endpoint, the + `MetricsEndpointProvider` object enables your charm to easily + communicate how to reach that metrics endpoint. + + By default, a charm instantiating this object has the metrics + endpoints of each of its units scraped by the related Prometheus + charms. The scraped metrics are automatically tagged by the + Prometheus charms with Juju topology data via the + `juju_model_name`, `juju_model_uuid`, `juju_application_name` + and `juju_unit` labels. To support such tagging `MetricsEndpointProvider` + automatically forwards scrape metadata to a `MetricsEndpointConsumer` + (Prometheus charm). + + Scrape targets provided by `MetricsEndpointProvider` can be + customized when instantiating this object. For example in the + case of a charm exposing the metrics endpoint for each of its + units on port 8080 and the `/metrics` path, the + `MetricsEndpointProvider` can be instantiated as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "static_configs": [{"targets": ["*:8080"]}], + }]) + + The notation `*:` means "scrape each unit of this charm on port + ``. + + In case the metrics endpoints are not on the standard `/metrics` path, + a custom path can be specified as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "metrics_path": "/my/strange/metrics/path", + "static_configs": [{"targets": ["*:8080"]}], + }]) + + Note how the `jobs` argument is a list: this allows you to expose multiple + combinations of paths "metrics_path" and "static_configs" in case your charm + exposes multiple endpoints, which could happen, for example, when you have + multiple workload containers, with applications in each needing to be scraped. + The structure of the objects in the `jobs` list is one-to-one with the + `scrape_config` configuration item of Prometheus' own configuration (see + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + ), but with only a subset of the fields allowed. The permitted fields are + listed in `ALLOWED_KEYS` object in this charm library module. + + It is also possible to specify alert rules. By default, this library will look + into the `/prometheus_alert_rules`, which in a standard charm + layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a + separate `*.rule` file. If the syntax of a rule is invalid, + the `MetricsEndpointProvider` logs an error and does not load the particular + rule. + + To avoid false positives and negatives in the evaluation of alert rules, + all ingested alert rule expressions are automatically qualified using Juju + Topology filters. This ensures that alert rules provided by your charm, trigger + alerts based only on data scrapped from your charm. For example an alert rule + such as the following + + alert: UnitUnavailable + expr: up < 1 + for: 0m + + will be automatically transformed into something along the lines of the following + + alert: UnitUnavailable + expr: up{juju_model=, juju_model_uuid=, juju_application=} < 1 + for: 0m + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointProvider` object. Typically this is + `self` in the instantiating class. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that provide metrics endpoints. + jobs: an optional list of dictionaries where each + dictionary represents the Prometheus scrape + configuration for a single job. When not provided, a + default scrape configuration is provided for the + `/metrics` endpoint polling all units of the charm on port `80` + using the `MetricsEndpointProvider` object. + alert_rules_path: an optional path for the location of alert rules + files. Defaults to "./prometheus_alert_rules", + resolved relative to the directory hosting the charm entry file. + The alert rules are automatically updated on charm upgrade. + refresh_event: an optional bound event or list of bound events which + will be observed to re-set scrape job data (IP address and others) + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.provides` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + try: + alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + + super().__init__(charm, relation_name) + self.topology = ProviderTopology.from_charm(charm) + + self._charm = charm + self._alert_rules_path = alert_rules_path + self._relation_name = relation_name + # sanitize job configurations to the supported subset of parameters + jobs = [] if jobs is None else jobs + self._jobs = [_sanitize_scrape_configuration(job) for job in jobs] + + events = self._charm.on[self._relation_name] + self.framework.observe(events.relation_joined, self._set_scrape_job_spec) + self.framework.observe(events.relation_changed, self._set_scrape_job_spec) + + if not refresh_event: + if len(self._charm.meta.containers) == 1: + if "kubernetes" in self._charm.meta.series: + # This is a podspec charm + refresh_event = [self._charm.on.update_status] + else: + # This is a sidecar/pebble charm + container = list(self._charm.meta.containers.values())[0] + refresh_event = [self._charm.on[container.name.replace("-", "_")].pebble_ready] + else: + logger.warning( + "%d containers are present in metadata.yaml and " + "refresh_event was not specified. Defaulting to update_status. " + "Metrics IP may not be set in a timely fashion.", + len(self._charm.meta.containers), + ) + refresh_event = [self._charm.on.update_status] + + else: + if not isinstance(refresh_event, list): + refresh_event = [refresh_event] + + for ev in refresh_event: + self.framework.observe(ev, self._set_unit_ip) + + self.framework.observe(self._charm.on.upgrade_charm, self._set_scrape_job_spec) + + # If there is no leader during relation_joined we will still need to set alert rules. + self.framework.observe(self._charm.on.leader_elected, self._set_scrape_job_spec) + + def _set_scrape_job_spec(self, event): + """Ensure scrape target information is made available to prometheus. + + When a metrics provider charm is related to a prometheus charm, the + metrics provider sets specification and metadata related to its own + scrape configuration. This information is set using Juju application + data. In addition each of the consumer units also sets its own + host address in Juju unit relation data. + """ + self._set_unit_ip(event) + + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules(topology=self.topology) + alert_rules.add_path(self._alert_rules_path, recursive=True) + alert_rules_as_dict = alert_rules.as_dict() + + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) + + if alert_rules_as_dict: + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. + relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + + def _set_unit_ip(self, _): + """Set unit host address. + + Each time a metrics provider charm container is restarted it updates its own + host address in the unit relation data for the prometheus charm. + + The only argument specified is an event and it ignored. this is for expediency + to be able to use this method as an event handler, although no access to the + event is actually needed. + """ + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = socket.getfqdn() + relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str( + self._charm.model.unit.name + ) + + def _is_valid_unit_address(self, address: str) -> bool: + """Validate a unit address. + + At present only IP address validation is supported, but + this may be extended to DNS addresses also, as needed. + + Args: + address: a string representing a unit address + """ + try: + _ = ipaddress.ip_address(address) + except ValueError: + return False + + return True + + @property + def _scrape_jobs(self) -> list: + """Fetch list of scrape jobs. + + Returns: + A list of dictionaries, where each dictionary specifies a + single scrape job for Prometheus. + """ + return self._jobs if self._jobs else [DEFAULT_JOB] + + @property + def _scrape_metadata(self) -> dict: + """Generate scrape metadata. + + Returns: + Scrape configuration metadata for this metrics provider charm. + """ + return self.topology.as_dict() + + +class PrometheusRulesProvider(Object): + """Forward rules to Prometheus. + + This object may be used to forward rules to Prometheus. At present it only supports + forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which + is used for forwarding both scrape targets and associated alert rules. This object + is typically used when there is a desire to forward rules that apply globally (across + all deployed charms and units) rather than to a single charm. All rule files are + forwarded using the same 'prometheus_scrape' interface that is also used by + `MetricsEndpointProvider`. + + Args: + charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface. + relation_name: Name of the relation in `metadata.yaml` that + has the `prometheus_scrape` interface. + dir_path: Root directory for the collection of rule files. + recursive: Whether or not to scan for rule files recursively. + """ + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + recursive=True, + ): + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self.topology = ProviderTopology.from_charm(charm) + self._recursive = recursive + + try: + dir_path = _resolve_dir_against_charm_path(charm, dir_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + self.dir_path = dir_path + + events = self._charm.on[self._relation_name] + event_sources = [ + events.relation_joined, + events.relation_changed, + self._charm.on.leader_elected, + self._charm.on.upgrade_charm, + ] + + for event_source in event_sources: + self.framework.observe(event_source, self._update_relation_data) + + def _reinitialize_alert_rules(self): + """Reloads alert rules and updates all relations.""" + self._update_relation_data(None) + + def _update_relation_data(self, _): + """Update application relation data with alert rules for all relations.""" + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules() + alert_rules.add_path(self.dir_path, recursive=self._recursive) + alert_rules_as_dict = alert_rules.as_dict() + + logger.info("Updating relation data with rule files from disk") + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["alert_rules"] = json.dumps( + alert_rules_as_dict, + sort_keys=True, # sort, to prevent unnecessary relation_changed events + ) + + +class MetricsEndpointAggregator(Object): + """Aggregate metrics from multiple scrape targets. + + `MetricsEndpointAggregator` collects scrape target information from one + or more related charms and forwards this to a `MetricsEndpointConsumer` + charm, which may be in a different Juju model. However it is + essential that `MetricsEndpointAggregator` itself resides in the same + model as its scrape targets, as this is currently the only way to + ensure in Juju that the `MetricsEndpointAggregator` will be able to + determine the model name and uuid of the scrape targets. + + `MetricsEndpointAggregator` should be used in place of + `MetricsEndpointProvider` in the following two use cases: + + 1. Integrating one or more scrape targets that do not support the + `prometheus_scrape` interface. + + 2. Integrating one or more scrape targets through cross model + relations. Although the [Scrape Config Operator](https://charmhub.io/cos-configuration-k8s) + may also be used for the purpose of supporting cross model + relations. + + Using `MetricsEndpointAggregator` to build a Prometheus charm client + only requires instantiating it. Instantiating + `MetricsEndpointAggregator` is similar to `MetricsEndpointProvider` except + that it requires specifying the names of three relations: the + relation with scrape targets, the relation for alert rules, and + that with the Prometheus charms. For example + + ```python + self._aggregator = MetricsEndpointAggregator( + self, + { + "prometheus": "monitoring", + "scrape_target": "prometheus-target", + "alert_rules": "prometheus-rules" + } + ) + ``` + + `MetricsEndpointAggregator` assumes that each unit of a scrape target + sets in its unit-level relation data two entries with keys + "hostname" and "port". If it is required to integrate with charms + that do not honor these assumptions, it is always possible to + derive from `MetricsEndpointAggregator` overriding the `_get_targets()` + method, which is responsible for aggregating the unit name, host + address ("hostname") and port of the scrape target. + + `MetricsEndpointAggregator` also assumes that each unit of a + scrape target sets in its unit-level relation data a key named + "groups". The value of this key is expected to be the string + representation of list of Prometheus Alert rules in YAML format. + An example of a single such alert rule is + + ```yaml + - alert: HighRequestLatency + expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 + for: 10m + labels: + severity: page + annotations: + summary: High request latency + ``` + + Once again if it is required to integrate with charms that do not + honour these assumptions about alert rules then an object derived + from `MetricsEndpointAggregator` may be used by overriding the + `_get_alert_rules()` method. + + `MetricsEndpointAggregator` ensures that Prometheus scrape job + specifications and alert rules are annotated with Juju topology + information, just like `MetricsEndpointProvider` and + `MetricsEndpointConsumer` do. + + By default `MetricsEndpointAggregator` ensures that Prometheus + "instance" labels refer to Juju topology. This ensures that + instance labels are stable over unit recreation. While it is not + advisable to change this option, if required it can be done by + setting the "relabel_instance" keyword argument to `False` when + constructing an aggregator object. + """ + + def __init__(self, charm, relation_names, relabel_instance=True): + """Construct a `MetricsEndpointAggregator`. + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointAggregator` object. Typically this is + `self` in the instantiating class. + relation_names: a dictionary with three keys. The value + of the "scrape_target" and "alert_rules" keys are + the relation names over which scrape job and alert rule + information is gathered by this `MetricsEndpointAggregator`. + And the value of the "prometheus" key is the name of + the relation with a `MetricsEndpointConsumer` such as + the Prometheus charm. + relabel_instance: A boolean flag indicating if Prometheus + scrape job "instance" labels must refer to Juju Topology. + """ + super().__init__(charm, relation_names["prometheus"]) + + self._charm = charm + self._target_relation = relation_names["scrape_target"] + self._prometheus_relation = relation_names["prometheus"] + self._alert_rules_relation = relation_names["alert_rules"] + self._relabel_instance = relabel_instance + + # manage Prometheus charm relation events + prometheus_events = self._charm.on[self._prometheus_relation] + self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data) + + # manage list of Prometheus scrape jobs from related scrape targets + target_events = self._charm.on[self._target_relation] + self.framework.observe(target_events.relation_changed, self._update_prometheus_jobs) + self.framework.observe(target_events.relation_departed, self._remove_prometheus_jobs) + + # manage alert rules for Prometheus from related scrape targets + alert_rule_events = self._charm.on[self._alert_rules_relation] + self.framework.observe(alert_rule_events.relation_changed, self._update_alert_rules) + self.framework.observe(alert_rule_events.relation_departed, self._remove_alert_rules) + + def _set_prometheus_data(self, event): + """Ensure every new Prometheus instances is updated. + + Any time a new Prometheus unit joins the relation with + `MetricsEndpointAggregator`, that Prometheus unit is provided + with the complete set of existing scrape jobs and alert rules. + """ + jobs = [] # list of scrape jobs, one per relation + for relation in self.model.relations[self._target_relation]: + targets = self._get_targets(relation) + if targets and relation.app: + jobs.append(self._static_scrape_job(targets, relation.app.name)) + + groups = [] # list of alert rule groups, one group per relation + for relation in self.model.relations[self._alert_rules_relation]: + unit_rules = self._get_alert_rules(relation) + if unit_rules and relation.app: + appname = relation.app.name + rules = self._label_alert_rules(unit_rules, appname) + group = {"name": self._group_name(appname), "rules": rules} + groups.append(group) + + event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + + def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. Additionally, if this method is called manually, do the + sameself. + + Args: + targets: a `dict` containing target information + app_name: a `str` identifying the application + """ + # new scrape job for the relation that has changed + updated_job = self._static_scrape_job(targets, app_name, **kwargs) + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + # list of scrape jobs that have not changed + jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] + jobs.append(updated_job) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + def _update_prometheus_jobs(self, event): + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. + """ + targets = self._get_targets(event.relation) + if not targets: + return + + # new scrape job for the relation that has changed + updated_job = self._static_scrape_job(targets, event.relation.app.name) + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + # list of scrape jobs that have not changed + jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] + jobs.append(updated_job) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + def _remove_prometheus_jobs(self, event): + """Remove scrape jobs when a target departs. + + Any time a scrape target departs, any Prometheus scrape job + associated with that specific scrape target is removed. + """ + job_name = self._job_name(event.relation.app.name) + unit_name = event.unit.name + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + if not jobs: + continue + + changed_job = [j for j in jobs if j.get("job_name") == job_name] + if not changed_job: + continue + changed_job = changed_job[0] + + # list of scrape jobs that have not changed + jobs = [job for job in jobs if job.get("job_name") != job_name] + + # list of scrape jobs for units of the same application that still exist + configs_kept = [ + config + for config in changed_job["static_configs"] # type: ignore + if config.get("labels", {}).get("juju_unit") != unit_name + ] + + if configs_kept: + changed_job["static_configs"] = configs_kept # type: ignore + jobs.append(changed_job) + + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + def _update_alert_rules(self, event): + """Update alert rules in response to scrape target changes. + + When there is any change in alert rule relation data for any + scrape target, the list of alert rules for that specific + target is updated. + """ + unit_rules = self._get_alert_rules(event.relation) + if not unit_rules: + return + + appname = event.relation.app.name + rules = self._label_alert_rules(unit_rules, appname) + # the alert rule group that has changed + updated_group = {"name": self._group_name(appname), "rules": rules} + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + # list of alert rule groups that have not changed + groups = [group for group in groups if updated_group["name"] != group["name"]] + groups.append(updated_group) + relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + + def _remove_alert_rules(self, event): + """Remove alert rules for departed targets. + + Any time a scrape target departs any alert rules associated + with that specific scrape target is removed. + """ + group_name = self._group_name(event.relation.app.name) + unit_name = event.unit.name + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + if not alert_rules: + continue + + groups = alert_rules.get("groups", []) + if not groups: + continue + + changed_group = [group for group in groups if group["name"] == group_name] + if not changed_group: + continue + changed_group = changed_group[0] + + # list of alert rule groups that have not changed + groups = [group for group in groups if group["name"] != group_name] + + # list of alert rules not associated with departing unit + rules_kept = [ + rule + for rule in changed_group.get("rules") # type: ignore + if rule.get("labels").get("juju_unit") != unit_name + ] + + if rules_kept: + changed_group["rules"] = rules_kept # type: ignore + groups.append(changed_group) + + relation.data[self._charm.app]["alert_rules"] = ( + json.dumps({"groups": groups}) if groups else "{}" + ) + + def _get_targets(self, relation) -> dict: + """Fetch scrape targets for a relation. + + Scrape target information is returned for each unit in the + relation. This information contains the unit name, network + hostname (or address) for that unit, and port on which an + metrics endpoint is exposed in that unit. + + Args: + relation: an `ops.model.Relation` object for which scrape + targets are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is itself + a dictionary of the form + ``` + {"hostname": hostname, "port": port} + ``` + """ + targets = {} + for unit in relation.units: + port = relation.data[unit].get("port", 80) + hostname = relation.data[unit].get("hostname") + if hostname: + targets.update({unit.name: {"hostname": hostname, "port": port}}) + + return targets + + def _get_alert_rules(self, relation) -> dict: + """Fetch alert rules for a relation. + + Each unit of the related scrape target may have its own + associated alert rules. Alert rules for all units are returned + indexed by unit name. + + Args: + relation: an `ops.model.Relation` object for which alert + rules are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is a list + of alert rules. Each rule is in dictionary format. The + structure "rule dictionary" corresponds to single + Prometheus alert rule. + """ + rules = {} + for unit in relation.units: + unit_rules = yaml.safe_load(relation.data[unit].get("groups", "")) + if unit_rules: + rules.update({unit.name: unit_rules}) + + return rules + + def _job_name(self, appname) -> str: + """Construct a scrape job name. + + Each relation has its own unique scrape job name. All units in + the relation are scraped as part of the same scrape job. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus scrape job name for the application. + """ + return "juju_{}_{}_{}_prometheus_scrape".format( + self.model.name, self.model.uuid[:7], appname + ) + + def _group_name(self, appname) -> str: + """Construct name for an alert rule group. + + Each unit in a relation may define its own alert rules. All + rules, for all units in a relation are grouped together and + given a single alert rule group name. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus alert rules group name for the application. + """ + return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], appname) + + def _label_alert_rules(self, unit_rules, appname) -> list: + """Apply juju topology labels to alert rules. + + Args: + unit_rules: a list of alert rules, where each rule is in + dictionary format. + appname: a string name of the application to which the + alert rules belong. + + Returns: + a list of alert rules with Juju topology labels. + """ + labeled_rules = [] + for unit_name, rules in unit_rules.items(): + for rule in rules: + rule["labels"].update( + AggregatorTopology.create( + self.model.name, self.model.uuid, appname, unit_name + ).as_promql_label_dict() + ) + labeled_rules.append(rule) + + return labeled_rules + + def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: + """Construct a static scrape job for an application. + + Args: + targets: a dictionary providing hostname and port for all + scrape target. The keys of this dictionary are unit + names. Values corresponding to these keys are + themselves a dictionary with keys "hostname" and + "port". + application_name: a string name of the application for + which this static scrape job is being constructed. + + Returns: + A dictionary corresponding to a Prometheus static scrape + job configuration for one application. The returned + dictionary may be transformed into YAML and appended to + the list of any existing list of Prometheus static configs. + """ + juju_model = self.model.name + juju_model_uuid = self.model.uuid + job = { + "job_name": self._job_name(application_name), + "static_configs": [ + { + "targets": ["{}:{}".format(target["hostname"], target["port"])], + "labels": { + "juju_model": juju_model, + "juju_model_uuid": juju_model_uuid, + "juju_application": application_name, + "juju_unit": unit_name, + "host": target["hostname"], + }, + } + for unit_name, target in targets.items() + ], + "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), + } + job.update(kwargs.get("updates", {})) + + return job + + @property + def _relabel_configs(self) -> list: + """Create Juju topology relabeling configuration. + + Using Juju topology for instance labels ensures that these + labels are stable across unit recreation. + + Returns: + a list of Prometheus relabling configurations. Each item in + this list is one relabel configuration. + """ + return ( + [ + { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + ] + if self._relabel_instance + else [] + ) + + +class PromqlTransformer: + """Uses promql-transform to inject label matchers into alert rule expressions.""" + + _path = None + _disabled = False + + @property + def path(self): + """Lazy lookup of the path of promql-transform.""" + if self._disabled: + return None + if not self._path: + self._path = self._get_transformer_path() + if not self._path: + logger.debug("Skipping injection of juju topology as label matchers") + self._disabled = True + return self._path + + def __init__(self, charm): + self._charm = charm + + def apply_label_matchers(self, rules): + """Will apply label matchers to the expression of all alerts in all supplied groups.""" + if not self.path: + return rules + for group in rules["groups"]: + rules_in_group = group.get("rules", []) + for rule in rules_in_group: + topology = {} + # if the user for some reason has provided juju_unit, we'll need to honor it + # in most cases, however, this will be empty + for label in [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_charm", + "juju_unit", + ]: + if label in rule["labels"]: + topology[label] = rule["labels"][label] + + rule["expr"] = self._apply_label_matcher(rule["expr"], topology) + return rules + + def _apply_label_matcher(self, expression, topology): + if not topology: + return expression + if not self.path: + logger.debug( + "`promql-transform` unavailable. leaving expression unchanged: %s", expression + ) + return expression + args = [str(self.path)] + args.extend( + ["--label-matcher={}={}".format(key, value) for key, value in topology.items()] + ) + + args.extend(["{}".format(expression)]) + # noinspection PyBroadException + try: + return self._exec(args) + except Exception as e: + logger.debug('Applying the expression failed: "%s", falling back to the original', e) + return expression + + def _get_transformer_path(self) -> Optional[Path]: + arch = platform.processor() + arch = "amd64" if arch == "x86_64" else arch + res = "promql-transform-{}".format(arch) + try: + path = Path(res).resolve() + path.chmod(0o777) + return path + except NotImplementedError: + logger.debug("System lacks support for chmod") + except FileNotFoundError: + logger.debug('Could not locate promql transform at: "{}"'.format(res)) + return None + + def _exec(self, cmd): + result = subprocess.run(cmd, check=False, stdout=subprocess.PIPE) + output = result.stdout.decode("utf-8").strip() + return output diff --git a/metadata.yaml b/metadata.yaml index 29a0d065..1d0098f9 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -39,6 +39,10 @@ provides: # assumed network type: private karma-dashboard: interface: karma_dashboard + self-metrics-endpoint: + interface: prometheus_scrape + grafana-dashboard: + interface: grafana_dashboard peers: replicas: diff --git a/pyproject.toml b/pyproject.toml index 931abeb2..5a61178b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,9 +56,11 @@ module = ["ops.*", "lightkube.*", "git.*", "pytest_operator.*", "validators.*"] ignore_missing_imports = true [[tool.mypy.overrides]] -module = ["charms.observability_libs.*"] +module = ["charms.grafana_k8s.*", "charms.observability_libs.*"] +follow_imports = "silent" warn_unused_ignores = false [tool.pytest.ini_options] minversion = "6.0" log_cli_level = "INFO" +asyncio_mode = "auto" \ No newline at end of file diff --git a/src/charm.py b/src/charm.py index f4b5996f..aa82dd9b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,12 +7,14 @@ import hashlib import logging import socket -from typing import List, cast +from typing import List, Optional, cast import yaml from charms.alertmanager_k8s.v0.alertmanager_dispatch import AlertmanagerProvider +from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider from charms.karma_k8s.v0.karma_dashboard import KarmaProvider from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch +from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from ops.charm import ActionEvent, CharmBase from ops.framework import StoredState from ops.main import main @@ -81,6 +83,14 @@ def __init__(self, *args): ], ) + # Self-monitoring + self._scraping = MetricsEndpointProvider( + self, + relation_name="self-metrics-endpoint", + jobs=[{"static_configs": [{"targets": [f"*:{self._api_port}"]}]}], + ) + self.grafana_dashboard_provider = GrafanaDashboardProvider(charm=self) + self.container = self.unit.get_container(self._container_name) # Core lifecycle events @@ -120,7 +130,7 @@ def api_port(self) -> int: return self._api_port @property - def peer_relation(self) -> Relation: + def peer_relation(self) -> Optional["Relation"]: """Helper function for obtaining the peer relation object. Returns: peer relation object @@ -366,7 +376,7 @@ def _common_exit_hook(self) -> None: service := self.container.get_service(self._service_name) ) and service.is_running() - num_peers = len(self.peer_relation.units) + num_peers = len(rel.units) if (rel := self.peer_relation) else 0 if layer_changed and ( not service_running or (num_peers > 0 and not self._stored.launched_with_peers) diff --git a/src/grafana_dashboards/alertmanager_rev4.json.tmpl b/src/grafana_dashboards/alertmanager_rev4.json.tmpl new file mode 100644 index 00000000..194e8173 --- /dev/null +++ b/src/grafana_dashboards/alertmanager_rev4.json.tmpl @@ -0,0 +1,11162 @@ +{ + "annotations": { + "list": [ + ] + }, + "description": "Dashboard for the Alertmanager Charmed Operator, powered by Juju.", + "editable": true, + "gnetId": 9578, + "graphTooltip": 1, + "id": null, + "iteration": 1555334727804, + "links": [ + { + "icon": "doc", + "tags": [], + "targetBlank": true, + "title": "Docs", + "tooltip": "Official documentation of Alertmanager", + "type": "link", + "url": "https://prometheus.io/docs/alerting/alertmanager/" + }, + { + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "tooltip": "Alertmanager sources on GitHub", + "type": "link", + "url": "https://github.com/prometheus/alertmanager" + }, + { + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "Twitter", + "tooltip": "Twitter account with prometheus related info", + "type": "link", + "url": "https://twitter.com/PrometheusIO" + }, + { + "icon": "question", + "tags": [], + "targetBlank": true, + "title": "Mailing list", + "tooltip": "Prometheus users mailing list", + "type": "link", + "url": "https://groups.google.com/forum/#!forum/prometheus-users" + }, + { + "icon": "question", + "tags": [], + "targetBlank": true, + "title": "IRC", + "tooltip": "Join IRC using Riot", + "type": "link", + "url": "https://riot.im/app/#/room/#prometheus:matrix.org" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 36, + "panels": [], + "title": "General info", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "decimals": 0, + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(244, 213, 152, 0.12)", + "full": false, + "lineColor": "#f4d598", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(alertmanager_build_info{instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Number of instances", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "columns": [], + "datasource": "${prometheusds}", + "description": "Table containing list of Alertmanager instances showing it's version, up time, last reload time and if it was successful.", + "fontSize": "90%", + "gridPos": { + "h": 5, + "w": 9, + "x": 3, + "y": 1 + }, + "id": 26, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 13, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Instance", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "instance", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "Version", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "version", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "Up time", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "mappingType": 1, + "pattern": "Value #A", + "thresholds": [], + "type": "number", + "unit": "s" + }, + { + "alias": "Last reload", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "mappingType": 1, + "pattern": "Value #B", + "thresholds": [], + "type": "number", + "unit": "s" + }, + { + "alias": "Last reload successful", + "colorMode": "cell", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #C", + "thresholds": [ + "0", + "1" + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [], + "type": "hidden", + "unit": "short" + } + ], + "targets": [ + { + "expr": "time() - (alertmanager_build_info{instance=~\"$instance\"} * on (instance, cluster) group_left process_start_time_seconds{instance=~\"$instance\"})", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "A" + }, + { + "expr": "time() - alertmanager_config_last_reload_success_timestamp_seconds{instance=~\"$instance\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "B" + }, + { + "expr": "alertmanager_config_last_reload_successful{instance=~\"$instance\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "C" + } + ], + "title": "Instance versions and up time", + "transform": "table", + "type": "table" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "decimals": 0, + "description": "Number of peers in the Alertmanager cluster.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 207, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(191, 27, 0, 0.08)", + "full": false, + "lineColor": "#e5ac0e", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(alertmanager_cluster_members{instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Cluster size", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Current number of active alerts.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(191, 27, 0, 0.08)", + "full": false, + "lineColor": "#bf1b00", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(alertmanager_alerts{state=\"active\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Number of active alerts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Current number of suppressed alerts.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 18, + "y": 1 + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(249, 226, 210, 0.18)", + "full": false, + "lineColor": "#f9e2d2", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(alertmanager_alerts{state=\"suppressed\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Number of suppressed alerts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Current number of active silences.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 121, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(249, 226, 210, 0.18)", + "full": false, + "lineColor": "#f9e2d2", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(alertmanager_silences{state=\"active\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Number of active silences", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 113, + "panels": [], + "title": "Notifications", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Number of sent notifications to distinct integrations such as PagerDuty, Slack and so on. On negative axis are displayed failed notifications.", + "fill": 1, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 7 + }, + "id": 118, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "total", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/Failed.*/", + "color": "#99440a", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_notifications_total{instance=~\"$instance\"}[$__interval])) by (integration)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ integration}}", + "refId": "B" + }, + { + "expr": "sum(increase(alertmanager_notifications_failed_total{instance=~\"$instance\"}[$__interval])) by (integration)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Failed {{ integration }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Notifications sent from $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Duration of notification sends in 0.99 and 0.9 quantiles per integration.", + "fill": 0, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 12 + }, + "id": 115, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/0.99.*/", + "linewidth": 1 + }, + { + "alias": "/0.5 .*/", + "linewidth": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(histogram_quantile(0.9,rate(alertmanager_notification_latency_seconds_bucket{instance=~\"$instance\"}[$__interval]))) by (integration)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "0.9q {{ integration }}", + "refId": "B" + }, + { + "expr": "sum(histogram_quantile(0.99,rate(alertmanager_notification_latency_seconds_bucket{instance=~\"$instance\"}[$__interval]))) by (integration)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "0.99q {{ integration }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Notification durations per integration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 18, + "panels": [], + "title": "Alerts", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Number of alerts by state such as `active`, `suppressed` etc.", + "fill": 4, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 19 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "active", + "color": "#bf1b00" + }, + { + "alias": "suppressed", + "color": "#2f575e" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(alertmanager_alerts{instance=~\"$instance\"}) by (state)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Active alerts in $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Number of received alerts from Prometheus by status `firing` on positive axis and `resolved` on negative axis.", + "fill": 1, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "resolved", + "color": "#7eb26d", + "transform": "negative-Y" + }, + { + "alias": "firing", + "color": "#99440a" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_alerts_received_total{instance=~\"$instance\"}[$__interval])) by (status)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ status }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Received alerts by status for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 34, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Shows cluster score representing cluster health. From Hashicorps official documentation: \n> This metric describes a node's perception of its own health based on how well it is meeting the soft real-time requirements of the protocol. This metric ranges from 0 to 8, where 0 indicates \"totally healthy\".\n\nFor more info see https://www.consul.io/docs/agent/telemetry.html#cluster-health", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 20 + }, + "id": 57, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_health_score{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster health score", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Clusterhealth score for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "Shows cluster score representing cluster health. From Hashicorps official documentation: \n> This metric describes a node's perception of its own health based on how well it is meeting the soft real-time requirements of the protocol. This metric ranges from 0 to 8, where 0 indicates \"totally healthy\".\n\nFor more info see https://www.consul.io/docs/agent/telemetry.html#cluster-health", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 20 + }, + "id": 408, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 57, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_health_score{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster health score", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Clusterhealth score for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "Shows cluster score representing cluster health. From Hashicorps official documentation: \n> This metric describes a node's perception of its own health based on how well it is meeting the soft real-time requirements of the protocol. This metric ranges from 0 to 8, where 0 indicates \"totally healthy\".\n\nFor more info see https://www.consul.io/docs/agent/telemetry.html#cluster-health", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 20 + }, + "id": 409, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 57, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_health_score{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster health score", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Clusterhealth score for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "Shows cluster score representing cluster health. From Hashicorps official documentation: \n> This metric describes a node's perception of its own health based on how well it is meeting the soft real-time requirements of the protocol. This metric ranges from 0 to 8, where 0 indicates \"totally healthy\".\n\nFor more info see https://www.consul.io/docs/agent/telemetry.html#cluster-health", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 20 + }, + "id": 410, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 57, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_health_score{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster health score", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Clusterhealth score for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Shows gossip cluster members count in time and failing peers in case of any in red color.", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 38, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_members{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of cluster members", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_failed_peers{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of failed peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster members count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "Shows gossip cluster members count in time and failing peers in case of any in red color.", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 24 + }, + "id": 411, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 38, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_members{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of cluster members", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_failed_peers{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of failed peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster members count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "Shows gossip cluster members count in time and failing peers in case of any in red color.", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 24 + }, + "id": 412, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 38, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_members{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of cluster members", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_failed_peers{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of failed peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster members count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "Shows gossip cluster members count in time and failing peers in case of any in red color.", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 24 + }, + "id": 413, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 38, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_members{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of cluster members", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_failed_peers{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of failed peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster members count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "On positive axis shows number of peers that joined the cluster and on negative axis number of peers that left the cluster.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 28 + }, + "id": 75, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_peers_left_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_peers_joined_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster left peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster peers left/joined on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis shows number of peers that joined the cluster and on negative axis number of peers that left the cluster.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 28 + }, + "id": 414, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 75, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_peers_left_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_peers_joined_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster left peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster peers left/joined on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis shows number of peers that joined the cluster and on negative axis number of peers that left the cluster.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 28 + }, + "id": 415, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 75, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_peers_left_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_peers_joined_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster left peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster peers left/joined on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis shows number of peers that joined the cluster and on negative axis number of peers that left the cluster.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 28 + }, + "id": 416, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 75, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_peers_left_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_peers_joined_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster left peers", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster peers left/joined on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "On positive axis is number of attempts to reconnect the cluster. On negative axis if number of failed attempts.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 34 + }, + "id": 68, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Failed reconnections", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_reconnections_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Successful reconnections", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_reconnections_failed_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failed reconnections", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster reconnections on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of attempts to reconnect the cluster. On negative axis if number of failed attempts.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 34 + }, + "id": 417, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 68, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Failed reconnections", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_reconnections_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Successful reconnections", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_reconnections_failed_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failed reconnections", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster reconnections on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of attempts to reconnect the cluster. On negative axis if number of failed attempts.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 34 + }, + "id": 418, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 68, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Failed reconnections", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_reconnections_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Successful reconnections", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_reconnections_failed_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failed reconnections", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster reconnections on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of attempts to reconnect the cluster. On negative axis if number of failed attempts.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 34 + }, + "id": 419, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 68, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Failed reconnections", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_cluster_reconnections_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Successful reconnections", + "refId": "A" + }, + { + "expr": "increase(alertmanager_cluster_reconnections_failed_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failed reconnections", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster reconnections on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "On positive axis is number of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 40 + }, + "id": 48, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 40 + }, + "id": 420, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 48, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 40 + }, + "id": 421, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 48, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 40 + }, + "id": 422, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 48, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "On positive axis is size of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 46 + }, + "id": 53, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is size of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 46 + }, + "id": 423, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 53, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is size of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 46 + }, + "id": 424, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 53, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is size of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 46 + }, + "id": 425, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 53, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(alertmanager_cluster_messages_sent_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "expr": "sum(increase(alertmanager_cluster_messages_received_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "On positive axis is number of queued cluster messages and on negative axis number of pruned messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 52 + }, + "id": 62, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Pruned messaged", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_messages_pruned_total{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pruned messaged", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_messages_queued{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Queued messages", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages queue on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of queued cluster messages and on negative axis number of pruned messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 52 + }, + "id": 426, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 62, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Pruned messaged", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_messages_pruned_total{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pruned messaged", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_messages_queued{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Queued messages", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages queue on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of queued cluster messages and on negative axis number of pruned messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 52 + }, + "id": 427, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 62, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Pruned messaged", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_messages_pruned_total{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pruned messaged", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_messages_queued{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Queued messages", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages queue on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "description": "On positive axis is number of queued cluster messages and on negative axis number of pruned messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 52 + }, + "id": 428, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 62, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Pruned messaged", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_cluster_messages_pruned_total{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pruned messaged", + "refId": "A" + }, + { + "expr": "alertmanager_cluster_messages_queued{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Queued messages", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster messages queue on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Cluster members", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 284, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 314, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "/dropped/", + "color": "#cca300", + "transform": "negative-Y" + }, + { + "alias": "/failed/", + "color": "#bf1b00", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_oversized_gossip_message_sent_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_dropped_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "dropped {{key}}", + "refId": "B" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_failure_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "failed {{key}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Count of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 441, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 314, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "/dropped/", + "color": "#cca300", + "transform": "negative-Y" + }, + { + "alias": "/failed/", + "color": "#bf1b00", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_oversized_gossip_message_sent_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_dropped_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "dropped {{key}}", + "refId": "B" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_failure_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "failed {{key}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Count of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 442, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 314, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "/dropped/", + "color": "#cca300", + "transform": "negative-Y" + }, + { + "alias": "/failed/", + "color": "#bf1b00", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_oversized_gossip_message_sent_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_dropped_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "dropped {{key}}", + "refId": "B" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_failure_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "failed {{key}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Count of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 443, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 314, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "/dropped/", + "color": "#cca300", + "transform": "negative-Y" + }, + { + "alias": "/failed/", + "color": "#bf1b00", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_oversized_gossip_message_sent_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_dropped_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "dropped {{key}}", + "refId": "B" + }, + { + "expr": "increase(alertmanager_oversized_gossip_message_failure_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "failed {{key}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Count of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 307, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_oversize_gossip_message_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Duration of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 444, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 307, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_oversize_gossip_message_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Duration of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 445, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 307, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_oversize_gossip_message_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Duration of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 446, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 307, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_oversize_gossip_message_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Duration of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 22 + }, + "id": 303, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "silences", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "nf_log", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of propagated gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 22 + }, + "id": 447, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 303, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "silences", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "nf_log", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of propagated gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 22 + }, + "id": 448, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 303, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "silences", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "nf_log", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of propagated gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 22 + }, + "id": 449, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 303, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "silences", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "nf_log", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of propagated gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Gossip messages", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 84, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 94, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_nflog_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query errors", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log queries count for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 11 + }, + "id": 450, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 94, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_nflog_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query errors", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log queries count for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 11 + }, + "id": 451, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 94, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_nflog_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query errors", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log queries count for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 11 + }, + "id": 452, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 94, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_nflog_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_nflog_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query errors", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log queries count for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 17 + }, + "id": 106, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_nflog_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log query duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 17 + }, + "id": 453, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 106, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_nflog_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log query duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 17 + }, + "id": 454, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 106, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_nflog_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log query duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 17 + }, + "id": 455, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 106, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_nflog_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log query duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 23 + }, + "id": 97, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot size for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 23 + }, + "id": 456, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 97, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot size for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 23 + }, + "id": 457, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 97, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot size for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 23 + }, + "id": 458, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 97, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot size for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 29 + }, + "id": 101, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval]) / rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 29 + }, + "id": 459, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 101, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval]) / rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 29 + }, + "id": 460, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 101, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval]) / rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 29 + }, + "id": 461, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 101, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval]) / rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log snapshot duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 35 + }, + "id": 92, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log Go GC time for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 35 + }, + "id": 462, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 92, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log Go GC time for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 35 + }, + "id": 463, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 92, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log Go GC time for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 35 + }, + "id": 464, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 92, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_nflog_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nf log Go GC time for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Nflog", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 123, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 12 + }, + "id": 129, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences count by state on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 12 + }, + "id": 465, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 129, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences count by state on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 12 + }, + "id": 466, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 129, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences count by state on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 12 + }, + "id": 467, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 129, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences count by state on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 18 + }, + "id": 134, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Silecnces query fails", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_silences_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Silecnces query fails", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 18 + }, + "id": 468, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 134, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Silecnces query fails", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_silences_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Silecnces query fails", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 18 + }, + "id": 469, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 134, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Silecnces query fails", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_silences_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Silecnces query fails", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 18 + }, + "id": 470, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 134, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Silecnces query fails", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(alertmanager_silences_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query count", + "refId": "A" + }, + { + "expr": "increase(alertmanager_silences_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Silecnces query fails", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 138, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_silences_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 24 + }, + "id": 471, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 138, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_silences_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 24 + }, + "id": 472, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 138, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_silences_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 24 + }, + "id": 473, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 138, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(1,rate(alertmanager_silences_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences query duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 30 + }, + "id": 149, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 30 + }, + "id": 474, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 149, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 30 + }, + "id": 475, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 149, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 30 + }, + "id": 476, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 149, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 36 + }, + "id": 143, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_duration_seconds{instance=~\"$instance\", quantile=\"0.99\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 36 + }, + "id": 477, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 143, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_duration_seconds{instance=~\"$instance\", quantile=\"0.99\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 36 + }, + "id": 478, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 143, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_duration_seconds{instance=~\"$instance\", quantile=\"0.99\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 36 + }, + "id": 479, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 143, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_snapshot_duration_seconds{instance=~\"$instance\", quantile=\"0.99\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences snapshot duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 42 + }, + "id": 131, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces GC duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences GC duraton on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 42 + }, + "id": 480, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 131, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces GC duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences GC duraton on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 42 + }, + "id": 481, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 131, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces GC duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences GC duraton on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 42 + }, + "id": 482, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 131, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "alertmanager_silences_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces GC duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Silences GC duraton on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Silences", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 173, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "decimals": 2, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 13 + }, + "id": 175, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "expr": "max(kube_pod_container_resource_limits_cpu_cores{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{pod}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU usage/s for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "decimals": 2, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 13 + }, + "id": 483, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 175, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "expr": "max(kube_pod_container_resource_limits_cpu_cores{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{pod}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU usage/s for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "decimals": 2, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 13 + }, + "id": 484, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 175, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "expr": "max(kube_pod_container_resource_limits_cpu_cores{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{pod}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU usage/s for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "decimals": 2, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 13 + }, + "id": 485, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 175, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "expr": "max(kube_pod_container_resource_limits_cpu_cores{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{pod}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU usage/s for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 20 + }, + "id": 177, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-54b9fdf9bf-gqxs4", + "value": "alertmanager-54b9fdf9bf-gqxs4" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "E" + }, + { + "expr": "max(kube_pod_container_resource_limits_memory_bytes{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{ pod }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory usage for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 20 + }, + "id": 486, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 177, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-7f5c977766-kwr2s", + "value": "alertmanager-7f5c977766-kwr2s" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "E" + }, + { + "expr": "max(kube_pod_container_resource_limits_memory_bytes{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{ pod }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory usage for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 20 + }, + "id": 487, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 177, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-5b9858bf54-r8skx", + "value": "alertmanager-sre-5b9858bf54-r8skx" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "E" + }, + { + "expr": "max(kube_pod_container_resource_limits_memory_bytes{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{ pod }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory usage for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 20 + }, + "id": 488, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1555332813958, + "repeatPanelId": 177, + "scopedVars": { + "instance": { + "selected": false, + "text": "alertmanager-sre-8455c65f47-mmwlq", + "value": "alertmanager-sre-8455c65f47-mmwlq" + } + }, + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "E" + }, + { + "expr": "max(kube_pod_container_resource_limits_memory_bytes{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{ pod }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory usage for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Resources", + "type": "row" + } + ], + "refresh": "5m", + "schemaVersion": 18, + "style": "dark", + "tags": [ + "alertmanager", + "prometheus", + "alerting" + ], + "templating": { + "list": [ + { + "current": { + "tags": [], + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "Prometheus datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": "", + "current": {}, + "datasource": "${prometheusds}", + "definition": "query_result(alertmanager_build_info)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": "query_result(alertmanager_build_info)", + "refresh": 2, + "regex": "/.*instance=\"([^\"]+)\".*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "label_values(alertmanager_build_info, instance)", + "tags": [], + "tagsQuery": "label_values(alertmanager_build_info, namespace)", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Alertmanager", + "uid": "eea-9_sik", + "version": 27 +} diff --git a/src/prometheus_alert_rules/alertmanager_configuration_reload_failure.rule b/src/prometheus_alert_rules/alertmanager_configuration_reload_failure.rule new file mode 100644 index 00000000..a2918be2 --- /dev/null +++ b/src/prometheus_alert_rules/alertmanager_configuration_reload_failure.rule @@ -0,0 +1,12 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: AlertmanagerConfigurationReloadFailure +expr: alertmanager_config_last_reload_successful{} != 1 +for: 0m +labels: + severity: warning +annotations: + summary: Alertmanager configuration reload failure (instance {{ $labels.instance }}) + description: | + Alertmanager configuration reload error + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/alertmanager_missing.rule b/src/prometheus_alert_rules/alertmanager_missing.rule new file mode 100644 index 00000000..697b829c --- /dev/null +++ b/src/prometheus_alert_rules/alertmanager_missing.rule @@ -0,0 +1,12 @@ +# Based on https://awesome-prometheus-alerts.grep.to/rules.html#prometheus-self-monitoring-1 +alert: AlertmanagerJobMissing +expr: absent(up{}) +for: 0m +labels: + severity: warning +annotations: + summary: Alertmanager job missing (instance {{ $labels.instance }}) + description: | + A Alertmanager job has disappeared + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/alertmanager_notifications_failed.rule b/src/prometheus_alert_rules/alertmanager_notifications_failed.rule new file mode 100644 index 00000000..71d77504 --- /dev/null +++ b/src/prometheus_alert_rules/alertmanager_notifications_failed.rule @@ -0,0 +1,11 @@ +alert: AlertmanagerNotificationsFailed +expr: alertmanager_notifications_failed_total{integration=~".*"} != 0 +for: 0m +labels: + severity: warning +annotations: + summary: Alertmanager notifications failure (instance {{ $labels.instance }}) + description: | + Alertmanager notifications failure + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/tests/integration/test_rerelate_alertmanager_dispatch.py b/tests/integration/test_rerelate_alertmanager_dispatch_metrics_endpoint.py similarity index 64% rename from tests/integration/test_rerelate_alertmanager_dispatch.py rename to tests/integration/test_rerelate_alertmanager_dispatch_metrics_endpoint.py index 45dab7d7..da3c6eb1 100644 --- a/tests/integration/test_rerelate_alertmanager_dispatch.py +++ b/tests/integration/test_rerelate_alertmanager_dispatch_metrics_endpoint.py @@ -4,7 +4,8 @@ """This test module tests alertmanager response to related apps being removed and re-related. -1. Deploy the charm under test and a related app, relate them and wait for them to become idle. +1. Deploy the charm under test and a related app (Promethes) relate them using + `alertmanager_dispatch` and `prometheus_scrape` interfaces and wait for them to become idle. 2. Remove the relation. 3. Re-add the relation. 4. Remove the related application. @@ -35,10 +36,17 @@ async def test_build_and_deploy(ops_test: OpsTest, charm_under_test): ops_test.model.deploy( charm_under_test, resources=resources, application_name=app_name, num_units=2 ), - ops_test.model.deploy("ch:prometheus-k8s", application_name=related_app, channel="edge"), + ops_test.model.deploy( + "ch:prometheus-k8s", application_name=related_app, channel="edge", trust=True + ), ) - await ops_test.model.add_relation(app_name, related_app) + await ops_test.model.add_relation(app_name, f"{related_app}:alertmanager") + await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=2500) + + assert await is_alertmanager_up(ops_test, app_name) + + await ops_test.model.add_relation(app_name, f"{related_app}:metrics-endpoint") await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) assert await is_alertmanager_up(ops_test, app_name) @@ -47,13 +55,20 @@ async def test_build_and_deploy(ops_test: OpsTest, charm_under_test): @pytest.mark.abort_on_fail async def test_remove_relation(ops_test: OpsTest): await ops_test.model.applications[app_name].remove_relation("alerting", related_app) + await ops_test.model.applications[app_name].remove_relation( + "self-metrics-endpoint", related_app + ) await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) assert await is_alertmanager_up(ops_test, app_name) @pytest.mark.abort_on_fail async def test_rerelate(ops_test: OpsTest): - await ops_test.model.add_relation(app_name, related_app) + await ops_test.model.add_relation(app_name, f"{related_app}:alertmanager") + await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) + assert await is_alertmanager_up(ops_test, app_name) + + await ops_test.model.add_relation(app_name, f"{related_app}:metrics-endpoint") await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) assert await is_alertmanager_up(ops_test, app_name) @@ -70,7 +85,13 @@ async def test_remove_related_app(ops_test: OpsTest): @pytest.mark.abort_on_fail async def test_rerelate_app(ops_test: OpsTest): - await ops_test.model.deploy("ch:prometheus-k8s", application_name=related_app, channel="edge") - await ops_test.model.add_relation(app_name, related_app) + await ops_test.model.deploy( + "ch:prometheus-k8s", application_name=related_app, channel="edge", trust=True + ) + await ops_test.model.add_relation(app_name, f"{related_app}:alertmanager") + await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) + assert await is_alertmanager_up(ops_test, app_name) + + await ops_test.model.add_relation(app_name, f"{related_app}:metrics-endpoint") await ops_test.model.wait_for_idle(apps=[app_name, related_app], status="active", timeout=1000) assert await is_alertmanager_up(ops_test, app_name) diff --git a/tests/integration/test_update_status_pressure.py b/tests/integration/test_update_status_pressure.py index 14a29260..83ebc6c9 100644 --- a/tests/integration/test_update_status_pressure.py +++ b/tests/integration/test_update_status_pressure.py @@ -42,12 +42,14 @@ async def test_deploy_multiple_units(ops_test: OpsTest, charm_under_test): ops_test.model.deploy( charm_under_test, application_name=app_name, resources=resources, num_units=2 ), - ops_test.model.deploy("ch:prometheus-k8s", application_name="prom", channel="edge"), + ops_test.model.deploy( + "ch:prometheus-k8s", application_name="prom", channel="edge", trust=True + ), ) await asyncio.gather( - ops_test.model.add_relation(app_name, "prom"), - ops_test.model.wait_for_idle(status="active", timeout=300), + ops_test.model.add_relation(f"{app_name}:alerting", "prom"), + ops_test.model.wait_for_idle(status="active", timeout=2500), ) assert await is_alertmanager_up(ops_test, app_name) diff --git a/tests/integration/test_upgrade_charm.py b/tests/integration/test_upgrade_charm.py index 71c976d5..84655811 100644 --- a/tests/integration/test_upgrade_charm.py +++ b/tests/integration/test_upgrade_charm.py @@ -44,20 +44,22 @@ async def test_upgrade_edge_with_local_in_isolation(ops_test: OpsTest, charm_und async def test_upgrade_local_with_local_with_relations(ops_test: OpsTest, charm_under_test): # Deploy related apps await asyncio.gather( - ops_test.model.deploy("ch:prometheus-k8s", application_name="prom", channel="edge"), + ops_test.model.deploy( + "ch:prometheus-k8s", application_name="prom", channel="edge", trust=True + ), ops_test.model.deploy("ch:karma-k8s", application_name="karma", channel="edge"), ) # Relate apps await asyncio.gather( - ops_test.model.add_relation(app_name, "prom"), + ops_test.model.add_relation(app_name, "prom:alertmanager"), ops_test.model.add_relation(app_name, "karma"), ) # Refresh from path await ops_test.model.applications[app_name].refresh(path=charm_under_test, resources=resources) await ops_test.model.wait_for_idle( - apps=[app_name, "prom", "karma"], status="active", timeout=1000 + apps=[app_name, "prom", "karma"], status="active", timeout=2500 ) assert await is_alertmanager_up(ops_test, app_name) @@ -73,6 +75,6 @@ async def test_upgrade_with_multiple_units(ops_test: OpsTest, charm_under_test): # Refresh from path await ops_test.model.applications[app_name].refresh(path=charm_under_test, resources=resources) await ops_test.model.wait_for_idle( - apps=[app_name, "prom", "karma"], status="active", timeout=1000 + apps=[app_name, "prom", "karma"], status="active", timeout=2500 ) assert await is_alertmanager_up(ops_test, app_name) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 322c540b..1810e76d 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -31,7 +31,7 @@ def setUp(self, *unused): self.harness.begin_with_initial_hooks() def test_num_peers(self): - self.assertEqual(0, len(self.harness.charm.peer_relation.units)) + self.assertEqual(0, len(self.harness.charm.peer_relation.units)) # type: ignore def test_pebble_layer_added(self, *unused): self.harness.container_pebble_ready(self.container_name) @@ -62,7 +62,7 @@ def test_relation_data_provides_public_address(self): rel = model.get_relation("alerting", self.relation_id) expected_address = "fqdn:{}".format(self.harness.charm.alertmanager_provider.api_port) - self.assertEqual({"public_address": expected_address}, rel.data[self.harness.charm.unit]) + self.assertEqual({"public_address": expected_address}, rel.data[self.harness.charm.unit]) # type: ignore def test_topology_added_if_user_provided_config_without_group_by(self, *unused): self.harness.container_pebble_ready(self.container_name) diff --git a/tox.ini b/tox.ini index 47ebeb74..7dc133cd 100644 --- a/tox.ini +++ b/tox.ini @@ -100,10 +100,10 @@ commands = description = Run integration tests deps = #git+https://github.com/juju/python-libjuju.git - juju==2.9.7 + juju pytest #git+https://github.com/charmed-kubernetes/pytest-operator.git - pytest-operator==0.15.0 + pytest-operator commands = pytest -v --tb native --log-cli-level=INFO -s {posargs} {toxinidir}/tests/integration