diff --git a/docs/spec/v1beta3/providers.md b/docs/spec/v1beta3/providers.md index 080b82e43..ef59ce7e3 100644 --- a/docs/spec/v1beta3/providers.md +++ b/docs/spec/v1beta3/providers.md @@ -878,9 +878,10 @@ an [Event](events.md#event-structure) to the provided Prometheus Alertmanager [Address](#address). The Event will be formatted into a `firing` [Prometheus Alertmanager -alert](https://prometheus.io/docs/alerting/latest/notifications/#alert), -with the metadata added to the `labels` fields, and the `message` (and optional -`.metadata.summary`) added as annotations. +alert](https://prometheus.io/docs/alerting/latest/notifications/#alert), with +the metadata added to the `labels` fields, and the `message` (and optional +`.metadata.summary`) added as annotations. Event timestamp will be used to set +alert start time (`.StartsAt`). In addition to the metadata from the Event, the following labels will be added: @@ -888,12 +889,26 @@ In addition to the metadata from the Event, the following labels will be added: |-----------|------------------------------------------------------------------------------------------------------| | alertname | The string Flux followed by the Kind and the reason for the event e.g `FluxKustomizationProgressing` | | severity | The severity of the event (`error` or `info`) | -| timestamp | The timestamp of the event | | reason | The machine readable reason for the objects transition into the current status | | kind | The kind of the involved object associated with the event | | name | The name of the involved object associated with the event | | namespace | The namespace of the involved object associated with the event | +Note that due to the way other Flux controllers currently emit events, there's +no way for notification-controller to figure out the time the event ends to set +`.EndsAt` (a reasonable estimate being double the reconciliation interval of the +resource involved) that doesn't involve a Kubernetes API roundtrip. A +possible workaround could be setting +[`global.resolve_timeout`][am_config_global] to an interval large enough for +events to reoccur: + +[am_config_global]: https://prometheus.io/docs/alerting/latest/configuration/#file-layout-and-global-settings + +```yaml +global: + resolve_timeout: 1h +``` + This Provider type does support the configuration of a [proxy URL](#https-proxy) and [TLS certificates](#tls-certificates). diff --git a/internal/notifier/alertmanager.go b/internal/notifier/alertmanager.go index 3bfa352d0..b1cb741bb 100644 --- a/internal/notifier/alertmanager.go +++ b/internal/notifier/alertmanager.go @@ -19,8 +19,10 @@ package notifier import ( "context" "crypto/x509" + "encoding/json" "fmt" "net/url" + "time" "golang.org/x/text/cases" "golang.org/x/text/language" @@ -38,6 +40,36 @@ type AlertManagerAlert struct { Status string `json:"status"` Labels map[string]string `json:"labels"` Annotations map[string]string `json:"annotations"` + + StartsAt AlertManagerTime `json:"startsAt"` + EndsAt AlertManagerTime `json:"endsAt,omitempty"` +} + +// AlertManagerTime takes care of representing time.Time as RFC3339. +// See https://prometheus.io/docs/alerting/0.27/clients/ +type AlertManagerTime time.Time + +func (a AlertManagerTime) String() string { + return time.Time(a).Format(time.RFC3339) +} + +func (a AlertManagerTime) MarshalJSON() ([]byte, error) { + return json.Marshal(a.String()) +} + +func (a *AlertManagerTime) UnmarshalJSON(jsonRepr []byte) error { + var serializedTime string + if err := json.Unmarshal(jsonRepr, &serializedTime); err != nil { + return err + } + + t, err := time.Parse(time.RFC3339, serializedTime) + if err != nil { + return err + } + + *a = AlertManagerTime(t) + return nil } func NewAlertmanager(hookURL string, proxyURL string, certPool *x509.CertPool) (*Alertmanager, error) { @@ -75,18 +107,30 @@ func (s *Alertmanager) Post(ctx context.Context, event eventv1.Event) error { labels["alertname"] = "Flux" + event.InvolvedObject.Kind + cases.Title(language.Und).String(event.Reason) labels["severity"] = event.Severity labels["reason"] = event.Reason - labels["timestamp"] = event.Timestamp.String() labels["kind"] = event.InvolvedObject.Kind labels["name"] = event.InvolvedObject.Name labels["namespace"] = event.InvolvedObject.Namespace labels["reportingcontroller"] = event.ReportingController + // The best reasonable `endsAt` value would be multiplying + // InvolvedObject's reconciliation interval by 2 then adding that to + // `startsAt` (the next successful reconciliation would make sure + // the alert is cleared after the timeout). Due to + // event.InvolvedObject only containing the object reference (namely + // the GVKNN) best we can do is leave it unset up to Alertmanager's + // default `resolve_timeout`. + // + // https://prometheus.io/docs/alerting/0.27/configuration/#file-layout-and-global-settings + startsAt := AlertManagerTime(event.Timestamp.Time) + payload := []AlertManagerAlert{ { Labels: labels, Annotations: annotations, Status: "firing", + + StartsAt: startsAt, }, }