Skip to content

Commit

Permalink
[SLO] Exclude stale slos from healthy count on overview (#201027)
Browse files Browse the repository at this point in the history
## Summary

Resolves #198911.

The result is achieved by nesting a new filter agg inside the existing
`HEALTHY` agg to remove any stale SLOs from the ultimate result.

This required a modification of the parsing code on the ES response to
include a new `not_stale` key. The original `success` total is preserved
in the `doc_count` of that agg, but is no longer referenced.

The filter for the `not_stale` agg I have added is the logical inverse
of the filter we're using to determine stale SLOs:

```json
{
  "range": {
    "summaryUpdatedAt": {
      "gte": "now-48h"
    }
  }
}
```

_Reviewer note: I also changed the spelling of a UI component, should be
a completely transparent change._

## Example

### Before

This is my local running on `main`:

<img width="1116" alt="image"
src="https://github.com/user-attachments/assets/80f86426-c7f1-4847-830f-a311c865a225">


### After

This is my local running on this PR branch:

<img width="1120" alt="image"
src="https://github.com/user-attachments/assets/2c4c4f26-2407-41ca-bf01-9ca730bbfab2">


### Proof query works

You can replicate these results by including a similar agg on a query
against SLO data. I added a terms agg to the `stale` agg to determine
how many SLOs I need to remove. The number of `HEALTHY` SLOs showing up
in `stale` should match the difference between the total `doc_count`
from `healthy` and the `doc_count` in the `not_stale` sub-aggregation.

#### Query

You can run this example aggs:

```json
{
  "aggs": {
    "stale": {
      "filter": {
        "range": {
          "summaryUpdatedAt": {
            "lt": "now-48h"
          }
        }
      },
      "aggs": {
        "by_status": {
          "terms": {
            "field": "status"
          }
        }
      }
    },
    "healthy": {
      "filter": {
        "term": {
          "status": "HEALTHY"
        }
      },
      "aggs": {
        "not_stale": {
          "filter": {
            "range": {
              "summaryUpdatedAt": {
                "gte": "now-48h"
              }
            }
          }
        }
      }
    }
  }
}
```

#### Relevant output

Here's a subset of my example query output. You can see that
`stale.by_status.buckets[1]` contains a total of 2 docs, which is the
difference between `healthy.doc_count` and
`healthy.not_stale.doc_count`.

```json
{
  "stale": {
    "doc_count": 7,
    "by_status": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "VIOLATED",
          "doc_count": 5
        },
        {
          "key": "HEALTHY",
          "doc_count": 2
        }
      ]
    }
  },
  "healthy": {
    "doc_count": 9,
    "not_stale": {
      "doc_count": 7
    }
  }
}
```
  • Loading branch information
justinkambic authored Nov 26, 2024
1 parent 430cc27 commit a92103b
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@ const getOverviewResponseSchema = t.type({
degrading: t.number,
stale: t.number,
healthy: t.number,
worst: t.type({
value: t.number,
id: t.string,
}),
noData: t.number,
burnRateRules: t.number,
burnRateActiveAlerts: t.number,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import { EuiFlexItem, EuiStat, EuiToolTip } from '@elastic/eui';
import React from 'react';
import { useUrlSearchState } from '../../hooks/use_url_search_state';

export function OverViewItem({
export function OverviewItem({
title,
description,
titleColor,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import { GetOverviewResponse } from '@kbn/slo-schema/src/rest_specs/routes/get_o
import { rulesLocatorID, RulesParams } from '@kbn/observability-plugin/public';
import { useAlertsUrl } from '../../../../hooks/use_alerts_url';
import { useKibana } from '../../../../hooks/use_kibana';
import { OverViewItem } from './overview_item';
import { OverviewItem } from './overview_item';

export function SLOOverviewAlerts({
data,
Expand Down Expand Up @@ -55,7 +55,7 @@ export function SLOOverviewAlerts({

<EuiSpacer size="xs" />
<EuiFlexGroup justifyContent="spaceBetween">
<OverViewItem
<OverviewItem
title={data?.burnRateActiveAlerts}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.burnRateActiveAlerts', {
defaultMessage: 'Active alerts',
Expand All @@ -66,7 +66,7 @@ export function SLOOverviewAlerts({
application.navigateToUrl(getAlertsUrl('active'));
}}
/>
<OverViewItem
<OverviewItem
title={data?.burnRateRecoveredAlerts}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.burnRateRecoveredAlerts', {
defaultMessage: 'Recovered alerts',
Expand All @@ -77,7 +77,7 @@ export function SLOOverviewAlerts({
application.navigateToUrl(getAlertsUrl('recovered'));
}}
/>
<OverViewItem
<OverviewItem
title={data?.burnRateRules}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.burnRateRules', {
defaultMessage: 'Rules',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import { SLOOverviewAlerts } from './slo_overview_alerts';
import { useGetSettings } from '../../../slo_settings/hooks/use_get_settings';
import { useFetchSLOsOverview } from '../../hooks/use_fetch_slos_overview';
import { useUrlSearchState } from '../../hooks/use_url_search_state';
import { OverViewItem } from './overview_item';
import { OverviewItem } from './overview_item';

export function SLOsOverview() {
const { state } = useUrlSearchState();
Expand Down Expand Up @@ -50,7 +50,7 @@ export function SLOsOverview() {
</EuiTitle>
<EuiSpacer size="xs" />
<EuiFlexGroup gutterSize="xl" justifyContent="spaceBetween">
<OverViewItem
<OverviewItem
title={data?.healthy}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.healthyLabel', {
defaultMessage: 'Healthy',
Expand All @@ -62,7 +62,7 @@ export function SLOsOverview() {
defaultMessage: 'Click to filter SLOs by Healthy status.',
})}
/>
<OverViewItem
<OverviewItem
title={data?.violated}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.violatedLabel', {
defaultMessage: 'Violated',
Expand All @@ -74,7 +74,7 @@ export function SLOsOverview() {
defaultMessage: 'Click to filter SLOs by Violated status.',
})}
/>
<OverViewItem
<OverviewItem
title={data?.noData}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.noDataLabel', {
defaultMessage: 'No data',
Expand All @@ -86,7 +86,7 @@ export function SLOsOverview() {
defaultMessage: 'Click to filter SLOs by no data status.',
})}
/>
<OverViewItem
<OverviewItem
title={data?.degrading}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.degradingLabel', {
defaultMessage: 'Degrading',
Expand All @@ -98,7 +98,7 @@ export function SLOsOverview() {
})}
titleColor={theme.colors.warningText}
/>
<OverViewItem
<OverviewItem
title={data?.stale}
description={i18n.translate('xpack.slo.sLOsOverview.euiStat.staleLabel', {
defaultMessage: 'Stale',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,6 @@ export class GetSLOsOverview {
},
body: {
aggs: {
worst: {
top_hits: {
sort: {
errorBudgetRemaining: {
order: 'asc',
},
},
_source: {
includes: ['sliValue', 'status', 'slo.id', 'slo.instanceId', 'slo.name'],
},
size: 1,
},
},
stale: {
filter: {
range: {
Expand All @@ -75,31 +62,42 @@ export class GetSLOsOverview {
},
},
},
violated: {
not_stale: {
filter: {
term: {
status: 'VIOLATED',
range: {
summaryUpdatedAt: {
gte: `now-${settings.staleThresholdInHours}h`,
},
},
},
},
healthy: {
filter: {
term: {
status: 'HEALTHY',
aggs: {
violated: {
filter: {
term: {
status: 'VIOLATED',
},
},
},
},
},
degrading: {
filter: {
term: {
status: 'DEGRADING',
healthy: {
filter: {
term: {
status: 'HEALTHY',
},
},
},
},
},
noData: {
filter: {
term: {
status: 'NO_DATA',
degrading: {
filter: {
term: {
status: 'DEGRADING',
},
},
},
noData: {
filter: {
term: {
status: 'NO_DATA',
},
},
},
},
},
Expand Down Expand Up @@ -131,15 +129,11 @@ export class GetSLOsOverview {
const aggs = response.aggregations;

return {
violated: aggs?.violated.doc_count ?? 0,
degrading: aggs?.degrading.doc_count ?? 0,
healthy: aggs?.healthy.doc_count ?? 0,
noData: aggs?.noData.doc_count ?? 0,
violated: aggs?.not_stale?.violated.doc_count ?? 0,
degrading: aggs?.not_stale?.degrading.doc_count ?? 0,
healthy: aggs?.not_stale?.healthy?.doc_count ?? 0,
noData: aggs?.not_stale?.noData.doc_count ?? 0,
stale: aggs?.stale.doc_count ?? 0,
worst: {
value: 0,
id: 'id',
},
burnRateRules: rules.total,
burnRateActiveAlerts: alerts.activeAlertCount,
burnRateRecoveredAlerts: alerts.recoveredAlertCount,
Expand Down

0 comments on commit a92103b

Please sign in to comment.