Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ResponseOps][MW] Add telemetry for the maintenance window #192483

Merged
merged 26 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
763160c
total count MW telemetry
guskovaue Sep 4, 2024
3b01b9e
first method count total MW
guskovaue Sep 9, 2024
f4a882e
add positive and negative unit tests for total MW count
guskovaue Sep 10, 2024
146ecea
add telemetry for toggles and fix tests for them
guskovaue Sep 10, 2024
52438fd
fix type
guskovaue Sep 10, 2024
b292b33
checking right SO property for recurring check
guskovaue Sep 11, 2024
8819661
Merge branch 'main' into MX-184088-add-telemetry-for-mw
guskovaue Sep 11, 2024
c289c84
refresh telemetry mappings
guskovaue Sep 11, 2024
9408886
Merge branch 'MX-184088-add-telemetry-for-mw' of github.com:guskovaue…
guskovaue Sep 11, 2024
ceeb6ca
nit
guskovaue Sep 11, 2024
b6df6bd
add integrational test for MW telemetry
guskovaue Sep 12, 2024
3c365a2
fix integration test
guskovaue Sep 13, 2024
2806a04
fix await issue in find in tile generator
guskovaue Sep 13, 2024
72e7c54
fix types
guskovaue Sep 13, 2024
ce8eb77
Merge branch 'main' into MX-184088-add-telemetry-for-mw
elasticmachine Sep 13, 2024
6a62381
Merge branch 'main' into MX-184088-add-telemetry-for-mw
elasticmachine Sep 16, 2024
732374d
changes after code review
guskovaue Sep 16, 2024
c69db28
Merge branch 'MX-184088-add-telemetry-for-mw' of github.com:guskovaue…
guskovaue Sep 16, 2024
2e4440d
fix max amount
guskovaue Sep 17, 2024
2051618
fix max amount
guskovaue Sep 17, 2024
44165b9
Merge branch 'MX-184088-add-telemetry-for-mw' of github.com:guskovaue…
guskovaue Sep 17, 2024
a13b6e1
max limit
guskovaue Sep 17, 2024
4c852ba
changes after code review
guskovaue Sep 18, 2024
cfc3d0f
Merge branch 'main' into MX-184088-add-telemetry-for-mw
elasticmachine Sep 18, 2024
95ac96b
fix unit tests after last changes
guskovaue Sep 18, 2024
493290a
Merge branch 'main' into MX-184088-add-telemetry-for-mw
guskovaue Sep 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ export function createAlertingUsageCollector(
count_rules_with_tags: 0,
count_rules_snoozed: 0,
count_rules_muted: 0,
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
count_rules_with_muted_alerts: 0,
count_connector_types_by_consumers: {},
count_rules_by_execution_status_per_day: {},
Expand Down Expand Up @@ -289,6 +292,9 @@ export function createAlertingUsageCollector(
count_rules_by_notify_when: byNotifyWhenSchema,
count_rules_snoozed: { type: 'long' },
count_rules_muted: { type: 'long' },
count_mw_total: { type: 'long' },
count_mw_with_repeat_toggle_on: { type: 'long' },
count_mw_with_filter_alert_toggle_on: { type: 'long' },
count_rules_with_muted_alerts: { type: 'long' },
count_connector_types_by_consumers: { DYNAMIC_KEY: { DYNAMIC_KEY: { type: 'long' } } },
count_rules_by_execution_status_per_day: byStatusPerDaySchema,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,97 @@
*/

import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import { getTotalCountAggregations, getTotalCountInUse } from './get_telemetry_from_kibana';
import {
getTotalCountAggregations,
getTotalCountInUse,
getMWTelemetry,
} from './get_telemetry_from_kibana';
import { savedObjectsClientMock } from '@kbn/core/server/mocks';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
import { ISavedObjectsRepository } from '@kbn/core/server';

const elasticsearch = elasticsearchServiceMock.createStart();
const esClient = elasticsearch.client.asInternalUser;
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
const savedObjectsClient = savedObjectsClientMock.create() as unknown as ISavedObjectsRepository;
const thrownError = new Error('Fail');

const mockedResponse = {
saved_objects: [
{
id: '1',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_1',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 0,
count: 1,
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: null,
},
},
{
id: '2',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_2',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 3,
interval: 1,
byweekday: ['SU'],
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: {
filters: [],
kql: 'kibana.alert.job_errors_results.job_id : * ',
dsl: '{"bool":{"must":[],"filter":[{"bool":{"should":[{"exists":{"field":"kibana.alert.job_errors_results.job_id"}}],"minimum_should_match":1}}],"should":[],"must_not":[]}}',
},
},
},
{
id: '3',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_3',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 3,
interval: 1,
byweekday: ['TU'],
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: null,
},
},
],
};

describe('kibana index telemetry', () => {
beforeEach(() => {
Expand Down Expand Up @@ -420,4 +506,65 @@ describe('kibana index telemetry', () => {
});
});
});

describe('getMWTelemetry', () => {
test('should return MW telemetry', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
yield mockedResponse;
}),
});
const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
});

expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
});
expect(telemetry).toStrictEqual({
count_mw_total: 3,
count_mw_with_repeat_toggle_on: 2,
count_mw_with_filter_alert_toggle_on: 1,
hasErrors: false,
});
});
});

test('should throw the error', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
throw thrownError;
}),
});

const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
});

expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
});

expect(telemetry).toStrictEqual({
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
hasErrors: true,
errorMessage: 'Fail',
});
expect(logger.warn).toHaveBeenCalled();
const loggerCall = logger.warn.mock.calls[0][0];
const loggerMeta = logger.warn.mock.calls[0][1];
expect(loggerCall).toBe('Error executing alerting telemetry task: getTotalMWCount - {}');
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import type {
AggregationsTermsAggregateBase,
AggregationsStringTermsBucketKeys,
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { ElasticsearchClient, Logger } from '@kbn/core/server';
import { ElasticsearchClient, Logger, ISavedObjectsRepository } from '@kbn/core/server';

import {
ConnectorsByConsumersBucket,
Expand All @@ -23,13 +23,20 @@ import { AlertingUsage } from '../types';
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
import { groupRulesBySearchType } from './group_rules_by_search_type';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
import { MaintenanceWindowAttributes } from '../../data/maintenance_window/types';

interface Opts {
esClient: ElasticsearchClient;
alertIndex: string;
logger: Logger;
}

interface MWOpts {
savedObjectsClient: ISavedObjectsRepository;
logger: Logger;
}

type GetTotalCountsResults = Pick<
AlertingUsage,
| 'count_total'
Expand All @@ -48,6 +55,14 @@ type GetTotalCountsResults = Pick<
| 'connectors_per_alert'
> & { errorMessage?: string; hasErrors: boolean };

type GetMWTelemetryResults = Pick<
AlertingUsage,
'count_mw_total' | 'count_mw_with_repeat_toggle_on' | 'count_mw_with_filter_alert_toggle_on'
> & {
errorMessage?: string;
hasErrors: boolean;
};

interface GetTotalCountInUseResults {
countTotal: number;
countByType: Record<string, number>;
Expand Down Expand Up @@ -490,3 +505,57 @@ export async function getTotalCountInUse({
};
}
}

export async function getMWTelemetry({
savedObjectsClient,
logger,
}: MWOpts): Promise<GetMWTelemetryResults> {
try {
const mwFinder = savedObjectsClient.createPointInTimeFinder<MaintenanceWindowAttributes>({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: What do you think of using the field parameter to reduce the response size to only the fields we want? I think the events attribute can be quite big.

});

let countMWTotal = 0;
let countMWWithRepeatToggleON = 0;
let countMWWithFilterAlertToggleON = 0;
for await (const response of mwFinder.find()) {
for (const mwSavedObject of response.saved_objects) {
countMWTotal = countMWTotal + 1;
// scopedQuery property will be null if "Filter alerts" toggle will be off
if (mwSavedObject.attributes.scopedQuery) {
countMWWithFilterAlertToggleON = countMWWithFilterAlertToggleON + 1;
}
// interval property will be not in place if "Repeat" toggle will be off
if (Object.hasOwn(mwSavedObject.attributes.rRule, 'interval')) {
countMWWithRepeatToggleON = countMWWithRepeatToggleON + 1;
}
}
}
await mwFinder.close();

return {
hasErrors: false,
count_mw_total: countMWTotal,
count_mw_with_repeat_toggle_on: countMWWithRepeatToggleON,
count_mw_with_filter_alert_toggle_on: countMWWithFilterAlertToggleON,
};
} catch (err) {
const errorMessage = err?.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getTotalMWCount - ${JSON.stringify(err)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err?.stack },
}
);
return {
hasErrors: true,
errorMessage,
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
};
}
}
39 changes: 30 additions & 9 deletions x-pack/plugins/alerting/server/usage/task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,19 @@ import {
TaskManagerStartContract,
IntervalSchedule,
} from '@kbn/task-manager-plugin/server';

import { getFailedAndUnrecognizedTasksPerDay } from './lib/get_telemetry_from_task_manager';
import { getTotalCountAggregations, getTotalCountInUse } from './lib/get_telemetry_from_kibana';
import {
getTotalCountAggregations,
getTotalCountInUse,
getMWTelemetry,
} from './lib/get_telemetry_from_kibana';
import {
getExecutionsPerDayCount,
getExecutionTimeoutsPerDayCount,
} from './lib/get_telemetry_from_event_log';
import { stateSchemaByVersion, emptyState, type LatestTaskStateSchema } from './task_state';
import { RULE_SAVED_OBJECT_TYPE } from '../saved_objects';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../common';

export const TELEMETRY_TASK_TYPE = 'alerting_telemetry';

Expand All @@ -36,12 +40,6 @@ export function initializeAlertingTelemetry(
registerAlertingTelemetryTask(logger, core, taskManager, eventLogIndex);
}

export function scheduleAlertingTelemetry(logger: Logger, taskManager?: TaskManagerStartContract) {
if (taskManager) {
scheduleTasks(logger, taskManager).catch(() => {}); // it shouldn't reject, but just in case
}
}

function registerAlertingTelemetryTask(
logger: Logger,
core: CoreSetup,
Expand All @@ -58,6 +56,12 @@ function registerAlertingTelemetryTask(
});
}

export function scheduleAlertingTelemetry(logger: Logger, taskManager?: TaskManagerStartContract) {
if (taskManager) {
scheduleTasks(logger, taskManager).catch(() => {}); // it shouldn't reject, but just in case
}
}

async function scheduleTasks(logger: Logger, taskManager: TaskManagerStartContract) {
try {
await taskManager.ensureScheduled({
Expand Down Expand Up @@ -93,16 +97,26 @@ export function telemetryTaskRunner(
.getStartServices()
.then(([coreStart]) => coreStart.savedObjects.getIndexForType(RULE_SAVED_OBJECT_TYPE));

const getSavedObjectClient = () =>
core
.getStartServices()
.then(([coreStart]) =>
coreStart.savedObjects.createInternalRepository([MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE])
);

return {
async run() {
const esClient = await getEsClient();
const alertIndex = await getAlertIndex();
const savedObjectsClient = await getSavedObjectClient();

return Promise.all([
getTotalCountAggregations({ esClient, alertIndex, logger }),
getTotalCountInUse({ esClient, alertIndex, logger }),
getExecutionsPerDayCount({ esClient, eventLogIndex, logger }),
getExecutionTimeoutsPerDayCount({ esClient, eventLogIndex, logger }),
getFailedAndUnrecognizedTasksPerDay({ esClient, taskManagerIndex, logger }),
getMWTelemetry({ logger, savedObjectsClient }),
])
.then(
([
Expand All @@ -111,20 +125,23 @@ export function telemetryTaskRunner(
dailyExecutionCounts,
dailyExecutionTimeoutCounts,
dailyFailedAndUnrecognizedTasks,
MWTelemetry,
]) => {
const hasErrors =
totalCountAggregations.hasErrors ||
totalInUse.hasErrors ||
dailyExecutionCounts.hasErrors ||
dailyExecutionTimeoutCounts.hasErrors ||
dailyFailedAndUnrecognizedTasks.hasErrors;
dailyFailedAndUnrecognizedTasks.hasErrors ||
MWTelemetry.hasErrors;

const errorMessages = [
totalCountAggregations.errorMessage,
totalInUse.errorMessage,
dailyExecutionCounts.errorMessage,
dailyExecutionTimeoutCounts.errorMessage,
dailyFailedAndUnrecognizedTasks.errorMessage,
MWTelemetry.errorMessage,
].filter((message) => message !== undefined);

const updatedState: LatestTaskStateSchema = {
Expand All @@ -147,6 +164,10 @@ export function telemetryTaskRunner(
count_rules_by_notify_when: totalCountAggregations.count_rules_by_notify_when,
count_rules_snoozed: totalCountAggregations.count_rules_snoozed,
count_rules_muted: totalCountAggregations.count_rules_muted,
count_mw_total: MWTelemetry.count_mw_total,
count_mw_with_repeat_toggle_on: MWTelemetry.count_mw_with_repeat_toggle_on,
count_mw_with_filter_alert_toggle_on:
MWTelemetry.count_mw_with_filter_alert_toggle_on,
count_rules_with_muted_alerts: totalCountAggregations.count_rules_with_muted_alerts,
count_connector_types_by_consumers:
totalCountAggregations.count_connector_types_by_consumers,
Expand Down
Loading