Skip to content

Commit

Permalink
[Response Ops][Task Manager] Return HealthStatus.Warning instead of…
Browse files Browse the repository at this point in the history
… error when task manager stats are not yet available. (#169447)
  • Loading branch information
ymao1 authored Oct 26, 2023
1 parent 8924218 commit dc3a140
Show file tree
Hide file tree
Showing 2 changed files with 351 additions and 1 deletion.
345 changes: 345 additions & 0 deletions x-pack/plugins/task_manager/server/lib/calculate_health_status.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,345 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { set } from '@kbn/safer-lodash-set';
import { HealthStatus, RawMonitoringStats } from '../monitoring';
import { loggingSystemMock } from '@kbn/core/server/mocks';
import { calculateHealthStatus } from './calculate_health_status';
import { cloneDeep } from 'lodash';

const now = '2023-05-09T13:00:00.000Z';
Date.now = jest.fn().mockReturnValue(new Date(now));

const logger = loggingSystemMock.create().get();
const config = {
enabled: true,
max_workers: 10,
index: 'foo',
max_attempts: 9,
poll_interval: 3000,
version_conflict_threshold: 80,
request_capacity: 1000,
allow_reading_invalid_state: false,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_health_verbose_log: {
enabled: false,
level: 'debug' as const,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_required_freshness: 5000,
monitored_stats_running_average_window: 50,
monitored_task_execution_thresholds: {
default: {
error_threshold: 90,
warn_threshold: 80,
},
custom: {},
},
ephemeral_tasks: {
enabled: false,
request_capacity: 10,
},
unsafe: {
exclude_task_types: [],
authenticate_background_task_utilization: true,
},
event_loop_delay: {
monitor: true,
warn_threshold: 5000,
},
worker_utilization_running_average_window: 5,
requeue_invalid_tasks: {
enabled: false,
delay: 3000,
max_attempts: 20,
},
metrics_reset_interval: 3000,
};

const getStatsWithTimestamp = ({
timestamp,
hotTimestamp,
}: {
timestamp?: string;
hotTimestamp?: string;
} = {}): RawMonitoringStats => {
timestamp = timestamp ?? '2023-05-09T12:59:57.000Z';
hotTimestamp = hotTimestamp ?? timestamp;
return {
last_update: timestamp,
stats: {
configuration: {
timestamp,
value: {
request_capacity: 1000,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_running_average_window: 50,
monitored_task_execution_thresholds: {
custom: {},
default: {
error_threshold: 90,
warn_threshold: 80,
},
},
poll_interval: 3000,
max_workers: 10,
},
status: HealthStatus.OK,
},
runtime: {
timestamp,
value: {
polling: {
last_successful_poll: hotTimestamp,
last_polling_delay: timestamp,
claim_duration: {
p50: 15,
p90: 152,
p95: 175.99999999999972,
p99: 1025,
},
duration: {
p50: 135,
p90: 303.8,
p95: 547.3999999999978,
p99: 1099,
},
claim_conflicts: {
p50: 0,
p90: 0,
p95: 0,
p99: 0,
},
claim_mismatches: {
p50: 0,
p90: 0,
p95: 0,
p99: 0,
},
result_frequency_percent_as_number: {
Failed: 0,
NoAvailableWorkers: 0,
NoTasksClaimed: 83,
RanOutOfCapacity: 4,
RunningAtCapacity: 4,
PoolFilled: 8,
},
persistence: {
recurring: 95,
non_recurring: 5,
},
},
drift: {
p50: 3110.5,
p90: 5871,
p95: 8058.400000000001,
p99: 8167,
},
drift_by_type: {
taskType1: {
p50: 2944,
p90: 2944,
p95: 2944,
p99: 2944,
},
taskType2: {
p50: 2949,
p90: 2949,
p95: 2949,
p99: 2949,
},
},
load: {
p50: 10,
p90: 100,
p95: 100,
p99: 100,
},
execution: {
duration: {
taskType1: {
p50: 49,
p90: 49,
p95: 49,
p99: 49,
},
taskType2: {
p50: 68,
p90: 68,
p95: 68,
p99: 68,
},
},
duration_by_persistence: {
recurring: {
p50: 53,
p90: 871.4999999999999,
p95: 1050.399999999999,
p99: 1915,
},
non_recurring: {
p50: 441.5,
p90: 876,
p95: 876,
p99: 876,
},
},
persistence: {
recurring: 95,
non_recurring: 5,
ephemeral: 0,
},
result_frequency_percent_as_number: {
taskType1: {
Success: 100,
RetryScheduled: 0,
Failed: 0,
status: HealthStatus.OK,
},
taskType2: {
Success: 100,
RetryScheduled: 0,
Failed: 0,
status: HealthStatus.OK,
},
},
},
},
status: HealthStatus.OK,
},
workload: {
timestamp,
value: {
count: 2,
task_types: {
taskType1: {
count: 1,
status: {
idle: 1,
},
},
taskType2: {
count: 1,
status: {
idle: 1,
},
},
},
non_recurring: 2,
owner_ids: 0,
schedule: [['5m', 2]],
overdue: 0,
overdue_non_recurring: 0,
estimated_schedule_density: [
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
],
capacity_requirements: {
per_minute: 21,
per_hour: 47,
per_day: 33,
},
},
status: HealthStatus.OK,
},
},
};
};

describe('calculateHealthStatus', () => {
test('should return OK status when stats are fresh', () => {
expect(calculateHealthStatus(getStatsWithTimestamp(), config, true, logger)).toEqual({
status: HealthStatus.OK,
});
});

test('should return OK status when stats are not yet populated', () => {
expect(
calculateHealthStatus(
{
last_update: '2023-05-09T12:59:57.000Z',
stats: {},
},
config,
true,
logger
)
).toEqual({ status: HealthStatus.Warning, reason: `no health stats available` });
});

test('should return error status if any stat has status error', () => {
const errorReason = `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (222.85972222222222) >= capacityPerMinutePerKibana (200) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (222.85972222222222) >= capacityPerMinutePerKibana (200)`;
const stats = getStatsWithTimestamp();
set(stats, 'stats.capacity_estimation.reason', errorReason);

['configuration', 'runtime', 'workload'].forEach((key: string) => {
expect(
calculateHealthStatus(
set(cloneDeep(stats), `stats.${key}.status`, HealthStatus.Error),
config,
true,
logger
)
).toEqual({ status: HealthStatus.Error, reason: errorReason });
});
});

test('should return warning status if any stat has status warning', () => {
const warningReason = `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (222.85972222222222) < capacityPerMinutePerKibana (200)`;
const stats = getStatsWithTimestamp();
set(stats, 'stats.capacity_estimation.reason', warningReason);

['configuration', 'runtime', 'workload'].forEach((key: string) => {
expect(
calculateHealthStatus(
set(cloneDeep(stats), `stats.${key}.status`, HealthStatus.Warning),
config,
true,
logger
)
).toEqual({ status: HealthStatus.Warning, reason: warningReason });
});
});

test('should return error if hot timestamps are expired and shouldRunTasks is true', () => {
expect(
calculateHealthStatus(
getStatsWithTimestamp({ hotTimestamp: '2023-05-08T12:59:57.000Z' }),
config,
true,
logger
)
).toEqual({
status: HealthStatus.Error,
reason: 'setting HealthStatus.Error because of expired hot timestamps',
});
});

test('should return ok if hot timestamps are expired but shouldRunTasks is false', () => {
expect(
calculateHealthStatus(
getStatsWithTimestamp({ hotTimestamp: '2023-05-08T12:59:57.000Z' }),
config,
false,
logger
)
).toEqual({ status: HealthStatus.OK });
});

test('should return error if cold timestamps are expired', () => {
expect(
calculateHealthStatus(
getStatsWithTimestamp({ timestamp: '2023-05-08T12:59:57.000Z' }),
config,
true,
logger
)
).toEqual({
status: HealthStatus.Error,
reason: 'setting HealthStatus.Error because of expired hot timestamps',
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* 2.0.
*/

import { isString } from 'lodash';
import { isEmpty, isString } from 'lodash';
import { JsonValue } from '@kbn/utility-types';
import { Logger } from '@kbn/core/server';
import { HealthStatus, RawMonitoringStats } from '../monitoring';
Expand All @@ -19,6 +19,11 @@ export function calculateHealthStatus(
): { status: HealthStatus; reason?: string } {
const now = Date.now();

// if stats are empty, return a warning
if (isEmpty(summarizedStats.stats)) {
return { status: HealthStatus.Warning, reason: `no health stats available` };
}

// if "hot" health stats are any more stale than monitored_stats_required_freshness
// times a multiplier, consider the system unhealthy
const requiredHotStatsFreshness: number = config.monitored_stats_required_freshness * 3;
Expand Down

0 comments on commit dc3a140

Please sign in to comment.