-
Notifications
You must be signed in to change notification settings - Fork 8.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handles ns to ms conversion for event loop delay metrics #118447
Changes from 8 commits
a8f94bf
5d7e2b7
150c75e
5ce71d4
f0ec458
fca6022
15cd397
82b5206
55a613a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,15 +9,17 @@ import moment from 'moment'; | |
import type { EventLoopDelaysMonitor } from './event_loop_delays_monitor'; | ||
import type { IntervalHistogram } from '../types'; | ||
|
||
function createMockHistogram(overwrites: Partial<IntervalHistogram> = {}): IntervalHistogram { | ||
function createMockRawNsDataHistogram( | ||
overwrites: Partial<IntervalHistogram> = {} | ||
): IntervalHistogram { | ||
const now = Date.now(); | ||
|
||
return { | ||
const mockedRawCollectedDataInNs = { | ||
min: 9093120, | ||
max: 53247999, | ||
mean: 11993238.600747818, | ||
mean: 11993238, | ||
exceeds: 0, | ||
stddev: 1168191.9357543814, | ||
stddev: 1168191, | ||
fromTimestamp: moment(now).toISOString(), | ||
lastUpdatedAt: moment(now).toISOString(), | ||
percentiles: { | ||
|
@@ -28,6 +30,31 @@ function createMockHistogram(overwrites: Partial<IntervalHistogram> = {}): Inter | |
}, | ||
...overwrites, | ||
}; | ||
return mockedRawCollectedDataInNs; | ||
} | ||
|
||
function createMockMonitorDataMsHistogram( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Create mock data in ms returned from the monitor |
||
overwrites: Partial<IntervalHistogram> = {} | ||
): IntervalHistogram { | ||
const now = Date.now(); | ||
|
||
const mockedRawCollectedDataInMs = { | ||
min: 9.09312, | ||
max: 53.247999, | ||
mean: 11.993238, | ||
exceeds: 0, | ||
stddev: 1.168191, | ||
fromTimestamp: moment(now).toISOString(), | ||
lastUpdatedAt: moment(now).toISOString(), | ||
percentiles: { | ||
'50': 12.607487, | ||
'75': 12.615679, | ||
'95': 12.648447, | ||
'99': 12.713983, | ||
}, | ||
...overwrites, | ||
}; | ||
return mockedRawCollectedDataInMs; | ||
} | ||
|
||
function createMockEventLoopDelaysMonitor() { | ||
|
@@ -40,12 +67,12 @@ function createMockEventLoopDelaysMonitor() { | |
stop: jest.fn(), | ||
}); | ||
|
||
mockCollect.mockReturnValue(createMockHistogram()); | ||
mockCollect.mockReturnValue(createMockMonitorDataMsHistogram()); // this must mock the return value of the public collect method from this monitor. | ||
|
||
return new MockEventLoopDelaysMonitor(); | ||
} | ||
|
||
export const mocked = { | ||
createHistogram: createMockHistogram, | ||
createHistogram: createMockRawNsDataHistogram, // raw data as received from Node.js perf_hooks.monitorEventLoopDelay([options]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
createEventLoopDelaysMonitor: createMockEventLoopDelaysMonitor, | ||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ | |
/* eslint-disable dot-notation */ | ||
jest.mock('perf_hooks'); | ||
import { monitorEventLoopDelay } from 'perf_hooks'; | ||
import { EventLoopDelaysMonitor } from './event_loop_delays_monitor'; | ||
import { EventLoopDelaysMonitor, nsToMs } from './event_loop_delays_monitor'; | ||
import { mocked } from './event_loop_delays_monitor.mocks'; | ||
|
||
describe('EventLoopDelaysMonitor', () => { | ||
|
@@ -39,10 +39,20 @@ describe('EventLoopDelaysMonitor', () => { | |
expect(eventLoopDelaysMonitor['loopMonitor'].percentile).toHaveBeenNthCalledWith(3, 95); | ||
expect(eventLoopDelaysMonitor['loopMonitor'].percentile).toHaveBeenNthCalledWith(4, 99); | ||
|
||
// mocked perf_hook returns `mocked.createHistogram()`. | ||
// mocked perf_hook returns `mocked.createNsHistogram()` that returns data in ns. | ||
// The `collect` function returns the data in ms. | ||
// This ensures that the wiring of the `collect` function is correct. | ||
const mockedHistogram = mocked.createHistogram(); | ||
expect(histogramData).toEqual(mockedHistogram); | ||
|
||
expect(histogramData.min).toEqual(nsToMs(mockedHistogram.min)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I couldn't come up with an elegant way to test all of this that was still explicit enough for us to glance over and realize there's a unit conversion taking place. Since the units already threw us once, I'm erring on the side of caution here. |
||
expect(histogramData.max).toEqual(nsToMs(mockedHistogram.max)); | ||
expect(histogramData.mean).toEqual(nsToMs(mockedHistogram.mean)); | ||
expect(histogramData.stddev).toEqual(nsToMs(mockedHistogram.stddev)); | ||
expect(histogramData.exceeds).toEqual(nsToMs(mockedHistogram.exceeds)); | ||
expect(histogramData.percentiles['50']).toEqual(nsToMs(mockedHistogram.percentiles['50'])); | ||
expect(histogramData.percentiles['75']).toEqual(nsToMs(mockedHistogram.percentiles['75'])); | ||
expect(histogramData.percentiles['95']).toEqual(nsToMs(mockedHistogram.percentiles['95'])); | ||
expect(histogramData.percentiles['99']).toEqual(nsToMs(mockedHistogram.percentiles['99'])); | ||
}); | ||
|
||
test('#reset resets histogram data', () => { | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -6,10 +6,23 @@ | |||||
* Side Public License, v 1. | ||||||
*/ | ||||||
|
||||||
import moment from 'moment'; | ||||||
import type { IntervalHistogram as PerfIntervalHistogram } from 'perf_hooks'; | ||||||
import { monitorEventLoopDelay } from 'perf_hooks'; | ||||||
import type { IntervalHistogram } from '../types'; | ||||||
|
||||||
/** | ||||||
* Nanosecond to milisecond conversion unit | ||||||
*/ | ||||||
export const ONE_MILLISECOND_AS_NANOSECONDS = 1000000; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely easier to parse by eye. Nice suggestion! |
||||||
|
||||||
/** | ||||||
* Converts time metric from ns to ms | ||||||
**/ | ||||||
export function nsToMs(metric: number) { | ||||||
return moment.duration(metric / ONE_MILLISECOND_AS_NANOSECONDS).asMilliseconds(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm torn between using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's use the native API whenever possible. I'd remove |
||||||
} | ||||||
|
||||||
export class EventLoopDelaysMonitor { | ||||||
private readonly loopMonitor: PerfIntervalHistogram; | ||||||
private fromTimestamp: Date; | ||||||
|
@@ -28,31 +41,41 @@ export class EventLoopDelaysMonitor { | |||||
* Collect gathers event loop delays metrics from nodejs perf_hooks.monitorEventLoopDelay | ||||||
* the histogram calculations start from the last time `reset` was called or this | ||||||
* EventLoopDelaysMonitor instance was created. | ||||||
* | ||||||
* Returns metrics in milliseconds. | ||||||
|
||||||
* @returns {IntervalHistogram} | ||||||
*/ | ||||||
|
||||||
public collect(): IntervalHistogram { | ||||||
const lastUpdated = new Date(); | ||||||
this.loopMonitor.disable(); | ||||||
const { min, max, mean, exceeds, stddev } = this.loopMonitor; | ||||||
const { | ||||||
min: minNs, | ||||||
max: maxNs, | ||||||
mean: meanNs, | ||||||
exceeds: exceedsNs, | ||||||
stddev: stddevNs, | ||||||
} = this.loopMonitor; | ||||||
|
||||||
const collectedData: IntervalHistogram = { | ||||||
min, | ||||||
max, | ||||||
mean, | ||||||
exceeds, | ||||||
stddev, | ||||||
const collectedDataMs: IntervalHistogram = { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation ended up being the easiest to read. |
||||||
min: nsToMs(minNs), | ||||||
max: nsToMs(maxNs), | ||||||
mean: nsToMs(meanNs), | ||||||
exceeds: nsToMs(exceedsNs), | ||||||
stddev: nsToMs(stddevNs), | ||||||
fromTimestamp: this.fromTimestamp.toISOString(), | ||||||
lastUpdatedAt: lastUpdated.toISOString(), | ||||||
percentiles: { | ||||||
50: this.loopMonitor.percentile(50), | ||||||
75: this.loopMonitor.percentile(75), | ||||||
95: this.loopMonitor.percentile(95), | ||||||
99: this.loopMonitor.percentile(99), | ||||||
50: nsToMs(this.loopMonitor.percentile(50)), | ||||||
75: nsToMs(this.loopMonitor.percentile(75)), | ||||||
95: nsToMs(this.loopMonitor.percentile(95)), | ||||||
99: nsToMs(this.loopMonitor.percentile(99)), | ||||||
}, | ||||||
}; | ||||||
|
||||||
this.loopMonitor.enable(); | ||||||
return collectedData; | ||||||
return collectedDataMs; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: not all data in the returned object are in milliseconds. |
||||||
} | ||||||
|
||||||
/** | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,7 +32,7 @@ export const MONITOR_EVENT_LOOP_DELAYS_RESET = 24 * 60 * 60 * 1000; | |
export const MONITOR_EVENT_LOOP_DELAYS_START = 1 * 60 * 1000; | ||
|
||
/** | ||
* Mean event loop delay threshold for logging a warning. | ||
* Mean event loop delay threshold in ms for logging a warning. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added the unit here as it wasn't explicit. |
||
*/ | ||
export const MONITOR_EVENT_LOOP_WARN_THRESHOLD = 350; | ||
|
||
|
@@ -45,8 +45,3 @@ export const MONITOR_EVENT_LOOP_THRESHOLD_START = 1 * 60 * 1000; | |
* Check the event loop utilization every 30 seconds | ||
*/ | ||
export const MONITOR_EVENT_LOOP_THRESHOLD_INTERVAL = 30 * 1000; | ||
|
||
/** | ||
* Nanosecond to milisecond conversion unit | ||
*/ | ||
export const ONE_MILLISECOND_AS_NANOSECONDS = 1000000; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,19 +59,19 @@ export const eventLoopDelaysUsageSchema: MakeSchemaFrom<EventLoopDelaysUsageRepo | |
min: { | ||
type: 'long', | ||
_meta: { | ||
description: 'The minimum recorded event loop delay.', | ||
description: 'The minimum recorded event loop delay in ms.', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding units to telemetry field descriptions helps! |
||
}, | ||
}, | ||
max: { | ||
type: 'long', | ||
_meta: { | ||
description: 'The maximum recorded event loop delay.', | ||
description: 'The maximum recorded event loop delay in ms.', | ||
}, | ||
}, | ||
mean: { | ||
type: 'long', | ||
_meta: { | ||
description: 'The mean of the recorded event loop delays.', | ||
description: 'The mean of the recorded event loop delays in ms.', | ||
}, | ||
}, | ||
exceeds: { | ||
|
@@ -84,32 +84,32 @@ export const eventLoopDelaysUsageSchema: MakeSchemaFrom<EventLoopDelaysUsageRepo | |
stddev: { | ||
type: 'long', | ||
_meta: { | ||
description: 'The standard deviation of the recorded event loop delays.', | ||
description: 'The standard deviation of the recorded event loop delays in ms.', | ||
}, | ||
}, | ||
percentiles: { | ||
'50': { | ||
type: 'long', | ||
_meta: { | ||
description: 'The 50th accumulated percentile distribution', | ||
description: 'The 50th accumulated percentile distribution in ms', | ||
}, | ||
}, | ||
'75': { | ||
type: 'long', | ||
_meta: { | ||
description: 'The 75th accumulated percentile distribution', | ||
description: 'The 75th accumulated percentile distribution in ms', | ||
}, | ||
}, | ||
'95': { | ||
type: 'long', | ||
_meta: { | ||
description: 'The 95th accumulated percentile distribution', | ||
description: 'The 95th accumulated percentile distribution in ms', | ||
}, | ||
}, | ||
'99': { | ||
type: 'long', | ||
_meta: { | ||
description: 'The 99th accumulated percentile distribution', | ||
description: 'The 99th accumulated percentile distribution in ms', | ||
}, | ||
}, | ||
}, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,13 +9,11 @@ | |
import { takeUntil, finalize } from 'rxjs/operators'; | ||
import { Observable, timer } from 'rxjs'; | ||
import type { Logger } from 'kibana/server'; | ||
import moment from 'moment'; | ||
import type { UsageCounter } from '../../../../usage_collection/server'; | ||
import { | ||
MONITOR_EVENT_LOOP_THRESHOLD_START, | ||
MONITOR_EVENT_LOOP_THRESHOLD_INTERVAL, | ||
MONITOR_EVENT_LOOP_WARN_THRESHOLD, | ||
ONE_MILLISECOND_AS_NANOSECONDS, | ||
} from './constants'; | ||
import type { EventLoopDelaysMonitor } from '../../../../../core/server'; | ||
|
||
|
@@ -48,14 +46,11 @@ export function startTrackingEventLoopDelaysThreshold( | |
finalize(() => eventLoopDelaysMonitor.stop()) | ||
) | ||
.subscribe(async () => { | ||
const { mean } = eventLoopDelaysMonitor.collect(); | ||
const meanDurationMs = moment | ||
.duration(mean / ONE_MILLISECOND_AS_NANOSECONDS) | ||
.asMilliseconds(); | ||
const { mean: meanMS } = eventLoopDelaysMonitor.collect(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The data's already in |
||
|
||
if (meanDurationMs > warnThreshold) { | ||
if (meanMS > warnThreshold) { | ||
logger.warn( | ||
`Average event loop delay threshold exceeded ${warnThreshold}ms. Received ${meanDurationMs}ms. ` + | ||
`Average event loop delay threshold exceeded ${warnThreshold}ms. Received ${meanMS}ms. ` + | ||
`See https://ela.st/kibana-scaling-considerations for more information about scaling Kibana.` | ||
); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mocks the raw data returned from Node's native API in nanoseconds.