Skip to content

Commit

Permalink
[Response Ops][Alerting] Log ES query when query fails (elastic#187785)
Browse files Browse the repository at this point in the history
Resolves elastic#185889

## Summary

Logs ES query at `warn` level when query throws an error. This aids in
debugging query errors during rule execution without having to turn
debug logging on.

Also added a wrapper to the core logger for use within the alerting task
runner that will include the rule ID and rule type ID as tags in any
message logged during rule execution. This should allow us to filter for
log messages for a specific rule.

## To verify

1. Create a rule where the query will fail and let it run (I created an
ES query rule with bad DSL)
2. Logs should show the query that was sent:

Note the error log already existed. The warning log was added in this
PR.
```
[2024-07-08T14:58:39.442-04:00][WARN ][plugins.alerting.es-query] executing query for rule .es-query:7240c70d-00c6-4f44-9ef2-5fd496bfe46b in space default - {"allow_no_indices":true,"index":[".kibana-event-log*"],"size":100,"ignore_unavailable":true,"track_total_hits":true,"body":{"docvalue_fields":[{"field":"@timestamp","format":"strict_date_optional_time"}],"query":{"bool":{"filter":[{"boop":{}},{"bool":{"filter":[{"range":{"@timestamp":{"lte":"2024-07-08T18:58:39.440Z","gte":"2024-07-08T18:53:39.440Z","format":"strict_date_optional_time"}}}]}}]}},"aggs":{},"sort":[{"@timestamp":{"order":"desc","format":"strict_date_optional_time||epoch_millis"}}]}} - with options {"meta":true} and 300000ms requestTimeout
[2024-07-08T14:58:39.461-04:00][ERROR][plugins.alerting.es-query] Executing Rule default:.es-query:7240c70d-00c6-4f44-9ef2-5fd496bfe46b has resulted in Error: x_content_parse_exception
	Caused by:
		parsing_exception: unknown query [boop] did you mean [bool]?
	Root causes:
		parsing_exception: unknown query [boop] did you mean [bool]?, caused by: "unknown query [boop] did you mean [bool]?,[1:117] unknown field [boop]" - ResponseError: x_content_parse_exception
	Caused by:
		parsing_exception: unknown query [boop] did you mean [bool]?
	Root causes:
		parsing_exception: unknown query [boop] did you mean [bool]?
    at KibanaTransport.request (/Users/ying/Code/kibana/node_modules/@elastic/transport/src/Transport.ts:564:17)
    at processTicksAndRejections (node:internal/process/task_queues:95:5)
```

---------

Co-authored-by: Elastic Machine <[email protected]>
  • Loading branch information
ymao1 and elasticmachine authored Jul 15, 2024
1 parent 7d6c18a commit de13fca
Show file tree
Hide file tree
Showing 9 changed files with 436 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {} and 5000ms requestTimeout`
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses asCurrentUser when specified', async () => {
Expand All @@ -100,6 +101,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {} and 5000ms requestTimeout`
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses search options when specified', async () => {
Expand All @@ -126,9 +128,14 @@ describe('wrapScopedClusterClient', () => {
});
expect(scopedClusterClient.asInternalUser.search).not.toHaveBeenCalled();
expect(scopedClusterClient.asCurrentUser.search).not.toHaveBeenCalled();

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {\"ignore\":[404],\"requestTimeout\":10000} and 5000ms requestTimeout`
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('re-throws error when an error is thrown', async () => {
test('re-throws error and logs query when an error is thrown', async () => {
const { abortController, scopedClusterClient, childClient } = getMockClusterClients();

childClient.search.mockRejectedValueOnce(new Error('something went wrong!'));
Expand All @@ -141,8 +148,15 @@ describe('wrapScopedClusterClient', () => {
}).client();

await expect(
wrappedSearchClient.asInternalUser.search
wrappedSearchClient.asInternalUser.search(esQuery)
).rejects.toThrowErrorMatchingInlineSnapshot(`"something went wrong!"`);

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {}`
);
expect(logger.warn).toHaveBeenCalledWith(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {}`
);
});

test('handles empty search result object', async () => {
Expand All @@ -169,6 +183,11 @@ describe('wrapScopedClusterClient', () => {
const stats = wrappedSearchClientFactory.getMetrics();
expect(stats.numSearches).toEqual(1);
expect(stats.esSearchDurationMs).toEqual(0);

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {}`
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('keeps track of number of queries', async () => {
Expand Down Expand Up @@ -200,6 +219,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {}`
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('throws error when search throws abort error', async () => {
Expand All @@ -216,10 +236,15 @@ describe('wrapScopedClusterClient', () => {
}).client();

await expect(
abortableSearchClient.asInternalUser.search
abortableSearchClient.asInternalUser.search(esQuery)
).rejects.toThrowErrorMatchingInlineSnapshot(
`"Search has been aborted due to cancelled execution"`
);

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing query for rule .test-rule-type:abcdefg in space my-space - {\"body\":{\"query\":{\"bool\":{\"filter\":{\"range\":{\"@timestamp\":{\"gte\":0}}}}}}} - with options {}`
);
expect(logger.warn).not.toHaveBeenCalled();
});
});

Expand Down Expand Up @@ -248,6 +273,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing eql query for rule .test-rule-type:abcdefg in space my-space - {"index":"foo","query":"process where process.name == \\"regsvr32.exe\\""} - with options {} and 5000ms requestTimeout'
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses asCurrentUser when specified', async () => {
Expand All @@ -273,6 +299,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing eql query for rule .test-rule-type:abcdefg in space my-space - {"index":"foo","query":"process where process.name == \\"regsvr32.exe\\""} - with options {} and 5000ms requestTimeout'
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses search options when specified', async () => {
Expand All @@ -299,9 +326,14 @@ describe('wrapScopedClusterClient', () => {
});
expect(scopedClusterClient.asInternalUser.search).not.toHaveBeenCalled();
expect(scopedClusterClient.asCurrentUser.search).not.toHaveBeenCalled();

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing eql query for rule .test-rule-type:abcdefg in space my-space - {"index":"foo","query":"process where process.name == \\"regsvr32.exe\\""} - with options {"ignore":[404],"requestTimeout":10000} and 5000ms requestTimeout'
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('re-throws error when an error is thrown', async () => {
test('re-throws error and logs when an error is thrown', async () => {
const { abortController, scopedClusterClient, childClient } = getMockClusterClients();

childClient.eql.search.mockRejectedValueOnce(new Error('something went wrong!'));
Expand All @@ -314,8 +346,15 @@ describe('wrapScopedClusterClient', () => {
}).client();

await expect(
wrappedSearchClient.asInternalUser.eql.search
wrappedSearchClient.asInternalUser.eql.search(eqlQuery)
).rejects.toThrowErrorMatchingInlineSnapshot(`"something went wrong!"`);

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing eql query for rule .test-rule-type:abcdefg in space my-space - {"index":"foo","query":"process where process.name == \\"regsvr32.exe\\""} - with options {}'
);
expect(logger.warn).toHaveBeenCalledWith(
'executing eql query for rule .test-rule-type:abcdefg in space my-space - {"index":"foo","query":"process where process.name == \\"regsvr32.exe\\""} - with options {}'
);
});

test('keeps track of number of queries', async () => {
Expand Down Expand Up @@ -347,6 +386,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing eql query for rule .test-rule-type:abcdefg in space my-space - {\"index\":\"foo\",\"query\":\"process where process.name == \\\"regsvr32.exe\\\"\"} - with options {}`
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('throws error when eql search throws abort error', async () => {
Expand All @@ -365,10 +405,15 @@ describe('wrapScopedClusterClient', () => {
}).client();

await expect(
abortableSearchClient.asInternalUser.eql.search
abortableSearchClient.asInternalUser.eql.search(eqlQuery)
).rejects.toThrowErrorMatchingInlineSnapshot(
`"EQL search has been aborted due to cancelled execution"`
);

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing eql query for rule .test-rule-type:abcdefg in space my-space - {\"index\":\"foo\",\"query\":\"process where process.name == \\\"regsvr32.exe\\\"\"} - with options {}`
);
expect(logger.warn).not.toHaveBeenCalled();
});
});

Expand All @@ -395,9 +440,11 @@ describe('wrapScopedClusterClient', () => {
});
expect(scopedClusterClient.asInternalUser.search).not.toHaveBeenCalled();
expect(scopedClusterClient.asCurrentUser.search).not.toHaveBeenCalled();

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing ES|QL query for rule .test-rule-type:abcdefg in space my-space - {"method":"POST","path":"/_query","body":{"query":"from .kibana_task_manager"}} - with options {} and 5000ms requestTimeout'
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses asCurrentUser when specified', async () => {
Expand All @@ -423,6 +470,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing ES|QL query for rule .test-rule-type:abcdefg in space my-space - {"method":"POST","path":"/_query","body":{"query":"from .kibana_task_manager"}} - with options {} and 5000ms requestTimeout'
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses search options when specified', async () => {
Expand All @@ -449,9 +497,14 @@ describe('wrapScopedClusterClient', () => {
});
expect(scopedClusterClient.asInternalUser.search).not.toHaveBeenCalled();
expect(scopedClusterClient.asCurrentUser.search).not.toHaveBeenCalled();

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing ES|QL query for rule .test-rule-type:abcdefg in space my-space - {"method":"POST","path":"/_query","body":{"query":"from .kibana_task_manager"}} - with options {"ignore":[404],"requestTimeout":10000} and 5000ms requestTimeout'
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('re-throws error when an error is thrown', async () => {
test('re-throws error and logs when an error is thrown', async () => {
const { abortController, scopedClusterClient, childClient } = getMockClusterClients();

childClient.transport.request.mockRejectedValueOnce(new Error('something went wrong!'));
Expand All @@ -466,6 +519,13 @@ describe('wrapScopedClusterClient', () => {
await expect(
wrappedSearchClient.asInternalUser.transport.request({ method: 'POST', path: '/_query' })
).rejects.toThrowErrorMatchingInlineSnapshot(`"something went wrong!"`);

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
'executing ES|QL query for rule .test-rule-type:abcdefg in space my-space - {"method":"POST","path":"/_query"} - with options {}'
);
expect(logger.warn).toHaveBeenCalledWith(
'executing ES|QL query for rule .test-rule-type:abcdefg in space my-space - {"method":"POST","path":"/_query"} - with options {}'
);
});

test('keeps track of number of queries', async () => {
Expand Down Expand Up @@ -497,6 +557,7 @@ describe('wrapScopedClusterClient', () => {
expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing ES|QL query for rule .test-rule-type:abcdefg in space my-space - {\"method\":\"POST\",\"path\":\"/_query\",\"body\":{\"query\":\"from .kibana_task_manager\"}} - with options {}`
);
expect(logger.warn).not.toHaveBeenCalled();
});

test('throws error when es|ql search throws abort error', async () => {
Expand All @@ -522,6 +583,11 @@ describe('wrapScopedClusterClient', () => {
).rejects.toThrowErrorMatchingInlineSnapshot(
`"ES|QL search has been aborted due to cancelled execution"`
);

expect(loggingSystemMock.collect(logger).debug[0][0]).toEqual(
`executing ES|QL query for rule .test-rule-type:abcdefg in space my-space - {\"method\":\"POST\",\"path\":\"/_query\"} - with options {}`
);
expect(logger.warn).not.toHaveBeenCalled();
});
});

Expand All @@ -542,12 +608,13 @@ describe('wrapScopedClusterClient', () => {

expect(asInternalUserWrappedSearchFn).toHaveBeenCalledWith(
{ method: '', path: '' },
{
requestTimeout: 5000,
}
{ requestTimeout: 5000 }
);
expect(scopedClusterClient.asInternalUser.search).not.toHaveBeenCalled();
expect(scopedClusterClient.asCurrentUser.search).not.toHaveBeenCalled();

expect(loggingSystemMock.collect(logger).debug).toEqual([]);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses asCurrentUser when specified', async () => {
Expand All @@ -572,6 +639,9 @@ describe('wrapScopedClusterClient', () => {
);
expect(scopedClusterClient.asInternalUser.search).not.toHaveBeenCalled();
expect(scopedClusterClient.asCurrentUser.search).not.toHaveBeenCalled();

expect(loggingSystemMock.collect(logger).debug).toEqual([]);
expect(logger.warn).not.toHaveBeenCalled();
});

test('uses search options when specified', async () => {
Expand Down Expand Up @@ -603,6 +673,9 @@ describe('wrapScopedClusterClient', () => {
);
expect(scopedClusterClient.asInternalUser.search).not.toHaveBeenCalled();
expect(scopedClusterClient.asCurrentUser.search).not.toHaveBeenCalled();

expect(loggingSystemMock.collect(logger).debug).toEqual([]);
expect(logger.warn).not.toHaveBeenCalled();
});

test('re-throws error when an error is thrown', async () => {
Expand All @@ -620,6 +693,9 @@ describe('wrapScopedClusterClient', () => {
await expect(
wrappedSearchClient.asInternalUser.transport.request({ method: '', path: '' })
).rejects.toThrowErrorMatchingInlineSnapshot(`"something went wrong!"`);

expect(loggingSystemMock.collect(logger).debug).toEqual([]);
expect(logger.warn).not.toHaveBeenCalled();
});

test(`doesn't throw error when non es|ql request throws an error`, async () => {
Expand All @@ -641,6 +717,9 @@ describe('wrapScopedClusterClient', () => {
path: '/_cat/indices',
})
).rejects.toThrowErrorMatchingInlineSnapshot(`"Some other error"`);

expect(loggingSystemMock.collect(logger).debug).toEqual([]);
expect(logger.warn).not.toHaveBeenCalled();
});
});
});
Expand Down
33 changes: 30 additions & 3 deletions x-pack/plugins/alerting/server/lib/wrap_scoped_cluster_client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ function getWrappedTransportRequestFn(opts: WrapEsClientOpts) {
): Promise<TResponse | TransportResult<TResponse, TContext>> {
// Wrap ES|QL requests with an abort signal
if (params.method === 'POST' && params.path === '/_query') {
let requestOptions: TransportRequestOptions = {};
try {
const requestOptions = options ?? {};
requestOptions = options ?? {};
const start = Date.now();
opts.logger.debug(
() =>
Expand Down Expand Up @@ -193,6 +194,14 @@ function getWrappedTransportRequestFn(opts: WrapEsClientOpts) {
if (opts.abortController.signal.aborted) {
throw new Error('ES|QL search has been aborted due to cancelled execution');
}

opts.logger.warn(
`executing ES|QL query for rule ${opts.rule.alertTypeId}:${opts.rule.id} in space ${
opts.rule.spaceId
} - ${JSON.stringify(params)} - with options ${JSON.stringify(requestOptions)}${
requestTimeout ? ` and ${requestTimeout}ms requestTimeout` : ''
}`
);
throw e;
}
}
Expand Down Expand Up @@ -232,8 +241,9 @@ function getWrappedEqlSearchFn(opts: WrapEsClientOpts) {
params: EqlSearchRequest | EqlSearchRequestWithBody,
options?: TransportRequestOptions
): Promise<EqlSearchResponse<TEvent> | TransportResult<EqlSearchResponse<TEvent>, unknown>> {
let searchOptions: TransportRequestOptions = {};
try {
const searchOptions = options ?? {};
searchOptions = options ?? {};
const start = Date.now();
opts.logger.debug(
() =>
Expand Down Expand Up @@ -271,6 +281,14 @@ function getWrappedEqlSearchFn(opts: WrapEsClientOpts) {
if (opts.abortController.signal.aborted) {
throw new Error('EQL search has been aborted due to cancelled execution');
}

opts.logger.warn(
`executing eql query for rule ${opts.rule.alertTypeId}:${opts.rule.id} in space ${
opts.rule.spaceId
} - ${JSON.stringify(params)} - with options ${JSON.stringify(searchOptions)}${
requestTimeout ? ` and ${requestTimeout}ms requestTimeout` : ''
}`
);
throw e;
}
}
Expand Down Expand Up @@ -314,8 +332,9 @@ function getWrappedSearchFn(opts: WrapEsClientOpts) {
| TransportResult<SearchResponse<TDocument, TAggregations>, unknown>
| SearchResponse<TDocument, TAggregations>
> {
let searchOptions: TransportRequestOptions = {};
try {
const searchOptions = options ?? {};
searchOptions = options ?? {};
const start = Date.now();
opts.logger.debug(
() =>
Expand Down Expand Up @@ -356,6 +375,14 @@ function getWrappedSearchFn(opts: WrapEsClientOpts) {
if (opts.abortController.signal.aborted) {
throw new Error('Search has been aborted due to cancelled execution');
}

opts.logger.warn(
`executing query for rule ${opts.rule.alertTypeId}:${opts.rule.id} in space ${
opts.rule.spaceId
} - ${JSON.stringify(params)} - with options ${JSON.stringify(searchOptions)}${
requestTimeout ? ` and ${requestTimeout}ms requestTimeout` : ''
}`
);
throw e;
}
}
Expand Down
1 change: 1 addition & 0 deletions x-pack/plugins/alerting/server/task_runner/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
export { partiallyUpdateAdHocRun } from './partially_update_ad_hoc_run';
export { processRunResults } from './process_run_result';
export { withAlertingSpan } from './with_alerting_span';
export { createTaskRunnerLogger } from './task_runner_logger';
Loading

0 comments on commit de13fca

Please sign in to comment.