Skip to content

Commit

Permalink
[Logs Overview] Improve analyzer by filtering unsuitable tokens (elas…
Browse files Browse the repository at this point in the history
…tic#197868)

This improves the analyzer configuration used by the logs overview
categorization feature so it's less susceptible to numeric and
hexadecimal values that overwhelm the rest of the content after
tokentization.

---------

Co-authored-by: Elastic Machine <[email protected]>
  • Loading branch information
weltenwort and elasticmachine authored Nov 5, 2024
1 parent c83e6db commit ae4209e
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export const unstructuredLogMessageGenerators = {
])} successfully ${f.number.int({ max: 100000 })} times`,
],
taskStatusSuccess: (f: Faker) => [
`${f.hacker.noun()}: ${f.word.words()} ${f.helpers.arrayElement([
`${f.hacker.noun()}: ${f.word.words()} ${f.string.uuid()} ${f.helpers.arrayElement([
'triggered',
'executed',
'processed',
Expand All @@ -46,7 +46,7 @@ export const unstructuredLogMessageGenerators = {
'execution',
'processing',
'handling',
])} of ${f.word.words()} failed at ${f.date.recent().toISOString()}`,
])} of ${f.string.uuid()} failed at ${f.date.recent().toISOString()}`,
],
error: (f: Faker) => [
`${f.helpers.arrayElement([
Expand All @@ -58,7 +58,7 @@ export const unstructuredLogMessageGenerators = {
'Issue',
])}: ${f.hacker.phrase()}`,
`Stopping ${f.number.int(42)} background tasks...`,
'Shutting down process...',
`Shutting down process ${f.string.hexadecimal({ length: 16, prefix: '' })}...`,
],
restart: (f: Faker) => {
const service = f.database.engine();
Expand All @@ -72,13 +72,27 @@ export const unstructuredLogMessageGenerators = {
])}`,
];
},
userAuthentication: (f: Faker) => [
`User ${f.internet.userName()} ${f.helpers.arrayElement([
'logged in',
'logged out',
'failed to login',
])}`,
],
userAuthentication: (f: Faker) =>
f.helpers.arrayElements(
[
`User ${f.internet.userName()} (id ${f.string.uuid()}) ${f.helpers.arrayElement([
'logged in',
'logged out',
])} at ${f.date.recent().toISOString()} from ${f.internet.ip()}:${f.internet.port()}`,
`Created new user ${f.internet.userName()} (id ${f.string.uuid()})`,
`Disabled user ${f.internet.userName()} (id ${f.string.uuid()}) due to level ${f.number.int(
{ max: 10 }
)} ${f.helpers.arrayElement([
'suspicious activity',
'security concerns',
'policy violation',
])}`,
`Login ${f.internet.userName()} (id ${f.string.uuid()}) incorrect ${f.number.int({
max: 100,
})} times from ${f.internet.ipv6()}.`,
],
{ min: 1, max: 3 }
),
networkEvent: (f: Faker) => [
`Network ${f.helpers.arrayElement([
'connection',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
* 2.0.
*/

import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';
import {
AggregationsCategorizeTextAnalyzer,
QueryDslQueryContainer,
} from '@elastic/elasticsearch/lib/api/types';
import { calculateAuto } from '@kbn/calculate-auto';
import { RandomSamplerWrapper } from '@kbn/ml-random-sampler-utils';
import moment from 'moment';
Expand Down Expand Up @@ -109,9 +112,7 @@ export const createCategorizationRequestParams = ({
categorize_text: {
field: messageField,
size: maxCategoriesCount,
categorization_analyzer: {
tokenizer: 'standard',
},
categorization_analyzer: categorizationAnalyzerConfig,
...(minDocsPerCategory > 0 ? { min_doc_count: minDocsPerCategory } : {}),
},
aggs: {
Expand Down Expand Up @@ -149,3 +150,38 @@ export const createCategoryQuery =
},
},
});

// This emulates the behavior of the `ml_standard` tokenizer in the ML plugin in
// regard to the hexadecimal and numeric tokens. The other parts pertaining to
// infix punctuation and file paths are not easily emulated this way.
// https://github.com/elastic/elasticsearch/blob/becd08da24df2af93eee28053d32929298cdccbd/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlStandardTokenizer.java#L35-L146
// We don't use the `ml_standard` tokenizer directly because it produces tokens
// that are different from the ones produced by the `standard` tokenizer upon
// indexing.
const categorizationAnalyzerConfig: AggregationsCategorizeTextAnalyzer = {
tokenizer: 'standard',
char_filter: [
'first_line_with_letters',
// This ignores tokens that are hexadecimal numbers
// @ts-expect-error the official types don't support inline char filters
{
type: 'pattern_replace',
pattern: '\\b[a-fA-F][a-fA-F0-9]+\\b',
replacement: '',
},
// This ignore tokens that start with a digit
// @ts-expect-error the official types don't support inline char filters
{
type: 'pattern_replace',
pattern: '\\b\\d\\w*\\b',
replacement: '',
},
],
filter: [
// @ts-expect-error the official types don't support inline token filters
{
type: 'limit',
max_token_count: '100',
},
],
};

0 comments on commit ae4209e

Please sign in to comment.