[Logs Overview] Improve analyzer by filtering unsuitable tokens (elas…

…tic#197868) This improves the analyzer configuration used by the logs overview categorization feature so it's less susceptible to numeric and hexadecimal values that overwhelm the rest of the content after tokentization. --------- Co-authored-by: Elastic Machine <[email protected]>
pheyos · Nov 5, 2024 · ae4209e · ae4209e
1 parent c83e6db
commit ae4209e
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 14 deletions.
diff --git a/packages/kbn-apm-synthtrace/src/scenarios/helpers/unstructured_logs.ts b/packages/kbn-apm-synthtrace/src/scenarios/helpers/unstructured_logs.ts
@@ -33,7 +33,7 @@ export const unstructuredLogMessageGenerators = {
     ])} successfully ${f.number.int({ max: 100000 })} times`,
   ],
   taskStatusSuccess: (f: Faker) => [
-    `${f.hacker.noun()}: ${f.word.words()} ${f.helpers.arrayElement([
+    `${f.hacker.noun()}: ${f.word.words()} ${f.string.uuid()} ${f.helpers.arrayElement([
       'triggered',
       'executed',
       'processed',
@@ -46,7 +46,7 @@ export const unstructuredLogMessageGenerators = {
       'execution',
       'processing',
       'handling',
-    ])} of ${f.word.words()} failed at ${f.date.recent().toISOString()}`,
+    ])} of ${f.string.uuid()} failed at ${f.date.recent().toISOString()}`,
   ],
   error: (f: Faker) => [
     `${f.helpers.arrayElement([
@@ -58,7 +58,7 @@ export const unstructuredLogMessageGenerators = {
       'Issue',
     ])}: ${f.hacker.phrase()}`,
     `Stopping ${f.number.int(42)} background tasks...`,
-    'Shutting down process...',
+    `Shutting down process ${f.string.hexadecimal({ length: 16, prefix: '' })}...`,
   ],
   restart: (f: Faker) => {
     const service = f.database.engine();
@@ -72,13 +72,27 @@ export const unstructuredLogMessageGenerators = {
       ])}`,
     ];
   },
-  userAuthentication: (f: Faker) => [
-    `User ${f.internet.userName()} ${f.helpers.arrayElement([
-      'logged in',
-      'logged out',
-      'failed to login',
-    ])}`,
-  ],
+  userAuthentication: (f: Faker) =>
+    f.helpers.arrayElements(
+      [
+        `User ${f.internet.userName()} (id ${f.string.uuid()}) ${f.helpers.arrayElement([
+          'logged in',
+          'logged out',
+        ])} at ${f.date.recent().toISOString()} from ${f.internet.ip()}:${f.internet.port()}`,
+        `Created new user ${f.internet.userName()} (id ${f.string.uuid()})`,
+        `Disabled user ${f.internet.userName()} (id ${f.string.uuid()}) due to level ${f.number.int(
+          { max: 10 }
+        )} ${f.helpers.arrayElement([
+          'suspicious activity',
+          'security concerns',
+          'policy violation',
+        ])}`,
+        `Login ${f.internet.userName()} (id ${f.string.uuid()}) incorrect ${f.number.int({
+          max: 100,
+        })} times from ${f.internet.ipv6()}.`,
+      ],
+      { min: 1, max: 3 }
+    ),
   networkEvent: (f: Faker) => [
     `Network ${f.helpers.arrayElement([
       'connection',

diff --git a/x-pack/packages/observability/logs_overview/src/services/categorize_logs_service/queries.ts b/x-pack/packages/observability/logs_overview/src/services/categorize_logs_service/queries.ts
@@ -5,7 +5,10 @@
  * 2.0.
  */
 
-import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';
+import {
+  AggregationsCategorizeTextAnalyzer,
+  QueryDslQueryContainer,
+} from '@elastic/elasticsearch/lib/api/types';
 import { calculateAuto } from '@kbn/calculate-auto';
 import { RandomSamplerWrapper } from '@kbn/ml-random-sampler-utils';
 import moment from 'moment';
@@ -109,9 +112,7 @@ export const createCategorizationRequestParams = ({
         categorize_text: {
           field: messageField,
           size: maxCategoriesCount,
-          categorization_analyzer: {
-            tokenizer: 'standard',
-          },
+          categorization_analyzer: categorizationAnalyzerConfig,
           ...(minDocsPerCategory > 0 ? { min_doc_count: minDocsPerCategory } : {}),
         },
         aggs: {
@@ -149,3 +150,38 @@ export const createCategoryQuery =
       },
     },
   });
+
+// This emulates the behavior of the `ml_standard` tokenizer in the ML plugin in
+// regard to the hexadecimal and numeric tokens. The other parts pertaining to
+// infix punctuation and file paths are not easily emulated this way.
+// https://github.com/elastic/elasticsearch/blob/becd08da24df2af93eee28053d32929298cdccbd/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlStandardTokenizer.java#L35-L146
+// We don't use the `ml_standard` tokenizer directly because it produces tokens
+// that are different from the ones produced by the `standard` tokenizer upon
+// indexing.
+const categorizationAnalyzerConfig: AggregationsCategorizeTextAnalyzer = {
+  tokenizer: 'standard',
+  char_filter: [
+    'first_line_with_letters',
+    // This ignores tokens that are hexadecimal numbers
+    // @ts-expect-error the official types don't support inline char filters
+    {
+      type: 'pattern_replace',
+      pattern: '\\b[a-fA-F][a-fA-F0-9]+\\b',
+      replacement: '',
+    },
+    // This ignore tokens that start with a digit
+    // @ts-expect-error the official types don't support inline char filters
+    {
+      type: 'pattern_replace',
+      pattern: '\\b\\d\\w*\\b',
+      replacement: '',
+    },
+  ],
+  filter: [
+    // @ts-expect-error the official types don't support inline token filters
+    {
+      type: 'limit',
+      max_token_count: '100',
+    },
+  ],
+};