[8.x] [Auto Import] Use larger number of samples on the backend (#196233

) (#196386) # Backport This will backport the following commits from `main` to `8.x`: - [[Auto Import] Use larger number of samples on the backend (#196233)](#196233)  ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sqren/backport)  Co-authored-by: Ilya Nikokoshev <[email protected]>
elastic · Oct 15, 2024 · a4938bc · a4938bc
1 parent 51b8359
commit a4938bc
Show file tree

Hide file tree

Showing 31 changed files with 534 additions and 190 deletions.
diff --git a/x-pack/plugins/integration_assistant/__jest__/fixtures/categorization.ts b/x-pack/plugins/integration_assistant/__jest__/fixtures/categorization.ts
@@ -162,7 +162,6 @@ export const testPipelineInvalidEcs: { pipelineResults: object[]; errors: object
 export const categorizationTestState = {
   rawSamples: ['{"test1": "test1"}'],
   samples: ['{ "test1": "test1" }'],
-  formattedSamples: '{"test1": "test1"}',
   ecsTypes: 'testtypes',
   ecsCategories: 'testcategories',
   exAnswer: 'testanswer',
@@ -173,9 +172,8 @@ export const categorizationTestState = {
   previousError: 'testprevious',
   previousInvalidCategorization: 'testinvalid',
   pipelineResults: [{ test: 'testresult' }],
-  finalized: false,
-  hasTriedOnce: false,
-  reviewed: false,
+  previousPipelineResults: [{ test: 'testresult' }],
+  lastReviewedSamples: [],
   currentPipeline: { test: 'testpipeline' },
   currentProcessors: [
     {
@@ -193,6 +191,9 @@ export const categorizationTestState = {
   initialPipeline: categorizationInitialPipeline,
   results: { test: 'testresults' },
   samplesFormat: { name: SamplesFormatName.Values.json },
+  stableSamples: [],
+  reviewCount: 0,
+  finalized: false,
 };
 
 export const categorizationMockProcessors = [

diff --git a/x-pack/plugins/integration_assistant/__jest__/fixtures/related.ts b/x-pack/plugins/integration_assistant/__jest__/fixtures/related.ts
@@ -140,7 +140,6 @@ export const testPipelineValidResult: { pipelineResults: object[]; errors: objec
 export const relatedTestState = {
   rawSamples: ['{"test1": "test1"}'],
   samples: ['{ "test1": "test1" }'],
-  formattedSamples: '{"test1": "test1"}',
   ecs: 'testtypes',
   exAnswer: 'testanswer',
   packageName: 'testpackage',

diff --git a/x-pack/plugins/integration_assistant/common/constants.ts b/x-pack/plugins/integration_assistant/common/constants.ts
@@ -36,3 +36,11 @@ export enum GenerationErrorCode {
   UNSUPPORTED_LOG_SAMPLES_FORMAT = 'unsupported-log-samples-format',
   UNPARSEABLE_CSV_DATA = 'unparseable-csv-data',
 }
+
+// Size limits
+export const FRONTEND_SAMPLE_ROWS = 100;
+export const LOG_FORMAT_DETECTION_SAMPLE_ROWS = 5;
+export const CATEGORIZATION_INITIAL_BATCH_SIZE = 60;
+export const CATEROGIZATION_REVIEW_BATCH_SIZE = 40;
+export const CATEGORIZATION_REVIEW_MAX_CYCLES = 5;
+export const CATEGORIZATION_RECURSION_LIMIT = 50;
diff --git a/x-pack/plugins/integration_assistant/common/index.ts b/x-pack/plugins/integration_assistant/common/index.ts
@@ -21,6 +21,8 @@ export {
 } from './api/analyze_logs/analyze_logs_route.gen';
 export { CelInputRequestBody, CelInputResponse } from './api/cel/cel_input_route.gen';
 
+export { partialShuffleArray } from './utils';
+
 export type {
   DataStream,
   InputType,

diff --git a/...ant/steps/data_stream_step/utils.test.tsx → ...ntegration_assistant/common/utils.test.ts b/...ant/steps/data_stream_step/utils.test.tsx → ...ntegration_assistant/common/utils.test.ts
diff --git a/...ssistant/steps/data_stream_step/utils.tsx → ...ins/integration_assistant/common/utils.ts b/...ssistant/steps/data_stream_step/utils.tsx → ...ins/integration_assistant/common/utils.ts
diff --git a/...ntegration/create_integration_assistant/steps/data_stream_step/sample_logs_input.test.tsx b/...ntegration/create_integration_assistant/steps/data_stream_step/sample_logs_input.test.tsx
@@ -11,7 +11,6 @@ import { TestProvider } from '../../../../../mocks/test_provider';
 import { parseNDJSON, parseJSONArray, SampleLogsInput } from './sample_logs_input';
 import { ActionsProvider } from '../../state';
 import { mockActions } from '../../mocks/state';
-import { mockServices } from '../../../../../services/mocks/services';
 
 const wrapper: React.FC<React.PropsWithChildren<{}>> = ({ children }) => (
   <TestProvider>
@@ -165,25 +164,6 @@ describe('SampleLogsInput', () => {
           samplesFormat: { name: 'json', json_path: [] },
         });
       });
-
-      describe('when the file has too many rows', () => {
-        const tooLargeLogsSample = Array(6).fill(logsSampleRaw).join(','); // 12 entries
-        beforeEach(async () => {
-          await changeFile(input, new File([`[${tooLargeLogsSample}]`], 'test.json', { type }));
-        });
-
-        it('should truncate the logs sample', () => {
-          expect(mockActions.setIntegrationSettings).toBeCalledWith({
-            logSamples: tooLargeLogsSample.split(',').slice(0, 2),
-            samplesFormat: { name: 'json', json_path: [] },
-          });
-        });
-        it('should add a notification toast', () => {
-          expect(mockServices.notifications.toasts.addInfo).toBeCalledWith(
-            `The logs sample has been truncated to 10 rows.`
-          );
-        });
-      });
     });
 
     describe('when the file is a json array under a key', () => {
@@ -236,25 +216,6 @@ describe('SampleLogsInput', () => {
           samplesFormat: { name: 'ndjson', multiline: false },
         });
       });
-
-      describe('when the file has too many rows', () => {
-        const tooLargeLogsSample = Array(6).fill(simpleNDJSON).join('\n'); // 12 entries
-        beforeEach(async () => {
-          await changeFile(input, new File([tooLargeLogsSample], 'test.json', { type }));
-        });
-
-        it('should truncate the logs sample', () => {
-          expect(mockActions.setIntegrationSettings).toBeCalledWith({
-            logSamples: tooLargeLogsSample.split('\n').slice(0, 2),
-            samplesFormat: { name: 'ndjson', multiline: false },
-          });
-        });
-        it('should add a notification toast', () => {
-          expect(mockServices.notifications.toasts.addInfo).toBeCalledWith(
-            `The logs sample has been truncated to 10 rows.`
-          );
-        });
-      });
     });
 
     describe('when the file is a an ndjson with a single record', () => {

diff --git a/...ate_integration/create_integration_assistant/steps/data_stream_step/sample_logs_input.tsx b/...ate_integration/create_integration_assistant/steps/data_stream_step/sample_logs_input.tsx
@@ -8,14 +8,12 @@
 import React, { useCallback, useState } from 'react';
 import { EuiCallOut, EuiFilePicker, EuiFormRow, EuiSpacer, EuiText } from '@elastic/eui';
 import { isPlainObject } from 'lodash/fp';
-import { useKibana } from '@kbn/kibana-react-plugin/public';
 import type { IntegrationSettings } from '../../types';
 import * as i18n from './translations';
 import { useActions } from '../../state';
 import type { SamplesFormat } from '../../../../../../common';
-import { partialShuffleArray } from './utils';
-
-const MaxLogsSampleRows = 10;
+import { partialShuffleArray } from '../../../../../../common';
+import { FRONTEND_SAMPLE_ROWS } from '../../../../../../common/constants';
 
 /**
  * Parse the logs sample file content as newiline-delimited JSON (NDJSON).
@@ -83,8 +81,8 @@ export const parseJSONArray = (
  * @returns Whether the array was truncated.
  */
 function trimShuffleLogsSample<T>(array: T[]): boolean {
-  const willTruncate = array.length > MaxLogsSampleRows;
-  const numElements = willTruncate ? MaxLogsSampleRows : array.length;
+  const willTruncate = array.length > FRONTEND_SAMPLE_ROWS;
+  const numElements = willTruncate ? FRONTEND_SAMPLE_ROWS : array.length;
 
   partialShuffleArray(array, 1, numElements);
 
@@ -215,7 +213,6 @@ interface SampleLogsInputProps {
 }
 
 export const SampleLogsInput = React.memo<SampleLogsInputProps>(({ integrationSettings }) => {
-  const { notifications } = useKibana().services;
   const { setIntegrationSettings } = useActions();
   const [isParsing, setIsParsing] = useState(false);
   const [sampleFileError, setSampleFileError] = useState<string>();
@@ -266,11 +263,7 @@ export const SampleLogsInput = React.memo<SampleLogsInputProps>(({ integrationSe
           return;
         }
 
-        const { samplesFormat, logSamples, isTruncated } = prepareResult;
-
-        if (isTruncated) {
-          notifications?.toasts.addInfo(i18n.LOGS_SAMPLE_TRUNCATED(MaxLogsSampleRows));
-        }
+        const { samplesFormat, logSamples } = prepareResult;
 
         setIntegrationSettings({
           ...integrationSettings,
@@ -293,7 +286,7 @@ export const SampleLogsInput = React.memo<SampleLogsInputProps>(({ integrationSe
 
       reader.readAsText(logsSampleFile);
     },
-    [integrationSettings, setIntegrationSettings, notifications?.toasts, setIsParsing]
+    [integrationSettings, setIntegrationSettings, setIsParsing]
   );
   return (
     <EuiFormRow

diff --git a/...ts/create_integration/create_integration_assistant/steps/data_stream_step/translations.ts b/...ts/create_integration/create_integration_assistant/steps/data_stream_step/translations.ts
@@ -110,11 +110,6 @@ export const LOGS_SAMPLE_DESCRIPTION = i18n.translate(
     defaultMessage: 'Drag and drop a file or Browse files.',
   }
 );
-export const LOGS_SAMPLE_TRUNCATED = (maxRows: number) =>
-  i18n.translate('xpack.integrationAssistant.step.dataStream.logsSample.truncatedWarning', {
-    values: { maxRows },
-    defaultMessage: `The logs sample has been truncated to {maxRows} rows.`,
-  });
 export const LOGS_SAMPLE_ERROR = {
   CAN_NOT_READ: i18n.translate(
     'xpack.integrationAssistant.step.dataStream.logsSample.errorCanNotRead',

diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/categorization.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/categorization.ts
@@ -11,6 +11,8 @@ import { combineProcessors } from '../../util/processors';
 import { CATEGORIZATION_EXAMPLE_PROCESSORS } from './constants';
 import { CATEGORIZATION_MAIN_PROMPT } from './prompts';
 import type { CategorizationNodeParams } from './types';
+import { selectResults } from './util';
+import { CATEGORIZATION_INITIAL_BATCH_SIZE } from '../../../common/constants';
 
 export async function handleCategorization({
   state,
@@ -19,8 +21,15 @@ export async function handleCategorization({
   const categorizationMainPrompt = CATEGORIZATION_MAIN_PROMPT;
   const outputParser = new JsonOutputParser();
   const categorizationMainGraph = categorizationMainPrompt.pipe(model).pipe(outputParser);
+
+  const [pipelineResults, _] = selectResults(
+    state.pipelineResults,
+    CATEGORIZATION_INITIAL_BATCH_SIZE,
+    new Set(state.stableSamples)
+  );
+
   const currentProcessors = (await categorizationMainGraph.invoke({
-    pipeline_results: JSON.stringify(state.pipelineResults, null, 2),
+    pipeline_results: JSON.stringify(pipelineResults, null, 2),
     example_processors: CATEGORIZATION_EXAMPLE_PROCESSORS,
     ex_answer: state?.exAnswer,
     ecs_categories: state?.ecsCategories,
@@ -36,7 +45,7 @@ export async function handleCategorization({
   return {
     currentPipeline,
     currentProcessors,
-    hasTriedOnce: true,
+    lastReviewedSamples: [],
     lastExecutedChain: 'categorization',
   };
 }
diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/constants.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/constants.ts
@@ -4,6 +4,7 @@
  * 2.0; you may not use this file except in compliance with the Elastic License
  * 2.0.
  */
+
 export const ECS_CATEGORIES = {
   api: 'Covers events from API calls, including those from OS and network protocols. Allowed event.type combinations: access, admin, allowed, change, creation, deletion, denied, end, info, start, user',
   authentication:

diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/errors.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/errors.ts
@@ -39,7 +39,6 @@ export async function handleErrors({
   return {
     currentPipeline,
     currentProcessors,
-    reviewed: false,
     lastExecutedChain: 'error',
   };
 }
diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/graph.test.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/graph.test.ts
@@ -25,6 +25,7 @@ import { handleReview } from './review';
 import { handleCategorization } from './categorization';
 import { handleErrors } from './errors';
 import { handleInvalidCategorization } from './invalid';
+import { handleUpdateStableSamples } from './stable';
 import { testPipeline, combineProcessors } from '../../util';
 import {
   ActionsClientChatOpenAI,
@@ -39,6 +40,7 @@ jest.mock('./errors');
 jest.mock('./review');
 jest.mock('./categorization');
 jest.mock('./invalid');
+jest.mock('./stable');
 
 jest.mock('../../util/pipeline', () => ({
   testPipeline: jest.fn(),
@@ -74,7 +76,8 @@ describe('runCategorizationGraph', () => {
       return {
         currentPipeline,
         currentProcessors,
-        reviewed: false,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'categorization',
       };
@@ -90,7 +93,8 @@ describe('runCategorizationGraph', () => {
       return {
         currentPipeline,
         currentProcessors,
-        reviewed: false,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'error',
       };
@@ -106,7 +110,8 @@ describe('runCategorizationGraph', () => {
       return {
         currentPipeline,
         currentProcessors,
-        reviewed: false,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'invalidCategorization',
       };
@@ -122,11 +127,29 @@ describe('runCategorizationGraph', () => {
       return {
         currentProcessors,
         currentPipeline,
-        reviewed: true,
+        stableSamples: [],
+        reviewCount: 0,
         finalized: false,
         lastExecutedChain: 'review',
       };
     });
+    // After the review it should route to modelOutput and finish.
+    (handleUpdateStableSamples as jest.Mock)
+      .mockResolvedValueOnce({
+        stableSamples: [],
+        finalized: false,
+        lastExecutedChain: 'handleUpdateStableSamples',
+      })
+      .mockResolvedValueOnce({
+        stableSamples: [],
+        finalized: false,
+        lastExecutedChain: 'handleUpdateStableSamples',
+      })
+      .mockResolvedValueOnce({
+        stableSamples: [0],
+        finalized: false,
+        lastExecutedChain: 'handleUpdateStableSamples',
+      });
   });
 
   it('Ensures that the graph compiles', async () => {