diff --git a/package.json b/package.json index da6934393d6da..a4dc51d2f4fbd 100644 --- a/package.json +++ b/package.json @@ -1463,6 +1463,7 @@ "@kbn/picomatcher": "link:packages/kbn-picomatcher", "@kbn/plugin-generator": "link:packages/kbn-plugin-generator", "@kbn/plugin-helpers": "link:packages/kbn-plugin-helpers", + "@kbn/product-doc-artifact-builder": "link:x-pack/packages/ai-infra/product-doc-artifact-builder", "@kbn/repo-file-maps": "link:packages/kbn-repo-file-maps", "@kbn/repo-linter": "link:packages/kbn-repo-linter", "@kbn/repo-path": "link:packages/kbn-repo-path", diff --git a/scripts/build_product_doc_artifacts.js b/scripts/build_product_doc_artifacts.js new file mode 100644 index 0000000000000..0d6bea2a6e775 --- /dev/null +++ b/scripts/build_product_doc_artifacts.js @@ -0,0 +1,11 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +require('../src/setup_node_env'); +require('@kbn/product-doc-artifact-builder').runScript(); diff --git a/src/dev/precommit_hook/casing_check_config.js b/src/dev/precommit_hook/casing_check_config.js index 37ef5ebe6c233..52d7b1c848628 100644 --- a/src/dev/precommit_hook/casing_check_config.js +++ b/src/dev/precommit_hook/casing_check_config.js @@ -116,6 +116,7 @@ export const IGNORE_DIRECTORY_GLOBS = [ 'src/babel-*', 'packages/*', 'packages/core/*/*', + 'x-pack/packages/ai-infra/*', 'packages/kbn-pm/src/utils/__fixtures__/*', 'packages/kbn-check-prod-native-modules-cli/integration_tests/__fixtures__/*/node_modules/*', 'x-pack/dev-tools', diff --git a/tsconfig.base.json b/tsconfig.base.json index 2e11f197f0ffe..12df74345a444 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -1366,6 +1366,8 @@ "@kbn/presentation-publishing/*": ["packages/presentation/presentation_publishing/*"], "@kbn/presentation-util-plugin": ["src/plugins/presentation_util"], "@kbn/presentation-util-plugin/*": ["src/plugins/presentation_util/*"], + "@kbn/product-doc-artifact-builder": ["x-pack/packages/ai-infra/product-doc-artifact-builder"], + "@kbn/product-doc-artifact-builder/*": ["x-pack/packages/ai-infra/product-doc-artifact-builder/*"], "@kbn/profiling-data-access-plugin": ["x-pack/plugins/observability_solution/profiling_data_access"], "@kbn/profiling-data-access-plugin/*": ["x-pack/plugins/observability_solution/profiling_data_access/*"], "@kbn/profiling-plugin": ["x-pack/plugins/observability_solution/profiling"], @@ -2084,4 +2086,4 @@ "@kbn/ambient-storybook-types" ] } -} \ No newline at end of file +} diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/README.md b/x-pack/packages/ai-infra/product-doc-artifact-builder/README.md new file mode 100644 index 0000000000000..eb64d53b5b8f7 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/README.md @@ -0,0 +1,3 @@ +# @kbn/product-doc-artifact-builder + +Script to build the knowledge base artifacts diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/index.ts new file mode 100644 index 0000000000000..c84a0a64540e9 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/index.ts @@ -0,0 +1,8 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export { runScript } from './src/command'; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/jest.config.js b/x-pack/packages/ai-infra/product-doc-artifact-builder/jest.config.js new file mode 100644 index 0000000000000..9ada1460ee7aa --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/jest.config.js @@ -0,0 +1,12 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +module.exports = { + preset: '@kbn/test/jest_node', + rootDir: '../../../..', + roots: ['/x-pack/packages/ai-infra/product-doc-artifact-builder'], +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/kibana.jsonc b/x-pack/packages/ai-infra/product-doc-artifact-builder/kibana.jsonc new file mode 100644 index 0000000000000..7b5a85fd5b7c8 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/kibana.jsonc @@ -0,0 +1,6 @@ +{ + "type": "shared-common", + "id": "@kbn/product-doc-artifact-builder", + "owner": "@elastic/appex-ai-infra", + "devOnly": true +} diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/package.json b/x-pack/packages/ai-infra/product-doc-artifact-builder/package.json new file mode 100644 index 0000000000000..b1a22882bb329 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/package.json @@ -0,0 +1,6 @@ +{ + "name": "@kbn/product-doc-artifact-builder", + "private": true, + "version": "1.0.0", + "license": "Elastic License 2.0" +} \ No newline at end of file diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/artifact_name.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/artifact_name.ts new file mode 100644 index 0000000000000..678b17088c7b4 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/artifact_name.ts @@ -0,0 +1,16 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export const getArtifactName = ({ + productName, + productVersion, +}: { + productName: string; + productVersion: string; +}): string => { + return `kibana-kb-${productName}-${productVersion}.zip`.toLowerCase(); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/manifest.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/manifest.ts new file mode 100644 index 0000000000000..cbebcdc22981b --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/manifest.ts @@ -0,0 +1,26 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export interface ArtifactManifest { + formatVersion: string; + productName: string; + productVersion: string; +} + +export const getArtifactManifest = ({ + productName, + stackVersion, +}: { + productName: string; + stackVersion: string; +}): ArtifactManifest => { + return { + formatVersion: '1.0.0', + productName, + productVersion: stackVersion, + }; +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/mappings.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/mappings.ts new file mode 100644 index 0000000000000..ae84ae60616a3 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/mappings.ts @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types'; + +export const getArtifactMappings = (inferenceEndpoint: string): MappingTypeMapping => { + return { + dynamic: 'strict', + properties: { + content_title: { type: 'text' }, + content_body: { + type: 'semantic_text', + inference_id: inferenceEndpoint, + }, + product_name: { type: 'keyword' }, + root_type: { type: 'keyword' }, + slug: { type: 'keyword' }, + url: { type: 'keyword' }, + version: { type: 'version' }, + ai_subtitle: { + type: 'semantic_text', + inference_id: inferenceEndpoint, + }, + ai_summary: { + type: 'semantic_text', + inference_id: inferenceEndpoint, + }, + ai_questions_answered: { + type: 'semantic_text', + inference_id: inferenceEndpoint, + }, + ai_tags: { type: 'keyword' }, + }, + }; +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/product_name.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/product_name.ts new file mode 100644 index 0000000000000..cfcc141323f4f --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/artifact/product_name.ts @@ -0,0 +1,11 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * The allowed product names, as found in the source's cluster + */ +export const sourceProductNames = ['Kibana', 'Elasticsearch', 'Security', 'Observability']; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts new file mode 100644 index 0000000000000..bbde3310f8e3a --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts @@ -0,0 +1,161 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import Path from 'path'; +import { Client } from '@elastic/elasticsearch'; +import { ToolingLog } from '@kbn/tooling-log'; +import { + // checkConnectivity, + createTargetIndex, + extractDocumentation, + indexDocuments, + installElser, + createChunkFiles, + createArtifact, + cleanupFolders, + deleteIndex, +} from './tasks'; +import type { TaskConfig } from './types'; + +const getSourceClient = (config: TaskConfig) => { + return new Client({ + compression: true, + nodes: [config.sourceClusterUrl], + sniffOnStart: false, + auth: { + username: config.sourceClusterUsername, + password: config.sourceClusterPassword, + }, + }); +}; + +const getEmbeddingClient = (config: TaskConfig) => { + return new Client({ + compression: true, + nodes: [config.embeddingClusterUrl], + auth: { + username: config.embeddingClusterUsername, + password: config.embeddingClusterPassword, + }, + // generating embeddings takes time + requestTimeout: 10 * 60 * 1000, + }); +}; + +export const buildArtifacts = async (config: TaskConfig) => { + const log = new ToolingLog({ + level: 'info', + writeTo: process.stdout, + }); + + log.info( + `Starting building artifacts for version=[${ + config.stackVersion + }] and products=[${config.productNames.join(',')}]` + ); + + const sourceClient = getSourceClient(config); + const embeddingClient = getEmbeddingClient(config); + + // log.info('Checking connectivity against clusters'); + // await checkConnectivity({ sourceClient, embeddingClient }); + + await cleanupFolders({ folders: [config.buildFolder] }); + + log.info('Ensuring ELSER is installed on the embedding cluster'); + await installElser({ client: embeddingClient }); + + for (const productName of config.productNames) { + await buildArtifact({ + productName, + stackVersion: config.stackVersion, + buildFolder: config.buildFolder, + targetFolder: config.targetFolder, + sourceClient, + embeddingClient, + log, + }); + } + + await cleanupFolders({ folders: [config.buildFolder] }); +}; + +const buildArtifact = async ({ + productName, + stackVersion, + buildFolder, + targetFolder, + embeddingClient, + sourceClient, + log, +}: { + productName: string; + stackVersion: string; + buildFolder: string; + targetFolder: string; + sourceClient: Client; + embeddingClient: Client; + log: ToolingLog; +}) => { + log.info(`Starting building artifact for product [${productName}] and version [${stackVersion}]`); + + const targetIndex = getTargetIndexName({ productName, stackVersion }); + + const documents = await extractDocumentation({ + client: sourceClient, + index: 'search-docs-1', + log, + productName, + stackVersion, + }); + + await createTargetIndex({ + client: embeddingClient, + indexName: targetIndex, + }); + + await indexDocuments({ + client: embeddingClient, + index: targetIndex, + documents, + log, + }); + + await createChunkFiles({ + index: targetIndex, + client: embeddingClient, + productName, + destFolder: Path.join(buildFolder, productName), + log, + }); + + await createArtifact({ + buildFolder: Path.join(buildFolder, productName), + targetFolder, + productName, + stackVersion, + log, + }); + + await deleteIndex({ + indexName: targetIndex, + client: embeddingClient, + log, + }); + + log.info(`Finished building artifact for product [${productName}] and version [${stackVersion}]`); +}; + +const getTargetIndexName = ({ + productName, + stackVersion, +}: { + productName: string; + stackVersion: string; +}) => { + return `kb-artifact-builder-${productName}-${stackVersion}`.toLowerCase(); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/command.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/command.ts new file mode 100644 index 0000000000000..49af1d158db83 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/command.ts @@ -0,0 +1,97 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import Path from 'path'; +import { REPO_ROOT } from '@kbn/repo-info'; +import yargs from 'yargs'; +import type { TaskConfig } from './types'; +import { buildArtifacts } from './build_artifacts'; +import { sourceProductNames } from './artifact/product_name'; + +function options(y: yargs.Argv) { + return y + .option('productName', { + describe: 'name of products to generate documentation for', + array: true, + choices: sourceProductNames, + default: ['Kibana'], + }) + .option('stackVersion', { + describe: 'The stack version to generate documentation for', + string: true, + default: '8.16', // TODO: master is on 9.0 now, not sure we can default to version in package.json? + }) + .option('targetFolder', { + describe: 'The folder to generate the artifacts in', + string: true, + default: Path.join(REPO_ROOT, 'build', 'kb-artifacts'), + }) + .option('buildFolder', { + describe: 'The folder to use for temporary files', + string: true, + default: Path.join(REPO_ROOT, 'build', 'temp-kb-artifacts'), + }) + .option('sourceClusterUrl', { + describe: 'The source cluster url', + string: true, + demandOption: true, + default: process.env.KIBANA_SOURCE_CLUSTER_URL, + }) + .option('sourceClusterUsername', { + describe: 'The source cluster username', + string: true, + demandOption: true, + default: process.env.KIBANA_SOURCE_CLUSTER_USERNAME, + }) + .option('sourceClusterPassword', { + describe: 'The source cluster password', + string: true, + demandOption: true, + default: process.env.KIBANA_SOURCE_CLUSTER_PASSWORD, + }) + .option('embeddingClusterUrl', { + describe: 'The embedding cluster url', + string: true, + demandOption: true, + default: process.env.KIBANA_EMBEDDING_CLUSTER_URL, + }) + .option('embeddingClusterUsername', { + describe: 'The embedding cluster username', + string: true, + demandOption: true, + default: process.env.KIBANA_EMBEDDING_CLUSTER_USERNAME, + }) + .option('embeddingClusterPassword', { + describe: 'The embedding cluster password', + string: true, + demandOption: true, + default: process.env.KIBANA_EMBEDDING_CLUSTER_PASSWORD, + }) + .locale('en'); +} + +export function runScript() { + yargs(process.argv.slice(2)) + .command('*', 'Build knowledge base artifacts', options, async (argv) => { + // argv contains additional entries - let's keep our input clear + const taskConfig: TaskConfig = { + productNames: argv.productName, + stackVersion: argv.stackVersion, + buildFolder: argv.buildFolder, + targetFolder: argv.targetFolder, + sourceClusterUrl: argv.sourceClusterUrl!, + sourceClusterUsername: argv.sourceClusterUsername!, + sourceClusterPassword: argv.sourceClusterPassword!, + embeddingClusterUrl: argv.embeddingClusterUrl!, + embeddingClusterUsername: argv.embeddingClusterUsername!, + embeddingClusterPassword: argv.embeddingClusterPassword!, + }; + + return buildArtifacts(taskConfig); + }) + .parse(); +} diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/check_connectivity.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/check_connectivity.ts new file mode 100644 index 0000000000000..aaf2be6bf12e5 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/check_connectivity.ts @@ -0,0 +1,18 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Client } from '@elastic/elasticsearch'; + +export const checkConnectivity = async ({ + sourceClient, + embeddingClient, +}: { + sourceClient: Client; + embeddingClient: Client; +}) => { + await Promise.all([sourceClient.ping(), embeddingClient.ping()]); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/cleanup_folders.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/cleanup_folders.ts new file mode 100644 index 0000000000000..3ae364d61f20e --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/cleanup_folders.ts @@ -0,0 +1,12 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { rm } from 'fs/promises'; + +export const cleanupFolders = async ({ folders }: { folders: string[] }) => { + await Promise.all(folders.map((folder) => rm(folder, { recursive: true, force: true }))); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_artifact.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_artifact.ts new file mode 100644 index 0000000000000..343099876585a --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_artifact.ts @@ -0,0 +1,51 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import Path from 'path'; +import AdmZip from 'adm-zip'; +import type { ToolingLog } from '@kbn/tooling-log'; +import { getArtifactMappings } from '../artifact/mappings'; +import { getArtifactManifest } from '../artifact/manifest'; +import { getArtifactName } from '../artifact/artifact_name'; + +export const createArtifact = async ({ + productName, + stackVersion, + buildFolder, + targetFolder, + log, +}: { + buildFolder: string; + targetFolder: string; + productName: string; + stackVersion: string; + log: ToolingLog; +}) => { + log.info( + `Starting to create artifact from build folder [${buildFolder}] into target [${targetFolder}]` + ); + + const zip = new AdmZip(); + + const mappings = getArtifactMappings('.default-elser'); + const mappingFileContent = JSON.stringify(mappings, undefined, 2); + zip.addFile('mappings.json', Buffer.from(mappingFileContent, 'utf-8')); + + const manifest = getArtifactManifest({ productName, stackVersion }); + const manifestFileContent = JSON.stringify(manifest, undefined, 2); + zip.addFile('manifest.json', Buffer.from(manifestFileContent, 'utf-8')); + + zip.addLocalFolder(buildFolder, 'content'); + + const artifactName = getArtifactName({ + productName, + productVersion: stackVersion, + }); + zip.writeZip(Path.join(targetFolder, artifactName)); + + log.info(`Finished creating artifact [${artifactName}]`); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_chunk_files.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_chunk_files.ts new file mode 100644 index 0000000000000..8b0e7323c2886 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_chunk_files.ts @@ -0,0 +1,68 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import Path from 'path'; +import Fs from 'fs/promises'; +import type { Client } from '@elastic/elasticsearch'; +import type { ToolingLog } from '@kbn/tooling-log'; + +const fileSizeLimit = 250_000; + +export const createChunkFiles = async ({ + index, + productName, + destFolder, + client, + log, +}: { + index: string; + productName: string; + destFolder: string; + client: Client; + log: ToolingLog; +}) => { + log.info(`Starting to create chunk files in directory [${destFolder}]`); + + const searchRes = await client.search({ + index, + size: 10000, + query: { + bool: { + must: [{ term: { product_name: productName } }], + }, + }, + }); + + await Fs.mkdir(destFolder, { recursive: true }); + + let chunkNumber = 1; + let chunkDocCount = 0; + let chunkContent: string = ''; + + const writeCurrentChunk = async () => { + const chunkFileName = `content-${chunkNumber}.ndjson`; + log.info(`Writing chunk file ${chunkFileName} containing ${chunkDocCount} docs`); + await Fs.writeFile(Path.join(destFolder, chunkFileName), chunkContent); + chunkContent = ''; + chunkDocCount = 0; + chunkNumber++; + }; + + for (let i = 0; i < searchRes.hits.hits.length; i++) { + const hit = searchRes.hits.hits[i]; + chunkContent += JSON.stringify(hit._source) + '\n'; + chunkDocCount++; + if ( + Buffer.byteLength(chunkContent, 'utf8') > fileSizeLimit || + i === searchRes.hits.hits.length - 1 + ) { + await writeCurrentChunk(); + } + } + + log.info(`Finished creating chunk files`); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts new file mode 100644 index 0000000000000..e4f24725883ab --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts @@ -0,0 +1,51 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Client } from '@elastic/elasticsearch'; +import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types'; + +const mappings: MappingTypeMapping = { + dynamic: 'strict', + properties: { + content_title: { type: 'text' }, + content_body: { + type: 'semantic_text', + inference_id: 'kibana-elser2', + }, + product_name: { type: 'keyword' }, + root_type: { type: 'keyword' }, + slug: { type: 'keyword' }, + url: { type: 'keyword' }, + version: { type: 'version' }, + ai_subtitle: { + type: 'semantic_text', + inference_id: 'kibana-elser2', + }, + ai_summary: { + type: 'semantic_text', + inference_id: 'kibana-elser2', + }, + ai_questions_answered: { + type: 'semantic_text', + inference_id: 'kibana-elser2', + }, + ai_tags: { type: 'keyword' }, + }, +}; + +export const createTargetIndex = async ({ + indexName, + client, +}: { + indexName: string; + client: Client; +}) => { + await client.indices.create({ + index: indexName, + mappings, + }); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/delete_index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/delete_index.ts new file mode 100644 index 0000000000000..6daacae1f92de --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/delete_index.ts @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Client } from '@elastic/elasticsearch'; +import type { ToolingLog } from '@kbn/tooling-log'; + +export const deleteIndex = async ({ + indexName, + client, + log, +}: { + indexName: string; + client: Client; + log: ToolingLog; +}) => { + log.info(`Deleting index ${indexName}`); + await client.indices.delete( + { + index: indexName, + }, + { ignore: [404] } + ); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/extract_documentation.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/extract_documentation.ts new file mode 100644 index 0000000000000..f1dd051394bbd --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/extract_documentation.ts @@ -0,0 +1,102 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Client } from '@elastic/elasticsearch'; +import type { SearchHit } from '@elastic/elasticsearch/lib/api/types'; +import type { ToolingLog } from '@kbn/tooling-log'; + +/** the list of fields to import from the source cluster */ +const fields = [ + 'content_title', + 'content_body', + 'product_name', // "Kibana", "Elasticsearch" + 'category', // "documentation" + 'slug', + 'url', + 'version', + 'ai_fields.ai_subtitle', + 'ai_fields.ai_summary', + 'ai_fields.ai_questions_answered', + 'ai_fields.ai_tags', +]; + +export interface ExtractedDocument { + content_title: string; + content_body: string; + product_name: string; + root_type: string; + slug: string; + url: string; + version: string; + ai_subtitle: string; + ai_summary: string; + ai_questions_answered: string[]; + ai_tags: string[]; +} + +const convertHit = (hit: SearchHit): ExtractedDocument => { + const source = hit._source; + return { + content_title: source.content_title, + content_body: source.content_body, + product_name: source.product_name, + root_type: 'documentation', + slug: source.slug, + url: source.url, + version: source.version, + ai_subtitle: source.ai_fields.ai_subtitle, + ai_summary: source.ai_fields.ai_summary, + ai_questions_answered: source.ai_fields.ai_questions_answered, + ai_tags: source.ai_fields.ai_tags, + }; +}; + +export const extractDocumentation = async ({ + client, + index, + stackVersion, + productName, + log, +}: { + client: Client; + index: string; + stackVersion: string; + productName: string; + log: ToolingLog; +}) => { + log.info(`Starting to extract documents from source cluster`); + + const response = await client.search({ + index, + size: 10000, + query: { + bool: { + must: [ + { term: { product_name: productName } }, + { term: { version: stackVersion } }, + { exists: { field: 'ai_fields.ai_summary' } }, + ], + }, + }, + fields, + }); + + const totalHits = + typeof response.hits.total === 'number' + ? response.hits.total // This format is to be removed in 8.0 + : response.hits.total?.value ?? response.hits.hits.length; + + if (totalHits > 10_000) { + throw new Error('Found more than 10k documents to extract - aborting'); + } + + log.info( + `Finished extracting documents from source. ${response.hits.hits.length} documents were extracted` + ); + + return response.hits.hits.map(convertHit); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts new file mode 100644 index 0000000000000..0c63431362329 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts @@ -0,0 +1,17 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export { extractDocumentation } from './extract_documentation'; +export { indexDocuments } from './index_documents'; +export { createTargetIndex } from './create_index'; +export { installElser } from './install_elser'; +export { createChunkFiles } from './create_chunk_files'; +export { performSemanticSearch } from './perform_semantic_search'; +export { checkConnectivity } from './check_connectivity'; +export { createArtifact } from './create_artifact'; +export { cleanupFolders } from './cleanup_folders'; +export { deleteIndex } from './delete_index'; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index_documents.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index_documents.ts new file mode 100644 index 0000000000000..120d71e3f2947 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index_documents.ts @@ -0,0 +1,50 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { chunk as toChunks } from 'lodash'; +import type { Client } from '@elastic/elasticsearch'; +import type { BulkRequest } from '@elastic/elasticsearch/lib/api/types'; +import type { ToolingLog } from '@kbn/tooling-log'; +import type { ExtractedDocument } from './extract_documentation'; + +const indexingChunkSize = 10; + +export const indexDocuments = async ({ + index, + client, + documents, + log, +}: { + index: string; + documents: ExtractedDocument[]; + client: Client; + log: ToolingLog; +}) => { + const chunks = toChunks(documents, indexingChunkSize); + + log.info(`Starting indexing process`); + + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + const before = Date.now(); + await client.bulk( + { + refresh: 'wait_for', + operations: chunk.reduce((operations, document) => { + operations!.push(...[{ index: { _index: index } }, document]); + return operations; + }, [] as BulkRequest['operations']), + }, + { requestTimeout: 10 * 60 * 1000 } + ); + + const duration = Date.now() - before; + log.info(`Indexed ${i + 1} of ${chunks.length} chunks (took ${duration}ms)`); + } + + log.info(`Finished indexing process`); +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts new file mode 100644 index 0000000000000..037a9e809d1e1 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts @@ -0,0 +1,74 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Client } from '@elastic/elasticsearch'; + +const inferenceEndpointId = 'kibana-elser2'; + +export const installElser = async ({ client }: { client: Client }) => { + const getInferenceRes = await client.inference.get( + { + task_type: 'sparse_embedding', + inference_id: 'kibana-elser2', + }, + { ignore: [404] } + ); + + const installed = (getInferenceRes.endpoints ?? []).some( + (endpoint) => endpoint.inference_id === inferenceEndpointId + ); + + if (!installed) { + await client.inference.put({ + task_type: 'sparse_embedding', + inference_id: inferenceEndpointId, + inference_config: { + service: 'elser', + service_settings: { + num_allocations: 1, + num_threads: 1, + model_id: '.elser_model_2', + }, + task_settings: {}, + }, + }); + } + + await waitUntilDeployed({ + modelId: '.elser_model_2', + client, + }); +}; + +const waitUntilDeployed = async ({ + modelId, + client, + maxRetries = 20, + delay = 2000, +}: { + modelId: string; + client: Client; + maxRetries?: number; + delay?: number; +}) => { + for (let i = 0; i < maxRetries; i++) { + const statsRes = await client.ml.getTrainedModelsStats({ + model_id: modelId, + }); + const deploymentStats = statsRes.trained_model_stats[0]?.deployment_stats; + // @ts-expect-error deploymentStats.nodes not defined as array even if it is. + if (!deploymentStats || deploymentStats.nodes.length === 0) { + await sleep(delay); + continue; + } + return; + } + + throw new Error(`Timeout waiting for ML model ${modelId} to be deployed`); +}; + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/perform_semantic_search.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/perform_semantic_search.ts new file mode 100644 index 0000000000000..373a6b8755429 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/perform_semantic_search.ts @@ -0,0 +1,92 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { Client } from '@elastic/elasticsearch'; + +// https://search-labs.elastic.co/search-labs/blog/elser-rag-search-for-relevance + +export const performSemanticSearch = async ({ + searchQuery, + index, + client, +}: { + searchQuery: string; + index: string; + client: Client; +}) => { + const results = await client.search({ + index, + size: 3, + query: { + bool: { + filter: { + bool: { + must: [{ term: { version: '8.15' } }], + }, + }, + should: [ + { + multi_match: { + query: searchQuery, + minimum_should_match: '1<-1 3<49%', + type: 'cross_fields', + fields: [ + 'content_title', + 'content_body.text', + 'ai_subtitle.text', + 'ai_summary.text', + 'ai_questions_answered.text', + 'ai_tags', + ], + }, + }, + { + multi_match: { + query: searchQuery, + type: 'phrase', + boost: 3, + slop: 0, + fields: [ + 'content_title.stem', + 'content_body.stem', + 'ai_subtitle.stem', + 'ai_summary.stem', + 'ai_questions_answered.stem', + ], + }, + }, + { + semantic: { + field: 'content_body', + query: searchQuery, + }, + }, + { + semantic: { + field: 'ai_subtitle', + query: searchQuery, + }, + }, + { + semantic: { + field: 'ai_summary', + query: searchQuery, + }, + }, + { + semantic: { + field: 'ai_questions_answered', + query: searchQuery, + }, + }, + ], + }, + }, + }); + + return results.hits.hits; +}; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/types.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/types.ts new file mode 100644 index 0000000000000..d2acfb5774500 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/types.ts @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export interface TaskConfig { + productNames: string[]; + stackVersion: string; + buildFolder: string; + targetFolder: string; + sourceClusterUrl: string; + sourceClusterUsername: string; + sourceClusterPassword: string; + embeddingClusterUrl: string; + embeddingClusterUsername: string; + embeddingClusterPassword: string; +} diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/tsconfig.json b/x-pack/packages/ai-infra/product-doc-artifact-builder/tsconfig.json new file mode 100644 index 0000000000000..508d4c715d0a7 --- /dev/null +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/tsconfig.json @@ -0,0 +1,20 @@ +{ + "extends": "../../../../tsconfig.base.json", + "compilerOptions": { + "outDir": "target/types", + "types": [ + "jest", + "node" + ] + }, + "include": [ + "**/*.ts", + ], + "exclude": [ + "target/**/*" + ], + "kbn_references": [ + "@kbn/tooling-log", + "@kbn/repo-info", + ] +} diff --git a/yarn.lock b/yarn.lock index 7bcf24d6415c4..31c9dfc2d26a4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6027,6 +6027,10 @@ version "0.0.0" uid "" +"@kbn/product-doc-artifact-builder@link:x-pack/packages/ai-infra/product-doc-artifact-builder": + version "0.0.0" + uid "" + "@kbn/profiling-data-access-plugin@link:x-pack/plugins/observability_solution/profiling_data_access": version "0.0.0" uid ""