[Obs AI Assistant] Evaluation framework (elastic#173010)

Adds an evaluation framework for the Observability AI Assistant. For more details, [see the README](https://github.com/dgieselaar/kibana/blob/obs-ai-assistant-evaluation-framework/x-pack/plugins/observability_ai_assistant/scripts/evaluation/README.md). --------- Co-authored-by: kibanamachine <[email protected]> (cherry picked from commit c36410f)
kibanamachine · Dec 13, 2023 · 31682ff · 31682ff
1 parent 55a88c8
commit 31682ff
Show file tree

Hide file tree

Showing 16 changed files with 1,106 additions and 11 deletions.
diff --git a/package.json b/package.json
@@ -930,6 +930,7 @@
     "exponential-backoff": "^3.1.1",
     "extract-zip": "^2.0.1",
     "fast-deep-equal": "^3.1.1",
+    "fast-glob": "^3.3.2",
     "fflate": "^0.6.9",
     "file-saver": "^1.3.8",
     "fnv-plus": "^1.3.1",
@@ -1630,6 +1631,7 @@
     "supertest": "^6.3.3",
     "supports-color": "^7.0.0",
     "svgo": "^2.8.0",
+    "table": "^6.8.1",
     "tape": "^5.0.1",
     "tempy": "^0.3.0",
     "terser": "^5.16.5",

diff --git a/x-pack/plugins/observability_ai_assistant/common/utils/concatenate_openai_chunks.ts b/x-pack/plugins/observability_ai_assistant/common/utils/concatenate_openai_chunks.ts
@@ -7,10 +7,23 @@
 
 import { cloneDeep } from 'lodash';
 import { type Observable, scan } from 'rxjs';
-import { CreateChatCompletionResponseChunk, MessageRole } from '../types';
+import { type CreateChatCompletionResponseChunk, MessageRole } from '../types';
 
 export const concatenateOpenAiChunks =
-  () => (source: Observable<CreateChatCompletionResponseChunk>) =>
+  () =>
+  (
+    source: Observable<CreateChatCompletionResponseChunk>
+  ): Observable<{
+    message: {
+      content: string;
+      role: MessageRole;
+      function_call: {
+        name: string;
+        arguments: string;
+        trigger: MessageRole.Assistant | MessageRole.User;
+      };
+    };
+  }> =>
     source.pipe(
       scan(
         (acc, { choices }) => {

diff --git a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/README.md b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/README.md
@@ -0,0 +1,37 @@
+# Observability AI Assistant Evaluation Framework
+
+## Overview
+
+This tool is developed for our team working on the Elastic Observability platform, specifically focusing on evaluating the Observability AI Assistant. It simplifies scripting and evaluating various scenarios with the Large Language Model (LLM) integration.
+
+## Setup requirements
+
+- An Elasticsearch instance
+- A Kibana instance
+- At least one .gen-ai connector set up
+
+## Running evaluations
+
+Run the tool using:
+
+`$ node x-pack/plugins/observability_ai_assistant/scripts/evaluation/index.js`
+
+This will evaluate all existing scenarios, and write the evaluation results to the terminal.
+
+### Configuration
+
+#### Kibana and Elasticsearch
+
+By default, the tool will look for a Kibana instance running locally (at `http://localhost:5601`, which is the default address for running Kibana in development mode). It will also attempt to read the Kibana config file for the Elasticsearch address & credentials. If you want to override these settings, use `--kibana` and `--es`. Only basic auth is supported, e.g. `--kibana http://username:password@localhost:5601`. If you want to use a specific space, use `--spaceId`
+
+#### Connector
+
+Use `--connectorId` to specify a `.gen-ai` connector to use. If none are given, it will prompt you to select a connector based on the ones that are available. If only a single `.gen-ai` connector is found, it will be used without prompting.
+
+#### Persisting conversations
+
+By default, completed conversations are not persisted. If you do want to persist them, for instance for reviewing purposes, set the `--persist` flag to store them. This will also generate a clickable link in the output of the evaluation that takes you to the conversation.
+
+If you want to clear conversations on startup, use the `--clear` flag. This only works when `--persist` is enabled. If `--spaceId` is set, only conversations for the current space will be cleared.
+
+When storing conversations, the name of the scenario is used as a title. Set the `--autoTitle` flag to have the LLM generate a title for you.
diff --git a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/cli.ts b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/cli.ts
@@ -0,0 +1,78 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+import { format, parse } from 'url';
+import { Argv } from 'yargs';
+import { readKibanaConfig } from './read_kibana_config';
+
+export function options(y: Argv) {
+  const config = readKibanaConfig();
+
+  return y
+    .option('files', {
+      string: true as const,
+      array: true,
+      describe: 'A file or list of files containing the scenarios to evaluate. Defaults to all',
+    })
+    .option('grep', {
+      string: true,
+      array: false,
+      describe: 'A string or regex to filter scenarios by',
+    })
+    .option('kibana', {
+      describe: 'Where Kibana is running',
+      string: true,
+      default: process.env.KIBANA_HOST || 'http://localhost:5601',
+    })
+    .option('spaceId', {
+      describe:
+        'The space to use. If space is set, conversations will only be cleared for that spaceId',
+      string: true,
+      array: false,
+    })
+    .option('elasticsearch', {
+      alias: 'es',
+      describe: 'Where Elasticsearch is running',
+      string: true,
+      default: format({
+        ...parse(config['elasticsearch.hosts']),
+        auth: `${config['elasticsearch.username']}:${config['elasticsearch.password']}`,
+      }),
+    })
+    .option('connectorId', {
+      describe: 'The ID of the connector',
+      string: true,
+    })
+    .option('persist', {
+      describe:
+        'Whether the conversations should be stored. Adding this will generate a link at which the conversation can be opened.',
+      boolean: true,
+      default: false,
+    })
+    .option('clear', {
+      describe: 'Clear conversations on startup',
+      boolean: true,
+      default: false,
+    })
+    .option('autoTitle', {
+      describe: 'Whether to generate titles for new conversations',
+      boolean: true,
+      default: false,
+    })
+    .option('logLevel', {
+      describe: 'Log level',
+      default: 'info',
+    })
+    .check((argv) => {
+      if (!argv.persist && argv.clear) {
+        throw new Error('clear cannot be true if persist is false');
+      }
+      if (!argv.persist && argv.autoTitle) {
+        throw new Error('autoTitle cannot be true if persist is false');
+      }
+      return true;
+    });
+}
diff --git a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/evaluation.ts b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/evaluation.ts
@@ -0,0 +1,202 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import yargs from 'yargs';
+import { run } from '@kbn/dev-cli-runner';
+import { Client } from '@elastic/elasticsearch';
+import inquirer from 'inquirer';
+import * as fastGlob from 'fast-glob';
+import Path from 'path';
+import chalk from 'chalk';
+import * as table from 'table';
+import { castArray, omit, sortBy } from 'lodash';
+import { TableUserConfig } from 'table';
+import { format, parse } from 'url';
+import { options } from './cli';
+import { getServiceUrls } from './get_service_urls';
+import { KibanaClient } from './kibana_client';
+import { EvaluationFunction } from './types';
+import { MessageRole } from '../../common';
+
+function runEvaluations() {
+  yargs(process.argv.slice(2))
+    .command('*', 'Run AI Assistant evaluations', options, (argv) => {
+      run(
+        async ({ log }) => {
+          const serviceUrls = await getServiceUrls({
+            log,
+            elasticsearch: argv.elasticsearch,
+            kibana: argv.kibana,
+          });
+
+          const kibanaClient = new KibanaClient(serviceUrls.kibanaUrl, argv.spaceId);
+          const esClient = new Client({
+            node: serviceUrls.esUrl,
+          });
+
+          const connectors = await kibanaClient.getConnectors();
+
+          if (!connectors.length) {
+            throw new Error('No connectors found');
+          }
+
+          let connector = connectors.find((item) => item.id === argv.connectorId);
+
+          if (!connector && argv.connectorId) {
+            log.warning(`Could not find connector ${argv.connectorId}`);
+          }
+
+          if (!connector && connectors.length === 1) {
+            connector = connectors[0];
+            log.debug('Using the only connector found');
+          } else {
+            const connectorChoice = await inquirer.prompt({
+              type: 'list',
+              name: 'connector',
+              message: 'Select a connector',
+              choices: connectors.map((item) => item.name),
+            });
+
+            connector = connectors.find((item) => item.name === connectorChoice.connector)!;
+          }
+
+          log.info(`Using connector ${connector.id}`);
+
+          const scenarios =
+            (argv.files !== undefined &&
+              castArray(argv.files).map((file) => Path.join(process.cwd(), file))) ||
+            fastGlob.sync(Path.join(__dirname, './scenarios/**/*.ts'));
+
+          if (!scenarios.length) {
+            throw new Error('No scenarios to run');
+          }
+
+          if (argv.clear) {
+            log.info('Clearing conversations');
+            await esClient.deleteByQuery({
+              index: '.kibana-observability-ai-assistant-conversations',
+              query: {
+                ...(argv.spaceId ? { term: { namespace: argv.spaceId } } : { match_all: {} }),
+              },
+              refresh: true,
+            });
+          }
+
+          let evaluationFunctions: Array<{
+            name: string;
+            fileName: string;
+            fn: EvaluationFunction;
+          }> = [];
+
+          for (const fileName of scenarios) {
+            log.info(`Running scenario ${fileName}`);
+            const mod = await import(fileName);
+            Object.keys(mod).forEach((key) => {
+              evaluationFunctions.push({ name: key, fileName, fn: mod[key] });
+            });
+          }
+
+          if (argv.grep) {
+            const lc = argv.grep.toLowerCase();
+            evaluationFunctions = evaluationFunctions.filter((fn) =>
+              fn.name.toLowerCase().includes(lc)
+            );
+          }
+
+          const header: string[][] = [
+            [chalk.bold('Criterion'), chalk.bold('Result'), chalk.bold('Reasoning')],
+          ];
+
+          const tableConfig: TableUserConfig = {
+            singleLine: false,
+            border: {
+              topBody: `─`,
+              topJoin: `┬`,
+              topLeft: `┌`,
+              topRight: `┐`,
+
+              bottomBody: `─`,
+              bottomJoin: `┴`,
+              bottomLeft: `└`,
+              bottomRight: `┘`,
+
+              bodyLeft: `│`,
+              bodyRight: `│`,
+              bodyJoin: `│`,
+
+              joinBody: `─`,
+              joinLeft: `├`,
+              joinRight: `┤`,
+              joinJoin: `┼`,
+            },
+            spanningCells: [
+              { row: 0, col: 0, colSpan: 3 },
+              { row: 1, col: 0, colSpan: 3 },
+            ],
+            columns: [
+              { wrapWord: true, width: 60 },
+              { wrapWord: true },
+              { wrapWord: true, width: 60 },
+            ],
+          };
+
+          const sortedEvaluationFunctions = sortBy(evaluationFunctions, 'fileName', 'name');
+
+          for (const { name, fn } of sortedEvaluationFunctions) {
+            log.debug(`Executing ${name}`);
+            const result = await fn({
+              esClient,
+              kibanaClient,
+              chatClient: kibanaClient.createChatClient({
+                connectorId: connector.id!,
+                persist: argv.persist,
+                title: argv.autoTitle ? undefined : name,
+              }),
+            });
+            log.debug(`Result:`, JSON.stringify(result));
+            const output: string[][] = [
+              [
+                result.messages.find((message) => message.role === MessageRole.User)!.content!,
+                '',
+                '',
+              ],
+              result.conversationId
+                ? [
+                    `${format(omit(parse(serviceUrls.kibanaUrl), 'auth'))}/${
+                      argv.spaceId ? `s/${argv.spaceId}/` : ''
+                    }app/observabilityAIAssistant/conversations/${result.conversationId}`,
+                    '',
+                    '',
+                  ]
+                : ['', '', ''],
+              ...header,
+            ];
+
+            result.scores.forEach((score) => {
+              output.push([
+                score.criterion,
+                score.score === 0 ? chalk.redBright('failed') : chalk.greenBright('passed'),
+                score.reasoning,
+              ]);
+            });
+            log.write(table.table(output, tableConfig));
+          }
+        },
+        {
+          log: {
+            defaultLevel: argv.logLevel as any,
+          },
+          flags: {
+            allowUnexpected: true,
+          },
+        }
+      );
+    })
+    .parse();
+}
+
+runEvaluations();