Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[8.x] [inference] NL-to-ESQL: improve doc generation (#192378) #192802

Merged
merged 1 commit into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ const buildTestDefinitions = (): Section[] => {
{
title: 'Generates a query to show employees filtered by name and grouped by hire_date',
question: `From the employees index, I want to see how many employees with a "B" in their first name
where hired each month over the past 2 years.
were hired each month over the past 2 years.
Assume the following fields:
- hire_date
- first_name
Expand All @@ -208,10 +208,10 @@ const buildTestDefinitions = (): Section[] => {
(which can be read the same backward and forward), and then return their last name and first name
- last_name
- first_name`,
expected: `FROM employees
| EVAL reversed_last_name = REVERSE(last_name)
| WHERE TO_LOWER(last_name) == TO_LOWER(reversed_last_name)
| KEEP last_name, first_name`,
criteria: [
`The assistant should not provide an ES|QL query, and explicitly mention that there is no
way to check for palindromes using ES|QL.`,
],
},
{
title: 'Generates a query to show the top 10 domains by doc count',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import Fs from 'fs/promises';
import Path from 'path';
import fastGlob from 'fast-glob';
import $, { load, Cheerio, AnyNode } from 'cheerio';
import { partition } from 'lodash';
import { ToolingLog } from '@kbn/tooling-log';
import pLimit from 'p-limit';
import { ScriptInferenceClient } from '../util/kibana_client';
import { convertToMarkdownPrompt } from './prompts/convert_to_markdown';
import { bindOutput, PromptCaller } from './utils/output_executor';

/**
* The pages that will be extracted but only used as context
* for the LLM for the enhancement tasks of the documentation entries.
*/
const contextArticles = [
'esql.html',
'esql-syntax.html',
'esql-kibana.html',
'esql-query-api.html',
'esql-limitations.html',
'esql-cross-clusters.html',
'esql-examples.html',
'esql-metadata-fields.html',
'esql-multi-index.html',
];

interface ExtractedPage {
sourceFile: string;
name: string;
content: string;
}

export interface ExtractedCommandOrFunc {
name: string;
markdownContent: string;
command: boolean;
}

export interface ExtractionOutput {
commands: ExtractedCommandOrFunc[];
functions: ExtractedCommandOrFunc[];
pages: ExtractedPage[];
skippedFile: string[];
}

export async function extractDocEntries({
builtDocsDir,
log,
inferenceClient,
}: {
builtDocsDir: string;
log: ToolingLog;
inferenceClient: ScriptInferenceClient;
}): Promise<ExtractionOutput> {
const files = await fastGlob(`${builtDocsDir}/html/en/elasticsearch/reference/master/esql*.html`);
if (!files.length) {
throw new Error('No files found');
}

const output: ExtractionOutput = {
commands: [],
functions: [],
pages: [],
skippedFile: [],
};

const executePrompt = bindOutput({
output: inferenceClient.output,
connectorId: inferenceClient.getConnectorId(),
});

const limiter = pLimit(10);

await Promise.all(
files.map(async (file) => {
return await processFile({
file,
log,
executePrompt,
output,
limiter,
});
})
);

return output;
}

async function processFile({
file: fileFullPath,
output,
executePrompt,
log,
limiter,
}: {
file: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const basename = Path.basename(fileFullPath);
const fileContent = (await Fs.readFile(fileFullPath)).toString('utf-8');

if (basename === 'esql-commands.html') {
// process commands
await processCommands({
fileContent,
log,
output,
limiter,
executePrompt,
});
} else if (basename === 'esql-functions-operators.html') {
// process functions / operators
await processFunctionsAndOperators({
fileContent,
log,
output,
limiter,
executePrompt,
});
} else if (contextArticles.includes(basename)) {
const $element = load(fileContent)('*');
output.pages.push({
sourceFile: basename,
name: basename === 'esql.html' ? 'overview' : basename.substring(5, basename.length - 5),
content: getSimpleText($element),
});
} else {
output.skippedFile.push(basename);
}
}

async function processFunctionsAndOperators({
fileContent,
output,
executePrompt,
log,
limiter,
}: {
fileContent: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const $element = load(fileContent.toString())('*');

const sections = extractSections($element);

const searches = [
'Binary operators',
'Equality',
'Inequality',
'Less than',
'Less than or equal to',
'Greater than',
'Greater than or equal to',
'Add +',
'Subtract -',
'Multiply *',
'Divide /',
'Modulus %',
'Unary operators',
'Logical operators',
'IS NULL and IS NOT NULL',
'Cast (::)',
];

const matches = ['IN', 'LIKE', 'RLIKE'];

const [operatorSections, allOtherSections] = partition(sections, (section) => {
return (
matches.includes(section.title) ||
searches.some((search) => section.title.toLowerCase().startsWith(search.toLowerCase()))
);
});

const functionSections = allOtherSections.filter(({ title }) => !!title.match(/^[A-Z_]+$/));

const markdownFiles = await Promise.all(
functionSections.map(async (section) => {
return limiter(async () => {
return {
name: section.title,
markdownContent: await executePrompt(
convertToMarkdownPrompt({ htmlContent: section.content })
),
command: false,
};
});
})
);

output.functions.push(...markdownFiles);

output.pages.push({
sourceFile: 'esql-functions-operators.html',
name: 'operators',
content: operatorSections.map(({ title, content }) => `${title}\n${content}`).join('\n'),
});
}

async function processCommands({
fileContent,
output,
executePrompt,
log,
limiter,
}: {
fileContent: string;
output: ExtractionOutput;
executePrompt: PromptCaller;
log: ToolingLog;
limiter: pLimit.Limit;
}) {
const $element = load(fileContent.toString())('*');

const sections = extractSections($element).filter(({ title }) => !!title.match(/^[A-Z_]+$/));

const markdownFiles = await Promise.all(
sections.map(async (section) => {
return limiter(async () => {
return {
name: section.title,
markdownContent: await executePrompt(
convertToMarkdownPrompt({ htmlContent: section.content })
),
command: true,
};
});
})
);

output.commands.push(...markdownFiles);
}

function getSimpleText($element: Cheerio<AnyNode>) {
$element.remove('.navfooter');
$element.remove('#sticky_content');
$element.remove('.edit_me');
$element.find('code').each(function () {
$(this).replaceWith('`' + $(this).text() + '`');
});
return $element
.find('.section,section,.part')
.last()
.text()
.replaceAll(/([\n]\s*){2,}/g, '\n');
}

export function extractSections(cheerio: Cheerio<AnyNode>) {
const sections: Array<{
title: string;
content: string;
}> = [];
cheerio.find('.section .position-relative').each((index, element) => {
const untilNextHeader = $(element).nextUntil('.position-relative');

const title = $(element).text().trim().replace('edit', '');

untilNextHeader.find('svg defs').remove();
untilNextHeader.find('.console_code_copy').remove();
untilNextHeader.find('.imageblock').remove();
untilNextHeader.find('table').remove();

const htmlContent = untilNextHeader
.map((i, node) => $(node).prop('outerHTML'))
.toArray()
.join('');

sections.push({
title: title === 'STATS ... BY' ? 'STATS' : title,
content: `<div><h1>${title}</h1> ${htmlContent}</div>`,
});
});

return sections;
}

This file was deleted.

Loading