From 248afdc85bed09eef0d549294fc08937186bc2ba Mon Sep 17 00:00:00 2001 From: Gerard Clos Date: Wed, 11 Sep 2024 12:19:28 +0200 Subject: [PATCH] feature: batch evaluations backend #2 This commit implements the full cycle of a batch evaluation. Now users can run batch evaluations on top of their prompts with dataset data. --- .../:commitUuid/documents/handlers/run.ts | 14 --- .../src/actions/evaluations/runBatch.test.ts | 12 +- apps/web/src/actions/evaluations/runBatch.ts | 8 +- apps/web/src/actions/prompts/run.ts | 2 +- .../Editor/Playground/Chat/index.tsx | 16 ++- .../Editor/Playground/index.tsx | 56 +++------ .../Editor/Playground/index.tsx | 10 +- .../dashboard/_components/Actions.tsx | 51 ++++++++ .../evaluations/dashboard/layout.tsx | 16 +-- .../DocumentLogs/DocumentLogInfo/Metadata.tsx | 2 +- .../DocumentLogs/DocumentLogsTable.tsx | 2 +- .../versions/[commitUuid]/layout.tsx | 2 + .../workspaces/1/datasets/batchtest.csv | 7 ++ packages/core/src/data-access/documentLogs.ts | 40 +++++++ packages/core/src/data-access/evaluations.ts | 14 +++ packages/core/src/data-access/index.ts | 3 + packages/core/src/data-access/providerLogs.ts | 27 +++++ .../handlers/createEvaluationResultJob.ts | 33 ++++++ .../events/handlers/documentLogs/createJob.ts | 34 ++++++ packages/core/src/events/handlers/index.ts | 33 ++++++ .../getDocumentLogsWithMetadata.test.ts | 2 +- .../documentLogsRepository/index.ts | 78 ++++--------- .../core/src/schema/models/documentLogs.ts | 12 +- .../core/src/schema/models/providerLogs.ts | 2 +- packages/core/src/services/chains/run.ts | 8 ++ .../services/commits/runDocumentAtCommit.ts | 21 +++- .../documentLogs/_createDocumentLogQuery.ts | 52 +++++++++ .../computeDocumentLogWithMetadata.test.ts | 108 +++++++++++++++++ .../computeDocumentLogWithMetadata.ts | 28 +++++ .../computeDocumentLogsWithMetadata.ts | 31 +++++ .../core/src/services/documentLogs/index.ts | 2 + .../src/services/evaluationResults/create.ts | 3 +- .../core/src/services/evaluations/index.ts | 1 + packages/core/src/services/evaluations/run.ts | 80 +++++++++++++ .../core/src/services/providerLogs/create.ts | 10 +- .../providerLogs/formatForEvaluation.ts | 45 ++++++++ .../core/src/services/providerLogs/index.ts | 1 + packages/core/src/tests/factories/index.ts | 1 + .../core/src/tests/factories/providerLogs.ts | 44 +++++++ .../runBatchEvaluationJob.test.ts | 16 ++- .../batchEvaluations/runBatchEvaluationJob.ts | 109 +++++++++--------- .../batchEvaluations/runDocumentJob.test.ts | 77 +++++++++---- .../batchEvaluations/runDocumentJob.ts | 98 +++++++++------- .../batchEvaluations/runEvaluationJob.ts | 65 ++++++----- .../job-definitions/documentLogs/createJob.ts | 13 --- packages/jobs/src/job-definitions/index.ts | 5 +- packages/jobs/src/queues/index.ts | 9 +- packages/jobs/src/workers/_shared.ts | 11 +- 48 files changed, 982 insertions(+), 332 deletions(-) create mode 100644 apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/dashboard/_components/Actions.tsx create mode 100644 packages/core/public/uploads/workspaces/1/datasets/batchtest.csv create mode 100644 packages/core/src/data-access/documentLogs.ts create mode 100644 packages/core/src/data-access/evaluations.ts create mode 100644 packages/core/src/data-access/providerLogs.ts create mode 100644 packages/core/src/events/handlers/createEvaluationResultJob.ts create mode 100644 packages/core/src/events/handlers/documentLogs/createJob.ts create mode 100644 packages/core/src/services/documentLogs/_createDocumentLogQuery.ts create mode 100644 packages/core/src/services/documentLogs/computeDocumentLogWithMetadata.test.ts create mode 100644 packages/core/src/services/documentLogs/computeDocumentLogWithMetadata.ts create mode 100644 packages/core/src/services/documentLogs/computeDocumentLogsWithMetadata.ts create mode 100644 packages/core/src/services/evaluations/run.ts create mode 100644 packages/core/src/services/providerLogs/formatForEvaluation.ts create mode 100644 packages/core/src/tests/factories/providerLogs.ts delete mode 100644 packages/jobs/src/job-definitions/documentLogs/createJob.ts diff --git a/apps/gateway/src/routes/api/v1/projects/:projectId/commits/:commitUuid/documents/handlers/run.ts b/apps/gateway/src/routes/api/v1/projects/:projectId/commits/:commitUuid/documents/handlers/run.ts index 7ef0b349d..d50bb2c35 100644 --- a/apps/gateway/src/routes/api/v1/projects/:projectId/commits/:commitUuid/documents/handlers/run.ts +++ b/apps/gateway/src/routes/api/v1/projects/:projectId/commits/:commitUuid/documents/handlers/run.ts @@ -2,7 +2,6 @@ import { zValidator } from '@hono/zod-validator' import { LogSources } from '@latitude-data/core/browser' import { runDocumentAtCommit } from '@latitude-data/core/services/commits/runDocumentAtCommit' import { pipeToStream } from '$/common/pipeToStream' -import { queues } from '$/jobs' import { Factory } from 'hono/factory' import { streamSSE } from 'hono/streaming' import { z } from 'zod' @@ -21,8 +20,6 @@ export const runHandler = factory.createHandlers( zValidator('json', runSchema), async (c) => { return streamSSE(c, async (stream) => { - const startTime = Date.now() - const { projectId, commitUuid } = c.req.param() const { documentPath, parameters, source } = c.req.valid('json') @@ -44,17 +41,6 @@ export const runHandler = factory.createHandlers( }).then((r) => r.unwrap()) await pipeToStream(stream, result.stream) - - queues.defaultQueue.jobs.enqueueCreateDocumentLogJob({ - commit, - data: { - uuid: result.documentLogUuid, - documentUuid: document.documentUuid, - resolvedContent: result.resolvedContent, - parameters, - duration: Date.now() - startTime, - }, - }) }) }, ) diff --git a/apps/web/src/actions/evaluations/runBatch.test.ts b/apps/web/src/actions/evaluations/runBatch.test.ts index c4241fe6c..6a5b060cb 100644 --- a/apps/web/src/actions/evaluations/runBatch.test.ts +++ b/apps/web/src/actions/evaluations/runBatch.test.ts @@ -10,7 +10,7 @@ import { import * as factories from '@latitude-data/core/factories' import { beforeEach, describe, expect, it, vi } from 'vitest' -import { runBatchAction } from './runBatch' +import { runBatchEvaluationAction } from './runBatch' const mocks = vi.hoisted(() => ({ getSession: vi.fn(), @@ -40,7 +40,7 @@ vi.mock('@latitude-data/jobs', () => ({ describe('runBatchAction', () => { describe('unauthorized', () => { it('errors when the user is not authenticated', async () => { - const [_, error] = await runBatchAction({ + const [_, error] = await runBatchEvaluationAction({ datasetId: 1, projectId: 1, documentUuid: 'doc-uuid', @@ -95,7 +95,7 @@ describe('runBatchAction', () => { }) it('successfully enqueues a batch evaluation job', async () => { - const [result, error] = await runBatchAction({ + const [result, error] = await runBatchEvaluationAction({ datasetId: dataset.id, projectId: project.id, documentUuid: document.documentUuid, @@ -128,7 +128,7 @@ describe('runBatchAction', () => { }) it('handles optional parameters', async () => { - const [result, error] = await runBatchAction({ + const [result, error] = await runBatchEvaluationAction({ datasetId: dataset.id, projectId: project.id, documentUuid: document.documentUuid, @@ -156,7 +156,7 @@ describe('runBatchAction', () => { }) it('handles errors when resources are not found', async () => { - const [_, error] = await runBatchAction({ + const [_, error] = await runBatchEvaluationAction({ datasetId: 999999, projectId: project.id, documentUuid: document.documentUuid, @@ -176,7 +176,7 @@ describe('runBatchAction', () => { name: 'Test Evaluation 2', }) - const [result, error] = await runBatchAction({ + const [result, error] = await runBatchEvaluationAction({ datasetId: dataset.id, projectId: project.id, documentUuid: document.documentUuid, diff --git a/apps/web/src/actions/evaluations/runBatch.ts b/apps/web/src/actions/evaluations/runBatch.ts index 778b108b2..5db530095 100644 --- a/apps/web/src/actions/evaluations/runBatch.ts +++ b/apps/web/src/actions/evaluations/runBatch.ts @@ -1,3 +1,5 @@ +'use server' + import { DatasetsRepository, DocumentVersionsRepository, @@ -9,7 +11,7 @@ import { z } from 'zod' import { authProcedure } from '../procedures' -export const runBatchAction = authProcedure +export const runBatchEvaluationAction = authProcedure .createServerAction() .input( z.object({ @@ -17,8 +19,8 @@ export const runBatchAction = authProcedure projectId: z.number(), documentUuid: z.string(), commitUuid: z.string(), - fromLine: z.number(), - toLine: z.number(), + fromLine: z.number().optional(), + toLine: z.number().optional(), parameters: z.record(z.number()).optional(), evaluationIds: z.array(z.number()), }), diff --git a/apps/web/src/actions/prompts/run.ts b/apps/web/src/actions/prompts/run.ts index 0a870ecdf..36f0a0f69 100644 --- a/apps/web/src/actions/prompts/run.ts +++ b/apps/web/src/actions/prompts/run.ts @@ -14,7 +14,7 @@ export const runPromptAction = authProcedure .input( z.object({ prompt: z.string(), - parameters: z.object({ messages: z.string(), last_message: z.string() }), + parameters: z.record(z.any()), }), ) .handler(async ({ ctx, input }) => { diff --git a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx index 2cbc9df82..3879b2392 100644 --- a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx +++ b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx @@ -24,10 +24,18 @@ import { } from '$/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/_components/DocumentEditor/Editor/Playground/Chat' import { readStreamableValue } from 'ai/rsc' -export const EVALUATION_PARAMETERS = ['messages', 'last_message'] +export const EVALUATION_PARAMETERS = [ + 'messages', + 'context', + 'response', + 'prompt', + 'parameters', + 'cost', + 'latency', + 'config', +] export type Parameters = (typeof EVALUATION_PARAMETERS)[number] -export type Inputs = { [key in Parameters]: string } export default function Chat({ clearChat, @@ -36,7 +44,7 @@ export default function Chat({ }: { clearChat: () => void evaluation: EvaluationDto - parameters: Inputs + parameters: Record }) { const [error, setError] = useState() const [tokens, setTokens] = useState(0) @@ -79,7 +87,7 @@ export default function Chat({ const [data, error] = await runPromptAction({ prompt: evaluation.metadata.prompt, - parameters: parameters as { messages: string; last_message: string }, + parameters, }) if (error) { setError(error) diff --git a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/index.tsx b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/index.tsx index 9a63b6cc7..fb3a1bf15 100644 --- a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/index.tsx +++ b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/index.tsx @@ -1,50 +1,24 @@ 'use client' import { useCallback, useEffect, useMemo, useState } from 'react' -import { capitalize } from 'lodash-es' -import { - ConversationMetadata, - Message, - MessageContent, - TextContent, -} from '@latitude-data/compiler' +import { ConversationMetadata } from '@latitude-data/compiler' import { EvaluationDto } from '@latitude-data/core/browser' -import { Badge, Icon, Text, TextArea } from '@latitude-data/web-ui' +import { + formatContext, + formatConversation, +} from '@latitude-data/core/services/providerLogs/formatForEvaluation' +import { Badge, Icon, Input, Text } from '@latitude-data/web-ui' +import { convertParams } from '$/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/_components/DocumentEditor/Editor/Playground' import { ROUTES } from '$/services/routes' import useProviderLogs from '$/stores/providerLogs' import Link from 'next/link' import { useSearchParams } from 'next/navigation' import { Header } from '../Header' -import Chat, { EVALUATION_PARAMETERS, Inputs } from './Chat' +import Chat, { EVALUATION_PARAMETERS } from './Chat' import Preview from './Preview' -function convertMessage(message: Message) { - if (typeof message.content === 'string') { - return `${capitalize(message.role)}: \n ${message.content}` - } else { - const content = message.content[0] as MessageContent - if (content.type === 'text') { - return `${capitalize(message.role)}: \n ${(content as TextContent).text}` - } else { - return `${capitalize(message.role)}: <${content.type} message>` - } - } -} - -function convertMessages(messages: Message[]) { - return messages.map((message) => convertMessage(message)).join('\n') -} - -function convertParams(inputs: Inputs) { - return Object.fromEntries( - Object.entries(inputs).map(([key, value]) => { - return [key, value] - }), - ) -} - export default function Playground({ evaluation, metadata, @@ -53,7 +27,7 @@ export default function Playground({ metadata: ConversationMetadata }) { const [mode, setMode] = useState<'preview' | 'chat'>('preview') - const [inputs, setInputs] = useState( + const [inputs, setInputs] = useState>( Object.fromEntries( EVALUATION_PARAMETERS.map((param: string) => [param, '']), ), @@ -74,8 +48,14 @@ export default function Playground({ useEffect(() => { if (providerLog) { setInputs({ - messages: convertMessages(providerLog.messages), - last_message: `Assistant: ${providerLog.responseText}`, + messages: JSON.stringify(formatConversation(providerLog)), + context: JSON.stringify(formatContext(providerLog)), + response: providerLog.responseText, + prompt: '', + parameters: '', + config: '', + duration: '', + cost: '', }) } }, [setInput, providerLog]) @@ -106,7 +86,7 @@ export default function Playground({ > {{{param}}}
-