diff --git a/apps/web/src/actions/copilot/refinePrompt.ts b/apps/web/src/actions/copilot/refinePrompt.ts index e31f834b5..fc8019ea1 100644 --- a/apps/web/src/actions/copilot/refinePrompt.ts +++ b/apps/web/src/actions/copilot/refinePrompt.ts @@ -8,6 +8,7 @@ import { EvaluationsRepository, } from '@latitude-data/core/repositories' import { serialize as serializeEvaluationResult } from '@latitude-data/core/services/evaluationResults/serialize' +import { getEvaluationPrompt } from '@latitude-data/core/services/evaluations/index' import { env } from '@latitude-data/env' import { createSdk } from '$/app/(private)/_lib/createSdk' import { z } from 'zod' @@ -76,11 +77,16 @@ export const refinePromptAction = authProcedure projectId: env.COPILOT_PROJECT_ID, }).then((r) => r.unwrap()) + const evaluationPrompt = await getEvaluationPrompt({ + workspace: ctx.workspace, + evaluation, + }).then((r) => r.unwrap()) + const result = await sdk.run(env.COPILOT_REFINE_PROMPT_PATH, { stream: false, parameters: { prompt: document.content, - evaluation: evaluation.metadata.prompt, + evaluation: evaluationPrompt, results: serializedEvaluationResults, }, }) diff --git a/apps/web/src/app/(private)/_data-access/index.ts b/apps/web/src/app/(private)/_data-access/index.ts index a04ddc263..5f94fcbe0 100644 --- a/apps/web/src/app/(private)/_data-access/index.ts +++ b/apps/web/src/app/(private)/_data-access/index.ts @@ -212,13 +212,20 @@ export const getApiKeysCached = cache(async () => { return result.unwrap() }) -export const getProviderApiKeyCached = cache(async (name: string) => { +export const getProviderApiKeyByNameCached = cache(async (name: string) => { const { workspace } = await getCurrentUser() const scope = new ProviderApiKeysRepository(workspace.id) const result = await scope.findByName(name) return result.unwrap() }) +export const getProviderApiKeyByIdCached = cache(async (id: number) => { + const { workspace } = await getCurrentUser() + const scope = new ProviderApiKeysRepository(workspace.id) + const result = await scope.find(id) + return result.unwrap() +}) + export const getProviderApiKeysCached = cache(async () => { const { workspace } = await getCurrentUser() const scope = new ProviderApiKeysRepository(workspace.id) diff --git a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/dashboard/_components/EvaluationStats.tsx b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/dashboard/_components/EvaluationStats.tsx index 0f27f3f89..62024b612 100644 --- a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/dashboard/_components/EvaluationStats.tsx +++ b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/dashboard/_components/EvaluationStats.tsx @@ -3,7 +3,10 @@ import { useEffect, useState } from 'react' import { readMetadata } from '@latitude-data/compiler' -import { EvaluationDto } from '@latitude-data/core/browser' +import { + EvaluationDto, + EvaluationMetadataType, +} from '@latitude-data/core/browser' import { ConnectedDocumentWithMetadata } from '@latitude-data/core/repositories' import { Skeleton, Text } from '@latitude-data/web-ui' import { formatCostInMillicents } from '$/app/_lib/formatUtils' @@ -34,10 +37,14 @@ export default function EvaluationStats({ useConnectedDocuments({ evaluation }) useEffect(() => { - readMetadata({ prompt: evaluation.metadata.prompt }).then((metadata) => { - const metadataModel = (metadata.config['model'] as string) ?? 'Unknown' - setModel(metadataModel) - }) + if (evaluation.metadataType === EvaluationMetadataType.LlmAsJudgeAdvanced) { + readMetadata({ prompt: evaluation.metadata.prompt }).then((metadata) => { + const metadataModel = (metadata.config['model'] as string) ?? 'Unknown' + setModel(metadataModel) + }) + } else { + setModel(evaluation.metadata.model) + } }, [evaluation.metadata]) return ( diff --git a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx index 012e7a3af..415fdbba0 100644 --- a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx +++ b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/Playground/Chat/index.tsx @@ -7,6 +7,7 @@ import { import { ChainEventTypes, EvaluationDto, + EvaluationMetadataLlmAsJudgeAdvanced, StreamEventTypes, } from '@latitude-data/core/browser' import { @@ -52,6 +53,10 @@ export default function Chat({ const [responseStream, setResponseStream] = useState() const [isStreaming, setIsStreaming] = useState(false) + // TODO: Only advanced evaluations are available right now. Next PR will add saparate components for each evaluation type + const prompt = (evaluation.metadata as EvaluationMetadataLlmAsJudgeAdvanced) + .prompt + const addMessageToConversation = useCallback( (message: ConversationMessage) => { let newConversation: Conversation @@ -75,7 +80,7 @@ export default function Chat({ let messagesCount = 0 const [data, error] = await runPromptAction({ - prompt: evaluation.metadata.prompt, + prompt, parameters, }) if (error) { diff --git a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/index.tsx b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/index.tsx index f8aa37bca..c91004972 100644 --- a/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/index.tsx +++ b/apps/web/src/app/(private)/evaluations/(evaluation)/[evaluationUuid]/editor/_components/EvaluationEditor/Editor/index.tsx @@ -3,6 +3,7 @@ import { Suspense, useCallback, useEffect, useMemo, useState } from 'react' import { + EvaluationMetadataLlmAsJudgeAdvanced, ProviderApiKey, SERIALIZED_DOCUMENT_LOG_FIELDS, } from '@latitude-data/core/browser' @@ -70,6 +71,10 @@ export default function EvaluationEditor({ if (!evaluation) return null + // TODO: Only advanced evaluations are available right now. Next PR will add saparate components for each evaluation type + const prompt = (evaluation.metadata as EvaluationMetadataLlmAsJudgeAdvanced) + .prompt + return (
@@ -82,7 +87,7 @@ export default function EvaluationEditor({ onChangePrompt={onChange} rightActions={ <> - {value !== evaluation.metadata.prompt && ( + {value !== prompt && (
diff --git a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/_components/DocumentEditor/Editor/RefineModal/steps/2_SelectEvaluationResults/SelectableEvaluationResultsTable.tsx b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/_components/DocumentEditor/Editor/RefineModal/steps/2_SelectEvaluationResults/SelectableEvaluationResultsTable.tsx index 208156779..6c896f0eb 100644 --- a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/_components/DocumentEditor/Editor/RefineModal/steps/2_SelectEvaluationResults/SelectableEvaluationResultsTable.tsx +++ b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/_components/DocumentEditor/Editor/RefineModal/steps/2_SelectEvaluationResults/SelectableEvaluationResultsTable.tsx @@ -29,9 +29,7 @@ export const ResultCellContent = ({ evaluation: EvaluationDto value: unknown }) => { - if ( - evaluation.metadata.configuration.type === EvaluationResultableType.Boolean - ) { + if (evaluation.resultType === EvaluationResultableType.Boolean) { return ( {String(value)} @@ -39,11 +37,9 @@ export const ResultCellContent = ({ ) } - if ( - evaluation.metadata.configuration.type === EvaluationResultableType.Number - ) { - const minValue = evaluation.metadata.configuration.detail?.range.from ?? 0 - const maxValue = evaluation.metadata.configuration.detail?.range.to ?? 10 + if (evaluation.resultType === EvaluationResultableType.Number) { + const minValue = evaluation.resultConfiguration.minValue + const maxValue = evaluation.resultConfiguration.maxValue return ( { - if ( - evaluation.metadata.configuration.type === EvaluationResultableType.Boolean - ) { + if (evaluation.resultType === EvaluationResultableType.Boolean) { return ( {String(value)} @@ -49,11 +47,9 @@ export const ResultCellContent = ({ ) } - if ( - evaluation.metadata.configuration.type === EvaluationResultableType.Number - ) { - const minValue = evaluation.metadata.configuration.detail?.range.from ?? 0 - const maxValue = evaluation.metadata.configuration.detail?.range.to ?? 10 + if (evaluation.resultType === EvaluationResultableType.Number) { + const minValue = evaluation.resultConfiguration.minValue + const maxValue = evaluation.resultConfiguration.maxValue return ( ({ +export function BigNumberPanels({ commit, evaluation, documentUuid, - isNumeric, }: { - isNumeric: T commit: Commit evaluation: EvaluationDto documentUuid: string @@ -23,15 +25,13 @@ export function BigNumberPanels({ evaluationId={evaluation.id} /> - {isNumeric && ( + {evaluation.resultType == EvaluationResultableType.Number ? ( - )} - - {!isNumeric && ( + ) : ( { diff --git a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/Charts/index.tsx b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/Charts/index.tsx index aeaaeb3e6..c224d924e 100644 --- a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/Charts/index.tsx +++ b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/Charts/index.tsx @@ -12,10 +12,6 @@ export function EvaluationResultsCharts({ evaluation: EvaluationDto documentUuid: string }) { - const isNumerical = - evaluation.metadata.configuration.type === EvaluationResultableType.Number - - if (!isNumerical) return null - + if (evaluation.resultType != EvaluationResultableType.Number) return null return } diff --git a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/index.tsx b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/index.tsx index 394cfdacf..77f915e49 100644 --- a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/index.tsx +++ b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/_components/MetricsSummary/index.tsx @@ -5,16 +5,14 @@ import { Commit, EvaluationDto } from '@latitude-data/core/browser' import { BigNumberPanels } from './BigNumberPanels' import { EvaluationResultsCharts } from './Charts' -export function MetricsSummary({ +export function MetricsSummary({ commit, evaluation, documentUuid, - isNumeric, }: { commit: Commit evaluation: EvaluationDto documentUuid: string - isNumeric: T }) { return (
@@ -27,7 +25,6 @@ export function MetricsSummary({ commit={commit} evaluation={evaluation} documentUuid={documentUuid} - isNumeric={isNumeric} />
diff --git a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/layout.tsx b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/layout.tsx index c5f908871..ef5cddb18 100644 --- a/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/layout.tsx +++ b/apps/web/src/app/(private)/projects/[projectId]/versions/[commitUuid]/documents/[documentUuid]/evaluations/[evaluationId]/layout.tsx @@ -1,7 +1,10 @@ import { ReactNode } from 'react' import { readMetadata } from '@latitude-data/compiler' -import { EvaluationResultableType } from '@latitude-data/core/browser' +import { + EvaluationMetadataType, + EvaluationResultableType, +} from '@latitude-data/core/browser' import { env } from '@latitude-data/env' import { Breadcrumb, @@ -14,7 +17,8 @@ import { } from '@latitude-data/web-ui' import { findCommitCached, - getProviderApiKeyCached, + getProviderApiKeyByIdCached, + getProviderApiKeyByNameCached, } from '$/app/(private)/_data-access' import BreadcrumbLink from '$/components/BreadcrumbLink' import { ROUTES } from '$/services/routes' @@ -47,11 +51,9 @@ export default async function ConnectedEvaluationLayout({ projectId: Number(params.projectId), uuid: params.commitUuid, }) - const isNumeric = - evaluation.metadata.configuration.type == EvaluationResultableType.Number let provider - if (evaluation.metadata.prompt) { + if (evaluation.metadataType == EvaluationMetadataType.LlmAsJudgeAdvanced) { const metadata = await readMetadata({ prompt: evaluation.metadata.prompt, }) @@ -61,12 +63,16 @@ export default async function ConnectedEvaluationLayout({ typeof metadata.config.provider === 'string' ) { try { - provider = await getProviderApiKeyCached(metadata.config.provider) + provider = await getProviderApiKeyByNameCached(metadata.config.provider) } catch (error) { // do nothing, it could be that the provider name does not match any // provider in the workspace } } + } else { + provider = await getProviderApiKeyByIdCached( + evaluation.metadata.providerApiKeyId, + ) } return (
@@ -92,7 +98,7 @@ export default async function ConnectedEvaluationLayout({ {evaluation.name} - {TYPE_TEXT[evaluation.metadata.configuration.type]} + {TYPE_TEXT[evaluation.resultType]} {children}
diff --git a/apps/web/src/components/EvaluationAggregatedResult/index.tsx b/apps/web/src/components/EvaluationAggregatedResult/index.tsx index df55a2d10..01731378a 100644 --- a/apps/web/src/components/EvaluationAggregatedResult/index.tsx +++ b/apps/web/src/components/EvaluationAggregatedResult/index.tsx @@ -136,9 +136,7 @@ export default function EvaluationAggregatedResult({ documentUuid: string commitUuid: string }) { - if ( - evaluation.metadata.configuration.type === EvaluationResultableType.Number - ) { + if (evaluation.resultType === EvaluationResultableType.Number) { return ( () .notNull(), templateId: bigint('template_id', { mode: 'number' }).references( diff --git a/packages/core/src/schema/types.ts b/packages/core/src/schema/types.ts index 1b40050d4..6e9f796d7 100644 --- a/packages/core/src/schema/types.ts +++ b/packages/core/src/schema/types.ts @@ -99,35 +99,35 @@ export type EvaluationDto = Evaluation & metadataType: EvaluationMetadataType.LlmAsJudgeAdvanced metadata: EvaluationMetadataLlmAsJudgeAdvanced resultType: EvaluationResultableType.Boolean - resultConfiguration?: EvaluationConfigurationBoolean // TODO: This is nullable by default, but will be changed in the future. + resultConfiguration: EvaluationConfigurationBoolean } | { metadataType: EvaluationMetadataType.LlmAsJudgeAdvanced metadata: EvaluationMetadataLlmAsJudgeAdvanced resultType: EvaluationResultableType.Number - resultConfiguration?: EvaluationConfigurationNumerical + resultConfiguration: EvaluationConfigurationNumerical } | { metadataType: EvaluationMetadataType.LlmAsJudgeAdvanced metadata: EvaluationMetadataLlmAsJudgeAdvanced resultType: EvaluationResultableType.Text - resultConfiguration?: EvaluationConfigurationText + resultConfiguration: EvaluationConfigurationText } | { metadataType: EvaluationMetadataType.LlmAsJudgeSimple - metadata: EvaluationMetadataLlmAsJudgeAdvanced + metadata: EvaluationMetadataLlmAsJudgeSimple resultType: EvaluationResultableType.Boolean resultConfiguration: EvaluationConfigurationBoolean } | { metadataType: EvaluationMetadataType.LlmAsJudgeSimple - metadata: EvaluationMetadataLlmAsJudgeAdvanced + metadata: EvaluationMetadataLlmAsJudgeSimple resultType: EvaluationResultableType.Number resultConfiguration: EvaluationConfigurationNumerical } | { metadataType: EvaluationMetadataType.LlmAsJudgeSimple - metadata: EvaluationMetadataLlmAsJudgeAdvanced + metadata: EvaluationMetadataLlmAsJudgeSimple resultType: EvaluationResultableType.Text resultConfiguration: EvaluationConfigurationText } diff --git a/packages/core/src/services/evaluationResults/aggregations/meanValueQuery.ts b/packages/core/src/services/evaluationResults/aggregations/meanValueQuery.ts index 7c46eb364..4aae6a74b 100644 --- a/packages/core/src/services/evaluationResults/aggregations/meanValueQuery.ts +++ b/packages/core/src/services/evaluationResults/aggregations/meanValueQuery.ts @@ -1,7 +1,11 @@ import { and, eq, sql } from 'drizzle-orm' import { getCommitFilter } from '../_createEvaluationResultQuery' -import { Commit, EvaluationDto } from '../../../browser' +import { + Commit, + EvaluationConfigurationNumerical, + EvaluationDto, +} from '../../../browser' import { database } from '../../../client' import { EvaluationResultsRepository } from '../../../repositories' import { commits, documentLogs } from '../../../schema' @@ -43,11 +47,11 @@ export async function getEvaluationMeanValueQuery( ), ) const value = results[0] - const config = evaluation.metadata.configuration - const { from: minValue, to: maxValue } = config.detail!.range + const config = + evaluation.resultConfiguration as EvaluationConfigurationNumerical return { - minValue, - maxValue, + minValue: config.minValue, + maxValue: config.maxValue, meanValue: value?.meanValue ?? 0, } } diff --git a/packages/core/src/services/evaluationResults/create.ts b/packages/core/src/services/evaluationResults/create.ts index aa1405286..00176151b 100644 --- a/packages/core/src/services/evaluationResults/create.ts +++ b/packages/core/src/services/evaluationResults/create.ts @@ -51,14 +51,12 @@ export async function createEvaluationResult( }: CreateEvaluationResultProps, db = database, ) { - const resultableTable = getResultable(evaluation.metadata.configuration.type) + const resultableTable = getResultable(evaluation.resultType) let resultableId: number | undefined if (!resultableTable && result) { return Result.error( - new BadRequestError( - `Unsupported result type: ${evaluation.metadata.configuration.type}`, - ), + new BadRequestError(`Unsupported result type: ${evaluation.resultType}`), ) } @@ -80,7 +78,7 @@ export async function createEvaluationResult( evaluationId: evaluation.id, documentLogId: documentLog.id, providerLogId: providerLog?.id, - resultableType: result ? evaluation.metadata.configuration.type : null, + resultableType: result ? evaluation.resultType : null, resultableId, source: documentLog.source, }) diff --git a/packages/core/src/services/evaluations/EvaluationRunChecker/index.ts b/packages/core/src/services/evaluations/EvaluationRunChecker/index.ts index d38c3f024..74f3293ca 100644 --- a/packages/core/src/services/evaluations/EvaluationRunChecker/index.ts +++ b/packages/core/src/services/evaluations/EvaluationRunChecker/index.ts @@ -15,6 +15,7 @@ import { Result } from '../../../lib' import { ChainError } from '../../chains/ChainErrors' import { serialize } from '../../documentLogs/serialize' import { createRunError } from '../../runErrors/create' +import { getEvaluationPrompt } from '../prompt' type EvaluationRunErrorCheckerCodes = | RunErrorCodes.EvaluationRunMissingProviderLogError @@ -84,9 +85,7 @@ export class EvaluationRunChecker { } private async buildSchema() { - const resultSchema = getResultSchema( - this.evaluation.metadata.configuration.type, - ) + const resultSchema = getResultSchema(this.evaluation.resultType) if (resultSchema.error) { await this.saveError(resultSchema.error) @@ -109,9 +108,14 @@ export class EvaluationRunChecker { if (serializedLogResult.error) return serializedLogResult try { + const evaluationPrompt = await getEvaluationPrompt({ + workspace, + evaluation: this.evaluation, + }).then((r) => r.unwrap()) + return Result.ok( createChainFn({ - prompt: this.evaluation.metadata.prompt, + prompt: evaluationPrompt, parameters: { ...serializedLogResult.value, }, diff --git a/packages/core/src/services/evaluations/compiler/index.ts b/packages/core/src/services/evaluations/compiler/index.ts deleted file mode 100644 index dfa510bcc..000000000 --- a/packages/core/src/services/evaluations/compiler/index.ts +++ /dev/null @@ -1,44 +0,0 @@ -import { - Chain, - ConversationMetadata, - readMetadata, -} from '@latitude-data/compiler' - -import { DocumentLog, EvaluationDto } from '../../../browser' -import { LatitudeError, Result, TypedResult } from '../../../lib' -import { ProviderLogsRepository } from '../../../repositories' -import { PARAMETERS_FROM_LOG } from './constants' - -export async function buildEvaluationChain( - evaluation: EvaluationDto, - documentLog: DocumentLog, -): Promise> { - const providerLogScope = new ProviderLogsRepository(evaluation.workspaceId) - const providerLogResult = await providerLogScope.findLastByDocumentLogUuid( - documentLog.uuid, - ) - if (providerLogResult.error) return providerLogResult - const providerLog = providerLogResult.value - - const parameters = Object.fromEntries( - Object.entries(PARAMETERS_FROM_LOG).map(([name, getValueFromLog]) => { - return [name, getValueFromLog({ documentLog, providerLog })] - }), - ) - - const chain = new Chain({ prompt: evaluation.metadata.prompt, parameters }) - return Result.ok(chain) -} - -export async function readMetadataFromEvaluation( - evaluation: EvaluationDto, -): Promise> { - const metadata = await readMetadata({ - prompt: evaluation.metadata.prompt, - withParameters: Object.keys(PARAMETERS_FROM_LOG), - }) - - return Result.ok(metadata) -} - -export { PARAMETERS_FROM_LOG } diff --git a/packages/core/src/services/evaluations/create.test.ts b/packages/core/src/services/evaluations/create.test.ts index 42a7693a6..4bd7d60ca 100644 --- a/packages/core/src/services/evaluations/create.test.ts +++ b/packages/core/src/services/evaluations/create.test.ts @@ -1,6 +1,11 @@ import { beforeEach, describe, expect, it } from 'vitest' -import { ProviderApiKey, User, Workspace } from '../../browser' +import { + EvaluationMetadataLlmAsJudgeAdvanced, + ProviderApiKey, + User, + Workspace, +} from '../../browser' import { EvaluationMetadataType, EvaluationResultableType, @@ -118,7 +123,6 @@ describe('createAdvancedEvaluation', () => { expect(result.ok).toBe(true) - expect(result.value!.metadata.configuration).toBeDefined() expect(result.value!.resultConfiguration).toBeDefined() }) }) @@ -155,16 +159,6 @@ describe('createAdvancedEvaluation', () => { }) expect(result.ok).toBe(true) - - expect(result.value!.metadata.configuration).toEqual({ - type: EvaluationResultableType.Number, - detail: { - range: { - from: 0, - to: 100, - }, - }, - }) expect(result.value!.resultConfiguration).toMatchObject({ minValue: 0, maxValue: 100, @@ -253,9 +247,6 @@ ${metadata.prompt} const evaluation = await repo .find(result.value!.id) .then((r) => r.unwrap()) - expect(evaluation.metadata.configuration.type).toBe( - EvaluationResultableType.Text, - ) expect(evaluation.resultType).toBe(EvaluationResultableType.Text) }) @@ -282,9 +273,6 @@ ${metadata.prompt} .find(result.value!.id) .then((r) => r.unwrap()) - expect(evaluation.metadata.configuration.type).toBe( - EvaluationResultableType.Boolean, - ) expect(evaluation.resultType).toBe(EvaluationResultableType.Boolean) }) @@ -314,7 +302,11 @@ ${metadata.prompt} const evaluation = await repo .find(result.value!.id) .then((r) => r.unwrap()) - expect(evaluation.metadata.templateId).toBe(template.id) + + expect( + (evaluation.metadata as EvaluationMetadataLlmAsJudgeAdvanced) + .templateId, + ).toBe(template.id) }) it('does not allow to create a number type evaluation without proper configuration', async () => { @@ -543,9 +535,6 @@ describe('createEvaluation', () => { metadataType: EvaluationMetadataType.LlmAsJudgeAdvanced, metadata: { prompt: 'Test Prompt', - configuration: { - type: EvaluationResultableType.Text, - }, templateId: null, }, resultType: EvaluationResultableType.Text, @@ -581,15 +570,6 @@ describe('createEvaluation', () => { metadataType: EvaluationMetadataType.LlmAsJudgeAdvanced, metadata: { prompt: 'Test Prompt', - configuration: { - type: EvaluationResultableType.Number, - detail: { - range: { - from: 0, - to: 100, - }, - }, - }, templateId: null, }, resultType: EvaluationResultableType.Number, @@ -637,9 +617,6 @@ describe('createEvaluation', () => { metadataType: EvaluationMetadataType.LlmAsJudgeAdvanced, metadata: { prompt: 'Test Prompt', - configuration: { - type: EvaluationResultableType.Boolean, - }, templateId: null, }, resultType: EvaluationResultableType.Boolean, diff --git a/packages/core/src/services/evaluations/create.ts b/packages/core/src/services/evaluations/create.ts index 00fbc76e8..0036e24d9 100644 --- a/packages/core/src/services/evaluations/create.ts +++ b/packages/core/src/services/evaluations/create.ts @@ -62,7 +62,7 @@ export async function createEvaluation< metadata: M extends EvaluationMetadataType.LlmAsJudgeSimple ? Omit : M extends EvaluationMetadataType.LlmAsJudgeAdvanced - ? Omit + ? Omit : never resultType: R resultConfiguration: R extends EvaluationResultableType.Boolean @@ -102,10 +102,31 @@ export async function createEvaluation< ) } + const legacyConfiguration = + metadataType === EvaluationMetadataType.LlmAsJudgeAdvanced + ? { + configuration: { + type: resultType, + ...(resultType === EvaluationResultableType.Number && { + detail: { + range: { + from: ( + resultConfiguration as EvaluationConfigurationNumerical + ).minValue, + to: (resultConfiguration as EvaluationConfigurationNumerical) + .maxValue, + }, + }, + }), + }, + } + : undefined + return await Transaction.call(async (tx) => { const metadataRow = (await tx .insert(metadataTables[metadataType]) - .values([metadata]) + // @ts-expect-error — The configuration type is being manually added here, although drizzle does not like it. It will be removed in the next PR. + .values([{ ...metadata, ...legacyConfiguration }]) .returning() .then((r) => r[0]!)) as IEvaluationMetadata diff --git a/packages/core/src/services/evaluations/index.ts b/packages/core/src/services/evaluations/index.ts index 308e81283..142479bff 100644 --- a/packages/core/src/services/evaluations/index.ts +++ b/packages/core/src/services/evaluations/index.ts @@ -1,6 +1,6 @@ export * from './create' export * from './update' export * from './destroy' -export * from './compiler' export * from './connect' export * from './run' +export * from './prompt' diff --git a/packages/core/src/services/evaluations/prompt/index.test.ts b/packages/core/src/services/evaluations/prompt/index.test.ts new file mode 100644 index 000000000..98500de8b --- /dev/null +++ b/packages/core/src/services/evaluations/prompt/index.test.ts @@ -0,0 +1,100 @@ +import { readMetadata } from '@latitude-data/compiler' +import { beforeEach, describe, expect, it } from 'vitest' + +import { getEvaluationPrompt } from '.' +import { ProviderApiKey, User } from '../../../browser' +import { + EvaluationMetadataType, + EvaluationResultableType, + Providers, + SERIALIZED_DOCUMENT_LOG_FIELDS, +} from '../../../constants' +import * as factories from '../../../tests/factories' +import { createEvaluation } from '../create' + +describe('getEvaluationPrompt', () => { + let workspace: any + let user: User + let provider: ProviderApiKey + + beforeEach(async () => { + const { + workspace: _workspace, + user: _user, + providers, + } = await factories.createProject({ + providers: [{ name: 'openai', type: Providers.OpenAI }], + }) + workspace = _workspace + user = _user + provider = providers[0]! + }) + + it('returns the plain prompt with advanced evaluations', async () => { + const prompt = factories.helpers.createPrompt({ provider }) + + const evaluation = await createEvaluation({ + workspace, + user, + name: 'Test evaluation', + description: 'Test description', + metadataType: EvaluationMetadataType.LlmAsJudgeAdvanced, + metadata: { + prompt, + templateId: null, + }, + resultType: EvaluationResultableType.Number, + resultConfiguration: { + minValue: 0, + minValueDescription: null, + maxValue: 10, + maxValueDescription: null, + }, + }).then((r) => r.unwrap()) + + const obtainedPrompt = await getEvaluationPrompt({ + workspace, + evaluation, + }).then((r) => r.unwrap()) + + expect(obtainedPrompt).toBe(prompt) + }) + + it('Creates a compilable prompt for a simple evaluation', async () => { + const model = 'custom-model' + + const evaluation = await createEvaluation({ + workspace, + user, + name: 'Test evaluation', + description: 'Test description', + metadataType: EvaluationMetadataType.LlmAsJudgeSimple, + metadata: { + providerApiKeyId: provider.id, + model, + objective: `This is the evaluation's objective`, + additionalInstructions: `These are the evaluation's additional instructions`, + }, + resultType: EvaluationResultableType.Number, + resultConfiguration: { + minValue: 0, + minValueDescription: 'The minimum value', + maxValue: 10, + maxValueDescription: 'The maximum value', + }, + }).then((r) => r.unwrap()) + + const evaluationPrompt = await getEvaluationPrompt({ + workspace, + evaluation, + }).then((r) => r.unwrap()) + + const metadata = await readMetadata({ + prompt: evaluationPrompt, + withParameters: SERIALIZED_DOCUMENT_LOG_FIELDS, + }) + expect(metadata).toBeDefined() + expect(metadata.errors.length).toBe(0) + expect(metadata.config.model).toBe(model) + }) +}) diff --git a/packages/core/src/services/evaluations/prompt/index.ts b/packages/core/src/services/evaluations/prompt/index.ts new file mode 100644 index 000000000..179c552a6 --- /dev/null +++ b/packages/core/src/services/evaluations/prompt/index.ts @@ -0,0 +1,110 @@ +import { + EvaluationDto, + EvaluationMetadataType, + EvaluationResultableType, + Workspace, +} from '../../../browser' +import { database } from '../../../client' +import { LatitudeError, PromisedResult, Result } from '../../../lib' +import { ProviderApiKeysRepository } from '../../../repositories' + +function valueInformation(evaluation: EvaluationDto) { + if (evaluation.resultType === EvaluationResultableType.Boolean) { + const base = 'A boolean value' + const valueDescriptions = [ + evaluation.resultConfiguration.trueValueDescription + ? `true represents "${evaluation.resultConfiguration.trueValueDescription}"` + : undefined, + evaluation.resultConfiguration.falseValueDescription + ? `false represents "${evaluation.resultConfiguration.falseValueDescription}"` + : undefined, + ].filter(Boolean) + + if (!valueDescriptions.length) return base + return `${base}, where ${valueDescriptions.join(' and ')}` + } + + if (evaluation.resultType === EvaluationResultableType.Number) { + const base = `A number between ${evaluation.resultConfiguration.minValue} and ${evaluation.resultConfiguration.maxValue}` + const valueDescriptions = [ + evaluation.resultConfiguration.minValueDescription + ? `${evaluation.resultConfiguration.minValue} represents "${evaluation.resultConfiguration.minValueDescription}"` + : undefined, + evaluation.resultConfiguration.maxValueDescription + ? `${evaluation.resultConfiguration.maxValue} represents "${evaluation.resultConfiguration.maxValueDescription}"` + : undefined, + ].filter(Boolean) + + if (!valueDescriptions.length) return base + return `${base}, where ${valueDescriptions.join(' and ')}` + } + + const base = 'A string value' + if (!evaluation.resultConfiguration.valueDescription) return base + return `${base} representing "${evaluation.resultConfiguration.valueDescription}"` +} + +export async function getEvaluationPrompt( + { + workspace, + evaluation, + }: { workspace: Workspace; evaluation: EvaluationDto }, + db = database, +): PromisedResult { + if (evaluation.metadataType === EvaluationMetadataType.LlmAsJudgeAdvanced) { + return Result.ok(evaluation.metadata.prompt) + } + + const providersRepo = new ProviderApiKeysRepository(workspace.id, db) + const providerResult = await providersRepo.find( + evaluation.metadata.providerApiKeyId, + ) + if (providerResult.error) return providerResult + const provider = providerResult.unwrap() + + const resultSchema = { + [EvaluationResultableType.Boolean]: 'boolean', + [EvaluationResultableType.Number]: 'number', + [EvaluationResultableType.Text]: 'string', + } as const + + return Result.ok( + ` +--- +provider: ${provider.name} +model: ${evaluation.metadata.model} +temperature: 0 +schema: + type: object + properties: + result: + type: ${resultSchema[evaluation.resultType]} + reason: + type: string + required: + - value + - reason +--- + +You’re an expert LLM evaluator. Your objective is to evaluate the response from an LLM model based on this goal: + +${evaluation.metadata.objective} + +${evaluation.metadata.additionalInstructions ? 'Additionally, you should follow these instructions:\n\n' + evaluation.metadata.additionalInstructions : ''} + +Now, evaluate the assistant response for the following conversation, based on your instructions: + +{{ messages.all }} + +{{#if cost || duration }} + Also, here is some aditional metadata about the LLM response. It may or may not be relevant to your objective. + {{#if cost }} - Cost: {{ cost }} cents. {{/if}} + {{#if duration }} - Duration: {{ duration }} milliseconds. {{/if}} +{{/if}} + +You must respond with a JSON object with the following properties: + - value: ${valueInformation(evaluation)} + - reason: A string explaining your evaluation decision. +`.trim(), + ) +} diff --git a/packages/core/src/services/evaluations/run.test.ts b/packages/core/src/services/evaluations/run.test.ts index b0dd8d8c2..bcbe4f25e 100644 --- a/packages/core/src/services/evaluations/run.test.ts +++ b/packages/core/src/services/evaluations/run.test.ts @@ -23,11 +23,10 @@ import { ErrorableEntity, LogSources, Providers } from '../../constants' import { publisher } from '../../events/publisher' import { Result } from '../../lib' import * as generateUUIDModule from '../../lib/generateUUID' -import { EvaluationsRepository } from '../../repositories' import { documentLogs, - evaluationMetadataLlmAsJudgeAdvanced, evaluationResults, + evaluations, providerApiKeys, providerLogs, runErrors, @@ -36,6 +35,7 @@ import * as factories from '../../tests/factories' import { ChainError } from '../chains/ChainErrors' import * as runChainModule from '../chains/run' import { serialize } from '../documentLogs/serialize' +import { getEvaluationPrompt } from './prompt' import { runEvaluation } from './run' const publisherSpy = vi.spyOn(publisher, 'publishLater') @@ -159,8 +159,12 @@ describe('run', () => { documentLog, }) const serializedPrompt = serializeResult.value + const evaluationPrompt = await getEvaluationPrompt({ + workspace, + evaluation, + }).then((r) => r.unwrap()) const chain = createChain({ - prompt: evaluation.metadata.prompt, + prompt: evaluationPrompt, parameters: { ...serializedPrompt }, }) expect(runChainModule.runChain).toHaveBeenCalledWith({ @@ -201,7 +205,7 @@ describe('run', () => { expect(evaluationResult).toMatchObject({ uuid: FAKE_GENERATED_UUID, - resultableType: evaluation.metadata.configuration.type, + resultableType: evaluation.resultType, source: LogSources.API, }) }) @@ -320,60 +324,20 @@ describe('run', () => { }) it('fails evaluation type is not recognized', async () => { - await database - .update(evaluationMetadataLlmAsJudgeAdvanced) - .set({ - configuration: { - ...evaluation.metadata.configuration, + let error + try { + await database + .update(evaluations) + .set({ // @ts-expect-error - intentionally setting invalid type - type: 'unknown', - }, - }) - .where( - eq(evaluationMetadataLlmAsJudgeAdvanced.id, evaluation.metadataId), - ) - - const evalRepo = new EvaluationsRepository(workspace.id) - const updatedEvaluation = await evalRepo - .find(evaluation.id) - .then((r) => r.unwrap()) - - updatedEvaluation.metadata = { - ...updatedEvaluation.metadata, - prompt: 'foo', + resultType: 'unknown', + }) + .where(eq(evaluations.id, evaluation.id)) + } catch (err) { + error = err } - const result = await runEvaluation({ - documentLog, - documentUuid, - evaluation: updatedEvaluation, - }) - const error = await findError( - RunErrorCodes.EvaluationRunUnsupportedResultTypeError, - ) - const evaluationResult = await database.query.evaluationResults.findFirst( - { - where: eq(evaluationResults.evaluationId, updatedEvaluation.id), - }, - ) - - expect(evaluationResult).toEqual( - expect.objectContaining({ - uuid: error?.errorableUuid, - documentLogId: documentLog.id, - evaluationId: updatedEvaluation.id, - resultableType: null, - resultableId: null, - source: LogSources.API, - }), - ) expect(error).toBeDefined() - expect(result.error).toEqual( - new ChainError({ - code: RunErrorCodes.EvaluationRunUnsupportedResultTypeError, - message: `Unsupported evaluation type 'unknown'`, - }), - ) }) it('fails when chain response without object', async () => { diff --git a/packages/core/src/tests/factories/evaluationResults.ts b/packages/core/src/tests/factories/evaluationResults.ts index 897b6af21..02d0ba9e0 100644 --- a/packages/core/src/tests/factories/evaluationResults.ts +++ b/packages/core/src/tests/factories/evaluationResults.ts @@ -14,6 +14,7 @@ import { generateUUIDIdentifier } from '../../lib' import { ProviderApiKeysRepository } from '../../repositories' import { Config } from '../../services/ai' import { createEvaluationResult as createEvaluationResultService } from '../../services/evaluationResults' +import { getEvaluationPrompt } from '../../services/evaluations' import { createProviderLog } from '../../services/providerLogs' export type IEvaluationResultData = { @@ -44,9 +45,13 @@ export async function createEvaluationResult({ ) const workspace = (await findWorkspaceFromCommit(commit))! const providerScope = new ProviderApiKeysRepository(workspace.id) + const evaluationPrompt = await getEvaluationPrompt({ + workspace, + evaluation, + }).then((r) => r.unwrap()) const chain = createChain({ - prompt: evaluation.metadata.prompt, + prompt: evaluationPrompt, parameters: {}, // TODO: Generate parameters from documentLog })