Skip to content

Commit

Permalink
refactor: comply with latest api spec
Browse files Browse the repository at this point in the history
  • Loading branch information
mdonnalley committed Dec 2, 2024
1 parent b468c27 commit aaa5ad6
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 149 deletions.
181 changes: 67 additions & 114 deletions src/agentTester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,47 +8,77 @@ import { Connection, Lifecycle, PollingClient, StatusResult } from '@salesforce/
import { Duration } from '@salesforce/kit';
import { MaybeMock } from './maybe-mock';

type Format = 'human' | 'tap' | 'junit' | 'json';
type Format = 'human' | 'json';

type TestStatus = 'NEW' | 'IN_PROGRESS' | 'COMPLETED' | 'ERROR';

type AgentTestStartResponse = {
id: string;
aiEvaluationId: string;
status: TestStatus;
};

type AgentTestStatusResponse = {
status: 'NEW' | 'IN_PROGRESS' | 'COMPLETED' | 'ERROR';
status: TestStatus;
startTime: string;
endTime?: string;
errorMessage?: string;
};

type AgentTestDetailsResponse = {
AiEvaluationSuiteDefinition: string;
tests: Array<{
AiEvaluationDefinition: string;
results: Array<{
test_number: number;
results: Array<{
name: string;
actual: string[];
is_pass: boolean;
execution_time_ms: number;
error?: string;
}>;
}>;
type TestCaseResult = {
status: TestStatus;
number: string;
startTime: string;
endTime?: string;
generatedData: {
type: 'AGENT';
actionsSequence: string[];
outcome: 'Success' | 'Failure';
topic: string;
inputTokensCount: string;
outputTokensCount: string;
};
expectationResults: Array<{
name: string;
actualValue: string;
expectedValue: string;
score: number;
result: 'Passed' | 'Failed';
metricLabel: 'Accuracy' | 'Precision';
metricExplainability: string;
status: TestStatus;
startTime: string;
endTime?: string;
errorCode?: string;
errorMessage?: string;
}>;
};

type AgentTestDetailsResponse = {
status: TestStatus;
startTime: string;
endTime?: string;
errorMessage?: string;
testCases: TestCaseResult[];
};

export class AgentTester {
private maybeMock: MaybeMock;
public constructor(connection: Connection) {
this.maybeMock = new MaybeMock(connection);
}

public async start(suiteId: string): Promise<{ id: string }> {
/**
* Starts an AI evaluation run based on the provided name or ID.
*
* @param nameOrId - The name or ID of the AI evaluation definition.
* @param type - Specifies whether the provided identifier is a 'name' or 'id'. Defaults to 'name'. If 'name' is provided, nameOrId is treated as the name of the AiEvaluationDefinition. If 'id' is provided, nameOrId is treated as the unique ID of the AiEvaluationDefinition.
* @returns A promise that resolves to an object containing the ID of the started AI evaluation run.
*/
public async start(nameOrId: string, type: 'name' | 'id' = 'name'): Promise<{ aiEvaluationId: string }> {
const url = '/einstein/ai-evaluations/runs';

return this.maybeMock.request<AgentTestStartResponse>('POST', url, {
aiEvaluationSuiteDefinition: suiteId,
[type === 'name' ? 'aiEvaluationDefinitionName' : 'aiEvaluationDefinitionVersionId']: nameOrId,
});
}

Expand Down Expand Up @@ -100,14 +130,7 @@ export class AgentTester {
const response = await this.maybeMock.request<AgentTestDetailsResponse>('GET', url);
return {
response,
formatted:
format === 'human'
? await humanFormat(response)
: format === 'tap'
? await tapFormat(response)
: format === 'junit'
? await junitFormat(response)
: await jsonFormat(response),
formatted: format === 'human' ? await humanFormat(jobId, response) : await jsonFormat(response),
};
}

Expand All @@ -118,100 +141,30 @@ export class AgentTester {
}
}

export async function humanFormat(details: AgentTestDetailsResponse): Promise<string> {
// TODO: the api response isn't finalized so this is just a POC
export async function humanFormat(name: string, details: AgentTestDetailsResponse): Promise<string> {
const { Ux } = await import('@salesforce/sf-plugins-core');
const ux = new Ux();

const tables: string[] = [];
for (const aiEvalDef of details.tests) {
for (const result of aiEvalDef.results) {
const table = ux.makeTable({
title: `Test Results for ${aiEvalDef.AiEvaluationDefinition} (#${result.test_number})`,
data: result.results.map((r) => ({
'TEST NAME': r.name,
OUTCOME: r.is_pass ? 'Pass' : 'Fail',
MESSAGE: r.error ?? '',
'RUNTIME (MS)': r.execution_time_ms,
})),
});
tables.push(table);
}
for (const testCase of details.testCases) {
const table = ux.makeTable({
title: `Test Case #${testCase.number}`,
data: testCase.expectationResults.map((r) => ({
name: r.name,
outcome: r.result === 'Passed' ? 'Pass' : 'Fail',
actualValue: r.actualValue,
expectedValue: r.expectedValue,
score: r.score,
'metric label': r.metricLabel,
message: r.errorMessage ?? '',
'runtime (MS)': r.endTime ? new Date(r.endTime).getTime() - new Date(r.startTime).getTime() : 0,
})),
});
tables.push(table);
}

return tables.join('\n');
}

export async function junitFormat(details: AgentTestDetailsResponse): Promise<string> {
// APEX EXAMPLE
// <?xml version="1.0" encoding="UTF-8"?>
// <testsuites>
// <testsuite name="force.apex" timestamp="2024-11-13T19:19:23.000Z" hostname="https://energy-site-1368-dev-ed.scratch.my.salesforce.com" tests="11" failures="0" errors="0" time="2.57">
// <properties>
// <property name="outcome" value="Successful"/>
// <property name="testsRan" value="11"/>
// <property name="passing" value="11"/>
// <property name="failing" value="0"/>
// <property name="skipped" value="0"/>
// <property name="passRate" value="100%"/>
// <property name="failRate" value="0%"/>
// <property name="testStartTime" value="Wed Nov 13 2024 12:19:23 PM"/>
// <property name="testSetupTimeInMs" value="0"/>
// <property name="testExecutionTime" value="2.57 s"/>
// <property name="testTotalTime" value="2.57 s"/>
// <property name="commandTime" value="0.17 s"/>
// <property name="hostname" value="https://energy-site-1368-dev-ed.scratch.my.salesforce.com"/>
// <property name="orgId" value="00DEi000006OlrxMAC"/>
// <property name="username" value="[email protected]"/>
// <property name="testRunId" value="707Ei00000dTRSa"/>
// <property name="userId" value="005Ei00000FkGU9IAN"/>
// </properties>
// <testcase name="importSampleData" classname="TestSampleDataController" time="0.27">
// </testcase>
// <testcase name="blankAddress" classname="GeocodingServiceTest" time="0.01">
// </testcase>
// <testcase name="errorResponse" classname="GeocodingServiceTest" time="0.01">
// </testcase>
// <testcase name="successResponse" classname="GeocodingServiceTest" time="0.01">
// </testcase>
// <testcase name="createFileFailsWhenIncorrectBase64Data" classname="FileUtilitiesTest" time="0.10">
// </testcase>
// <testcase name="createFileFailsWhenIncorrectFilename" classname="FileUtilitiesTest" time="0.03">
// </testcase>
// <testcase name="createFileFailsWhenIncorrectRecordId" classname="FileUtilitiesTest" time="0.35">
// </testcase>
// <testcase name="createFileSucceedsWhenCorrectInput" classname="FileUtilitiesTest" time="0.22">
// </testcase>
// <testcase name="testGetPagedPropertyList" classname="TestPropertyController" time="1.01">
// </testcase>
// <testcase name="testGetPicturesNoResults" classname="TestPropertyController" time="0.06">
// </testcase>
// <testcase name="testGetPicturesWithResults" classname="TestPropertyController" time="0.51">
// </testcase>
// </testsuite>
// </testsuites>
await Promise.reject(new Error('Not implemented'));
return JSON.stringify(details, null, 2);
}

export async function tapFormat(details: AgentTestDetailsResponse): Promise<string> {
// APEX EXAMPLE (these are streamed in chunks)
// 1..11
// ok 1 TestPropertyController.testGetPagedPropertyList
// ok 2 TestPropertyController.testGetPicturesNoResults
// ok 3 TestPropertyController.testGetPicturesWithResults
// ok 4 FileUtilitiesTest.createFileFailsWhenIncorrectBase64Data
// ok 5 FileUtilitiesTest.createFileFailsWhenIncorrectFilename
// ok 6 FileUtilitiesTest.createFileFailsWhenIncorrectRecordId
// ok 7 FileUtilitiesTest.createFileSucceedsWhenCorrectInput
// ok 8 TestSampleDataController.importSampleData
// ok 9 GeocodingServiceTest.blankAddress
// ok 10 GeocodingServiceTest.errorResponse
// ok 11 GeocodingServiceTest.successResponse
// # Run "sf apex get test -i 707Ei00000dUJry -o [email protected] --result-format <format>" to retrieve test results in a different format.
await Promise.reject(new Error('Not implemented'));
return JSON.stringify(details, null, 2);
}

export async function jsonFormat(details: AgentTestDetailsResponse): Promise<string> {
return Promise.resolve(JSON.stringify(details, null, 2));
}
7 changes: 4 additions & 3 deletions test/agentTester.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ describe('AgentTester', () => {
const output = await tester.poll('4KBSM000000003F4AQ');
expect(output).to.be.ok;
// TODO: make these assertions more meaningful
expect(output.formatted).to.include('Test Results for my first test');
expect(output.response.tests[0].results[0].results[0].is_pass).to.be.true;
expect(output.formatted).to.include('Test Case #1');
expect(output.formatted).to.include('Test Case #2');
expect(output.response.testCases[0].status).to.equal('Completed');
});

it('should poll until test run is complete (json format)', async () => {
Expand All @@ -68,7 +69,7 @@ describe('AgentTester', () => {
expect(output).to.be.ok;
// TODO: make these assertions more meaningful
expect(JSON.parse(output.formatted)).to.deep.equal(output.response);
expect(output.response.tests[0].results[0].results[0].is_pass).to.be.true;
expect(output.response.testCases[0].status).to.equal('Completed');
});
});

Expand Down
3 changes: 2 additions & 1 deletion test/mocks/einstein_ai-evaluations_runs.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{
"id": "4KBSM000000003F4AQ"
"aiEvaluationId": "4KBSM000000003F4AQ",
"status": "NEW"
}
Original file line number Diff line number Diff line change
@@ -1,39 +1,80 @@
{
"AiEvaluationSuiteDefinition": "",
"tests": [
"status": "Completed",
"startTime": "2024-11-28T12:00:00Z",
"endTime": "2024-11-28T12:05:00Z",
"errorMessage": null,
"testCases": [
{
"AiEvaluationDefinition": "my first test",
"results": [
"status": "Completed",
"number": 1,
"startTime": "2024-11-28T12:00:10Z",
"endTime": "2024-11-28T12:00:20Z",
"generatedData": {
"type": "AGENT",
"actionsSequence": ["Action1", "Action2"],
"outcome": "Success",
"topic": "Mathematics",
"inputTokensCount": 50,
"outputTokensCount": 55
},
"expectationResults": [
{
"test_number": 1,
"results": [
{
"name": "action_assertion",
"actual": ["Identify Record by Name", "Get Record Details"],
"is_pass": true,
"execution_time_ms": 3000,
"error": ""
},
{
"name": "action_assertion",
"actual": ["Identify Record by Name", "Get Record Details"],
"is_pass": false,
"execution_time_ms": 3000,
"error": "assertion failed"
}
]
"name": "topic_sequence_match",
"actualValue": "Result A",
"expectedValue": "Result A",
"score": 1.0,
"result": "Passed",
"metricLabel": "Accuracy",
"metricExplainability": "Measures the correctness of the result.",
"status": "Completed",
"startTime": "2024-11-28T12:00:12Z",
"endTime": "2024-11-28T12:00:13Z",
"errorCode": null,
"errorMessage": null
},
{
"test_number": 2,
"results": [
{
"name": "action_assertion",
"actual": ["Identify Record by Name", "Get Record Details"],
"is_pass": true,
"execution_time_ms": 3000,
"error": ""
}
]
"name": "action_sequence_match",
"actualValue": "Result B",
"expectedValue": "Result B",
"score": 0.9,
"result": "Passed",
"metricLabel": "Precision",
"metricExplainability": "Measures the precision of the result.",
"status": "Completed",
"startTime": "2024-11-28T12:00:14Z",
"endTime": "2024-11-28T12:00:15Z",
"errorCode": null,
"errorMessage": null
}
]
},
{
"status": "Failed",
"number": 2,
"startTime": "2024-11-28T12:00:30Z",
"endTime": "2024-11-28T12:00:40Z",
"generatedData": {
"type": "AGENT",
"actionsSequence": ["Action3", "Action4"],
"outcome": "Failure",
"topic": "Physics",
"inputTokensCount": 60,
"outputTokensCount": 50
},
"expectationResults": [
{
"name": "topic_sequence_match",
"actualValue": "Result C",
"expectedValue": "Result D",
"score": 0.5,
"result": "Failed",
"metricLabel": "Accuracy",
"metricExplainability": "Measures the correctness of the result.",
"status": "Completed",
"startTime": "2024-11-28T12:00:32Z",
"endTime": "2024-11-28T12:00:33Z",
"errorCode": null,
"errorMessage": null
}
]
}
Expand Down

0 comments on commit aaa5ad6

Please sign in to comment.