refactor: comply with latest api spec

forcedotcom · Dec 2, 2024 · aaa5ad6 · aaa5ad6
1 parent b468c27
commit aaa5ad6
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 149 deletions.
diff --git a/src/agentTester.ts b/src/agentTester.ts
@@ -8,47 +8,77 @@ import { Connection, Lifecycle, PollingClient, StatusResult } from '@salesforce/
 import { Duration } from '@salesforce/kit';
 import { MaybeMock } from './maybe-mock';
 
-type Format = 'human' | 'tap' | 'junit' | 'json';
+type Format = 'human' | 'json';
+
+type TestStatus = 'NEW' | 'IN_PROGRESS' | 'COMPLETED' | 'ERROR';
 
 type AgentTestStartResponse = {
-  id: string;
+  aiEvaluationId: string;
+  status: TestStatus;
 };
 
 type AgentTestStatusResponse = {
-  status: 'NEW' | 'IN_PROGRESS' | 'COMPLETED' | 'ERROR';
+  status: TestStatus;
   startTime: string;
   endTime?: string;
   errorMessage?: string;
 };
 
-type AgentTestDetailsResponse = {
-  AiEvaluationSuiteDefinition: string;
-  tests: Array<{
-    AiEvaluationDefinition: string;
-    results: Array<{
-      test_number: number;
-      results: Array<{
-        name: string;
-        actual: string[];
-        is_pass: boolean;
-        execution_time_ms: number;
-        error?: string;
-      }>;
-    }>;
+type TestCaseResult = {
+  status: TestStatus;
+  number: string;
+  startTime: string;
+  endTime?: string;
+  generatedData: {
+    type: 'AGENT';
+    actionsSequence: string[];
+    outcome: 'Success' | 'Failure';
+    topic: string;
+    inputTokensCount: string;
+    outputTokensCount: string;
+  };
+  expectationResults: Array<{
+    name: string;
+    actualValue: string;
+    expectedValue: string;
+    score: number;
+    result: 'Passed' | 'Failed';
+    metricLabel: 'Accuracy' | 'Precision';
+    metricExplainability: string;
+    status: TestStatus;
+    startTime: string;
+    endTime?: string;
+    errorCode?: string;
+    errorMessage?: string;
   }>;
 };
 
+type AgentTestDetailsResponse = {
+  status: TestStatus;
+  startTime: string;
+  endTime?: string;
+  errorMessage?: string;
+  testCases: TestCaseResult[];
+};
+
 export class AgentTester {
   private maybeMock: MaybeMock;
   public constructor(connection: Connection) {
     this.maybeMock = new MaybeMock(connection);
   }
 
-  public async start(suiteId: string): Promise<{ id: string }> {
+  /**
+   * Starts an AI evaluation run based on the provided name or ID.
+   *
+   * @param nameOrId - The name or ID of the AI evaluation definition.
+   * @param type - Specifies whether the provided identifier is a 'name' or 'id'. Defaults to 'name'. If 'name' is provided, nameOrId is treated as the name of the AiEvaluationDefinition. If 'id' is provided, nameOrId is treated as the unique ID of the AiEvaluationDefinition.
+   * @returns A promise that resolves to an object containing the ID of the started AI evaluation run.
+   */
+  public async start(nameOrId: string, type: 'name' | 'id' = 'name'): Promise<{ aiEvaluationId: string }> {
     const url = '/einstein/ai-evaluations/runs';
 
     return this.maybeMock.request<AgentTestStartResponse>('POST', url, {
-      aiEvaluationSuiteDefinition: suiteId,
+      [type === 'name' ? 'aiEvaluationDefinitionName' : 'aiEvaluationDefinitionVersionId']: nameOrId,
     });
   }
 
@@ -100,14 +130,7 @@ export class AgentTester {
     const response = await this.maybeMock.request<AgentTestDetailsResponse>('GET', url);
     return {
       response,
-      formatted:
-        format === 'human'
-          ? await humanFormat(response)
-          : format === 'tap'
-          ? await tapFormat(response)
-          : format === 'junit'
-          ? await junitFormat(response)
-          : await jsonFormat(response),
+      formatted: format === 'human' ? await humanFormat(jobId, response) : await jsonFormat(response),
     };
   }
 
@@ -118,100 +141,30 @@ export class AgentTester {
   }
 }
 
-export async function humanFormat(details: AgentTestDetailsResponse): Promise<string> {
-  // TODO: the api response isn't finalized so this is just a POC
+export async function humanFormat(name: string, details: AgentTestDetailsResponse): Promise<string> {
   const { Ux } = await import('@salesforce/sf-plugins-core');
   const ux = new Ux();
+
   const tables: string[] = [];
-  for (const aiEvalDef of details.tests) {
-    for (const result of aiEvalDef.results) {
-      const table = ux.makeTable({
-        title: `Test Results for ${aiEvalDef.AiEvaluationDefinition} (#${result.test_number})`,
-        data: result.results.map((r) => ({
-          'TEST NAME': r.name,
-          OUTCOME: r.is_pass ? 'Pass' : 'Fail',
-          MESSAGE: r.error ?? '',
-          'RUNTIME (MS)': r.execution_time_ms,
-        })),
-      });
-      tables.push(table);
-    }
+  for (const testCase of details.testCases) {
+    const table = ux.makeTable({
+      title: `Test Case #${testCase.number}`,
+      data: testCase.expectationResults.map((r) => ({
+        name: r.name,
+        outcome: r.result === 'Passed' ? 'Pass' : 'Fail',
+        actualValue: r.actualValue,
+        expectedValue: r.expectedValue,
+        score: r.score,
+        'metric label': r.metricLabel,
+        message: r.errorMessage ?? '',
+        'runtime (MS)': r.endTime ? new Date(r.endTime).getTime() - new Date(r.startTime).getTime() : 0,
+      })),
+    });
+    tables.push(table);
   }
-
   return tables.join('\n');
 }
 
-export async function junitFormat(details: AgentTestDetailsResponse): Promise<string> {
-  // APEX EXAMPLE
-  // <?xml version="1.0" encoding="UTF-8"?>
-  // <testsuites>
-  //     <testsuite name="force.apex" timestamp="2024-11-13T19:19:23.000Z" hostname="https://energy-site-1368-dev-ed.scratch.my.salesforce.com" tests="11" failures="0"  errors="0"  time="2.57">
-  //         <properties>
-  //             <property name="outcome" value="Successful"/>
-  //             <property name="testsRan" value="11"/>
-  //             <property name="passing" value="11"/>
-  //             <property name="failing" value="0"/>
-  //             <property name="skipped" value="0"/>
-  //             <property name="passRate" value="100%"/>
-  //             <property name="failRate" value="0%"/>
-  //             <property name="testStartTime" value="Wed Nov 13 2024 12:19:23 PM"/>
-  //             <property name="testSetupTimeInMs" value="0"/>
-  //             <property name="testExecutionTime" value="2.57 s"/>
-  //             <property name="testTotalTime" value="2.57 s"/>
-  //             <property name="commandTime" value="0.17 s"/>
-  //             <property name="hostname" value="https://energy-site-1368-dev-ed.scratch.my.salesforce.com"/>
-  //             <property name="orgId" value="00DEi000006OlrxMAC"/>
-  //             <property name="username" value="[email protected]"/>
-  //             <property name="testRunId" value="707Ei00000dTRSa"/>
-  //             <property name="userId" value="005Ei00000FkGU9IAN"/>
-  //         </properties>
-  //         <testcase name="importSampleData" classname="TestSampleDataController" time="0.27">
-  //         </testcase>
-  //         <testcase name="blankAddress" classname="GeocodingServiceTest" time="0.01">
-  //         </testcase>
-  //         <testcase name="errorResponse" classname="GeocodingServiceTest" time="0.01">
-  //         </testcase>
-  //         <testcase name="successResponse" classname="GeocodingServiceTest" time="0.01">
-  //         </testcase>
-  //         <testcase name="createFileFailsWhenIncorrectBase64Data" classname="FileUtilitiesTest" time="0.10">
-  //         </testcase>
-  //         <testcase name="createFileFailsWhenIncorrectFilename" classname="FileUtilitiesTest" time="0.03">
-  //         </testcase>
-  //         <testcase name="createFileFailsWhenIncorrectRecordId" classname="FileUtilitiesTest" time="0.35">
-  //         </testcase>
-  //         <testcase name="createFileSucceedsWhenCorrectInput" classname="FileUtilitiesTest" time="0.22">
-  //         </testcase>
-  //         <testcase name="testGetPagedPropertyList" classname="TestPropertyController" time="1.01">
-  //         </testcase>
-  //         <testcase name="testGetPicturesNoResults" classname="TestPropertyController" time="0.06">
-  //         </testcase>
-  //         <testcase name="testGetPicturesWithResults" classname="TestPropertyController" time="0.51">
-  //         </testcase>
-  //     </testsuite>
-  // </testsuites>
-  await Promise.reject(new Error('Not implemented'));
-  return JSON.stringify(details, null, 2);
-}
-
-export async function tapFormat(details: AgentTestDetailsResponse): Promise<string> {
-  // APEX EXAMPLE (these are streamed in chunks)
-  // 1..11
-  // ok 1 TestPropertyController.testGetPagedPropertyList
-  // ok 2 TestPropertyController.testGetPicturesNoResults
-  // ok 3 TestPropertyController.testGetPicturesWithResults
-  // ok 4 FileUtilitiesTest.createFileFailsWhenIncorrectBase64Data
-  // ok 5 FileUtilitiesTest.createFileFailsWhenIncorrectFilename
-  // ok 6 FileUtilitiesTest.createFileFailsWhenIncorrectRecordId
-  // ok 7 FileUtilitiesTest.createFileSucceedsWhenCorrectInput
-  // ok 8 TestSampleDataController.importSampleData
-  // ok 9 GeocodingServiceTest.blankAddress
-  // ok 10 GeocodingServiceTest.errorResponse
-  // ok 11 GeocodingServiceTest.successResponse
-  // # Run "sf apex get test -i 707Ei00000dUJry -o [email protected] --result-format <format>" to retrieve test results in a different format.
-  await Promise.reject(new Error('Not implemented'));
-  return JSON.stringify(details, null, 2);
-}
-
 export async function jsonFormat(details: AgentTestDetailsResponse): Promise<string> {
   return Promise.resolve(JSON.stringify(details, null, 2));
 }
diff --git a/test/agentTester.test.ts b/test/agentTester.test.ts
@@ -57,8 +57,9 @@ describe('AgentTester', () => {
       const output = await tester.poll('4KBSM000000003F4AQ');
       expect(output).to.be.ok;
       // TODO: make these assertions more meaningful
-      expect(output.formatted).to.include('Test Results for my first test');
-      expect(output.response.tests[0].results[0].results[0].is_pass).to.be.true;
+      expect(output.formatted).to.include('Test Case #1');
+      expect(output.formatted).to.include('Test Case #2');
+      expect(output.response.testCases[0].status).to.equal('Completed');
     });
 
     it('should poll until test run is complete (json format)', async () => {
@@ -68,7 +69,7 @@ describe('AgentTester', () => {
       expect(output).to.be.ok;
       // TODO: make these assertions more meaningful
       expect(JSON.parse(output.formatted)).to.deep.equal(output.response);
-      expect(output.response.tests[0].results[0].results[0].is_pass).to.be.true;
+      expect(output.response.testCases[0].status).to.equal('Completed');
     });
   });
 

diff --git a/test/mocks/einstein_ai-evaluations_runs.json b/test/mocks/einstein_ai-evaluations_runs.json
@@ -1,3 +1,4 @@
 {
-  "id": "4KBSM000000003F4AQ"
+  "aiEvaluationId": "4KBSM000000003F4AQ",
+  "status": "NEW"
 }
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json
@@ -1,39 +1,80 @@
 {
-  "AiEvaluationSuiteDefinition": "",
-  "tests": [
+  "status": "Completed",
+  "startTime": "2024-11-28T12:00:00Z",
+  "endTime": "2024-11-28T12:05:00Z",
+  "errorMessage": null,
+  "testCases": [
     {
-      "AiEvaluationDefinition": "my first test",
-      "results": [
+      "status": "Completed",
+      "number": 1,
+      "startTime": "2024-11-28T12:00:10Z",
+      "endTime": "2024-11-28T12:00:20Z",
+      "generatedData": {
+        "type": "AGENT",
+        "actionsSequence": ["Action1", "Action2"],
+        "outcome": "Success",
+        "topic": "Mathematics",
+        "inputTokensCount": 50,
+        "outputTokensCount": 55
+      },
+      "expectationResults": [
         {
-          "test_number": 1,
-          "results": [
-            {
-              "name": "action_assertion",
-              "actual": ["Identify Record by Name", "Get Record Details"],
-              "is_pass": true,
-              "execution_time_ms": 3000,
-              "error": ""
-            },
-            {
-              "name": "action_assertion",
-              "actual": ["Identify Record by Name", "Get Record Details"],
-              "is_pass": false,
-              "execution_time_ms": 3000,
-              "error": "assertion failed"
-            }
-          ]
+          "name": "topic_sequence_match",
+          "actualValue": "Result A",
+          "expectedValue": "Result A",
+          "score": 1.0,
+          "result": "Passed",
+          "metricLabel": "Accuracy",
+          "metricExplainability": "Measures the correctness of the result.",
+          "status": "Completed",
+          "startTime": "2024-11-28T12:00:12Z",
+          "endTime": "2024-11-28T12:00:13Z",
+          "errorCode": null,
+          "errorMessage": null
         },
         {
-          "test_number": 2,
-          "results": [
-            {
-              "name": "action_assertion",
-              "actual": ["Identify Record by Name", "Get Record Details"],
-              "is_pass": true,
-              "execution_time_ms": 3000,
-              "error": ""
-            }
-          ]
+          "name": "action_sequence_match",
+          "actualValue": "Result B",
+          "expectedValue": "Result B",
+          "score": 0.9,
+          "result": "Passed",
+          "metricLabel": "Precision",
+          "metricExplainability": "Measures the precision of the result.",
+          "status": "Completed",
+          "startTime": "2024-11-28T12:00:14Z",
+          "endTime": "2024-11-28T12:00:15Z",
+          "errorCode": null,
+          "errorMessage": null
+        }
+      ]
+    },
+    {
+      "status": "Failed",
+      "number": 2,
+      "startTime": "2024-11-28T12:00:30Z",
+      "endTime": "2024-11-28T12:00:40Z",
+      "generatedData": {
+        "type": "AGENT",
+        "actionsSequence": ["Action3", "Action4"],
+        "outcome": "Failure",
+        "topic": "Physics",
+        "inputTokensCount": 60,
+        "outputTokensCount": 50
+      },
+      "expectationResults": [
+        {
+          "name": "topic_sequence_match",
+          "actualValue": "Result C",
+          "expectedValue": "Result D",
+          "score": 0.5,
+          "result": "Failed",
+          "metricLabel": "Accuracy",
+          "metricExplainability": "Measures the correctness of the result.",
+          "status": "Completed",
+          "startTime": "2024-11-28T12:00:32Z",
+          "endTime": "2024-11-28T12:00:33Z",
+          "errorCode": null,
+          "errorMessage": null
         }
       ]
     }