Add local openai translator

rootstudiohq · Aug 29, 2024 · ea9b288 · ea9b288
1 parent 24ab36c
commit ea9b288
Show file tree

Hide file tree

Showing 18 changed files with 616 additions and 325 deletions.
diff --git a/apps/api/app/v1/config/route.ts b/apps/api/app/v1/config/route.ts
@@ -1,14 +1,14 @@
-import { LLMProviderConfig } from '@repo/base/config';
+import { OpenAITranslationProvider } from '@repo/provider/openai';
 import { NextResponse } from 'next/server';
 
 export const runtime = 'edge';
 
 export async function GET() {
-  const config: LLMProviderConfig = {
-    provider: 'openai',
-    maxOutputTokens: 16383,
-    buffer: 0.3,
-    maxRetry: 1,
-  };
-  return NextResponse.json(config);
+  const apiKey = process.env.OPENAI_API_KEY;
+  if (!apiKey) {
+    throw new Error('OPENAI_API_KEY is not set');
+  }
+  const provider = new OpenAITranslationProvider({ apiKey });
+  console.log(`provider.config()`, await provider.config());
+  return NextResponse.json(await provider.config());
 }
diff --git a/apps/api/app/v1/localize/route.ts b/apps/api/app/v1/localize/route.ts
@@ -1,5 +1,6 @@
 import { ApiError, ApiErrorCode, withApiHandler } from '@/lib/response';
 import { openai } from '@ai-sdk/openai';
+import { OpenAITranslationProvider } from '@repo/provider/openai';
 import { ObjectStreamPart, StreamingTextResponse, streamObject } from 'ai';
 import { NextRequest } from 'next/server';
 import { z } from 'zod';
@@ -45,41 +46,16 @@ async function handlePOSTRequest(request: NextRequest): Promise<any> {
     const validationError = fromZodError(requestBody.error);
     throw new ApiError(ApiErrorCode.BAD_REQUEST, validationError.message);
   }
-  const payload: RequestPayload = requestBody.data;
-  const model = openai('gpt-4o-mini');
-  let instructions = `As an app/website translator, your task is to translate texts to target languages, considering context and developer notes for accuracy and cultural appropriateness. It's essential to preserve original format, including line breaks, separators, escaping characters and localization symbols, otherwise, user interface may break.\nSource texts are in key=value format. Translate only the 'value', keeping the 'key' as is. Lines starting with "//" are developer notes for translation guidance.\nFor example, 'key=Hello "%@"\\nWelcome!' can be translate to 'key=你好 "%@"\\n欢迎!' in Chinese. \nOutput should be in JSON format: each source key links to an object with target languages as keys and translated texts as values. \n`;
-  if (payload.context) {
-    instructions += `\nTranslation context: \n${payload.context}\n`;
-  }
-  let userContent = `Translate from ${
-    payload.sourceLanguage
-  } to target languages: [${payload.targetLanguages.join(', ')}].\n\n`;
-  userContent += '=====\n\n';
-  for (const content of payload.contents) {
-    if (content.notes) {
-      for (const note of content.notes) {
-        userContent += `// ${note}\n`;
-      }
-    }
-    userContent += `${content.key}=${content.source}\n\n`;
+  const apiKey = process.env.OPENAI_API_KEY;
+  if (!apiKey) {
+    throw new Error('OPENAI_API_KEY is not set');
   }
-  const TranslationReponseSchema = z.record(
-    z.string(),
-    z.record(
-      z.enum([payload.targetLanguages[0], ...payload.targetLanguages.slice(1)]),
-      z.string(),
-    ),
-  );
-  const result = await streamObject({
-    model,
-    mode: 'json',
-    schema: TranslationReponseSchema,
-    system: instructions,
-    prompt: userContent,
-    onFinish: (e) => {
-      console.log(`Finished translating, usage: ${e.usage}`);
-    },
+  const payload: RequestPayload = requestBody.data;
+  const provider = new OpenAITranslationProvider({
+    apiKey,
   });
+  const result = await provider.translate(payload);
+
   /**
    * Technically we can just wait for full response and return it as a single JSON,
    * but it may time out since it may take some time to process which may exceed some platform's (like Vecel) limit

diff --git a/apps/api/package.json b/apps/api/package.json
@@ -10,9 +10,10 @@
   },
   "dependencies": {
     "@repo/base": "workspace:*",
-    "@ai-sdk/openai": "^0.0.36",
+    "@repo/provider": "workspace:*",
+    "@ai-sdk/openai": "^0.0.54",
     "@vercel/edge": "^1.1.1",
-    "ai": "^3.2.27",
+    "ai": "^3.3.20",
     "gpt-tokenizer": "^2.1.2",
     "next": "14.2.5",
     "openai": "^4.20.0",

diff --git a/apps/cli/package.json b/apps/cli/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@icodesign/dolphin",
-  "version": "0.3.0",
+  "version": "0.4.0",
   "main": "./dist/index.cjs",
   "bin": "./bin/cli.cjs",
   "type": "module",
@@ -25,8 +25,8 @@
     "clean": "rimraf dist",
     "build": "tsup",
     "dev": "tsc -w --preserveWatchOutput",
-    "start": "node dist/index.cjs",
-    "start:debug": "node --inspect-brk dist/index.cjs",
+    "start": "pnpm build && node dist/index.cjs",
+    "start:debug": "pnpm build && node --inspect-brk dist/index.cjs",
     "prepack": "pnpm build",
     "prepublishOnly": "rm -rf ./package && clean-publish --fields 'dependencies,publishConfig' && cp ../../README.md ../../LICENSE ./package",
     "postpublish": "rm -rf ./package",

diff --git a/examples/apple/TranditionalXcodeDemo/dolphin.yml b/examples/apple/TranditionalXcodeDemo/dolphin.yml
@@ -1,8 +1,6 @@
 baseLanguage: en
 translator:
-  agent: api
-  baseUrl: http://localhost:3000/v1/
-  mode: interactive
+  agent: openai
 localizations:
   - id: app
     path: TranditionalXcodeDemo/${LANGUAGE}.lproj/Localizable.strings

diff --git a/packages/base/src/config.ts b/packages/base/src/config.ts
@@ -53,17 +53,15 @@ export const LLMTranslatorConfigSchema = z.object({
   maxOutputTokens: z.number().default(4096),
   buffer: z.number().default(0.3),
   maxRetry: z.number().default(1),
+  tokenizer: z.enum(['openai']).default('openai'),
+  tokenizerModel: z.string().default('gpt-4'), // use for tiktoken cal
 });
 
-const TranslationProviderEnum = z.enum(['openai']);
+const TranslationTokenizerEnum = z.enum(['openai']);
 
-export type TranslationProvider = z.infer<typeof TranslationProviderEnum>;
+export type TranslationTokenizer = z.infer<typeof TranslationTokenizerEnum>;
 
-export const LLMProviderConfigSchema = LLMTranslatorConfigSchema.extend({
-  provider: TranslationProviderEnum,
-});
-
-export type LLMProviderConfig = z.infer<typeof LLMProviderConfigSchema>;
+export type LLMTranslatorConfig = z.infer<typeof LLMTranslatorConfigSchema>;
 
 const DolphinTranslatorConfigSchema = CommonTranslatorConfigSchema.extend({
   agent: z.literal('api'),
@@ -72,6 +70,7 @@ const DolphinTranslatorConfigSchema = CommonTranslatorConfigSchema.extend({
 
 const OpenAITranslatorConfigSchema = CommonTranslatorConfigSchema.extend({
   agent: z.literal('openai'),
+  apiKey: z.string().optional(),
 }).merge(LLMTranslatorConfigSchema);
 
 const TranslatorConfigSchema = z.union([

diff --git a/packages/provider/package.json b/packages/provider/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "@repo/provider",
+  "version": "1.0.0",
+  "private": true,
+  "description": "Translation providers",
+  "type": "module",
+  "exports": {
+    ".": "./src/index.ts",
+    "./*": "./src/*.ts"
+  },
+  "scripts": {},
+  "dependencies": {
+    "@repo/base": "workspace:*",
+    "ai": "^3.3.20",
+    "openai": "^4.20.0",
+    "@ai-sdk/openai": "^0.0.54",
+    "zod": "^3.22.4",
+    "zod-validation-error": "^2.1.0"
+  },
+  "devDependencies": {
+    "@repo/typescript-config": "workspace:*",
+    "@types/node": "^20.5.4"
+  }
+}
diff --git a/packages/provider/src/index.ts b/packages/provider/src/index.ts
@@ -0,0 +1,20 @@
+import { LLMTranslatorConfig } from '@repo/base/config';
+import { StreamObjectResult } from 'ai';
+
+interface TranslationPayload {
+  context?: string;
+  sourceLanguage: string;
+  targetLanguages: string[];
+  contents: {
+    key: string;
+    source: string;
+    notes?: string[];
+  }[];
+}
+
+export interface TranslationProvider {
+  translate(
+    payload: TranslationPayload,
+  ): Promise<StreamObjectResult<Record<string, Record<string, string>>>>;
+  config(): Promise<LLMTranslatorConfig>;
+}
diff --git a/packages/provider/src/openai.ts b/packages/provider/src/openai.ts
@@ -0,0 +1,86 @@
+import { OpenAIProvider, createOpenAI } from '@ai-sdk/openai';
+import { LLMTranslatorConfig } from '@repo/base/config';
+import { logger } from '@repo/base/logger';
+import { LanguageModel, streamObject } from 'ai';
+import { z } from 'zod';
+
+import { TranslationProvider } from '.';
+
+export class OpenAITranslationProvider implements TranslationProvider {
+  private openai: OpenAIProvider;
+  private model: LanguageModel;
+
+  constructor(options: { apiKey: string }) {
+    this.openai = createOpenAI({
+      apiKey: options.apiKey,
+      compatibility: 'strict', // https://sdk.vercel.ai/providers/ai-sdk-providers/openai#provider-instance
+    });
+    this.model = this.openai('gpt-4o-mini');
+  }
+
+  async config() {
+    const config: LLMTranslatorConfig = {
+      maxOutputTokens: 16383,
+      buffer: 0.3,
+      maxRetry: 1,
+      tokenizer: 'openai',
+      tokenizerModel: 'gpt-4',
+    };
+    return config;
+  }
+
+  async translate(payload: {
+    context?: string;
+    sourceLanguage: string;
+    targetLanguages: string[];
+    contents: {
+      key: string;
+      source: string;
+      notes?: string[];
+    }[];
+  }) {
+    let instructions = `As an app/website translator, your task is to translate texts to target languages, considering context and developer notes for accuracy and cultural appropriateness. It's essential to preserve original format, including line breaks, separators, escaping characters and localization symbols, otherwise, user interface may break.\nSource texts are in key=value format. Translate only the 'value', keeping the 'key' as is. Lines starting with "//" are developer notes for translation guidance.\nFor example, 'key=Hello "%@"\\nWelcome!' can be translate to 'key=你好 "%@"\\n欢迎!' in Chinese. \nOutput should be in JSON format: each source key links to an object with target languages as keys and translated texts as values. \n`;
+    if (payload.context) {
+      instructions += `\nTranslation context: \n${payload.context}\n`;
+    }
+    let userContent = `Translate from ${
+      payload.sourceLanguage
+    } to target languages: [${payload.targetLanguages.join(', ')}].\n\n`;
+    userContent += '=====\n\n';
+    for (const content of payload.contents) {
+      if (content.notes) {
+        for (const note of content.notes) {
+          userContent += `// ${note}\n`;
+        }
+      }
+      userContent += `${content.key}=${content.source}\n\n`;
+    }
+    const TranslationReponseSchema = z.record(
+      z.string(),
+      z.record(
+        z.enum([
+          payload.targetLanguages[0],
+          ...payload.targetLanguages.slice(1),
+        ]),
+        z.string(),
+      ),
+    );
+    const result = await streamObject({
+      model: this.model,
+      mode: 'json',
+      schema: TranslationReponseSchema,
+      system: instructions,
+      prompt: userContent,
+      onFinish: (e) => {
+        if (e.error) {
+          logger.error(`Error translating streaming object error: ${e.error}`);
+          return;
+        }
+        logger.info(
+          `Finished translating, usage: ${e.usage}, object: ${JSON.stringify(e.object)}`,
+        );
+      },
+    });
+    return result;
+  }
+}
diff --git a/packages/provider/tsconfig.json b/packages/provider/tsconfig.json
@@ -0,0 +1,9 @@
+{
+  "extends": "@repo/typescript-config/lib.json",
+  "compilerOptions": {
+    "module": "esnext",
+    "moduleResolution": "bundler"
+  },
+  "include": ["src"],
+  "exclude": ["dist", "build", "node_modules"]
+}
diff --git a/packages/translate/package.json b/packages/translate/package.json
@@ -15,8 +15,9 @@
     "@inquirer/input": "^1.2.14",
     "@inquirer/select": "^1.3.1",
     "@repo/base": "workspace:*",
+    "@repo/provider": "workspace:*",
     "@repo/ioloc": "workspace:*",
-    "ai": "^3.2.27",
+    "ai": "^3.3.20",
     "chalk": "^5.3.0",
     "js-tiktoken": "^1.0.8",
     "langchain": "^0.0.141",

diff --git a/packages/translate/src/batch.ts b/packages/translate/src/batch.ts
@@ -1,4 +1,4 @@
-import { TranslationProvider } from '@repo/base/config';
+import { TranslationTokenizer } from '@repo/base/config';
 import { logger } from '@repo/base/logger';
 
 import { LocalizationEntity } from './entity.js';
@@ -21,8 +21,8 @@ export function createBatches(
   config: {
     maxTokens: number;
     buffer: number;
-    provider: TranslationProvider;
-    model: string;
+    tokenizer: TranslationTokenizer;
+    tokenizerModel: string;
   },
 ): TranslationBatch[] {
   if (entities.length === 0) {
@@ -44,8 +44,8 @@ export function createBatches(
     }
 
     const expectedTokens = calEntityExpectedTokens(
-      config.provider,
-      config.model,
+      config.tokenizer,
+      config.tokenizerModel,
       entity,
     );
     if (expectedTokens > maxSafeTokens) {
@@ -85,8 +85,8 @@ export function createBatches(
             },
           ],
           sourceTokens: calEntitySourceTokens(
-            config.provider,
-            config.model,
+            config.tokenizer,
+            config.tokenizerModel,
             entity,
           ),
           expectedTokens: expectedTokens * group.length,
@@ -95,8 +95,8 @@ export function createBatches(
     } else {
       let currentExpectedTokens = expectedTokens;
       let currentSourceTokens = calEntitySourceTokens(
-        config.provider,
-        config.model,
+        config.tokenizer,
+        config.tokenizerModel,
         entity,
       );
       let similarEntities = [entity];
@@ -108,8 +108,8 @@ export function createBatches(
         ) {
           const expectedTokens =
             calEntityExpectedTokens(
-              config.provider,
-              config.model,
+              config.tokenizer,
+              config.tokenizerModel,
               remainingEntity,
             ) * remainingTargetLanguages.length;
           if (currentExpectedTokens + expectedTokens > maxSafeTokens) {
@@ -119,8 +119,8 @@ export function createBatches(
             remainings.delete(remainingEntity);
             currentExpectedTokens += expectedTokens;
             currentSourceTokens += calEntitySourceTokens(
-              config.provider,
-              config.model,
+              config.tokenizer,
+              config.tokenizerModel,
               remainingEntity,
             );
           }