Skip to content

Commit

Permalink
Add local openai translator
Browse files Browse the repository at this point in the history
  • Loading branch information
icodesign committed Aug 29, 2024
1 parent 24ab36c commit ea9b288
Show file tree
Hide file tree
Showing 18 changed files with 616 additions and 325 deletions.
16 changes: 8 additions & 8 deletions apps/api/app/v1/config/route.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import { LLMProviderConfig } from '@repo/base/config';
import { OpenAITranslationProvider } from '@repo/provider/openai';
import { NextResponse } from 'next/server';

export const runtime = 'edge';

export async function GET() {
const config: LLMProviderConfig = {
provider: 'openai',
maxOutputTokens: 16383,
buffer: 0.3,
maxRetry: 1,
};
return NextResponse.json(config);
const apiKey = process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error('OPENAI_API_KEY is not set');
}
const provider = new OpenAITranslationProvider({ apiKey });
console.log(`provider.config()`, await provider.config());
return NextResponse.json(await provider.config());
}
42 changes: 9 additions & 33 deletions apps/api/app/v1/localize/route.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { ApiError, ApiErrorCode, withApiHandler } from '@/lib/response';
import { openai } from '@ai-sdk/openai';
import { OpenAITranslationProvider } from '@repo/provider/openai';
import { ObjectStreamPart, StreamingTextResponse, streamObject } from 'ai';
import { NextRequest } from 'next/server';
import { z } from 'zod';
Expand Down Expand Up @@ -45,41 +46,16 @@ async function handlePOSTRequest(request: NextRequest): Promise<any> {
const validationError = fromZodError(requestBody.error);
throw new ApiError(ApiErrorCode.BAD_REQUEST, validationError.message);
}
const payload: RequestPayload = requestBody.data;
const model = openai('gpt-4o-mini');
let instructions = `As an app/website translator, your task is to translate texts to target languages, considering context and developer notes for accuracy and cultural appropriateness. It's essential to preserve original format, including line breaks, separators, escaping characters and localization symbols, otherwise, user interface may break.\nSource texts are in key=value format. Translate only the 'value', keeping the 'key' as is. Lines starting with "//" are developer notes for translation guidance.\nFor example, 'key=Hello "%@"\\nWelcome!' can be translate to 'key=你好 "%@"\\n欢迎!' in Chinese. \nOutput should be in JSON format: each source key links to an object with target languages as keys and translated texts as values. \n`;
if (payload.context) {
instructions += `\nTranslation context: \n${payload.context}\n`;
}
let userContent = `Translate from ${
payload.sourceLanguage
} to target languages: [${payload.targetLanguages.join(', ')}].\n\n`;
userContent += '=====\n\n';
for (const content of payload.contents) {
if (content.notes) {
for (const note of content.notes) {
userContent += `// ${note}\n`;
}
}
userContent += `${content.key}=${content.source}\n\n`;
const apiKey = process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error('OPENAI_API_KEY is not set');
}
const TranslationReponseSchema = z.record(
z.string(),
z.record(
z.enum([payload.targetLanguages[0], ...payload.targetLanguages.slice(1)]),
z.string(),
),
);
const result = await streamObject({
model,
mode: 'json',
schema: TranslationReponseSchema,
system: instructions,
prompt: userContent,
onFinish: (e) => {
console.log(`Finished translating, usage: ${e.usage}`);
},
const payload: RequestPayload = requestBody.data;
const provider = new OpenAITranslationProvider({
apiKey,
});
const result = await provider.translate(payload);

/**
* Technically we can just wait for full response and return it as a single JSON,
* but it may time out since it may take some time to process which may exceed some platform's (like Vecel) limit
Expand Down
5 changes: 3 additions & 2 deletions apps/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
},
"dependencies": {
"@repo/base": "workspace:*",
"@ai-sdk/openai": "^0.0.36",
"@repo/provider": "workspace:*",
"@ai-sdk/openai": "^0.0.54",
"@vercel/edge": "^1.1.1",
"ai": "^3.2.27",
"ai": "^3.3.20",
"gpt-tokenizer": "^2.1.2",
"next": "14.2.5",
"openai": "^4.20.0",
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@icodesign/dolphin",
"version": "0.3.0",
"version": "0.4.0",
"main": "./dist/index.cjs",
"bin": "./bin/cli.cjs",
"type": "module",
Expand All @@ -25,8 +25,8 @@
"clean": "rimraf dist",
"build": "tsup",
"dev": "tsc -w --preserveWatchOutput",
"start": "node dist/index.cjs",
"start:debug": "node --inspect-brk dist/index.cjs",
"start": "pnpm build && node dist/index.cjs",
"start:debug": "pnpm build && node --inspect-brk dist/index.cjs",
"prepack": "pnpm build",
"prepublishOnly": "rm -rf ./package && clean-publish --fields 'dependencies,publishConfig' && cp ../../README.md ../../LICENSE ./package",
"postpublish": "rm -rf ./package",
Expand Down
4 changes: 1 addition & 3 deletions examples/apple/TranditionalXcodeDemo/dolphin.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
baseLanguage: en
translator:
agent: api
baseUrl: http://localhost:3000/v1/
mode: interactive
agent: openai
localizations:
- id: app
path: TranditionalXcodeDemo/${LANGUAGE}.lproj/Localizable.strings
Expand Down
13 changes: 6 additions & 7 deletions packages/base/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,15 @@ export const LLMTranslatorConfigSchema = z.object({
maxOutputTokens: z.number().default(4096),
buffer: z.number().default(0.3),
maxRetry: z.number().default(1),
tokenizer: z.enum(['openai']).default('openai'),
tokenizerModel: z.string().default('gpt-4'), // use for tiktoken cal
});

const TranslationProviderEnum = z.enum(['openai']);
const TranslationTokenizerEnum = z.enum(['openai']);

export type TranslationProvider = z.infer<typeof TranslationProviderEnum>;
export type TranslationTokenizer = z.infer<typeof TranslationTokenizerEnum>;

export const LLMProviderConfigSchema = LLMTranslatorConfigSchema.extend({
provider: TranslationProviderEnum,
});

export type LLMProviderConfig = z.infer<typeof LLMProviderConfigSchema>;
export type LLMTranslatorConfig = z.infer<typeof LLMTranslatorConfigSchema>;

const DolphinTranslatorConfigSchema = CommonTranslatorConfigSchema.extend({
agent: z.literal('api'),
Expand All @@ -72,6 +70,7 @@ const DolphinTranslatorConfigSchema = CommonTranslatorConfigSchema.extend({

const OpenAITranslatorConfigSchema = CommonTranslatorConfigSchema.extend({
agent: z.literal('openai'),
apiKey: z.string().optional(),
}).merge(LLMTranslatorConfigSchema);

const TranslatorConfigSchema = z.union([
Expand Down
24 changes: 24 additions & 0 deletions packages/provider/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"name": "@repo/provider",
"version": "1.0.0",
"private": true,
"description": "Translation providers",
"type": "module",
"exports": {
".": "./src/index.ts",
"./*": "./src/*.ts"
},
"scripts": {},
"dependencies": {
"@repo/base": "workspace:*",
"ai": "^3.3.20",
"openai": "^4.20.0",
"@ai-sdk/openai": "^0.0.54",
"zod": "^3.22.4",
"zod-validation-error": "^2.1.0"
},
"devDependencies": {
"@repo/typescript-config": "workspace:*",
"@types/node": "^20.5.4"
}
}
20 changes: 20 additions & 0 deletions packages/provider/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { LLMTranslatorConfig } from '@repo/base/config';
import { StreamObjectResult } from 'ai';

interface TranslationPayload {
context?: string;
sourceLanguage: string;
targetLanguages: string[];
contents: {
key: string;
source: string;
notes?: string[];
}[];
}

export interface TranslationProvider {
translate(
payload: TranslationPayload,
): Promise<StreamObjectResult<Record<string, Record<string, string>>>>;
config(): Promise<LLMTranslatorConfig>;
}
86 changes: 86 additions & 0 deletions packages/provider/src/openai.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import { OpenAIProvider, createOpenAI } from '@ai-sdk/openai';
import { LLMTranslatorConfig } from '@repo/base/config';
import { logger } from '@repo/base/logger';
import { LanguageModel, streamObject } from 'ai';
import { z } from 'zod';

import { TranslationProvider } from '.';

export class OpenAITranslationProvider implements TranslationProvider {
private openai: OpenAIProvider;
private model: LanguageModel;

constructor(options: { apiKey: string }) {
this.openai = createOpenAI({
apiKey: options.apiKey,
compatibility: 'strict', // https://sdk.vercel.ai/providers/ai-sdk-providers/openai#provider-instance
});
this.model = this.openai('gpt-4o-mini');
}

async config() {
const config: LLMTranslatorConfig = {
maxOutputTokens: 16383,
buffer: 0.3,
maxRetry: 1,
tokenizer: 'openai',
tokenizerModel: 'gpt-4',
};
return config;
}

async translate(payload: {
context?: string;
sourceLanguage: string;
targetLanguages: string[];
contents: {
key: string;
source: string;
notes?: string[];
}[];
}) {
let instructions = `As an app/website translator, your task is to translate texts to target languages, considering context and developer notes for accuracy and cultural appropriateness. It's essential to preserve original format, including line breaks, separators, escaping characters and localization symbols, otherwise, user interface may break.\nSource texts are in key=value format. Translate only the 'value', keeping the 'key' as is. Lines starting with "//" are developer notes for translation guidance.\nFor example, 'key=Hello "%@"\\nWelcome!' can be translate to 'key=你好 "%@"\\n欢迎!' in Chinese. \nOutput should be in JSON format: each source key links to an object with target languages as keys and translated texts as values. \n`;
if (payload.context) {
instructions += `\nTranslation context: \n${payload.context}\n`;
}
let userContent = `Translate from ${
payload.sourceLanguage
} to target languages: [${payload.targetLanguages.join(', ')}].\n\n`;
userContent += '=====\n\n';
for (const content of payload.contents) {
if (content.notes) {
for (const note of content.notes) {
userContent += `// ${note}\n`;
}
}
userContent += `${content.key}=${content.source}\n\n`;
}
const TranslationReponseSchema = z.record(
z.string(),
z.record(
z.enum([
payload.targetLanguages[0],
...payload.targetLanguages.slice(1),
]),
z.string(),
),
);
const result = await streamObject({
model: this.model,
mode: 'json',
schema: TranslationReponseSchema,
system: instructions,
prompt: userContent,
onFinish: (e) => {
if (e.error) {
logger.error(`Error translating streaming object error: ${e.error}`);
return;
}
logger.info(
`Finished translating, usage: ${e.usage}, object: ${JSON.stringify(e.object)}`,
);
},
});
return result;
}
}
9 changes: 9 additions & 0 deletions packages/provider/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"extends": "@repo/typescript-config/lib.json",
"compilerOptions": {
"module": "esnext",
"moduleResolution": "bundler"
},
"include": ["src"],
"exclude": ["dist", "build", "node_modules"]
}
3 changes: 2 additions & 1 deletion packages/translate/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
"@inquirer/input": "^1.2.14",
"@inquirer/select": "^1.3.1",
"@repo/base": "workspace:*",
"@repo/provider": "workspace:*",
"@repo/ioloc": "workspace:*",
"ai": "^3.2.27",
"ai": "^3.3.20",
"chalk": "^5.3.0",
"js-tiktoken": "^1.0.8",
"langchain": "^0.0.141",
Expand Down
26 changes: 13 additions & 13 deletions packages/translate/src/batch.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { TranslationProvider } from '@repo/base/config';
import { TranslationTokenizer } from '@repo/base/config';
import { logger } from '@repo/base/logger';

import { LocalizationEntity } from './entity.js';
Expand All @@ -21,8 +21,8 @@ export function createBatches(
config: {
maxTokens: number;
buffer: number;
provider: TranslationProvider;
model: string;
tokenizer: TranslationTokenizer;
tokenizerModel: string;
},
): TranslationBatch[] {
if (entities.length === 0) {
Expand All @@ -44,8 +44,8 @@ export function createBatches(
}

const expectedTokens = calEntityExpectedTokens(
config.provider,
config.model,
config.tokenizer,
config.tokenizerModel,
entity,
);
if (expectedTokens > maxSafeTokens) {
Expand Down Expand Up @@ -85,8 +85,8 @@ export function createBatches(
},
],
sourceTokens: calEntitySourceTokens(
config.provider,
config.model,
config.tokenizer,
config.tokenizerModel,
entity,
),
expectedTokens: expectedTokens * group.length,
Expand All @@ -95,8 +95,8 @@ export function createBatches(
} else {
let currentExpectedTokens = expectedTokens;
let currentSourceTokens = calEntitySourceTokens(
config.provider,
config.model,
config.tokenizer,
config.tokenizerModel,
entity,
);
let similarEntities = [entity];
Expand All @@ -108,8 +108,8 @@ export function createBatches(
) {
const expectedTokens =
calEntityExpectedTokens(
config.provider,
config.model,
config.tokenizer,
config.tokenizerModel,
remainingEntity,
) * remainingTargetLanguages.length;
if (currentExpectedTokens + expectedTokens > maxSafeTokens) {
Expand All @@ -119,8 +119,8 @@ export function createBatches(
remainings.delete(remainingEntity);
currentExpectedTokens += expectedTokens;
currentSourceTokens += calEntitySourceTokens(
config.provider,
config.model,
config.tokenizer,
config.tokenizerModel,
remainingEntity,
);
}
Expand Down
Loading

0 comments on commit ea9b288

Please sign in to comment.