feat(): can now pass custom models for speech-transcription

jgw96 · Jul 10, 2024 · 632dd8f · 632dd8f
1 parent 83e9aca
commit 632dd8f
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,6 @@
 
 # Web AI Toolkit
 
-**Currently in Alpha**
-
 The Web AI Toolkit simplifies the integration of AI features, such as OCR and audio file transcription, into your application. It ensures optimal performance by running all AI workloads locally, leveraging WebGPU and WASM technologies.
 
 ## Installation
@@ -18,7 +16,7 @@ npm install web-ai-toolkit
 | Function Name         | Parameter      | Type                   | Default Value |
 |-----------------------|----------------|------------------------|---------------|
 | transcribeAudioFile   | audioFile      | Blob                   | -             |
-|                       | model          | "tiny" \| "base"       | "tiny"        |
+|                       | model          | string                 | "Xenova/whisper-tiny"|
 |                       | timestamps     | boolean                | false         |
 |                       | language       | string                 | "en-US"       |
 | textToSpeech          | text           | string                 | -             |
@@ -42,7 +40,7 @@ Here are examples of how to use each function:
 import { transcribeAudioFile } from 'web-ai-toolkit';
 
 const audioFile = ...; // Your audio file Blob
-const transcription = await transcribeAudioFile(audioFile, "base", true, "en-US");
+const transcription = await transcribeAudioFile(audioFile, "Xenova/whisper-tiny", true, "en-US");
 console.log(transcription);
 ```
 

diff --git a/src/index.ts b/src/index.ts
@@ -1,4 +1,4 @@
-export async function transcribeAudioFile(audioFile: Blob, model: "tiny" | "base", timestamps: boolean = false, language: string = "en-US") {
+export async function transcribeAudioFile(audioFile: Blob, model: string = "Xenova/whisper-tiny", timestamps: boolean = false, language: string = "en-US") {
     try {
         const { loadTranscriber, doLocalWhisper } = await import("./services/speech-recognition/whisper-ai");
         await loadTranscriber(model, timestamps, language);

diff --git a/src/services/speech-recognition/whisper-ai.ts b/src/services/speech-recognition/whisper-ai.ts
@@ -3,7 +3,7 @@ let whisperWorker: Worker;
 // @ts-ignore
 import WhisperWorker from './worker?worker&inline'
 
-export async function loadTranscriber(model: "tiny" | "base", timestamps: boolean, language: string): Promise<void> {
+export async function loadTranscriber(model: string = "Xenova/whisper-tiny", timestamps: boolean, language: string): Promise<void> {
     return new Promise(async (resolve) => {
         whisperWorker = new WhisperWorker();
 
@@ -22,7 +22,7 @@ export async function loadTranscriber(model: "tiny" | "base", timestamps: boolea
     });
 }
 
-export function doLocalWhisper(audioFile: Blob, model: "tiny" | "base") {
+export function doLocalWhisper(audioFile: Blob, model: string = "Xenova/whisper-tiny") {
     return new Promise((resolve, reject) => {
         try {
             const fileReader = new FileReader();
@@ -70,7 +70,7 @@ export function doLocalWhisper(audioFile: Blob, model: "tiny" | "base") {
                 whisperWorker.postMessage({
                     type: "transcribe",
                     blob: audio,
-                    model: model || "tiny",
+                    model: model || "Xenova/whisper-tiny",
                 })
 
             };

diff --git a/src/services/speech-recognition/worker.ts b/src/services/speech-recognition/worker.ts
@@ -18,7 +18,7 @@ self.onmessage = async (e) => {
         })
     }
     else if (e.data.type === "load") {
-        await loadTranscriber(e.data.model || "tiny", e.data.timestamps, e.data.language);
+        await loadTranscriber(e.data.model || 'Xenova/whisper-tiny', e.data.timestamps, e.data.language);
         self.postMessage({
             type: 'loaded'
         });
@@ -29,12 +29,12 @@ self.onmessage = async (e) => {
     }
 }
 
-export async function loadTranscriber(model: "tiny" | "base", timestamps: boolean, language: string): Promise<void> {
+export async function loadTranscriber(model: string = "Xenova/whisper-tiny", timestamps: boolean, language: string): Promise<void> {
     return new Promise(async (resolve) => {
         if (!transcriber) {
             env.allowLocalModels = false;
             env.useBrowserCache = false;
-            transcriber = await pipeline('automatic-speech-recognition', `Xenova/whisper-${model}`, {
+            transcriber = await pipeline('automatic-speech-recognition', model || 'Xenova/whisper-tiny', {
                 // @ts-ignore
                 return_timestamps: timestamps,
                 language