v0.3.1

jgw96 · Oct 25, 2024 · 1ffe92a · 1ffe92a
1 parent 5c0415f
commit 1ffe92a
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ the code will attempt to choose an NPU first, then a GPU and finally the CPU if
 
 | Function Name         | Parameter      | Type                   | Default Value | Supported Hardware |
 |-----------------------|----------------|------------------------|---------------|--------------------|
-| transcribeAudioFile   | audioFile      | Blob                   | -             | NPU / GPU / CPU               |
+| transcribeAudioFile   | audioFile      | Blob                   | -             | GPU / CPU               |
 |                       | model          | string                 | "Xenova/whisper-tiny"|                    |
 |                       | timestamps     | boolean                | false         |                    |
 |                       | language       | string                 | "en-US"       |                    |
@@ -28,10 +28,12 @@ the code will attempt to choose an NPU first, then a GPU and finally the CPU if
 |                       | model          | string                 | "Xenova/distilbart-cnn-6-6"|                |
 | ocr                   | image          | Blob                   | -             | GPU / CPU               |
 |                       | model          | string                 | "Xenova/trocr-small-printed"|                 |
+| image-classification  | image          | Blob                   | -             | NPU / GPU / CPU               |
+|                       | model          | string                 | "Xenova/resnet-50"|                 |
 
 ## Technical Details
 
-The Web AI Toolkit utilizes the [transformers.js project](https://huggingface.co/docs/transformers.js/index) to run AI workloads. All AI processing is performed locally on the device, ensuring data privacy and reducing latency. AI workloads are run using the [WebNN API](https://learn.microsoft.com/en-us/windows/ai/directml/webnn-overview) when available, otherwise falling back to the WebGPU API. Both of these APIs are used to "hardware accelerate" the AI inferences, with WebNN targeting NPUs and GPUs, and WebGPU strictly targeting GPUs.
+The Web AI Toolkit utilizes the [transformers.js project](https://huggingface.co/docs/transformers.js/index) to run AI workloads. All AI processing is performed locally on the device, ensuring data privacy and reducing latency. AI workloads are run using the [WebNN API](https://learn.microsoft.com/en-us/windows/ai/directml/webnn-overview) when available, otherwise falling back to the WebGPU API, or even to the CPU with WebAssembly. Choosing the correct hardware to target is handled by the library.
 
 ## Usage
 
@@ -77,6 +79,16 @@ const text = await ocr(image);
 console.log(text);
 ```
 
+### Image Classification
+
+```javascript
+import { classifyImage } from 'web-ai-toolkit';
+
+const image = ...; // Your image Blob
+const text = await classifyImage(image);
+console.log(text);
+```
+
 ## Contribution
 
 We welcome contributions to the Web AI Toolkit. Please fork the repository and submit a pull request with your changes. For major changes, please open an issue first to discuss what you would like to change.

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "web-ai-toolkit",
-  "version": "0.2.1",
+  "version": "0.3.1",
   "repository": "https://github.com/jgw96/web-ai-toolkit",
   "keywords": [
     "ai",
@@ -38,7 +38,7 @@
     "vitest": "^2.1.2"
   },
   "dependencies": {
-    "@huggingface/transformers": "^3.0.0-alpha.16",
+    "@huggingface/transformers": "^3.0.0-alpha.22",
     "@xenova/transformers": "^2.17.2"
   }
 }
diff --git a/src/index.ts b/src/index.ts
@@ -42,3 +42,14 @@ export async function ocr(image: Blob, model: string = "Xenova/trocr-small-print
         return err;
     }
 }
+
+export async function classifyImage(image: Blob, model: string = "Xenova/resnet-50") {
+    try {
+        const { runClassifier } = await import("./services/image-classification/image-classification");
+        return runClassifier(image, model);
+    }
+    catch (err) {
+        console.error(err);
+        return err;
+    }
+}
diff --git a/src/services/image-classification/image-classification.ts b/src/services/image-classification/image-classification.ts
@@ -0,0 +1,42 @@
+import { pipeline, env } from '@huggingface/transformers';
+import { webGPUCheck } from '../../utils';
+
+let classifier: any = undefined;
+
+export async function runClassifier(image: Blob | string, model: string = "onnx-community/mobilenetv4s-webnn") {
+    return new Promise(async (resolve, reject) => {
+        try {
+            if (!classifier) {
+                await loadClassifier(model);
+            };
+
+            if (typeof image !== "string") {
+                image = URL.createObjectURL(image);
+            }
+
+            const out = await classifier(image);
+            resolve(out);
+        }
+        catch (err) {
+            reject(err);
+        }
+    });
+}
+
+async function loadClassifier(model: string): Promise<void> {
+    return new Promise(async (resolve) => {
+        if (!classifier) {
+            env.allowLocalModels = false;
+            env.useBrowserCache = false;
+
+            classifier = await pipeline("image-classification", model || "Xenova/resnet-50", {
+                device: (navigator as any).ml ? "webnn-npu" : await webGPUCheck() ? "webgpu" : "wasm"
+            });
+            console.log("loaded classifier", classifier)
+            resolve();
+        }
+        else {
+            resolve();
+        }
+    });
+}
diff --git a/src/services/speech-recognition/recognition.ts b/src/services/speech-recognition/recognition.ts
@@ -10,7 +10,7 @@ export function doLocalWhisper(audioFile: Blob, model: string = "Xenova/whisper-
             if (!transcriber) {
                 await loadTranscriber(model || 'Xenova/whisper-tiny', false, 'en');
             }
-            
+
             const fileReader = new FileReader();
             fileReader.onloadend = async () => {
                 const audioCTX = new AudioContext({
@@ -58,9 +58,11 @@ export async function loadTranscriber(model: string = "Xenova/whisper-tiny", tim
                 // @ts-ignore
                 return_timestamps: timestamps,
                 language,
-                device: (navigator as any).ml ? "webnn" : await webGPUCheck() ? "webgpu" : "wasm"
+                // @ts-ignore
+                device: await webGPUCheck() ? "webgpu" : "wasm"
             });
 
+
             resolve();
         }
         else {
@@ -127,8 +129,6 @@ function callback_function(item: any) {
     // Update tokens of last chunk
     last.tokens = [...item[0].output_token_ids];
 
-    console.log("callback_function", item, last)
-
     // Merge text chunks
     // TODO optimise so we don't have to decode all chunks every time
     // @ts-ignore
@@ -138,7 +138,6 @@ function callback_function(item: any) {
         force_full_sequences: false,
     });
 
-    console.log("callback_function", data);
 
     self.postMessage({
         type: 'transcribe-interim',

diff --git a/test.html b/test.html
@@ -22,6 +22,11 @@
         <button id="image-to-text-button">Test Image to Text</button>
     </div>
 
+    <div id="image-classify-block">
+        <input type="file" id="image-classify-file" accept="image/*" />
+        <button id="image-classify-button">Test Image Classification</button>
+    </div>
+
 
     <script type="module">
         document.querySelector("#summarize_button").addEventListener("click", async () => {
@@ -56,6 +61,15 @@
             console.log(text);
             URL.revokeObjectURL(file);
         });
+
+        document.querySelector("#image-classify-button").addEventListener("click", async () => {
+            const { classifyImage } = await import("/dist/index.js");
+
+            const file = document.querySelector("#image-classify-file").files[0];
+            const text = await classifyImage(URL.createObjectURL(file));
+            console.log(text);
+            URL.revokeObjectURL(file);
+        });
     </script>
 </body>