microsoft · miaobin · Nov 28, 2024 · Dec 5, 2024
diff --git a/demos/whisper-base/main.js b/demos/whisper-base/main.js
@@ -88,6 +88,7 @@ const SpeechStates = {
 let speechState = SpeechStates.UNINITIALIZED;
 
 let mask4d = true; // use 4D mask input for decoder models
+let ioBinding = true;
 let streamingNode = null;
 let sourceNode = null;
 let audioChunks = []; // member {isSubChunk: boolean, data: Float32Array}
@@ -137,29 +138,34 @@ function updateConfig() {
     let vars = query.split("&");
     for (let i = 0; i < vars.length; i++) {
         let pair = vars[i].split("=");
+        const key = pair[0].toLowerCase();
+        const value = pair[1].toLowerCase();
         if (pair[0] == "provider" && providers.includes(pair[1])) {
             provider = pair[1];
         }
-        if (pair[0].toLowerCase() == "devicetype" && deviceTypes.includes(pair[1])) {
+        if (key == "devicetype" && deviceTypes.includes(pair[1])) {
             deviceType = pair[1];
         }
-        if (pair[0].toLowerCase() == "datatype" && dataTypes.includes(pair[1])) {
+        if (key == "datatype" && dataTypes.includes(pair[1])) {
             dataType = pair[1];
         }
-        if (pair[0].toLowerCase() == "maxchunklength") {
+        if (key == "maxchunklength") {
             maxChunkLength = parseFloat(pair[1]);
         }
-        if (pair[0].toLowerCase() == "chunklength") {
+        if (key == "chunklength") {
             chunkLength = parseFloat(pair[1]);
         }
-        if (pair[0].toLowerCase() == "maxaudiolength") {
+        if (key == "maxaudiolength") {
             maxAudioLength = Math.min(parseInt(pair[1]), kMaxAudioLengthInSec);
         }
-        if (pair[0].toLowerCase() == "accumulatesubchunks") {
-            accumulateSubChunks = pair[1].toLowerCase() === "true";
+        if (key == "accumulatesubchunks") {
+            accumulateSubChunks = value === "true";
         }
         if (pair[0] == "mask_4d") {
-            mask4d = pair[1].toLowerCase() === "true";
+            mask4d = value === "true";
+        }
+        if (pair[0] == 'ioBinding') {
+            ioBinding = value === 'true';
         }
     }
 }
@@ -664,7 +670,7 @@ const main = async () => {
         const whisper_url = location.href.includes("github.io")
             ? "https://huggingface.co/microsoft/whisper-base-webnn/resolve/main/"
             : "./models/";
-        whisper = new Whisper(whisper_url, provider, deviceType, dataType, mask4d);
+        whisper = new Whisper(whisper_url, provider, deviceType, dataType, mask4d, ioBinding);
         await whisper.create_whisper_processor();
         await whisper.create_whisper_tokenizer();
         await whisper.create_ort_sessions();

diff --git a/demos/whisper-base/whisper.js b/demos/whisper-base/whisper.js
@@ -40,12 +40,13 @@ if (
 
 // wrapper around onnxruntime and model
 export class Whisper {
-    constructor(url, provider, deviceType = "gpu", dataType, mask_4d = true) {
+    constructor(url, provider, deviceType = "gpu", dataType, mask_4d = true, ioBinding) {
         this.url = url;
         this.provider = provider;
         this.deviceType = deviceType;
         this.dataType = dataType;
         this.mask_4d = mask_4d;
+        this.ioBinding = ioBinding && deviceType == 'gpu';
         ort.env.wasm.simd = true;
 
         this.models = {
@@ -101,6 +102,19 @@ export class Whisper {
                     deviceType: this.deviceType,
                 },
             ],
+            preferredOutputLocation: this.ioBinding ? (() => {
+                let pairs = {};
+                pairs['last_hidden_state'] = "ml-tensor";
+                for (let i = 0; i < 6; i++) {
+                    pairs[`padded_present_key_values.${i}.decoder.key`] = "ml-tensor";
+                    pairs[`padded_present_key_values.${i}.decoder.value`] = "ml-tensor";
+                    pairs[`present_key_values.${i}.encoder.key`] = "ml-tensor";
+                    pairs[`present_key_values.${i}.encoder.value`] = "ml-tensor";
+                    pairs[`updated_present_key_values.${i}.decoder.key`] = "ml-tensor";
+                    pairs[`updated_present_key_values.${i}.decoder.value`] = "ml-tensor";
+                }
+                return pairs;
+            })() : undefined,
             logSeverityLevel: 0,
         };
 
@@ -111,16 +125,22 @@ export class Whisper {
                 let url = this.url + this.models[name]["url"];
                 if (this.dataType == "float16") {
                     url = url.replace(".onnx", "_fp16_layernorm_gelu.onnx");
-                    if (name.includes("decoder") && this.mask_4d) {
-                        url = url.replace(".onnx", "_4dmask.onnx");
+                    if (name.includes("decoder")) {
+                        if (this.mask_4d) {
+                            url = url.replace(".onnx", "_4dmask.onnx");
+                        }
+                        if (this.ioBinding) {
+                            url = url.replace(".onnx", "_iobinding.onnx");
+                        }
                     }
                     log(`Loading ${this.models[name]["title"]} · ${this.dataType} · ${this.models[name]["fp16size"]}`);
                 } else {
                     url = url.replace(".onnx", "_layernorm.onnx");
                     log(`Loading ${this.models[name]["title"]} · ${this.dataType} · ${this.models[name]["fp32size"]}`);
                 }
 
-                const modelBuffer = await getModelOPFS(`${this.deviceType}_${name}_${this.dataType}`, url, false);
+                const modelBuffer = await getModelOPFS(`${this.ioBinding}_${this.deviceType}_${name}_${this.dataType}`,
+                    url, false);
                 log(`${this.models[name]["title"]} loaded`);
 
                 log(`Creating session for ${this.models[name]["title"]}`);
@@ -271,15 +291,24 @@ export class Whisper {
         }
 
         // modify the self attention kv cache in place
-        cache_update(
-            decoder_input,
-            decoder_output,
-            0,
-            this.max_sequence_length,
-            this.num_init_tokens,
-            this.num_init_tokens,
-            this.dataType,
-        );
+        if (this.ioBinding) {
+            for (let i = 0; i < 6; i++) {
+                decoder_input[`past_key_values.${i}.decoder.key`] =
+                    decoder_output[`padded_present_key_values.${i}.decoder.key`];
+                decoder_input[`past_key_values.${i}.decoder.value`] =
+                    decoder_output[`padded_present_key_values.${i}.decoder.value`];
+            }
+        } else {
+            cache_update(
+                decoder_input,
+                decoder_output,
+                0,
+                this.max_sequence_length,
+                this.num_init_tokens,
+                this.num_init_tokens,
+                this.dataType
+            );
+        }
 
         const position_ids = new Int32Array(decoder_input["position_ids"].cpuData.buffer);
         // run complete inference for every item in dataset
@@ -319,16 +348,40 @@ export class Whisper {
                 position_ids[0],
                 this.mask_4d,
             );
-            // modify the kv cache in place
-            cache_update(
-                decoder_input,
-                decoder_cached_output,
-                i,
-                this.max_sequence_length,
-                this.num_init_tokens,
-                position_ids[0],
-                this.dataType,
-            );
+
+          // modify the kv cache in place
+          if (this.ioBinding) {
+              for (let i = 0; i < 6; i++) {
+                  // dispose previous tensors
+                  decoder_input[`past_key_values.${i}.decoder.key`].mlTensor.destroy();
+                  decoder_input[`past_key_values.${i}.decoder.value`].mlTensor.destroy();
+                  // update the kv cache
+                  decoder_input[`past_key_values.${i}.decoder.key`] =
+                      decoder_cached_output[`updated_present_key_values.${i}.decoder.key`];
+                  decoder_input[`past_key_values.${i}.decoder.value`] =
+                      decoder_cached_output[`updated_present_key_values.${i}.decoder.value`];
+              }
+          } else {
+              cache_update(
+                  decoder_input,
+                  decoder_cached_output,
+                  i,
+                  this.max_sequence_length,
+                  this.num_init_tokens,
+                  position_ids[0],
+                  this.dataType
+              );
+          }
+
+        }
+
+        if (this.ioBinding) {
+            for (let i = 0; i < 6; i++) {
+                decoder_output[`padded_present_key_values.${i}.decoder.key`].mlTensor.destroy();
+                decoder_output[`padded_present_key_values.${i}.decoder.value`].mlTensor.destroy();
+                decoder_input[`past_key_values.${i}.encoder.key`].mlTensor.destroy();
+                decoder_input[`past_key_values.${i}.encoder.value`].mlTensor.destroy();
+            }
         }
 
         // add token to sentence decode time