From 555aeab40c4587eb20361aae4da56576ec36c989 Mon Sep 17 00:00:00 2001
From: Ningxin Hu <ningxin.hu@intel.com>
Date: Mon, 6 May 2024 10:30:20 +0800
Subject: [PATCH] Support NPU for SSD MobileNetV1 example (#228)

* Support NPU for SSD MobileNetV1 example

This change converts the float32 weigths to float16 at loading time. And
it uses WebNN cast operator to convert float32 inputs to float16 before
compute and convert float16 outputs back to float32 after compute.

* Fix lint error
---
 common/utils.js                          | 70 ++++++++++++++++++++++--
 object_detection/index.html              |  3 +
 object_detection/ssd_mobilenetv1_nchw.js | 27 +++++++--
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/common/utils.js b/common/utils.js
index a588fd8e..20dbab84 100644
--- a/common/utils.js
+++ b/common/utils.js
@@ -34,7 +34,57 @@ export async function getBufferFromUrl(url) {
   return arrayBuffer;
 }
 
-export async function buildConstantByNpy(builder, url) {
+// ref: http://stackoverflow.com/questions/32633585/how-do-you-convert-to-half-floats-in-javascript
+export const toHalf = (function() {
+  const floatView = new Float32Array(1);
+  const int32View = new Int32Array(floatView.buffer);
+
+  /* This method is faster than the OpenEXR implementation (very often
+   * used, eg. in Ogre), with the additional benefit of rounding, inspired
+   * by James Tursa?s half-precision code. */
+  return function toHalf(val) {
+    floatView[0] = val;
+    const x = int32View[0];
+
+    let bits = (x >> 16) & 0x8000; /* Get the sign */
+    let m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
+    const e = (x >> 23) & 0xff; /* Using int is faster here */
+
+    /* If zero, or denormal, or exponent underflows too much for a denormal
+     * half, return signed zero. */
+    if (e < 103) {
+      return bits;
+    }
+
+    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
+    if (e > 142) {
+      bits |= 0x7c00;
+      /* If exponent was 0xff and one mantissa bit was set, it means NaN,
+       * not Inf, so make sure we set one mantissa bit too. */
+      bits |= ((e == 255) ? 0 : 1) && (x & 0x007fffff);
+      return bits;
+    }
+
+    /* If exponent underflows but not too much, return a denormal */
+    if (e < 113) {
+      m |= 0x0800;
+      /* Extra rounding may overflow and set mantissa to 0 and exponent
+       * to 1, which is OK. */
+      bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
+      return bits;
+    }
+
+    bits |= ((e - 112) << 10) | (m >> 1);
+    /* Extra rounding. An overflow will set mantissa to 0 and increment
+     * the exponent, which is OK. */
+    bits += m & 1;
+    return bits;
+  };
+})();
+
+// Convert npy data in original data type to `targetType`, only support
+// 'float32' to 'float16' conversion currently.
+export async function buildConstantByNpy(builder, url, targetType) {
   const dataTypeMap = new Map([
     ['f2', {type: 'float16', array: Uint16Array}],
     ['f4', {type: 'float32', array: Float32Array}],
@@ -55,11 +105,22 @@ export async function buildConstantByNpy(builder, url) {
     throw new Error(`Data type ${npArray.dataType} is not supported.`);
   }
   const dimensions = npArray.shape;
-  const type = dataTypeMap.get(npArray.dataType).type;
+  let type = dataTypeMap.get(npArray.dataType).type;
   const TypedArrayConstructor = dataTypeMap.get(npArray.dataType).array;
   const dataView = new Uint8Array(npArray.data.buffer);
   const dataView2 = dataView.slice();
-  const typedArray = new TypedArrayConstructor(dataView2.buffer);
+  let typedArray = new TypedArrayConstructor(dataView2.buffer);
+  if (type === 'float32' && targetType === 'float16') {
+    const uint16Array = new Uint16Array(typedArray.length);
+    for (let i = 0; i < typedArray.length; ++i) {
+      uint16Array[i] = toHalf(typedArray[i]);
+    }
+    typedArray = uint16Array;
+    type = targetType;
+  } else if (type !== targetType) {
+    throw new Error(`Conversion from ${npArray.dataType} ` +
+        `to ${targetType} is not supported.`);
+  }
   return builder.constant({dataType: type, type, dimensions}, typedArray);
 }
 
@@ -494,7 +555,8 @@ export function getDefaultLayout(deviceType) {
     // Windows or Mac platform.
     if (deviceType.indexOf('cpu') != -1) {
       return 'nhwc';
-    } else if (deviceType.indexOf('gpu') != -1) {
+    } else if (deviceType.indexOf('gpu') != -1 ||
+               deviceType.indexOf('npu') != -1) {
       return 'nchw';
     }
   }
diff --git a/object_detection/index.html b/object_detection/index.html
index bcc5c3dc..e795f22f 100644
--- a/object_detection/index.html
+++ b/object_detection/index.html
@@ -43,6 +43,9 @@
               <label class="btn btn-outline-info custom" name="webnn">
                 <input type="radio" name="backend" id="webnn_gpu" autocomplete="off">WebNN (GPU)
               </label>
+              <label class="btn btn-outline-info custom" name="webnn">
+                <input type="radio" name="backend" id="webnn_npu" autocomplete="off">WebNN (NPU)
+              </label>
             </div>
           </div>
         </div>
diff --git a/object_detection/ssd_mobilenetv1_nchw.js b/object_detection/ssd_mobilenetv1_nchw.js
index 035eaf92..fbbab007 100644
--- a/object_detection/ssd_mobilenetv1_nchw.js
+++ b/object_detection/ssd_mobilenetv1_nchw.js
@@ -7,6 +7,7 @@ export class SsdMobilenetV1Nchw {
   constructor() {
     this.context_ = null;
     this.deviceType_ = null;
+    this.targetDataType_ = 'float32';
     this.model_ = null;
     this.builder_ = null;
     this.graph_ = null;
@@ -57,9 +58,11 @@ ${nameArray[1]}_BatchNorm_batchnorm`;
     }
 
     const weightsName = this.weightsUrl_ + prefix + weightSuffix;
-    const weights = await buildConstantByNpy(this.builder_, weightsName);
+    const weights = await buildConstantByNpy(
+        this.builder_, weightsName, this.targetDataType_);
     const biasName = this.biasUrl_ + prefix + biasSuffix;
-    const bias = await buildConstantByNpy(this.builder_, biasName);
+    const bias = await buildConstantByNpy(
+        this.builder_, biasName, this.targetDataType_);
     options.padding = computePadding2DForAutoPad(
         /* nchw */[input.shape()[2], input.shape()[3]],
         /* oihw */[weights.shape()[2], weights.shape()[3]],
@@ -69,7 +72,7 @@ ${nameArray[1]}_BatchNorm_batchnorm`;
       // TODO: Set clamp activation to options once it's supported in
       // WebNN DML backend.
       // Implement `clip` by `clamp` of  WebNN API
-      if (this.deviceType_ == 'gpu') {
+      if (this.deviceType_ == 'gpu' || this.deviceType_ == 'npu') {
         return this.builder_.clamp(
             this.builder_.conv2d(input, weights, options),
             {minValue: 0, maxValue: 6});
@@ -83,12 +86,17 @@ ${nameArray[1]}_BatchNorm_batchnorm`;
   async load(contextOptions) {
     this.context_ = await navigator.ml.createContext(contextOptions);
     this.deviceType_ = contextOptions.deviceType;
+    if (this.deviceType_ == 'gpu' || this.deviceType_ == 'npu') {
+      this.targetDataType_ = 'float16';
+    }
     this.builder_ = new MLGraphBuilder(this.context_);
-    const input = this.builder_.input('input', {
-      type: 'float32',
+    let input = this.builder_.input('input', {
       dataType: 'float32',
       dimensions: this.inputOptions.inputDimensions,
     });
+    if (this.targetDataType_ === 'float16') {
+      input = this.builder_.cast(input, 'float16');
+    }
     const strides = [2, 2];
     const conv0 = await this.buildConv_(
         input, ['', '0', '', '165__cf__168'],
@@ -249,7 +257,14 @@ ${nameArray[1]}_BatchNorm_batchnorm`;
     const concat1 = this.builder_.concat(
         [reshape6, reshape7, reshape8, reshape9, reshape10, reshape11], 1);
 
-    return {'boxes': concat0, 'scores': concat1};
+    let boxes = concat0;
+    let scores = concat1;
+
+    if (this.targetDataType_ === 'float16') {
+      boxes = this.builder_.cast(boxes, 'float32');
+      scores = this.builder_.cast(boxes, 'float32');
+    }
+    return {boxes, scores};
   }
 
   async build(outputOperand) {