microsoft · fs-eire · Sep 30, 2023 · Sep 12, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
@@ -21,16 +21,16 @@
 
 export declare type Activation = 'linear' | 'relu' | 'prelu' | 'elu' | 'relu6' | 'leakyrelu' | 'sigmoid' | 'gelu';
 
-export const typeSnippet = (component: number) => {
+export const typeSnippet = (component: number, dataType: string) => {
   switch (component) {
     case 1:
-      return 'f32';
+      return dataType;
     case 2:
-      return 'vec2<f32>';
+      return `vec2<${dataType}>`;
     case 3:
-      return 'vec3<f32>';
+      return `vec3<${dataType}>`;
     case 4:
-      return 'vec4<f32>';
+      return `vec4<${dataType}>`;
     default:
       throw new Error(`${component}-component is not supported.`);
   }

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -23,6 +23,7 @@ import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
+import {tensorTypeToWsglStorageType} from '../common';
 import {ConvAttributes} from '../conv';
 
 import {Activation, activationFnSnippet, biasActivationSnippet, typeSnippet} from './activation_util';
@@ -32,13 +33,13 @@ import {makeMatMulPackedSource, makeMatMulPackedVec4Source} from './matmul_packe
 const conv2dCommonSnippet =
     (isChannelsLast: boolean, fitAOuter: boolean, fitBOuter: boolean, fitInner: boolean, addBias = false,
      activation?: Activation, hasPreluActivationWeights = false, innerElementSizeX = 4, innerElementSizeW = 4,
-     innerElementSize = 4): string => {
+     innerElementSize = 4, dataType = 'f32'): string => {
       const getXSnippet = (innerElementSize: number) => {
         switch (innerElementSize) {
           case 1:
             return 'resData = x[xIndex];';
           case 3:
-            return 'resData = vec3<f32>(x[xIndex], x[xIndex + 1], x[xIndex + 2]);';
+            return `resData = vec3<${dataType}>(x[xIndex], x[xIndex + 1], x[xIndex + 2]);`;
           case 4:
             return 'resData = x[xIndex / 4];';
           default:
@@ -92,7 +93,7 @@ const conv2dCommonSnippet =
     let xRow = outRow * stride[0] + dilation[0] * WRow - pad[0];
     let xCol = outCol * stride[1] + dilation[1] * WCol - pad[1];
     let xCh = ${col} % inChannels;
-    var resData = ${typeSnippet(innerElementSizeX)}(0.0);
+    var resData = ${typeSnippet(innerElementSizeX, dataType)}(0.0);
     // The bounds checking is always needed since we use it to pad zero for
     // the 'same' padding type.
     if (xRow >= 0 && xRow < ${xHeight} && xCol >= 0 && xCol < ${xWidth}) {
@@ -110,7 +111,7 @@ const conv2dCommonSnippet =
     if (row < dimAOuter && col < dimInner) {
       ${readXSnippet}
     }
-    return ${typeSnippet(innerElementSizeX)}(0.0);`) :
+    return ${typeSnippet(innerElementSizeX, dataType)}(0.0);`) :
                                        (fitInner && fitBOuter ? `
     let col = colIn * ${innerElementSizeX};
     ${readXSnippet}` :
@@ -119,13 +120,15 @@ const conv2dCommonSnippet =
     if (row < dimInner && col < dimBOuter) {
       ${readXSnippet}
     }
-    return ${typeSnippet(innerElementSizeX)}(0.0);`);
+    return ${typeSnippet(innerElementSizeX, dataType)}(0.0);`);
 
       const sampleW = `${getWSnippet(innerElementSizeW)}`;
 
-      const resType = typeSnippet(innerElementSize);
-      const aType = isChannelsLast ? typeSnippet(innerElementSizeX) : typeSnippet(innerElementSizeW);
-      const bType = isChannelsLast ? typeSnippet(innerElementSizeW) : typeSnippet(innerElementSizeX);
+      const resType = typeSnippet(innerElementSize, dataType);
+      const aType =
+          isChannelsLast ? typeSnippet(innerElementSizeX, dataType) : typeSnippet(innerElementSizeW, dataType);
+      const bType =
+          isChannelsLast ? typeSnippet(innerElementSizeW, dataType) : typeSnippet(innerElementSizeX, dataType);
       const userCode = `
     ${activationFnSnippet(activation, hasPreluActivationWeights, innerElementSize === 4, 4)}
     fn mm_readA(batch: i32, row : i32, colIn : i32) -> ${aType} {
@@ -190,23 +193,24 @@ export const createConv2DMatMulProgramInfo =
       const fitInner = dimInner % tileInner === 0;
 
       const elementsSize = isVec4 ? [innerElementSize, 4, 4] : [1, 1, 1];
+      const t = tensorTypeToWsglStorageType(inputs[0].dataType);
 
       const declareInputs = [
-        `@group(0) @binding(0) var<storage, read> x: array<${isVec4 && innerElementSize === 4 ? 'vec4<f32>' : 'f32'}>;`,
-        `@group(0) @binding(1) var<storage, read> w: array<${isVec4 ? 'vec4<f32>' : 'f32'}>;`
+        `@group(0) @binding(0) var<storage, read> x: array<${isVec4 && innerElementSize === 4 ? `vec4<${t}>` : t}>;`,
+        `@group(0) @binding(1) var<storage, read> w: array<${isVec4 ? `vec4<${t}>` : t}>;`
       ];
       let declareFunctions = `
-      fn setOutputAtIndex(flatIndex : i32, value : ${isVec4 ? 'vec4<f32>' : 'f32'}) {
-        result[flatIndex] = ${isVec4 ? 'vec4<f32>' : 'f32'}(value);
+      fn setOutputAtIndex(flatIndex : i32, value : ${isVec4 ? `vec4<${t}>` : t}) {
+        result[flatIndex] = ${isVec4 ? `vec4<${t}>` : t}(value);
       }
-      fn setOutputAtCoords(d0 : i32, d1 : i32, d2 : i32, d3 : i32, value : ${isVec4 ? 'vec4<f32>' : 'f32'}) {
+      fn setOutputAtCoords(d0 : i32, d1 : i32, d2 : i32, d3 : i32, value : ${isVec4 ? `vec4<${t}>` : t}) {
         let flatIndex = getOutputIndexFromCoords(vec4<i32>(d0, d1, d2, d3));
         setOutputAtIndex(flatIndex ${isVec4 ? '/ 4' : ''}, value);
       }`;
       if (hasBias) {
-        declareInputs.push(`@group(0) @binding(2) var<storage, read> bias: array<${isVec4 ? 'vec4<f32>' : 'f32'}>;`);
+        declareInputs.push(`@group(0) @binding(2) var<storage, read> bias: array<${isVec4 ? `vec4<${t}>` : t}>;`);
         declareFunctions += `
-        fn getBiasByOutputCoords(coords : vec4<i32>) -> ${isVec4 ? 'vec4<f32>' : 'f32'} {
+        fn getBiasByOutputCoords(coords : vec4<i32>) -> ${isVec4 ? `vec4<${t}>` : t} {
           return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
         }`;
       }
@@ -222,7 +226,7 @@ export const createConv2DMatMulProgramInfo =
         //  dilation : vec2<i32>, dimAOuter : i32, dimBOuter : i32, dimInner : i32 };
         ${declareInputs.join('')}
         @group(0) @binding(${declareInputs.length}) var<storage, read_write> result: array<${
-            isVec4 ? 'vec4<f32>' : 'f32'}>;
+            isVec4 ? `vec4<${t}>` : t}>;
         //@group(0) @binding(${declareInputs.length + 1}) var<uniform> uniforms: Uniforms;
 
         const xShape : vec4<i32> = vec4<i32>(${inputs[0].dims.join(',')});
@@ -240,12 +244,12 @@ export const createConv2DMatMulProgramInfo =
         ${
             conv2dCommonSnippet(
                 isChannelsLast, fitAOuter, fitBOuter, fitInner, hasBias, undefined, false, elementsSize[0],
-                elementsSize[1], elementsSize[2])}
+                elementsSize[1], elementsSize[2], t)}
             ${
             isVec4 ?
-                makeMatMulPackedVec4Source(elementsPerThread, workGroupSize, undefined, !isChannelsLast, tileInner) :
+                makeMatMulPackedVec4Source(elementsPerThread, workGroupSize, t, undefined, !isChannelsLast, tileInner) :
                 makeMatMulPackedSource(
-                    elementsPerThread, workGroupSize, undefined, !isChannelsLast, tileInner, false, undefined,
+                    elementsPerThread, workGroupSize, t, undefined, !isChannelsLast, tileInner, false, undefined,
                     sequentialAccessByThreads)}`
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -21,12 +21,13 @@ import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
-import {inputVariable, outputVariable, ShaderHelper} from '../common';
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
 import {ConvTransposeAttributes} from '../conv-transpose';
 
 const createConvTranspose2DOpProgramShaderSource =
     (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: ConvTransposeAttributes,
-     outputShape: readonly number[], hasBias: boolean, is1DimensionDispatch: boolean, isVec4 = false): string => {
+     outputShape: readonly number[], hasBias: boolean, is1DimensionDispatch: boolean, isVec4 = false,
+     dataType: string): string => {
       const isChannelsLast = attributes.format === 'NHWC';
       const rowDim = isChannelsLast ? 1 : 2;
       const colDim = isChannelsLast ? 2 : 3;
@@ -39,12 +40,12 @@ const createConvTranspose2DOpProgramShaderSource =
       const outputChannelsPerGroup = wShape[1];
 
       let declareFunctions = `
-  fn setOutputAtIndex(flatIndex : u32, value : ${isVec4 ? 'vec4<f32>' : 'f32'}) {
-    result[flatIndex] = ${isVec4 ? 'vec4<f32>' : 'f32'}(value);
+  fn setOutputAtIndex(flatIndex : u32, value : ${isVec4 ? `vec4<${dataType}>` : dataType}) {
+    result[flatIndex] = ${isVec4 ? `vec4<${dataType}>` : dataType}(value);
   }`;
       if (hasBias) {
         declareFunctions += `
-    fn getBiasByOutputCoords(coords : vec4<u32>) -> ${isVec4 ? 'vec4<f32>' : 'f32'} {
+    fn getBiasByOutputCoords(coords : vec4<u32>) -> ${isVec4 ? `vec4<${dataType}>` : dataType} {
       return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
     }`;
       }
@@ -66,33 +67,33 @@ const createConvTranspose2DOpProgramShaderSource =
 
         // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
         // ? = to be determined. : = across all values in that axis.
-        var dotProd: array<vec4<f32>, ${workPerThread}>;
+        var dotProd: array<vec4<${dataType}>, ${workPerThread}>;
         for (var i = 0; i < ${workPerThread}; i++) {
-          dotProd[i] = vec4<f32>(0.0);
+          dotProd[i] = vec4<${dataType}>(0.0);
         }
         for (var wR: u32 = 0; wR < filterDims[0]; wR = wR + 1) {
-          var dyR = (f32(dyCorner.x) + f32(wR)) / f32(strides.x);
+          var dyR = (${dataType}(dyCorner.x) + ${dataType}(wR)) / ${dataType}(strides.x);
           let wRPerm = filterDims[0] - 1 - wR;
-          if (dyR < 0.0 || dyR >= f32(outBackprop[1]) ||
+          if (dyR < 0.0 || dyR >= ${dataType}(outBackprop[1]) ||
               fract(dyR) > 0.0 || wRPerm < 0) {
             continue;
           }
           let idyR: u32 = u32(dyR);
 
           for (var wC: u32 = 0; wC < filterDims[1]; wC = wC + 1) {
-            let dyC = (f32(dyCorner.y) + f32(wC)) / f32(strides.y);
-            let dyC2 = (f32(dyCorner.y) + 1.0 + f32(wC)) / f32(strides.y);
+            let dyC = (${dataType}(dyCorner.y) + ${dataType}(wC)) / ${dataType}(strides.y);
+            let dyC2 = (${dataType}(dyCorner.y) + 1.0 + ${dataType}(wC)) / ${dataType}(strides.y);
             let wCPerm = filterDims[1] - 1 - wC;
             if (wCPerm < 0) {
               continue;
             }
             var bDyCVal = true;
             var bDyCVal2 = true;
-            if (dyC < 0.0 || dyC >= f32(outBackprop[2]) ||
+            if (dyC < 0.0 || dyC >= ${dataType}(outBackprop[2]) ||
                 fract(dyC) > 0.0) {
               bDyCVal = false;
             }
-            if (dyC2 < 0.0 || dyC2 >= f32(outBackprop[2]) ||
+            if (dyC2 < 0.0 || dyC2 >= ${dataType}(outBackprop[2]) ||
                 fract(dyC2) > 0.0) {
               bDyCVal2 = false;
             }
@@ -108,15 +109,15 @@ const createConvTranspose2DOpProgramShaderSource =
                 let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
 
                 var xValue = ${dy.get('batch', 'idyR', 'idyC', 'd2')};
-                let tmpval = vec4<f32>(dot(xValue, wValue0),
+                let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
                                       dot(xValue, wValue1),
                                       dot(xValue, wValue2),
                                       dot(xValue, wValue3));
                 dotProd[0] = dotProd[0] + tmpval;
 
                 xValue =  ${dy.get('batch', 'idyR', 'idyC2', 'd2')};
 
-                dotProd[1] = dotProd[1] + vec4<f32>(dot(xValue, wValue0),
+                dotProd[1] = dotProd[1] + vec4<${dataType}>(dot(xValue, wValue0),
                                                     dot(xValue, wValue1),
                                                     dot(xValue, wValue2),
                                                     dot(xValue, wValue3));
@@ -130,7 +131,7 @@ const createConvTranspose2DOpProgramShaderSource =
                 let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
 
                 var xValue = ${dy.get('batch', 'idyR', 'idyC', 'd2')};
-                let tmpval = vec4<f32>(dot(xValue, wValue0),
+                let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
                                       dot(xValue, wValue1),
                                       dot(xValue, wValue2),
                                       dot(xValue, wValue3));
@@ -145,7 +146,7 @@ const createConvTranspose2DOpProgramShaderSource =
                 let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
 
                 var xValue = ${dy.get('batch', 'idyR', 'idyC2', 'd2')};
-                let tmpval = vec4<f32>(dot(xValue, wValue0),
+                let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
                                       dot(xValue, wValue1),
                                       dot(xValue, wValue2),
                                       dot(xValue, wValue3));
@@ -178,9 +179,9 @@ const createConvTranspose2DOpProgramShaderSource =
             if (wR % dilations.x != 0) {
               continue;
             }
-            let dyR = (f32(dyRCorner) + f32(wR)) / f32(strides[0]);
+            let dyR = (${dataType}(dyRCorner) + ${dataType}(wR)) / ${dataType}(strides[0]);
             let wRPerm = filterDims.x - 1 - wR / dilations.x;
-            if (dyR < 0.0 || dyR >= f32(outBackprop[${rowDim}]) || fract(dyR) > 0.0 ||
+            if (dyR < 0.0 || dyR >= ${dataType}(outBackprop[${rowDim}]) || fract(dyR) > 0.0 ||
                 wRPerm < 0) {
               continue;
             }
@@ -190,9 +191,9 @@ const createConvTranspose2DOpProgramShaderSource =
               if (wC % dilations.y != 0) {
                 continue;
               }
-              let dyC = (f32(dyCCorner) + f32(wC)) / f32(strides.y);
+              let dyC = (${dataType}(dyCCorner) + ${dataType}(wC)) / ${dataType}(strides.y);
               let wCPerm = filterDims.y - 1 - wC / dilations.y;
-              if (dyC < 0.0 || dyC >= f32(outBackprop[${colDim}]) ||
+              if (dyC < 0.0 || dyC >= ${dataType}(outBackprop[${colDim}]) ||
                   fract(dyC) > 0.0 || wCPerm < 0) {
                 continue;
               }
@@ -256,6 +257,7 @@ export const createConvTranspose2DProgramInfo =
       ];
       LOG_DEBUG('verbose', () => `[conv2d_backprop_webgpu] dispatch = ${dispatch}`);
 
+      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
       return {
         ...metadata,
         outputs: [{
@@ -265,6 +267,7 @@ export const createConvTranspose2DProgramInfo =
         }],
         dispatchGroup: () => ({x: dispatch[0], y: dispatch[1], z: dispatch[2]}),
         getShaderSource: (shaderHelper: ShaderHelper) => createConvTranspose2DOpProgramShaderSource(
-            shaderHelper, inputs, attributes, outputShape, hasBias, dispatch[1] === 1 && dispatch[2] === 1),
+            shaderHelper, inputs, attributes, outputShape, hasBias, dispatch[1] === 1 && dispatch[2] === 1, false,
+            dataType),
       };
     };