diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index f3845e3110905..c054da51a3098 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -592,7 +592,8 @@ class ShaderHelperImpl implements ShaderHelper {
     const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];
 
     const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
-    const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3<u32>' :
+    const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(local_invocation_id) local_id : vec3<u32>` :
                                              `@builtin(local_invocation_index) local_index : u32,
     @builtin(workgroup_id) workgroup_id : vec3<u32>`;
     const globalIdxDefinition = is1DimensionDispatch ?
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index f62c766aa9ed0..449073a133295 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -1,83 +1,97 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common';
 
 export interface InstanceNormAttributes extends AttributeWithCacheKey {
   epsilon: number;
   format: 'NHWC'|'NCHW';
 }
 
-const validateInputs = (inputs: readonly TensorView[]): void => {
-  if (!inputs || inputs.length !== 3) {
-    throw new Error('instanceNorm requires 3 inputs.');
-  }
-
-  if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
-    throw new Error('inputs should be float type');
-  }
-};
-
 const createInstanceNormProgramInfo =
     (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
       const xShape = inputs[0].dims;
-      const scale = inputs[1];
-      const bias = inputs[2];
 
       const outputShape = xShape;
-      const outputSize = ShapeUtil.size(outputShape);
       const axis = 2;
       const normCount = ShapeUtil.sizeToDimension(xShape, axis);
       const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
       const C = xShape[1];
-
-      const scaleSize = ShapeUtil.size(scale.dims);
-      const biasSize = bias ? ShapeUtil.size(bias.dims) : 0;
-      if (scaleSize !== normSize || (bias && biasSize !== normSize)) {
-        throw new Error(`Size of X.shape()[axis:] == ${normSize}.
-             Size of scale and bias (if provided) must match this. 
-             Got scale size of ${scaleSize} and bias size of ${biasSize}`);
-      }
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-
+      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const variables = [x, scale, bias, output];
+      const dataType = x.type.value;
+      const workgroupSize = 64;
       const getShaderSource = (shaderHelper: ShaderHelper) => `
+
   const C: u32 = ${C};
   const normSize: u32 = ${normSize};
-  const normSizeTyped: ${dataType} = ${normSize};
   const epsilon: f32 = ${attributes.epsilon};
+  var<workgroup> meanShared : ${dataType};
+  var<workgroup> squaredNormShared : ${dataType};
+  var<workgroup> workgroupShared : array<${dataType}, ${workgroupSize}>;
+  const workgroupSize = ${workgroupSize}u;
+  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.mainStart(workgroupSize)}
+    let norm = global_idx / workgroupSize;
+    let batch = norm / C;
+    let channel = norm % C;
+    let localIndex = local_id.x;
+
+    // initialize workgroup memory
+    var initial: ${dataType} = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      initial = initial + ${x.get('batch', 'channel', 'h')};
+    }
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
 
-  @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> scale : array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias : array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> output : array<${dataType}>;
-
-  ${shaderHelper.mainStart()}
-    let offset = global_idx * normSize;
-    if (offset + normSize >= ${outputSize}) { return; }
-    var mean: ${dataType} = 0;
+    // Calculate the mean of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
+    }
+    if (localIndex == 0) {
+      meanShared = workgroupShared[0] / ${dataType}(normSize);
+    }
+    workgroupBarrier();
 
-    for (var h: u32 = 0u; h < normSize; h++) {
-        mean = mean + x[h + offset];
+    // reinitialize workgroup memory.
+    initial = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let deviation =  ${x.get('batch', 'channel', 'h')} - meanShared;
+      initial = initial + deviation * deviation;
     }
-    mean = mean / normSizeTyped;
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
 
-    var squaredNorm: ${dataType} = 0;
-    for (var h: u32 = 0u; h < normSize; h++) {
-        let deviation: f32 = x[h + offset] - mean;
-        squaredNorm = squaredNorm + deviation * deviation;
+    // Calculate the sum of square of deviation of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
     }
-    let invStdDev = 1 / sqrt(squaredNorm / normSizeTyped + epsilon);
-    let channelScale = invStdDev * scale[global_idx % C];
-    let channelShift = bias[global_idx % C] - mean * channelScale;
-    for (var j: u32 = 0; j < normSize; j++) {
-        output[j + offset] = x[j + offset] * channelScale + channelShift;
+    if (localIndex == 0) {
+      squaredNormShared = workgroupShared[0];
+    }
+    workgroupBarrier();
+
+    let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon);
+    let channelScale = invStdDev * ${scale.getByOffset('channel')};
+    let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift;
+      ${output.set('batch', 'channel', 'h', 'value')};
     }
   }`;
       return {
@@ -86,7 +100,7 @@ const createInstanceNormProgramInfo =
           {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
         ],
         getShaderSource,
-        dispatchGroup: () => ({x: Math.ceil(normCount / 64 /* workgroup size */)})
+        dispatchGroup: () => ({x: normCount})
       };
     };
 
@@ -118,7 +132,7 @@ const createInstanceNormNHWCProgramInfo =
   ${shaderHelper.mainStart()}
     let currentImageNumber = global_idx / C;
     let currentChannelNumber = global_idx % C;
-    
+
     // offset is channel num * N
     let offset = currentImageNumber * imageSize;
     if (offset >= ${outputSize}) { return; }
@@ -156,8 +170,6 @@ export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes):
     createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
 
 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
-  validateInputs(context.inputs);
-
   const metadata = {
     name: 'InstanceNormalization',
     inputTypes: [GpuDataType.default, GpuDataType.default, GpuDataType.default],
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
new file mode 100644
index 0000000000000..6a4e6912405ee
--- /dev/null
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -0,0 +1,79 @@
+[
+  {
+    "name": "Simple test with NHWC",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index e580259071968..94592884ccad6 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -257,6 +257,7 @@
       "greater.jsonc",
       //"identity.jsonc",
       "image-scaler.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
@@ -1347,6 +1348,7 @@
       "gemm.jsonc",
       "global-average-pool.jsonc",
       "greater.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",