diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index 816eaaf9bc71a..ec9be80a63574 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 {
                                                          // can be updated using: UpdateTensorRTProviderOptionsWithValue
   int trt_max_partition_iterations{1000};                // maximum iterations for TensorRT parser to get capability
   int trt_min_subgraph_size{1};                          // minimum size of TensorRT subgraphs
-  size_t trt_max_workspace_size{1 << 30};                // maximum workspace size for TensorRT.
+  size_t trt_max_workspace_size{0};                      // maximum workspace size for TensorRT. Default is 0 means max device memory size
   int trt_fp16_enable{0};                                // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
   int trt_int8_enable{0};                                // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
   const char* trt_int8_calibration_table_name{nullptr};  // TensorRT INT8 calibration table name.
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index 3ee9441eeb981..fe46165ffbd50 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -35,6 +35,7 @@ Do not modify directly.*
 | Cosh | ai.onnx(9+) |  |
 | CumSum | ai.onnx(11-13,14+) |  |
 | DepthToSpace | ai.onnx(11-12,13+); com.ms.internal.nhwc(11-12,13+) |  |
+| DequantizeLinear | ai.onnx(10-12,13-18,19-20,21+) |  |
 | Div | ai.onnx(7-12,13,14+) |  |
 | Einsum | ai.onnx(12+) |  |
 | Elu | ai.onnx(6+) |  |
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index ce5b4455fde60..e0288eebbe604 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -26,6 +26,7 @@ import {matMulNBits, parseMatMulNBitsAttributes} from './ops/matmulnbits';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multihead-attention';
 import {pad} from './ops/pad';
 import * as pool from './ops/pool';
+import {dequantizeLinear, parseDequantizeLinearAttributes} from './ops/quantize-linear';
 import {range} from './ops/range';
 import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
 import {parseResizeAttributes, resize} from './ops/resize';
@@ -71,6 +72,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Cosh', [unaryOps.cosh]],
   ['CumSum', [cumsum, parseCumSumAttributes]],
   ['DepthToSpace', [depthToSpace, parseDepthToSpaceAttributes]],
+  ['DequantizeLinear', [dequantizeLinear, parseDequantizeLinearAttributes]],
   ['Div', [binaryOps.div]],
   ['Einsum', [einsum, parseEinsumAttributes]],
   ['Elu', [unaryOps.elu, unaryOps.parseAlphaAttributes]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts b/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts
new file mode 100644
index 0000000000000..0d7c7ab408b3a
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts
@@ -0,0 +1,219 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
+
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
+
+export interface DequantizeLinerAttributes extends AttributeWithCacheKey {
+  axis: number;
+  blockSize: number;
+}
+
+const validateInputs = (inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): void => {
+  if (inputs.length < 2 || inputs.length > 3) {
+    throw new Error('DequantizeLinear requires 2 or 3 inputs.');
+  }
+  if (inputs.length === 3 && inputs[1].dims === inputs[2].dims) {
+    throw new Error('x-scale and x-zero-point must have the same shape.');
+  }
+  if (inputs.length === 3 && inputs[0].dataType !== inputs[2].dataType) {
+    throw new Error('x and x-zero-point must have the same data type.');
+  }
+  if (inputs[0].dataType === DataType.int32 && inputs.length > 2) {
+    throw new Error('In the case of dequantizing int32 there is no zero point.');
+  }
+  if (inputs[1].dims.length !== 0 && inputs[1].dims.length !== 1 && inputs[1].dims.length !== inputs[0].dims.length) {
+    throw new Error('scale input must be a scalar, a 1D tensor, or have the same rank as the input tensor.');
+  }
+  // validate scale and zero-point input shapes
+  if (inputs.length > 2) {
+    // zero-point input type should be the same as input data type.
+    if (inputs[0].dataType !== inputs[2].dataType) {
+      throw new Error('x and x-zero-point must have the same data type.');
+    }
+    // Scale and zero-point inputs must have the same shape
+    if (inputs[1].dims.length !== inputs[2].dims.length) {
+      throw new Error('scale and zero-point inputs must have the same rank.');
+    }
+    if (!inputs[1].dims.map((d, i) => d === inputs[2].dims[i]).reduce((a, b) => a && b, true)) {
+      throw new Error('scale and zero-point inputs must have the same shape.');
+    }
+  }
+  // Validate blockSize
+  if (attributes.blockSize > 0) {
+    // Block qunatization
+    if (inputs[1].dims.length === 0 || (inputs[1].dims.length === 1 && inputs[1].dims[0] === 1)) {
+      throw new Error('blockSize must be set only for block quantization.');
+    }
+    if (!inputs[1]
+             .dims.map((d, i) => i === attributes.axis || d === inputs[0].dims[i])
+             .reduce((a, b) => a && b, true)) {
+      throw new Error('For block qunatization, scale input shape to match the input shape except for the axis');
+    }
+    // Scale input rank should be same as the input rank
+    if (inputs[1].dims.length !== inputs[0].dims.length) {
+      throw new Error('For block qunatization the scale input rank must be the same as the x rank.');
+    }
+    const dI = inputs[0].dims[attributes.axis];
+    const si = inputs[1].dims[attributes.axis];
+    if (attributes.blockSize < Math.ceil(dI / si) || attributes.blockSize > Math.ceil(dI / (si - 1) - 1)) {
+      throw new Error('blockSize must be with in the range [ceil(dI / Si), ceil(dI / (Si - 1) - 1)].');
+    }
+  }
+};
+
+const createDequantizeLinearProgramInfo =
+    (inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): ProgramInfo => {
+      const axis = ShapeUtil.normalizeAxis(attributes.axis, inputs[0].dims.length);
+      const inputType = inputs[0].dataType;
+      const isSigned = inputType === DataType.int8;
+      const outputShape = inputs[0].dims;   // output shape is same as the input shape
+      const dataType = inputs[1].dataType;  // output type is same as the the scale input type
+      const outputSize = ShapeUtil.size(outputShape);
+      const isPacked = inputType === DataType.int8 || inputType === DataType.uint8;
+      const inputShape = isPacked ? [Math.ceil(ShapeUtil.size(inputs[0].dims) / 4)] : inputs[0].dims;
+      const scaleShape = inputs[1].dims;
+      const zeroPointInput = inputs.length > 2 ? inputs[2] : undefined;
+      const zeroPointShape = zeroPointInput ?
+          (isPacked ? [Math.ceil(ShapeUtil.size(zeroPointInput.dims) / 4)] : zeroPointInput.dims) :
+          undefined;
+      // Scales input is a scaler for per-tensor/per-layer quantization, 1-D tensor for per-axis quantization
+      // or tensor with same rank as input for blocked quantization.
+      const perLayerQuantization = scaleShape.length === 0 || (scaleShape.length === 1 && scaleShape[0] === 1);
+      const perAxisQuantization = perLayerQuantization === false && scaleShape.length === 1;
+      // Left unnecessary commented-out assignment for documentation
+      // const blockQuantization = perLayerQuantization === false && perAxisQuantization === false;
+      const maxComponents = getMaxComponents(outputSize);
+      const useComponents = perLayerQuantization && (!isPacked || maxComponents === 4);
+      const components = useComponents ? maxComponents : 1;
+      const inputComponent = (useComponents && !isPacked) ? maxComponents : 1;
+      const input = inputVariable('input', isPacked ? DataType.uint32 : inputType, inputShape.length, inputComponent);
+      const scale = inputVariable('scale', dataType, scaleShape.length);
+      const zeroPoint = zeroPointInput ?
+          inputVariable('zero_point', isPacked ? DataType.uint32 : inputType, zeroPointShape!.length) :
+          undefined;
+      const output = outputVariable('output', dataType, outputShape.length, components);
+      const inputVariables = [input, scale];
+      if (zeroPoint) {
+        inputVariables.push(zeroPoint);
+      }
+      const inputShapes = [inputShape, scaleShape];
+      if (zeroPointInput) {
+        inputShapes.push(zeroPointShape!);
+      }
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize / components}, {type: DataType.uint32, data: axis},
+        {type: DataType.uint32, data: attributes.blockSize}, ...createTensorShapeVariables(...inputShapes, outputShape)
+      ];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const uniforms: UniformsArrayType =
+            [{name: 'output_size', type: 'u32'}, {name: 'axis', type: 'u32'}, {name: 'block_size', type: 'u32'}];
+        return `
+      ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
+      ${shaderHelper.mainStart()}
+          ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+          let output_indices = ${output.offsetToIndices('global_idx')};
+
+          // Set input x
+          ${(() => {
+          if (isPacked) {
+            return `
+            let input = ${input.getByOffset('global_idx / 4')};
+            let x_vec = ${isSigned ? 'unpack4xI8(input)' : 'unpack4xU8(input)'};
+            let x_value = ${components === 1 ? 'x_vec[global_idx % 4]' : 'x_vec'};`;
+          } else {
+            return `let x_value = ${input.getByOffset('global_idx')};`;
+          }
+        })()};
+
+          // Set scale input
+          ${(() => {
+          if (perLayerQuantization) {
+            // scale input is a scalar ()
+            return `let scale_value= ${scale.getByOffset('0')}`;
+          } else if (perAxisQuantization) {
+            // scale input is a 1D tensor
+            return `
+            let scale_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
+            let scale_value= ${scale.getByOffset('scale_index')};`;
+          } else {
+            // Block quantization. Scale input rank is same as input/output rank.
+            return `
+            var scale_indices: ${scale.type.indices} = output_indices;
+            let index = ${scale.indicesGet('scale_indices', 'uniforms.axis')} / uniforms.block_size;
+            ${scale.indicesSet('scale_indices', 'uniforms.axis', 'index')};
+            let scale_value= ${scale.getByIndices('scale_indices')};`;
+          }
+        })()};
+
+          // Set zero-point input
+          ${(() => {
+          if (zeroPoint) {
+            if (perLayerQuantization) {
+              // zero-point input is a scalar
+              if (isPacked) {
+                return `
+                let zero_point_input = ${zeroPoint.getByOffset('0')};
+                let zero_point_vec =  ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
+                let zero_point_value= zero_point_vec[0]`;
+              } else {
+                return `let zero_point_value = ${zeroPoint.getByOffset('0')}`;
+              }
+            } else if (perAxisQuantization) {
+              // zero-point input is a 1D tensor
+              if (isPacked) {
+                return `
+                let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
+                let zero_point_input = ${zeroPoint.getByOffset('zero_point_index / 4')};
+                let zero_point_vec =  ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
+                let zero_point_value = zero_point_vec[zero_point_index % 4]`;
+              } else {
+                return `
+                let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
+                let zero_point_value = ${zeroPoint.getByOffset('zero_point_index')};`;
+              }
+            } else {
+              // BlockedQuantization. The zero-point input shape is same as the input shape except along axis.
+              if (isPacked) {
+                return `
+                let zero_point_offset = ${scale.indicesToOffset('scale_indices')};
+                let zero_point_input = ${zeroPoint.getByOffset('zero_point_offset / 4')};
+                let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
+                let zero_point_value = zero_point_vec[zero_point_offset % 4];`;
+              } else {
+                return `let zero_point_value = ${zeroPoint.getByIndices('scale_indices')};`;
+              }
+            }
+          } else {
+            return `let zero_point_value = ${isPacked ? (isSigned ? 'i32' : 'u32') : input.type.value}(0);`;
+          }
+        })()};
+      // Compute and write output
+      ${output.setByOffset('global_idx', `${output.type.value}(x_value - zero_point_value) * scale_value`)};
+      }`;
+      };
+      return {
+        name: 'DequantizeLinear',
+        shaderCache:
+            {hint: attributes.cacheKey, inputDependencies: zeroPoint ? ['rank', 'rank', 'rank'] : ['rank', 'rank']},
+        getShaderSource,
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / components / 64), y: 1, z: 1},
+          programUniforms
+        })
+      };
+    };
+
+export const dequantizeLinear = (context: ComputeContext, attributes: DequantizeLinerAttributes): void => {
+  validateInputs(context.inputs, attributes);
+  context.compute(createDequantizeLinearProgramInfo(context.inputs, attributes));
+};
+
+export const parseDequantizeLinearAttributes = (attributes: Record<string, unknown>): DequantizeLinerAttributes =>
+    createAttributeWithCacheKey({axis: attributes.axis as number, blockSize: attributes.blockSize as number});
diff --git a/js/web/test/data/ops/dequantizelinear.jsonc b/js/web/test/data/ops/dequantizelinear.jsonc
new file mode 100644
index 0000000000000..2dc04d11f2889
--- /dev/null
+++ b/js/web/test/data/ops/dequantizelinear.jsonc
@@ -0,0 +1,385 @@
+[
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 10 },
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[1]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "uint8"
+          },
+          {
+            "data": [0.1],
+            "dims": [1],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "uint8"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 10 },
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[2]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "int32"
+          },
+          {
+            "data": [0.1],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 13 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [2, 2],
+            "type": "uint8"
+          },
+          {
+            "data": [0.1],
+            "dims": [1],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "uint8"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 13 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [2, 2],
+            "type": "int32"
+          },
+          {
+            "data": [0.1],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 13 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[5]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 2, 2],
+            "type": "uint8"
+          },
+          {
+            "data": [0.1, 0.1],
+            "dims": [2],
+            "type": "float32"
+          },
+          {
+            "data": [0, 0],
+            "dims": [2],
+            "type": "uint8"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
+            "dims": [2, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 13 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[6]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 2, 2],
+            "type": "uint8"
+          },
+          {
+            "data": [0.1, 0.2],
+            "dims": [2],
+            "type": "float32"
+          },
+          {
+            "data": [0, 0],
+            "dims": [2],
+            "type": "uint8"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.6, 0.8, 0.5, 0.6, 1.4, 1.6],
+            "dims": [2, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 13 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[7]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 2, 2],
+            "type": "int32"
+          },
+          {
+            "data": [0.1],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
+            "dims": [2, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 21 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      },
+      {
+        "name": "block_size",
+        "data": 2,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[8]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 2, 2],
+            "type": "uint8"
+          },
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [2, 1, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [2, 1, 2],
+            "type": "uint8"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0, 0.0, 0.2, 0.4, 0.6, 0.8, 1.2, 1.6],
+            "dims": [2, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear block dequantization",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 21 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      },
+      {
+        "name": "block_size",
+        "data": 2,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[9]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 2, 2],
+            "type": "int32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [2, 1, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.4, 0.3, 0.8, 1.5, 2.4, 2.1, 3.2],
+            "dims": [2, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "dequantizelinear",
+    "operator": "DequantizeLinear",
+    "opset": { "domain": "", "version": 13 },
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "T[3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [2, 2],
+            "type": "uint8"
+          },
+          {
+            "data": [0.1],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 4aaf9d16b2b0e..ede89f7557dd8 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -477,8 +477,8 @@
       "test_depthtospace_dcr_mode",
       "test_depthtospace_example",
       "test_depthtospace",
-      // // "test_dequantizelinear_axis",
-      // // "test_dequantizelinear",
+      "test_dequantizelinear_axis",
+      "test_dequantizelinear",
       // // "test_det_2d",
       // // "test_det_nd",
       // // "test_dft_axis",
@@ -1352,6 +1352,7 @@
       "div.jsonc",
       "div_int32.jsonc",
       "depth-to-space.jsonc",
+      "dequantizelinear.jsonc",
       "equal.jsonc",
       "exp.jsonc",
       "expand.jsonc",
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index cbd53298ab2ad..42f491825462c 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -1358,6 +1358,7 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
 common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& node,
                                               const std::filesystem::path& model_path,
                                               ONNX_NAMESPACE::TensorProto& tensor) {
+  ORT_ENFORCE(node.output_size() == 1, "NodeProto for Constant should have 1 output. Got:", node.output_size());
   return ConstantNodeProtoToTensorProto(node, model_path, tensor, node.output(0));
 }
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index ee4d9f9154971..d38c1ace7d7a8 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -646,7 +646,7 @@ Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::
   return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold);
 }
 
-Status Model::LoadFromBytes(int count, void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) {
+Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) {
   const bool result = model_proto.ParseFromArray(p_bytes, count);
   if (!result) {
     return Status(ONNXRUNTIME, INVALID_PROTOBUF, "Protobuf parsing failed.");
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 728af727ac83b..ea34dba889277 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -234,7 +234,7 @@ class Model {
                              const ModelOptions& options = {});
 
   // 'int' rather than 'size_t' because of a protobuf design choice; let callers handle type checks
-  static common::Status LoadFromBytes(int count, void* pBytes,
+  static common::Status LoadFromBytes(int count, const void* pBytes,
                                       /*out*/ ONNX_NAMESPACE::ModelProto& model_proto);
 
   // 'int' rather than 'size_t' because of a protobuf design choice; let callers handle type checks
diff --git a/onnxruntime/core/optimizer/unsqueeze_elimination.cc b/onnxruntime/core/optimizer/unsqueeze_elimination.cc
index 4efc8018f0217..d52cc82af02bb 100644
--- a/onnxruntime/core/optimizer/unsqueeze_elimination.cc
+++ b/onnxruntime/core/optimizer/unsqueeze_elimination.cc
@@ -40,6 +40,10 @@ Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect&
   // Generate new dims.
   InlinedVector<int64_t> new_dims(output_rank, 0);
   for (int64_t axis : axes) {
+    if (static_cast<size_t>(axis) >= new_dims.size()) {
+      LOGS(logger, WARNING) << "UnsqueezeElimination cannot remove node due to invalid axes" << node.Name();
+      return Status::OK();
+    }
     new_dims[static_cast<size_t>(axis)] = 1;
   }
 
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
index 21a256eee6f14..7797cbe678bd4 100644
--- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -380,8 +380,8 @@ Status QLinearConv<ActType>::PrePack(const Tensor& tensor, int input_idx, Alloca
   const int64_t M = shape[0];
   const int64_t C = shape[1];
 
-  // Verify that the total number of output channels is a multiple of the group count.
-  if (M % conv_attrs_.group != 0) {
+  // Verify that conv_attrs_.group is not 0 and the total number of output channels is a multiple of the group count.
+  if (conv_attrs_.group == 0 || M % conv_attrs_.group != 0) {
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 0ad62b87d33b5..e51b53686fafc 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -370,6 +370,19 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomai
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 13, CumSum);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, CumSum);
 
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, int32_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, uint8_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, int8_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, int32_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, uint8_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int8_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int32_t, DequantizeLinear);
+
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
@@ -670,6 +683,18 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 15, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, int32_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, int32_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int32_t, DequantizeLinear)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/js/operators/quantize_linear.cc b/onnxruntime/core/providers/js/operators/quantize_linear.cc
new file mode 100644
index 0000000000000..a3dd635f1fb13
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/quantize_linear.cc
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "quantize_linear.h"
+
+namespace onnxruntime {
+namespace js {
+#define REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL(T, sinceVersion, endVerion) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                             \
+      DequantizeLinear,                                                                \
+      kOnnxDomain,                                                                     \
+      sinceVersion, endVerion,                                                         \
+      T,                                                                               \
+      kJsExecutionProvider,                                                            \
+      (*KernelDefBuilder::Create())                                                    \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())                      \
+          .TypeConstraint("T2", JsepSupportedFloatTypes()),                            \
+      DequantizeLinear);
+
+#define REGISTER_DEQUANTIZED_LINEAR_TYPED_KERNEL(T, sinceVersion) \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      DequantizeLinear,                                           \
+      kOnnxDomain,                                                \
+      sinceVersion,                                               \
+      T,                                                          \
+      kJsExecutionProvider,                                       \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
+          .TypeConstraint("T2", JsepSupportedFloatTypes()),       \
+      DequantizeLinear);
+
+#define REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL_PRE_19(T, sinceVersion, endVerion) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                    \
+      DequantizeLinear,                                                                       \
+      kOnnxDomain,                                                                            \
+      sinceVersion, endVerion,                                                                \
+      T,                                                                                      \
+      kJsExecutionProvider,                                                                   \
+      (*KernelDefBuilder::Create())                                                           \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),                             \
+      DequantizeLinear);
+
+#define REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(T)                    \
+  REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL_PRE_19(T, 10, 12) \
+  REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL_PRE_19(T, 13, 18) \
+  REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL(T, 19, 20)        \
+  REGISTER_DEQUANTIZED_LINEAR_TYPED_KERNEL(T, 21)
+
+REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(int8_t)
+REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(uint8_t)
+REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(int32_t)
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/quantize_linear.h b/onnxruntime/core/providers/js/operators/quantize_linear.h
new file mode 100644
index 0000000000000..e15942aaf1a41
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/quantize_linear.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+class DequantizeLinear : public JsKernel {
+ public:
+  DequantizeLinear(const OpKernelInfo& info) : JsKernel(info) {
+    int64_t axis;
+    int64_t block_size;
+    if (!info.GetAttr<int64_t>("axis", &axis).IsOK()) {
+      axis = 1;
+    }
+    if (!info.GetAttr<int64_t>("block_size", &block_size).IsOK()) {
+      block_size = 0;
+    }
+    JSEP_INIT_KERNEL_ATTRIBUTE(DequantizeLinear, ({
+                                 "axis" : $1,
+                                 "blockSize" : $2
+                               }),
+                               static_cast<int32_t>(axis), static_cast<int32_t>(block_size));
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index a2b3ed068235b..f1df1abf4c49a 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -583,22 +583,23 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph,
 
   // Handle Qs in the NodeUnit
   if (!node_unit.GetQNodes().empty()) {
-    ORT_ENFORCE(node_unit.GetQNodes().size() == 1);
-    const auto& q_node = node_unit.GetQNodes().at(0);
-
-    SkipReason reason;
-
-    bool keep_q = CheckQRuleSet(node_unit, q_node, src_graph, reason);
-
-    if (keep_q) {
-      AddNode(initializers_to_keep, src_graph, dst_graph, *q_node);
-      // if keep_q, then output defs of the target node doesn't change
-      output_args.push_back(&dst_graph.GetOrCreateNodeArg(target_node.OutputDefs().at(0)->Name(),
-                                                          target_node.OutputDefs().at(0)->TypeAsProto()));
-    } else {
-      // convert this Q to float
-      output_args.push_back(&ProcessNodeUnitIO(dst_graph, src_graph, initializers_to_keep,
-                                               node_unit_outputs.at(0)));
+    for (size_t i = 0; i < node_unit.GetQNodes().size(); i++) {
+      const auto& q_node = node_unit.GetQNodes().at(i);
+
+      SkipReason reason;
+
+      bool keep_q = CheckQRuleSet(node_unit, q_node, src_graph, reason);
+
+      if (keep_q) {
+        AddNode(initializers_to_keep, src_graph, dst_graph, *q_node);
+        // if keep_q, then output defs of the target node doesn't change
+        output_args.push_back(&dst_graph.GetOrCreateNodeArg(target_node.OutputDefs().at(i)->Name(),
+                                                            target_node.OutputDefs().at(i)->TypeAsProto()));
+      } else {
+        // convert this Q to float
+        output_args.push_back(&ProcessNodeUnitIO(dst_graph, src_graph, initializers_to_keep,
+                                                 node_unit_outputs.at(i)));
+      }
     }
   } else {
     for (const auto& node_unit_output : node_unit_outputs) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index cdbb7bb2a8094..a7daa98902afb 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1583,10 +1583,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
     min_subgraph_size_ = 1;
   }
-  if (max_workspace_size_ <= 0) {
-    LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
-    max_workspace_size_ = 1 << 30;
-  }
   if (dla_core_ < 0) {
     LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
     dla_core_ = 0;
@@ -2756,7 +2752,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
   auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
   trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
-  trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+  if (max_workspace_size_ > 0) {
+    trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+  }
 
   // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
   if (fp16_enable_ && layer_norm_fp32_fallback_) {
@@ -3363,7 +3361,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
           &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
           &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
           input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
-          dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision,
+          dla_enable_, dla_core_, trt_node_name_with_precision,
           engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name],
           context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_,
           engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_,
@@ -3538,7 +3536,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       trt_state->context->reset();
       trt_state->engine->reset();
       auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-      trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
+      if (max_workspace_size_ > 0) {
+        trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+      }
       for (auto trt_profile : trt_profiles) {
         trt_config->addOptimizationProfile(trt_profile);
       }
@@ -3752,6 +3752,11 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       trt_context = trt_state->context->get();
     }
 
+    // Check before using trt_engine
+    if (trt_engine == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found.");
+    }
+
     // Get input and output binding names
     int total_bindings = trt_engine->getNbIOTensors();
     std::vector<char const*> input_binding_names, output_binding_names;
@@ -4075,6 +4080,11 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
     Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
     cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
 
+    // Check before using trt_engine
+    if (trt_engine == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found.");
+    }
+
     // Get input and output binding names
     int total_bindings = trt_engine->getNbIOTensors();
     std::vector<char const*> input_binding_names, output_binding_names;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 3f20314438564..97c9367b0bb61 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -175,7 +175,6 @@ struct TensorrtFuncState {
   bool int8_calibration_cache_available = false;
   bool dla_enable = false;
   int dla_core = 0;
-  size_t* max_workspace_size_ptr = nullptr;
   std::string trt_node_name_with_precision;
   bool engine_cache_enable = false;
   std::string engine_cache_path;
@@ -290,7 +289,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   cudaStream_t stream_ = nullptr;
   int max_partition_iterations_ = 1000;
   size_t min_subgraph_size_ = 1;
-  size_t max_workspace_size_ = 1 << 30;  // 1GB
+  size_t max_workspace_size_ = 0;
   bool fp16_enable_ = false;
   bool int8_enable_ = false;
   bool dla_enable_ = false;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index 50b934fd5fcbc..fa1bbd6d3d7e6 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -22,7 +22,7 @@ struct TensorrtExecutionProviderInfo {
   bool has_trt_options{false};
   int max_partition_iterations{1000};
   int min_subgraph_size{1};
-  size_t max_workspace_size{1 << 30};
+  size_t max_workspace_size{0};
   bool fp16_enable{false};
   bool int8_enable{false};
   std::string int8_calibration_table_name{""};
diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index aab04485246d6..d48964203ce76 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -515,8 +515,6 @@ def adjust_tensor_ranges(self):
         for node in self.model.nodes():
             # adjust tensor_ranges for input of Clip and Relu node
             if node.op_type in ["Clip", "Relu"]:
-                if self.is_activation_symmetric:
-                    continue
                 if not self.should_quantize_node(node):
                     continue
                 if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index cc8bd622df9b1..c0cc4f038cd3b 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -712,14 +712,20 @@ def process(self):
         if self.algo_config.algorithm in ["HQQ", "DEFAULT"]:
             # use a stack to keep track of sub-graphs
             graph_stack = [self.model.graph()]
-            opset_import = self.model.opset_import()
-
-            has_ms_domain = False
-            for opset in opset_import:
-                if opset.domain == "com.microsoft":
-                    has_ms_domain = True
-            if not has_ms_domain:
-                opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+            # Update domain opset
+            if self.algo_config.quant_format == QuantFormat.QOperator:
+                self.model.set_opset_import("com.microsoft", 1)
+            else:
+                opset_import = self.model.opset_import()
+                for opset in opset_import:
+                    if opset.domain in [None, "ai.onnx", ""] and opset.version < 21:
+                        logger.warning(
+                            "The opset of the input model is under 21 and doesn't support int4 data type. "
+                            "Force to update it to opset 21, but the generated model may not be a valid model."
+                        )
+                        self.model.set_opset_import(opset.domain, 21)
+
             self._process_subgraph(graph_stack)
             self.model.clean_initializers()
         else:
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 60bf90c243db0..b71f332252850 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -989,8 +989,7 @@ def is_tensor_per_channel(
             per_chan_overrides = self.tensor_quant_overrides.get_per_channel_overrides(tensor_name)
             axis = per_chan_overrides[0]["axis"]  # Prefer axis from user-specified tensor-level overrides if available
 
-        weight_nparray = tensor_proto_to_array(weight_initializer)
-        weight_rank = len(weight_nparray.shape)
+        weight_rank = len(weight_initializer.dims)
         axis_valid, axis = normalize_axis(axis, weight_rank)
         if not axis_valid:
             logging.warning(f"Axis {axis} is out-of-range for weight '{tensor_name}' with rank {weight_rank}")
diff --git a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
index 6050bd2e05ec5..219d929d22fce 100644
--- a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
+++ b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
@@ -12,7 +12,7 @@
 
 import onnx
 
-from .quant_utils import QuantType, tensor_proto_to_array
+from .quant_utils import QuantType
 
 
 @dataclass
@@ -235,7 +235,7 @@ def _is_valid_per_channel(
                 "the first channel dictionary.",
             )
 
-        weight_shape = tensor_proto_to_array(initializers[tensor_name]).shape
+        weight_shape = list(initializers[tensor_name].dims)
         weight_rank = len(weight_shape)
         norm_axis = axis
         if norm_axis < 0:
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 4cc8a0c151d14..0438d93227524 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -156,6 +156,9 @@ def quant_test(
                 }
             )
             check_qtype_by_node_type(self, model_int4_path, dqnode_io_qtypes)
+            for op in quant.model.opset_import():
+                if op.domain in [None, "", "ai.onnx"] and op.version < 21:
+                    self.fail(f"In QDQ format {op.domain} opset should be >= 21")
 
         data_reader.rewind()
 
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 8691471b040a7..21a772c5f56c7 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -5,7 +5,9 @@
 # license information.
 # --------------------------------------------------------------------------
 
+import os
 import struct
+import tempfile
 import unittest
 
 import numpy as np
@@ -1150,6 +1152,48 @@ def test_get_qnn_qdq_config_ext_data(self):
         self.assertEqual(set(qnn_config.op_types_to_quantize), {"Add"})
         self.assertTrue(qnn_config.use_external_data_format)
 
+    def test_get_qnn_qdq_config_ext_data_separate_dir(self):
+        """
+        Test that get_qnn_qdq_config() can validate per-channel quantization overrides for a model with external data
+        that is in a separate directory not in the cwd.
+        """
+
+        # Create model with a weight large enough (> 1024 bytes) to be stored externally.
+        large_weight = onnx.numpy_helper.from_array(np.random.random((1, 2, 32, 32)).astype(np.float32), "weight")
+        graph = onnx.helper.make_graph(
+            [onnx.helper.make_node("Conv", ["input", "weight"], ["output"])],
+            "conv_ext_data",
+            [onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, (1, 2, 64, 64))],
+            [onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, None)],
+            initializer=[large_weight],
+        )
+        model = onnx.helper.make_model(
+            graph,
+            opset_imports=[onnx.helper.make_opsetid("", 21)],
+        )
+
+        # Make a separate directory in which to save model and its external data.
+        model_dir_path = tempfile.mkdtemp(prefix="model_ext_data")
+        model_name = "conv_ext_data.onnx"
+        model_path = os.path.join(model_dir_path, model_name)
+
+        onnx.save_model(
+            model,
+            str(model_path),
+            save_as_external_data=True,
+        )
+
+        # Use tensor quantization overrides to quantize Conv's weight input to 4 bits on axis 0.
+        init_overrides = {"weight": [{"quant_type": QuantType.QInt4, "axis": 0, "symmetric": True}]}
+
+        # get_qnn_qdq_config() should be able to validate the per-channel axis without having to load
+        # the external weight data.
+        qnn_config = get_qnn_qdq_config(
+            str(model_path), DummyDataReader([]), init_overrides=init_overrides  # Dummy data reader does nothing
+        )
+        self.assertEqual(set(qnn_config.op_types_to_quantize), {"Conv"})
+        self.assertTrue(qnn_config.use_external_data_format)
+
 
 if __name__ == "__main__":
     t = TestTensorQuantOverridesOption()
diff --git a/orttraining/orttraining/test/training_api/core/training_capi_tests.cc b/orttraining/orttraining/test/training_api/core/training_capi_tests.cc
index 8f25e1e4c92b8..cff060134e679 100644
--- a/orttraining/orttraining/test/training_api/core/training_capi_tests.cc
+++ b/orttraining/orttraining/test/training_api/core/training_capi_tests.cc
@@ -265,6 +265,41 @@ TEST(TrainingCApiTest, LoadONNXModelsFromBuffer) {
                                                                train_model_data);
 }
 
+TEST(TrainingCApiTest, LoadONNXModelsFromBufferThenExport) {
+  auto model_path = MODEL_FOLDER "training_model.onnx";
+  size_t model_data_len = 0;
+  ASSERT_STATUS_OK(Env::Default().GetFileLength(model_path, model_data_len));
+  std::vector<uint8_t> train_model_data(model_data_len);
+  std::ifstream bytes_stream(model_path, std::ifstream::in | std::ifstream::binary);
+  bytes_stream.read(reinterpret_cast<char*>(train_model_data.data()), model_data_len);
+  ASSERT_TRUE(train_model_data.size() == model_data_len);
+
+  auto eval_model_path = MODEL_FOLDER "eval_model.onnx";
+  size_t eval_model_data_len = 0;
+  ASSERT_STATUS_OK(Env::Default().GetFileLength(eval_model_path, eval_model_data_len));
+  std::vector<uint8_t> eval_model_data(eval_model_data_len);
+  std::ifstream eval_bytes_stream(eval_model_path, std::ifstream::in | std::ifstream::binary);
+  eval_bytes_stream.read(reinterpret_cast<char*>(eval_model_data.data()), eval_model_data_len);
+  ASSERT_TRUE(eval_model_data.size() == eval_model_data_len);
+
+  Ort::Env env;
+  Ort::CheckpointState checkpoint_state = Ort::CheckpointState::LoadCheckpoint(MODEL_FOLDER "checkpoint.ckpt");
+  Ort::TrainingSession training_session = Ort::TrainingSession(env,
+                                                               Ort::SessionOptions(),
+                                                               checkpoint_state,
+                                                               train_model_data,
+                                                               eval_model_data);
+
+  // randomly selected output name
+  std::vector<std::string> graph_output_names({"onnx::loss::21273"});
+  training_session.ExportModelForInferencing(MODEL_FOLDER "inference_model.onnx", graph_output_names);
+
+  // Check that the model is a valid inference model by loading into an InferenceSession
+  std::unique_ptr<Environment> environment;
+  ASSERT_STATUS_OK(Environment::Create(nullptr, environment));
+  InferenceSession inference_session = InferenceSession(SessionOptions(), *environment, MODEL_FOLDER "inference_model.onnx");
+}
+
 TEST(TrainingCApiTest, LoadORTFormatModelsFromBuffer) {
   auto train_model_path = ORT_FORMAT_MODEL_FOLDER "training_model.ort";
   auto eval_model_path = ORT_FORMAT_MODEL_FOLDER "eval_model.ort";
diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc
index dc724fbae48eb..939e1de334e52 100644
--- a/orttraining/orttraining/training_api/module.cc
+++ b/orttraining/orttraining/training_api/module.cc
@@ -412,11 +412,12 @@ Module::Module(const ModelIdentifiers& model_identifiers,
   eval_user_input_count_ = eval_user_input_names.size();
   eval_input_names_.insert(eval_input_names_.end(), eval_param_input_names.begin(), eval_param_input_names.end());
 
-  // Keep a copy of the eval model path to be able to later export the model for inferencing.
+  // Keep a copy of the eval model path or buffer to be able to later export the model for inferencing.
   // The inference model will be reconstructed from the eval model.
-  // TODO(askhade): Find a fix to export model for inference when the eval model is loaded from a buffer.
   if (std::holds_alternative<std::optional<std::string>>(model_identifiers.eval_model)) {
     eval_model_path_ = std::get<std::optional<std::string>>(model_identifiers.eval_model);
+  } else if (std::holds_alternative<gsl::span<const uint8_t>>(model_identifiers.eval_model)) {
+    eval_model_buffer_ = std::get<gsl::span<const uint8_t>>(model_identifiers.eval_model);
   }
 }
 
@@ -658,11 +659,17 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path
                                          gsl::span<const std::string> graph_output_names) const {
   ORT_RETURN_IF(state_->module_checkpoint_state.is_nominal_state,
                 "Cannot export the model with a nominal state. Please load the model parameters first.");
-  ORT_RETURN_IF(!eval_sess_ || !eval_model_path_.has_value(),
+  ORT_RETURN_IF(!eval_sess_ || (!eval_model_path_.has_value() && !eval_model_buffer_.has_value()),
                 "Eval model was not provided. Cannot export a model for inferencing.");
 
   ONNX_NAMESPACE::ModelProto eval_model;
-  ORT_THROW_IF_ERROR(Model::Load(ToPathString(eval_model_path_.value()), eval_model));
+  if (eval_model_path_.has_value()) {
+    ORT_THROW_IF_ERROR(Model::Load(ToPathString(eval_model_path_.value()), eval_model));
+  } else if (eval_model_buffer_.has_value()) {
+    int eval_model_buffer_size = static_cast<int>(eval_model_buffer_.value().size());
+    const void* eval_model_buffer_ptr = static_cast<const void*>(eval_model_buffer_.value().data());
+    ORT_THROW_IF_ERROR(Model::LoadFromBytes(eval_model_buffer_size, eval_model_buffer_ptr, eval_model));
+  }
 
   // Clone the eval mode into an inference onnxruntime::Model.
   std::shared_ptr<Model> inference_model;
diff --git a/orttraining/orttraining/training_api/module.h b/orttraining/orttraining/training_api/module.h
index 917887404217f..f4d894f33516a 100644
--- a/orttraining/orttraining/training_api/module.h
+++ b/orttraining/orttraining/training_api/module.h
@@ -198,6 +198,7 @@ struct Module {
 
   bool accumulate_gradient_ = false;
   std::optional<std::string> eval_model_path_;
+  std::optional<gsl::span<const uint8_t>> eval_model_buffer_;
   size_t eval_user_input_count_{0U};
 };
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index 3e8366b11f4aa..436d914c426ad 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -8,6 +8,7 @@ jobs:
   pool:
     vmImage: windows-2019
   timeoutInMinutes: 60
+  continueOnError: true
   workspace:
     clean: all
   steps: