From 3f76000a82c1d2d44684f75a2f558985c90291ec Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 10 Oct 2022 16:28:35 -0700
Subject: [PATCH 01/81] enable JSEP (draft)

---
 cmake/CMakeLists.txt                          |   7 +
 cmake/onnxruntime_providers.cmake             |  19 +++
 cmake/onnxruntime_unittests.cmake             |  15 ++
 cmake/onnxruntime_webassembly.cmake           |   6 +
 .../onnxruntime/core/framework/ortdevice.h    |   1 +
 include/onnxruntime/core/graph/constants.h    |   1 +
 js/web/lib/index.ts                           |   1 +
 js/web/lib/wasm/binding/ort-wasm.d.ts         |  15 ++
 js/web/lib/wasm/jsep.ts                       |  38 +++++
 js/web/lib/wasm/session-options.ts            |   3 +
 js/web/lib/wasm/wasm-core-impl.ts             |   4 +
 js/web/script/test-runner-cli-args.ts         |   5 +-
 js/web/test/test-runner.ts                    |   3 +
 .../core/providers/get_execution_providers.cc |   8 ++
 onnxruntime/core/providers/js/allocator.cc    |  36 +++++
 onnxruntime/core/providers/js/allocator.h     |  39 ++++++
 .../core/providers/js/data_transfer.cc        |  39 ++++++
 onnxruntime/core/providers/js/data_transfer.h |  23 +++
 .../providers/js/js_execution_provider.cc     | 132 ++++++++++++++++++
 .../core/providers/js/js_execution_provider.h |  52 +++++++
 onnxruntime/core/providers/js/js_kernel.h     |  20 +++
 .../core/providers/js/js_provider_factory.cc  |  30 ++++
 .../js/js_provider_factory_creator.h          |  17 +++
 .../core/providers/js/operators/unary.cc      |  45 ++++++
 onnxruntime/core/providers/js/symbols.txt     |   0
 .../providers/provider_factory_creators.h     |   4 +
 .../core/session/provider_registration.cc     |   6 +
 onnxruntime/wasm/js_internal_api.js           |  14 ++
 tools/ci_build/build.py                       |   2 +
 29 files changed, 583 insertions(+), 2 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep.ts
 create mode 100644 onnxruntime/core/providers/js/allocator.cc
 create mode 100644 onnxruntime/core/providers/js/allocator.h
 create mode 100644 onnxruntime/core/providers/js/data_transfer.cc
 create mode 100644 onnxruntime/core/providers/js/data_transfer.h
 create mode 100644 onnxruntime/core/providers/js/js_execution_provider.cc
 create mode 100644 onnxruntime/core/providers/js/js_execution_provider.h
 create mode 100644 onnxruntime/core/providers/js/js_kernel.h
 create mode 100644 onnxruntime/core/providers/js/js_provider_factory.cc
 create mode 100644 onnxruntime/core/providers/js/js_provider_factory_creator.h
 create mode 100644 onnxruntime/core/providers/js/operators/unary.cc
 create mode 100644 onnxruntime/core/providers/js/symbols.txt
 create mode 100644 onnxruntime/wasm/js_internal_api.js

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 1ff5760422177..d8ae4996701b2 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -57,6 +57,8 @@ option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android N
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
+option(onnxruntime_USE_JS "Build with JavaScript implemented kernels support" OFF)
+option(onnxruntime_DEV_MODE "Enable developer warnings and treat most of them as error." OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
 option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
@@ -635,6 +637,11 @@ if (onnxruntime_USE_NNAPI_BUILTIN)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_NNAPI_BUILTIN=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES nnapi)
 endif()
+if (onnxruntime_USE_JS)
+    list(APPEND ORT_PROVIDER_FLAGS -DUSE_JS=1)
+    list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_JS=1)
+    list(APPEND ONNXRUNTIME_PROVIDER_NAMES js)
+endif()
 if (onnxruntime_USE_SNPE)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_SNPE=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES snpe)
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 6697493fbb3c9..2d101603ffbc5 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -114,6 +114,9 @@ endif()
 if(onnxruntime_USE_NNAPI_BUILTIN)
   set(PROVIDERS_NNAPI onnxruntime_providers_nnapi)
 endif()
+if(onnxruntime_USE_JS)
+  set(PROVIDERS_JS onnxruntime_providers_js)
+endif()
 if(onnxruntime_USE_RKNPU)
   set(PROVIDERS_RKNPU onnxruntime_providers_rknpu)
 endif()
@@ -1050,6 +1053,22 @@ if (onnxruntime_USE_NNAPI_BUILTIN)
   endif()
 endif()
 
+if (onnxruntime_USE_JS)
+  add_compile_definitions(USE_JS=1)
+
+  file(GLOB_RECURSE onnxruntime_providers_js_cc_srcs
+    "${ONNXRUNTIME_ROOT}/core/providers/js/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/js/*.cc"
+  )
+
+  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_js_cc_srcs})
+  onnxruntime_add_static_library(onnxruntime_providers_js ${onnxruntime_providers_js_cc_srcs})
+  onnxruntime_add_include_to_target(onnxruntime_providers_js onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers)
+
+  add_dependencies(onnxruntime_providers_js ${onnxruntime_EXTERNAL_DEPENDENCIES})
+
+endif()
+
 if (onnxruntime_USE_RKNPU)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable -Wno-unused-parameter")
   add_definitions(-DUSE_RKNPU=1)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 6a9f6104dbf80..39a05a9c97afb 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -471,6 +471,10 @@ if(onnxruntime_USE_NNAPI_BUILTIN)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_nnapi)
 endif()
 
+if(onnxruntime_USE_JS)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_js)
+endif()
+
 if(onnxruntime_USE_RKNPU)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_rknpu)
 endif()
@@ -518,6 +522,7 @@ set(ONNXRUNTIME_TEST_LIBS
     ${onnxruntime_libs}
     # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
     ${PROVIDERS_NNAPI}
+    ${PROVIDERS_JS}
     ${PROVIDERS_SNPE}
     ${PROVIDERS_RKNPU}
     ${PROVIDERS_DML}
@@ -569,6 +574,13 @@ if(onnxruntime_USE_NNAPI_BUILTIN)
   list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_nnapi)
 endif()
 
+if(onnxruntime_USE_JS)
+  list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/js/*)
+  list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_js)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_js)
+  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_js)
+endif()
+
 if(onnxruntime_USE_SNPE)
   list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/snpe/*)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_snpe)
@@ -802,6 +814,9 @@ if (onnxruntime_BUILD_WEBASSEMBLY)
   if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
     set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY LINK_FLAGS " -s USE_PTHREADS=1 -s PROXY_TO_PTHREAD=1")
   endif()
+  if (onnxruntime_USE_JS)
+    set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY LINK_FLAGS " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js\"")
+  endif()
 endif()
 
 if (onnxruntime_ENABLE_ATEN)
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 25761aa8414a9..cf410a3e4f18f 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -108,6 +108,7 @@ if (onnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB)
       onnxruntime_mlas
       onnxruntime_optimizer
       onnxruntime_providers
+      ${PROVIDERS_JS}
       ${PROVIDERS_XNNPACK}
       onnxruntime_session
       onnxruntime_util
@@ -183,6 +184,7 @@ else()
     onnxruntime_mlas
     onnxruntime_optimizer
     onnxruntime_providers
+    ${PROVIDERS_JS}
     ${PROVIDERS_XNNPACK}
     onnxruntime_session
     onnxruntime_util
@@ -213,6 +215,10 @@ else()
                         ${WASM_API_EXCEPTION_CATCHING} \
                         --no-entry")
 
+  if (onnxruntime_USE_JS)
+    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js\"")
+  endif()
+
   if (onnxruntime_EMSCRIPTEN_SETTINGS)
     foreach(setting IN LISTS onnxruntime_EMSCRIPTEN_SETTINGS)
     set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS
diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h
index 77f7c3e1743f0..efc5923031262 100644
--- a/include/onnxruntime/core/framework/ortdevice.h
+++ b/include/onnxruntime/core/framework/ortdevice.h
@@ -23,6 +23,7 @@ struct OrtDevice {
     static const MemoryType CUDA_PINNED = 1;
     static const MemoryType HIP_PINNED = 2;
     static const MemoryType CANN_PINNED = 3;
+    static const MemoryType HANDLE = 4;
   };
 
   constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)
diff --git a/include/onnxruntime/core/graph/constants.h b/include/onnxruntime/core/graph/constants.h
index fe54ef6a98e85..7f4cc0ebab974 100644
--- a/include/onnxruntime/core/graph/constants.h
+++ b/include/onnxruntime/core/graph/constants.h
@@ -43,6 +43,7 @@ constexpr const char* kAclExecutionProvider = "ACLExecutionProvider";
 constexpr const char* kArmNNExecutionProvider = "ArmNNExecutionProvider";
 constexpr const char* kRocmExecutionProvider = "ROCMExecutionProvider";
 constexpr const char* kCoreMLExecutionProvider = "CoreMLExecutionProvider";
+constexpr const char* kJsExecutionProvider = "JsExecutionProvider";
 constexpr const char* kSnpeExecutionProvider = "SNPEExecutionProvider";
 constexpr const char* kTvmExecutionProvider = "TvmExecutionProvider";
 constexpr const char* kXnnpackExecutionProvider = "XnnpackExecutionProvider";
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 0e4a3f6d575f8..098b6603e5700 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -15,6 +15,7 @@ if (!BUILD_DEFS.DISABLE_WEBGL) {
 }
 if (!BUILD_DEFS.DISABLE_WASM) {
   const wasmBackend = require('./backend-wasm').wasmBackend;
+  registerBackend('js', wasmBackend, 11);
   registerBackend('cpu', wasmBackend, 10);
   registerBackend('wasm', wasmBackend, 10);
   registerBackend('xnnpack', wasmBackend, 9);
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index fd82a83bd716b..e2c12840e3e2f 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -1,6 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+declare namespace JSEP {
+  type BackendType = unknown;
+  type AllocFunction = (size: number) => number;
+  type FreeFunction = (size: number) => number;
+  type UploadFunction = (size: number) => number;
+  type DownloadFunction = (size: number) => number;
+  type RunFunction = (size: number) => number;
+}
+
 export interface OrtWasmModule extends EmscriptenModule {
   // #region emscripten functions
   stackSave(): number;
@@ -51,6 +60,12 @@ export interface OrtWasmModule extends EmscriptenModule {
   // #region config
   mainScriptUrlOrBlob?: string|Blob;
   // #endregion
+
+  // #region JSEP
+  jsepInit?
+      (backend: JSEP.BackendType, alloc: JSEP.AllocFunction, free: JSEP.FreeFunction, upload: JSEP.UploadFunction,
+       download: JSEP.DownloadFunction, run: JSEP.RunFunction): void;
+  // #endregion
 }
 
 declare const moduleFactory: EmscriptenModuleFactory<OrtWasmModule>;
diff --git a/js/web/lib/wasm/jsep.ts b/js/web/lib/wasm/jsep.ts
new file mode 100644
index 0000000000000..6766840303077
--- /dev/null
+++ b/js/web/lib/wasm/jsep.ts
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {OrtWasmModule} from './binding/ort-wasm';
+
+export const init = (module: OrtWasmModule): void => {
+  // init JSEP if available
+  const init = module.jsepInit;
+  if (init) {
+    init(
+        {},
+        (size: number) => {
+          // eslint-disable-next-line no-console
+          console.log(`jsepAlloc: ${size}`);
+          return 1234;
+        },
+        (ptr: number) => {
+          // eslint-disable-next-line no-console
+          console.log(`jsepFree: ${ptr}`);
+          return 5678;
+        },
+        (_a: number) => {
+          // eslint-disable-next-line no-console
+          console.log('jsepUpload');
+          return 40;
+        },
+        (_a: number) => {
+          // eslint-disable-next-line no-console
+          console.log('jsepDownload');
+          return 41;
+        },
+        (_a: number) => {
+          // eslint-disable-next-line no-console
+          console.log('jsepRun');
+          return 42;
+        });
+  }
+};
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 6d4d8eeb345eb..f9612f26c9fb8 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -58,6 +58,9 @@ const setExecutionProviders =
           case 'xnnpack':
             epName = 'XNNPACK';
             break;
+          case 'js':
+            epName = 'JS';
+            break;
           case 'wasm':
           case 'cpu':
             continue;
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 4406f09714271..ab72fcc73f448 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -3,6 +3,7 @@
 
 import {InferenceSession, Tensor} from 'onnxruntime-common';
 
+import {init} from './jsep';
 import {SerializableModeldata, SerializableSessionMetadata, SerializableTensor} from './proxy-messages';
 import {setRunOptions} from './run-options';
 import {setSessionOptions} from './session-options';
@@ -19,6 +20,9 @@ export const initOrt = (numThreads: number, loggingLevel: number): void => {
   if (errorCode !== 0) {
     throw new Error(`Can't initialize onnxruntime. error code = ${errorCode}`);
   }
+
+  // init JSEP if available
+  init(getInstance());
 };
 
 /**
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index c008933bc4ed5..289f9ad8eeddc 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -36,6 +36,7 @@ Options:
                                    webgl
                                    wasm
                                    xnnpack
+                                   js
  -e=<...>, --env=<...>         Specify the environment to run the test. Should be one of the following:
                                  chrome     (default)
                                  edge       (Windows only)
@@ -98,7 +99,7 @@ Examples:
 
 export declare namespace TestRunnerCliArgs {
   type Mode = 'suite0'|'suite1'|'model'|'unittest'|'op';
-  type Backend = 'cpu'|'webgl'|'wasm'|'onnxruntime'|'xnnpack';
+  type Backend = 'cpu'|'webgl'|'wasm'|'onnxruntime'|'xnnpack'|'js';
   type Environment = 'chrome'|'edge'|'firefox'|'electron'|'safari'|'node'|'bs';
   type BundleMode = 'prod'|'dev'|'perf';
 }
@@ -334,7 +335,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   }
 
   // Option: -b=<...>, --backend=<...>
-  const browserBackends = ['webgl', 'wasm', 'xnnpack'];
+  const browserBackends = ['webgl', 'wasm', 'xnnpack', 'js'];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
   const backend =
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 4bdea197bb1d5..83a1301c7a24b 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -259,6 +259,9 @@ export class TensorResultValidator {
     if (backend === 'cpu') {
       this.absoluteThreshold = CPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = CPU_THRESHOLD_RELATIVE_ERROR;
+    } else if (backend === 'js') {
+      this.absoluteThreshold = WEBGL_THRESHOLD_ABSOLUTE_ERROR;
+      this.relativeThreshold = WEBGL_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'webgl') {
       if (TensorResultValidator.isHalfFloat === undefined) {
         TensorResultValidator.isHalfFloat = !createWebGLContext(ort.env.webgl.contextId).isRenderFloat32Supported;
diff --git a/onnxruntime/core/providers/get_execution_providers.cc b/onnxruntime/core/providers/get_execution_providers.cc
index be29c32f68db7..566f18f17d1e6 100644
--- a/onnxruntime/core/providers/get_execution_providers.cc
+++ b/onnxruntime/core/providers/get_execution_providers.cc
@@ -90,6 +90,14 @@ constexpr ProviderInfo kProvidersInPriorityOrder[] =
             true,
 #else
             false,
+#endif
+        },
+        {
+            kJsExecutionProvider,
+#ifdef USE_JS
+            true,
+#else
+            false,
 #endif
         },
         {
diff --git a/onnxruntime/core/providers/js/allocator.cc b/onnxruntime/core/providers/js/allocator.cc
new file mode 100644
index 0000000000000..aed52855c421f
--- /dev/null
+++ b/onnxruntime/core/providers/js/allocator.cc
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <emscripten.h>
+
+#include "core/framework/session_state.h"
+#include "core/providers/js/allocator.h"
+
+namespace onnxruntime {
+namespace js {
+
+void* JsCustomAllocator::Alloc(size_t size) {
+  printf("JsCustomAllocator::Alloc(%zu)\n", size);
+  void* p = EM_ASM_PTR({return Module.jsepAlloc($0);}, size);
+  stats_.num_allocs++;
+  stats_.bytes_in_use += size;
+  stats_.max_bytes_in_use =std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
+  stats_.max_alloc_size = std::max<int64_t>(stats_.max_alloc_size, static_cast<int64_t>(size));
+  stats_.num_arena_extensions++;
+  stats_.num_arena_shrinkages = std::max(stats_.num_arena_shrinkages, stats_.num_arena_extensions);
+  stats_.total_allocated_bytes += size;
+  return p;
+}
+
+void JsCustomAllocator::Free(void* p) {
+  size_t size = (size_t)(void*)EM_ASM_PTR({return Module.jsepFree($0);}, p);
+  stats_.num_arena_extensions--;
+  stats_.bytes_in_use -= size;
+}
+
+void JsCustomAllocator::GetStats(AllocatorStats* stats) {
+  *stats = stats_;
+}
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/allocator.h b/onnxruntime/core/providers/js/allocator.h
new file mode 100644
index 0000000000000..6a6663c4c3e31
--- /dev/null
+++ b/onnxruntime/core/providers/js/allocator.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocator.h"
+#include "core/framework/ortdevice.h"
+
+namespace onnxruntime {
+namespace js {
+
+class JsCPUAllocator : public CPUAllocator {
+ public:
+  JsCPUAllocator()
+      : CPUAllocator(
+            OrtMemoryInfo("JsCPUAllocator", OrtAllocatorType::OrtDeviceAllocator,
+                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
+                          0, OrtMemTypeCPUOutput)){};
+};
+
+class JsCustomAllocator : public IAllocator {
+ public:
+  JsCustomAllocator()
+      : IAllocator(
+            OrtMemoryInfo("JsCustomAllocator", OrtAllocatorType::OrtDeviceAllocator,
+                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::HANDLE, 0),
+                          0, OrtMemTypeDefault)) {
+  }
+
+  virtual void* Alloc(size_t size) override;
+  virtual void Free(void* p) override;
+  void GetStats(AllocatorStats* stats) override;
+
+ private:
+  AllocatorStats stats_;
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
new file mode 100644
index 0000000000000..9400145f554c0
--- /dev/null
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <emscripten.h>
+
+#include "core/providers/js/data_transfer.h"
+
+namespace onnxruntime {
+namespace js {
+
+bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
+  return (dst_device.Type() == OrtDevice::GPU && src_device.Type() == OrtDevice::CPU) ||
+         (dst_device.Type() == OrtDevice::CPU && src_device.Type() == OrtDevice::GPU);
+}
+
+common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*unused_arg*/) const {
+  size_t bytes = src.SizeInBytes();
+  const void* src_data = src.DataRaw();
+  void* dst_data = dst.MutableDataRaw();
+
+  auto& src_device = src.Location().device;
+  auto& dst_device = dst.Location().device;
+
+  if (dst_device.Type() == OrtDevice::GPU) {
+    // copy from CPU to GPU
+    EM_ASM({ Module.jsepUpload(); });
+  } else if (src_device.Type() == OrtDevice::GPU) {
+    // copy from GPU to CPU
+    EM_ASM({ Module.jsepDownload(); });
+  } else {
+    // copy from CPU to CPU (don't think we ever get here)
+    memcpy(dst_data, src_data, bytes);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/data_transfer.h b/onnxruntime/core/providers/js/data_transfer.h
new file mode 100644
index 0000000000000..d1e703ec1dc0c
--- /dev/null
+++ b/onnxruntime/core/providers/js/data_transfer.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/data_transfer.h"
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+namespace js {
+
+class DataTransfer : public IDataTransfer {
+ public:
+  DataTransfer() {};
+  ~DataTransfer() {};
+
+  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
+
+  common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
new file mode 100644
index 0000000000000..b57697de818ce
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -0,0 +1,132 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "js_execution_provider.h"
+
+#include "core/graph/function_utils.h"
+#include "core/framework/compute_capability.h"
+#include "core/framework/kernel_registry.h"
+#include "core/providers/shared/node_unit/node_unit.h"
+
+#include "allocator.h"
+#include "data_transfer.h"
+
+namespace onnxruntime {
+
+namespace js {
+template <>
+KernelCreateInfo BuildKernelCreateInfo<void>() {
+  KernelCreateInfo info;
+  return info;
+}
+
+#define KERNEL_CREATE_INFO_VERSIONED(Start, End, Op) \
+  BuildKernelCreateInfo<                             \
+      ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, Start, End, Op)>
+
+#define KERNEL_CREATE_INFO(Start, Op) \
+  BuildKernelCreateInfo<              \
+      ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, Start, Op)>
+
+#define KERNEL_CREATE_INFO_TYPED(Start, type, Op) \
+  BuildKernelCreateInfo<                          \
+      ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, Start, type, Op)>
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 14, Abs);
+
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv);
+// class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool);
+// class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Softmax);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax);
+
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, uint8_t, QLinearConv);
+// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, int8_t, QLinearConv);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, QLinearAveragePool);
+// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider,
+//                                       kDynamicDomainByCreate, 1, QLinearSoftmax);
+
+std::unique_ptr<KernelRegistry> RegisterKernels() {
+  auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
+
+  static const BuildKernelCreateInfoFn function_table[] = {
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 14, Abs)>,
+      // KERNEL_CREATE_INFO(11, Conv),
+      // KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool),
+      // KERNEL_CREATE_INFO(12, MaxPool),
+      // KERNEL_CREATE_INFO(11, AveragePool),
+      // // layout insensitive, use ONNX-domain directly
+      // BuildKernelCreateInfo<
+      //     ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax)>,
+      // BuildKernelCreateInfo<
+      //     ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Softmax)>,
+
+      // //  quantization op
+      // KERNEL_CREATE_INFO_TYPED(10, uint8_t, QLinearConv),
+      // KERNEL_CREATE_INFO_TYPED(10, int8_t, QLinearConv),
+      // KERNEL_CREATE_INFO(1, QLinearAveragePool),
+      // BuildKernelCreateInfo<
+      //     ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax)>,
+  };
+
+  for (auto& function_table_entry : function_table) {
+    KernelCreateInfo info = function_table_entry();
+    if (info.kernel_def != nullptr) {  // filter disabled entries where type is void
+      ORT_THROW_IF_ERROR(kernel_registry->Register(std::move(info)));
+    }
+  }
+
+  return kernel_registry;
+}
+
+}  // namespace js
+
+using namespace js;
+
+JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info)
+    : IExecutionProvider{kJsExecutionProvider, true} {
+}
+
+// implement RegisterAllocator to test/validate sharing the CPU EP's allocator
+void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
+  AllocatorCreationInfo default_memory_info([&](int) { return std::make_unique<js::JsCPUAllocator>(); });
+
+  AllocatorPtr default_allocator = CreateAllocator(default_memory_info);
+  InsertAllocator(default_allocator);
+
+  // use_arena might have some issue, for this to work need to change
+  // https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/framework/execution_frame.cc#L507
+  AllocatorCreationInfo memory_info(
+      [&](int) { return std::make_unique<js::JsCustomAllocator>(); }, 0, false);
+
+  AllocatorPtr allocator = CreateAllocator(memory_info);
+  InsertAllocator(allocator);
+}
+
+std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapability(
+    const onnxruntime::GraphViewer& graph,
+    const IKernelLookup& kernel_lookup) const {
+
+  return IExecutionProvider::GetCapability(graph, kernel_lookup);
+}
+
+std::shared_ptr<KernelRegistry> JsExecutionProvider::GetKernelRegistry() const {
+  static std::shared_ptr<KernelRegistry> registry = js::RegisterKernels();
+  return registry;
+}
+
+std::unique_ptr<onnxruntime::IDataTransfer> JsExecutionProvider::GetDataTransfer() const {
+  return std::make_unique<js::DataTransfer>();
+}
+
+JsExecutionProvider::~JsExecutionProvider() {
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
new file mode 100644
index 0000000000000..ac5f20f185288
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2019, NXP Semiconductor, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocatormgr.h"
+#include "core/framework/execution_provider.h"
+#include "core/graph/constants.h"
+#include "core/providers/providers.h"
+
+struct pthreadpool;
+namespace onnxruntime {
+
+namespace js {
+
+// forward declaration for this EP's namespace.
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+
+}
+
+// placeholder for future use. no options currently
+struct JsExecutionProviderInfo {
+  JsExecutionProviderInfo() = default;
+
+  JsExecutionProviderInfo(const ProviderOptions& po) {
+  }
+};
+
+class JsExecutionProvider : public IExecutionProvider {
+ public:
+  JsExecutionProvider(const JsExecutionProviderInfo& info);
+  ~JsExecutionProvider() override;
+
+  std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
+      const onnxruntime::GraphViewer& graph_viewer,
+      const IKernelLookup& /*kernel_lookup*/) const override;
+
+  std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
+  std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
+
+  void RegisterAllocator(AllocatorManager& /*allocator_manager*/) override;
+
+  DataLayout GetPreferredLayout() const override { return DataLayout::NHWC; }
+
+  FusionStyle GetFusionStyle() const override { return FusionStyle::FilteredGraphViewer; }
+
+  bool ConcurrentRunSupported() const override { return false; }
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
new file mode 100644
index 0000000000000..72aab01221899
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/framework/op_kernel.h"
+#include "core/providers/js/js_execution_provider.h"
+
+struct pthreadpool;
+
+namespace onnxruntime {
+namespace js {
+
+class JsKernel : public OpKernel {
+ public:
+  explicit JsKernel(const OpKernelInfo& info)
+      : OpKernel(info) {
+  }
+};
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_provider_factory.cc b/onnxruntime/core/providers/js/js_provider_factory.cc
new file mode 100644
index 0000000000000..5b7329a87cf6a
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_provider_factory.cc
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/error_code_helper.h"
+#include "core/providers/js/js_execution_provider.h"
+#include "core/providers/js/js_provider_factory_creator.h"
+#include "core/session/abi_session_options_impl.h"
+#include "core/session/ort_apis.h"
+
+namespace onnxruntime {
+
+struct JsProviderFactory : IExecutionProviderFactory {
+  JsProviderFactory(const ProviderOptions& provider_options)
+      : info_{provider_options} {
+  }
+
+  std::unique_ptr<IExecutionProvider> CreateProvider() override {
+    return std::make_unique<JsExecutionProvider>(info_);
+  }
+
+ private:
+  JsExecutionProviderInfo info_;
+};
+
+std::shared_ptr<IExecutionProviderFactory> JsProviderFactoryCreator::Create(
+    const ProviderOptions& provider_options) {
+  return std::make_shared<JsProviderFactory>(provider_options);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_provider_factory_creator.h b/onnxruntime/core/providers/js/js_provider_factory_creator.h
new file mode 100644
index 0000000000000..dbabe255c2d7b
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_provider_factory_creator.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+
+#include "core/framework/provider_options.h"
+#include "core/providers/providers.h"
+
+namespace onnxruntime {
+
+struct JsProviderFactoryCreator {
+  static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options);
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
new file mode 100644
index 0000000000000..361adab12e985
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_execution_provider.h"
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+class AbsImpl : public OpKernel {
+public:
+    AbsImpl(const OpKernelInfo& info) : OpKernel(info) {}
+
+    Status Compute(OpKernelContext* context) const override {
+        AllocatorPtr alloc;
+        ORT_RETURN_IF_ERROR(context->GetTempSpaceCPUAllocator(&alloc));
+        size_t temp_data_size = sizeof(size_t) * (1 + context->InputCount() * (3 + context->Input<Tensor>(0)->Shape().NumDimensions()));
+        printf("temp data size: %zu\n", temp_data_size);
+        void *p_inputs = alloc->Alloc(temp_data_size);
+
+        //
+        // type | data_ptr | dim_size | dim[0] ... dim[N-1]
+        //
+
+        Tensor* Y = context->Output(0, TensorShape(context->Input<Tensor>(0)->Shape()));
+        printf("Y.data=%zu\n", (size_t)(Y->DataRaw()));
+
+        alloc->Free(p_inputs);
+
+        return Status::OK();
+    }
+};
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Abs,
+    kOnnxDomain,
+    1,
+    14,
+    kJsExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    AbsImpl);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/symbols.txt b/onnxruntime/core/providers/js/symbols.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/onnxruntime/core/providers/provider_factory_creators.h b/onnxruntime/core/providers/provider_factory_creators.h
index 521152612d3e2..9b67f9e9c3a17 100644
--- a/onnxruntime/core/providers/provider_factory_creators.h
+++ b/onnxruntime/core/providers/provider_factory_creators.h
@@ -46,6 +46,10 @@
 #include "core/providers/nnapi/nnapi_provider_factory_creator.h"
 #endif
 
+#if defined(USE_JS)
+#include "core/providers/js/js_provider_factory_creator.h"
+#endif
+
 #if defined(USE_OPENVINO)
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #endif
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index cc9a319f90c2e..43f2d81602e5f 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -83,6 +83,12 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     options->provider_factories.push_back(AzureProviderFactoryCreator::Create(provider_options));
 #else
     status = create_not_supported_status();
+#endif
+  } else if (strcmp(provider_name, "JS") == 0) {
+#if defined(USE_JS)
+    options->provider_factories.push_back(JsProviderFactoryCreator::Create(provider_options));
+#else
+    status = create_not_supported_status();
 #endif
   } else {
     ORT_UNUSED_PARAMETER(options);
diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js
new file mode 100644
index 0000000000000..9fc4b297a2fac
--- /dev/null
+++ b/onnxruntime/wasm/js_internal_api.js
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+// init JSEP
+Module["jsepInit"] = function (backend, alloc, free, upload, download, run) {
+    Module.jsepBackend = backend;
+    Module.jsepAlloc = alloc;
+    Module.jsepFree = free;
+    Module.jsepUpload = upload;
+    Module.jsepDownload = download;
+    Module.jsepRun = run;
+};
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index f421800523667..19b78bd4a7f7a 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -484,6 +484,7 @@ def convert_arg_line_to_args(self, arg_line):
     parser.add_argument(
         "--nnapi_min_api", type=int, help="Minimum Android API level to enable NNAPI, should be no less than 27"
     )
+    parser.add_argument("--use_js", action="store_true", help="Build with JavaScript kernels.")
     parser.add_argument("--use_rknpu", action="store_true", help="Build with RKNPU.")
     parser.add_argument("--use_preinstalled_eigen", action="store_true", help="Use pre-installed Eigen.")
     parser.add_argument("--eigen_path", help="Path to pre-installed Eigen.")
@@ -933,6 +934,7 @@ def generate_build_tree(
         "-Donnxruntime_USE_ARMNN=" + ("ON" if args.use_armnn else "OFF"),
         "-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"),
         "-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"),
+        "-Donnxruntime_USE_JS=" + ("ON" if args.use_js else "OFF"),
         # Training related flags
         "-Donnxruntime_ENABLE_NVTX_PROFILE=" + ("ON" if args.enable_nvtx_profile else "OFF"),
         "-Donnxruntime_ENABLE_TRAINING=" + ("ON" if args.enable_training else "OFF"),

From 7a5405d62d43d9d75158182f2fb7e1c47e28a2e3 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 10 Oct 2022 16:32:14 -0700
Subject: [PATCH 02/81] Squashed commit of the following:

commit 340c88b252cd22fa0bb817cebaa7dfc23656a090
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Sep 8 13:40:31 2022 -0700

    batch mode

commit b16084035fae358c8acc8203a3e374c853876ab7
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Jul 26 17:00:39 2022 -0700

    sum

commit 306a19b5477d48e710cda98785e09757b00ca298
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Mon Jul 25 19:04:48 2022 -0700

    squeeze + transpose

commit 86d8d3a7144b219407a747e013b2c5b2ca84c995
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Mon Jul 18 16:31:59 2022 -0700

    fix webgpu test launch

commit e104d175b40864bfc04fee8333179694bf9d04e6
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Jul 12 16:52:54 2022 -0700

    shape

commit a2197f020858ce0a9cc59147438179a2566c8a56
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Jul 12 13:49:15 2022 -0700

    pool

commit 59b10fb42df82f83c6ca76f99ec3c7d94553afb2
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Jul 7 17:32:56 2022 -0700

    upgrade to latest webgpu spec

commit 4ed1bfbb027cfbbe64d962b4f7329d1d0787fe40
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Jun 28 14:23:08 2022 -0700

    naive conv

commit 7c5e44673b02af3dc834d44cf72c11f302f429af
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Jun 8 15:37:12 2022 -0700

    check webgpu backend in execution loop

commit b0d7dfaed1e3034a915d3ce235e6df24951a5294
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Jun 8 15:31:19 2022 -0700

    dump shader source only in debug mode

commit 7fca0ea607b981ccf511172c3ccf8ba62a0fe7a3
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Jun 8 15:17:27 2022 -0700

    add verbose log for buffer upload/download

commit 179712be6fd6ac6ca93025d335b5148cf5240873
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Jun 8 15:06:03 2022 -0700

    fix program key

commit 67ea4cb3c5d074d79164c56747b6ef80a7caf3fd
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Jun 8 15:05:20 2022 -0700

    concat: fix 1 input

commit 21b5dfe1c4de3b0d70fab044fb1ac92c51ffe152
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Jun 7 16:13:12 2022 -0700

    matmul (no-broadcast)

commit a8def8e1ef8b62bab6f6efcb313277e9e36a5485
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Jun 2 17:56:15 2022 -0700

    ...

commit e8711389907caf9f21e5150bfa58660be12b395d
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri May 27 16:12:56 2022 -0700

    slice (scalar)

commit 75c7941211aac7172bdc0f7c4a07728caa76681d
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu May 26 16:54:53 2022 -0700

    slice (...)

commit 40b15e40a0ce93697969bd8a225a8618c96184b8
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu May 26 12:45:16 2022 -0700

    slice

commit 9d92513c403797368bed23fadd2f7f7b34dae11d
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed May 25 22:37:48 2022 -0700

    gemm (scalar)

commit c1185b49845cffc5894609eb6b3beff2f42de3fb
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue May 24 16:54:43 2022 -0700

    gemm...

commit 99653f5d3f044195b0bd783a554e7f95a96540f2
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue May 24 16:54:20 2022 -0700

    format code

commit 86c75bb486e503d80d170de4884bedda1ca3fecd
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue May 24 11:39:35 2022 -0700

    gemm

commit 79dd53945b8356426a00359c17511de73daa89b2
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri Apr 8 04:46:03 2022 -0700

    concat

commit 25c9d2a2e2596d4b57067f02d717d79eb8a87638
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Apr 7 19:32:48 2022 -0700

    gather

commit 6627349fb6192b0bcb7c75bbd5af9c1c25c30a95
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Apr 7 18:46:53 2022 -0700

    binary ops

commit fb81d7f4f8796dbb82ad210d7fced5373a9f632d
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Apr 6 17:55:07 2022 -0700

    binary - add

commit 073695ff1b3fbf4b614ae4b056a11fe4b6fec30f
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Apr 6 17:54:24 2022 -0700

    optimize types

commit e9775fe631a3cc8b138b9db542c60c9c1310e595
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Apr 5 16:45:27 2022 -0700

    working

commit cba119cd8ad11d9066c848183b9155148d07476b
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Apr 5 15:10:26 2022 -0700

    upgrade @webgpu/types@0.1.13

commit ed17c576837ca13f6a91062fae3d90b1282ede82
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Tue Apr 5 03:37:29 2022 -0700

    neg

commit e8e4d8841edfe5a19a0cbad0b3a2a33f1fb4f091
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Mon Apr 4 16:28:52 2022 -0700

    other f32 unary operators

commit a1fbcfd51ad59b0cf7136e9cddf0f516557458dc
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri Apr 1 17:24:10 2022 -0700

    leaky relu

commit dbe57febfa534afaaee1903ebc2338f0befaa8a1
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri Apr 1 17:09:27 2022 -0700

    exp, floor

commit 3b883b940b19a4b9f3be4f72e63e56ad4b8915f7
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri Apr 1 16:43:15 2022 -0700

    elu

commit aac2fc61fc6457b42603a6132e4f42a75902d009
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Mar 24 20:30:54 2022 -0700

    always create storage buffer with 16 bytes alignment

commit ad6bd01df68c32ab3a524d4b681613a4426af578
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Mar 24 20:30:07 2022 -0700

    fix unary funcs async signature

commit a782667d5bbfd2dfe300ce9b99bdc79173b2d81c
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Mar 23 19:57:38 2022 -0700

    fix upload

commit b6e7fbae4d20e323daff6e92572dd7755afdbc19
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Wed Mar 23 15:36:58 2022 -0700

    reshape

commit dfbf6f36b11e542b86f689400d82fd3dbbc6ee27
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Mar 24 16:11:31 2022 -0700

    clip and ceil

commit 55af08e97fcfa07671b0733af2fe808f0d15b7f9
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Mar 24 15:57:58 2022 -0700

    fix clip

commit 41274ba24fe3e3979326bdbdf1f4c81347ceac20
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Thu Mar 24 14:58:23 2022 -0700

    try more unary ops

commit fe850d16b445efbae09b981173930517b8d08bf7
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Mon Mar 14 16:15:58 2022 -0700

    first operator (correctness validated)

commit ba09337db3af2140daa4163beb17277024dc5598
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri Jan 28 17:50:56 2022 -0800

    enable initialization of webgpu

commit 3fb2712619e4238f910814c98b947b283fbe2070
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri Jan 28 17:50:24 2022 -0800

    install webgpu typescript type declaration

commit ed352620b8b946d7df56fa4a57066c2250fb2486
Author: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date:   Fri Jan 28 14:53:50 2022 -0800

    [POC] __blank ( npm test -- -b=webgpu )
---
 js/web/karma.conf.js                          |  25 +-
 js/web/lib/backend-onnxjs.ts                  |  11 +-
 js/web/lib/index.ts                           |   1 +
 js/web/lib/onnxjs/backend.ts                  |   2 +
 js/web/lib/onnxjs/backends/backend-webgpu.ts  |  95 +++++
 .../backends/webgpu/gpu-data-manager.ts       | 167 ++++++++
 .../backends/webgpu/inference-handler.ts      |  89 +++++
 .../backends/webgpu/op-resolve-rules.ts       |  90 +++++
 .../onnxjs/backends/webgpu/ops/binary-op.ts   | 217 ++++++++++
 .../lib/onnxjs/backends/webgpu/ops/common.ts  |  91 +++++
 .../lib/onnxjs/backends/webgpu/ops/concat.ts  | 176 ++++++++
 .../backends/webgpu/ops/conv-grouped.ts       | 127 ++++++
 js/web/lib/onnxjs/backends/webgpu/ops/conv.ts | 150 +++++++
 .../onnxjs/backends/webgpu/ops/fuse-utils.ts  |  39 ++
 .../lib/onnxjs/backends/webgpu/ops/gather.ts  | 131 ++++++
 js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts | 165 ++++++++
 .../lib/onnxjs/backends/webgpu/ops/matmul.ts  | 115 ++++++
 js/web/lib/onnxjs/backends/webgpu/ops/pool.ts | 376 ++++++++++++++++++
 .../backends/webgpu/ops/reduce-tensors.ts     |  85 ++++
 .../lib/onnxjs/backends/webgpu/ops/reshape.ts |  22 +
 .../lib/onnxjs/backends/webgpu/ops/shape.ts   |  16 +
 .../lib/onnxjs/backends/webgpu/ops/slice.ts   | 180 +++++++++
 .../lib/onnxjs/backends/webgpu/ops/squeeze.ts |  44 ++
 .../onnxjs/backends/webgpu/ops/transpose.ts   | 116 ++++++
 .../onnxjs/backends/webgpu/ops/unary-op.ts    | 197 +++++++++
 .../onnxjs/backends/webgpu/ops/unsqueeze.ts   |  43 ++
 .../onnxjs/backends/webgpu/program-manager.ts |  75 ++++
 .../onnxjs/backends/webgpu/session-handler.ts |  47 +++
 .../backends/webgpu/tensor-data-manager.ts    | 140 +++++++
 js/web/lib/onnxjs/backends/webgpu/types.ts    |  96 +++++
 js/web/lib/onnxjs/execution-plan.ts           |  22 +-
 js/web/lib/onnxjs/operators.ts                |   6 +-
 js/web/lib/onnxjs/opset.ts                    |  10 +-
 js/web/lib/onnxjs/tensor.ts                   |  14 +-
 js/web/package-lock.json                      |  13 +
 js/web/package.json                           |   1 +
 js/web/script/test-runner-cli-args.ts         |   5 +-
 js/web/script/test-runner-cli.ts              |  39 +-
 js/web/test/suite-test-list.jsonc             | 270 +++++++++++++
 js/web/test/test-runner.ts                    |  20 +-
 .../unittests/backends/webgl/test-conv-new.ts |   2 +-
 js/web/tsconfig.json                          |   1 +
 42 files changed, 3490 insertions(+), 41 deletions(-)
 create mode 100644 js/web/lib/onnxjs/backends/backend-webgpu.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/inference-handler.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/common.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/conv.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/gather.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/pool.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/shape.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/slice.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/program-manager.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/session-handler.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts
 create mode 100644 js/web/lib/onnxjs/backends/webgpu/types.ts

diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index 49553088bd2d8..bb85fa90bafe1 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -6,6 +6,7 @@
 const bundleMode = require('minimist')(process.argv)['bundle-mode'] || 'dev';  // 'dev'|'perf'|undefined;
 const karmaPlugins = require('minimist')(process.argv)['karma-plugins'] || undefined;
 const timeoutMocha = require('minimist')(process.argv)['timeout-mocha'] || 60000;
+const forceLocalHost = !!require('minimist')(process.argv)['force-localhost'];
 const commonFile = bundleMode === 'dev' ? '../common/dist/ort-common.js' : '../common/dist/ort-common.min.js'
 const mainFile = bundleMode === 'dev' ? 'test/ort.dev.js' : 'test/ort.perf.js';
 
@@ -16,18 +17,20 @@ const mainFile = bundleMode === 'dev' ? 'test/ort.dev.js' : 'test/ort.perf.js';
 // https://stackoverflow.com/a/8440736
 //
 function getMachineIpAddress() {
-  var os = require('os');
-  var ifaces = os.networkInterfaces();
+  if (!forceLocalHost) {
+    var os = require('os');
+    var ifaces = os.networkInterfaces();
 
-  for (const ifname in ifaces) {
-    for (const iface of ifaces[ifname]) {
-      if ('IPv4' !== iface.family || iface.internal !== false) {
-        // skip over internal (i.e. 127.0.0.1) and non-ipv4 addresses
-        continue;
-      }
+    for (const ifname in ifaces) {
+      for (const iface of ifaces[ifname]) {
+        if ('IPv4' !== iface.family || iface.internal !== false) {
+          // skip over internal (i.e. 127.0.0.1) and non-ipv4 addresses
+          continue;
+        }
 
-      // returns the first available IP address
-      return iface.address;
+        // returns the first available IP address
+        return iface.address;
+      }
     }
   }
 
@@ -80,6 +83,8 @@ module.exports = function (config) {
       ChromeTest: { base: 'ChromeHeadless', flags: ['--enable-features=SharedArrayBuffer'] },
       ChromePerf: { base: 'Chrome', flags: ['--window-size=1,1', '--enable-features=SharedArrayBuffer'] },
       ChromeDebug: { debug: true, base: 'Chrome', flags: ['--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer'] },
+      ChromeCanaryTest: { base: 'ChromeCanary', flags: ['--window-size=1,1', '--enable-features=SharedArrayBuffer', '--enable-unsafe-webgpu'] },
+      ChromeCanaryDebug: { debug: true, base: 'ChromeCanary', flags: ['--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer', '--enable-unsafe-webgpu'] },
 
       //
       // ==== BrowserStack browsers ====
diff --git a/js/web/lib/backend-onnxjs.ts b/js/web/lib/backend-onnxjs.ts
index 18a068e0ced8b..c4fe1f1db38af 100644
--- a/js/web/lib/backend-onnxjs.ts
+++ b/js/web/lib/backend-onnxjs.ts
@@ -17,7 +17,16 @@ class OnnxjsBackend implements Backend {
     // onnxruntime-common).
     //       In future we should remove Session.Config and use InferenceSession.SessionOptions.
     //       Currently we allow this to happen to make test runner work.
-    const session = new Session(options as unknown as Session.Config);
+    const onnxjsOptions = {...options as unknown as Session.Config};
+    if (!onnxjsOptions.backendHint && options?.executionProviders && options?.executionProviders[0]) {
+      const ep = options?.executionProviders[0];
+      if (typeof ep === 'string') {
+        onnxjsOptions.backendHint = ep;
+      } else {
+        onnxjsOptions.backendHint = ep.name;
+      }
+    }
+    const session = new Session(onnxjsOptions);
 
     // typescript cannot merge method override correctly (so far in 4.2.3). need if-else to call the method.
     if (typeof pathOrBuffer === 'string') {
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 098b6603e5700..eefdbcfb63b05 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -12,6 +12,7 @@ import {registerBackend} from 'onnxruntime-common';
 if (!BUILD_DEFS.DISABLE_WEBGL) {
   const onnxjsBackend = require('./backend-onnxjs').onnxjsBackend;
   registerBackend('webgl', onnxjsBackend, -10);
+  registerBackend('webgpu', onnxjsBackend, 999);  // set to 999 as the highest priority
 }
 if (!BUILD_DEFS.DISABLE_WASM) {
   const wasmBackend = require('./backend-wasm').wasmBackend;
diff --git a/js/web/lib/onnxjs/backend.ts b/js/web/lib/onnxjs/backend.ts
index a363ec9f21368..5ac77ae2f5fcb 100644
--- a/js/web/lib/onnxjs/backend.ts
+++ b/js/web/lib/onnxjs/backend.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {WebGLBackend} from './backends/backend-webgl';
+import {WebGpuBackend} from './backends/backend-webgpu';
 import {Graph} from './graph';
 import {Operator} from './operators';
 import {OpSet} from './opset';
@@ -79,6 +80,7 @@ const backendsCache: Map<string, Backend> = new Map();
 
 export const backend: {[name: string]: Backend} = {
   webgl: new WebGLBackend(),
+  webgpu: new WebGpuBackend()
 };
 
 /**
diff --git a/js/web/lib/onnxjs/backends/backend-webgpu.ts b/js/web/lib/onnxjs/backends/backend-webgpu.ts
new file mode 100644
index 0000000000000..e0f247eb135cb
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/backend-webgpu.ts
@@ -0,0 +1,95 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {env} from 'onnxruntime-common';
+
+import {Backend, SessionHandler} from '../backend';
+import {Logger} from '../instrument';
+import {Session} from '../session';
+
+import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
+import {WebGpuSessionHandler} from './webgpu/session-handler';
+
+export class WebGpuBackend implements Backend {
+  device: GPUDevice;
+  gpuDataManager: GpuDataManager;
+
+  commandEncoder: GPUCommandEncoder|null = null;
+  computePassEncoder: GPUComputePassEncoder|null = null;
+  pendingDispatchNumber = 0;
+
+  // #region interface Backend
+
+  async initialize(): Promise<boolean> {
+    try {
+      if (!navigator.gpu) {
+        // WebGPU is not available.
+        Logger.warning('WebGpuBackend', 'WebGPU is not available.');
+        return false;
+      }
+
+      const adapter = await navigator.gpu.requestAdapter();
+      if (!adapter) {
+        Logger.warning('WebGpuBackend', 'Failed to get GPU adapter.');
+        return false;
+      }
+      this.device = await adapter.requestDevice();
+      this.gpuDataManager = createGpuDataManager(this);
+
+      // TODO: set up flags
+
+      Logger.setWithEnv(env);
+
+      Logger.verbose('WebGpuBackend', 'Initialized successfully.');
+
+      this.device.onuncapturederror = ev => {
+        if (ev.error instanceof GPUValidationError) {
+          // eslint-disable-next-line no-console
+          console.error(`An uncaught WebGPU validation error was raised: ${ev.error.message}`);
+        }
+      };
+
+      return true;
+    } catch (e) {
+      Logger.warning('WebGpuBackend', `Unable to initialize WebGpuBackend. ${e}`);
+      return false;
+    }
+  }
+  createSessionHandler(context: Session.Context): SessionHandler {
+    return new WebGpuSessionHandler(this, context);
+  }
+  dispose(): void {
+    // TODO: uninitialization
+    // this.glContext.dispose();
+  }
+
+  // #endregion interface Backend
+
+  getCommandEncoder(): GPUCommandEncoder {
+    if (!this.commandEncoder) {
+      this.commandEncoder = this.device.createCommandEncoder();
+    }
+    return this.commandEncoder;
+  }
+
+  getComputePassEncoder(): GPUComputePassEncoder {
+    if (!this.computePassEncoder) {
+      this.computePassEncoder = this.getCommandEncoder().beginComputePass();
+    }
+    return this.computePassEncoder;
+  }
+
+  endComputePass(): void {
+    if (this.computePassEncoder) {
+      this.computePassEncoder.end();
+      this.computePassEncoder = null;
+    }
+  }
+
+  flush(): void {
+    this.endComputePass();
+    this.device.queue.submit([this.commandEncoder!.finish()]);
+    this.commandEncoder = null;
+    this.pendingDispatchNumber = 0;
+  }
+}
diff --git a/js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts b/js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts
new file mode 100644
index 0000000000000..297d4bae64aed
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts
@@ -0,0 +1,167 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Guid} from 'guid-typescript';
+
+import {Logger} from '../../instrument';
+
+import {sizeof, Tensor} from '../../tensor';
+import {ShapeUtil} from '../../util';
+import {WebGpuBackend} from '../backend-webgpu';
+import {GpuData, GpuDataId, GpuDataType} from './types';
+
+/**
+ * manages GpuDataId -> GpuBuffer
+ */
+export interface GpuDataManager {
+  /**
+   * upload data to GPU. if the ID already exists in cache, returns the cached value without uploading anything.
+   */
+  upload(data: Tensor.NumberType, gpuDataType: GpuDataType): Promise<GpuData>;
+  /**
+   * create new data on GPU.
+   */
+  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData;
+  /**
+   * get GPU data by ID.
+   */
+  get(id: GpuDataId): GpuData|undefined;
+  /**
+   * release the data on GPU by ID.
+   */
+  release(id: GpuDataId): void;
+  /**
+   * download the data from GPU.
+   */
+  download(id: GpuDataId): Promise<ArrayBufferLike>;
+}
+
+interface StorageCacheValue {
+  gpuData: GpuData;
+  size: number;
+}
+
+interface DownloadCacheValue {
+  gpuData: GpuData;
+  data: Promise<ArrayBufferLike>;
+}
+
+/**
+ * normalize the buffer size so that it fits the 128-bits (16 bytes) alignment.
+ */
+const calcNormalizedBufferSize = (size: number) => Math.ceil(size / 16) * 16;
+
+class GpuDataManagerImpl implements GpuDataManager {
+  // GPU Data ID => GPU Data ( storage buffer )
+  storageCache: Map<GpuDataId, StorageCacheValue>;
+
+  // GPU Data ID => GPU Data ( read buffer )
+  downloadCache: Map<GpuDataId, DownloadCacheValue>;
+
+  constructor(private backend: WebGpuBackend /* , private reuseBuffer: boolean */) {
+    this.storageCache = new Map();
+    this.downloadCache = new Map();
+  }
+
+  async upload(data: Tensor.NumberType, gpuDataType: GpuDataType): Promise<GpuData> {
+    if (gpuDataType !== GpuDataType.default) {
+      throw new Error('we only support default GPU data type now');
+    }
+
+    Logger.verbose('GpuData', `Uploading data to GPU: {${data.length}}`);
+
+    const srcArrayBuffer = data.buffer;
+    const srcOffset = data.byteOffset;
+    const srcLength = data.byteLength;
+    const size = calcNormalizedBufferSize(srcLength);
+
+    // create gpu buffer
+    const gpuBuffer = this.backend.device.createBuffer({mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE});
+
+    // copy (upload) data
+    const arrayBuffer = gpuBuffer.getMappedRange();
+    new Uint8Array(arrayBuffer).set(new Uint8Array(srcArrayBuffer, srcOffset, srcLength));
+    gpuBuffer.unmap();
+
+    const gpuData = {id: Guid.create(), type: GpuDataType.default, buffer: gpuBuffer};
+    this.storageCache.set(gpuData.id, {gpuData, size: srcLength});
+    return gpuData;
+  }
+
+  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData {
+    if (gpuDataType !== GpuDataType.default) {
+      throw new Error('we only support default GPU data type now');
+    }
+
+    // !!!
+    // !!! IMPORTANT: TODO: whether we should keep the storage buffer every time, or always create new ones.
+    // !!!                  This need to be figured out by performance test results.
+    // !!!
+
+    const elemCount = ShapeUtil.size(dims);
+    const bufferLength = sizeof(type) * elemCount;
+    const size = calcNormalizedBufferSize(bufferLength);
+
+    // create gpu buffer
+    const gpuBuffer =
+        // eslint-disable-next-line no-bitwise
+        this.backend.device.createBuffer({size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC});
+
+    const gpuData = {id: Guid.create(), type: GpuDataType.default, buffer: gpuBuffer};
+    this.storageCache.set(gpuData.id, {gpuData, size: bufferLength});
+    return gpuData;
+  }
+
+  get(id: GpuDataId): GpuData|undefined {
+    return this.storageCache.get(id)?.gpuData;
+  }
+
+  release(id: GpuDataId): void {
+    const cachedData = this.storageCache.get(id);
+    if (!cachedData) {
+      throw new Error('releasing data does not exist');
+    }
+
+    this.storageCache.delete(id);
+    cachedData.gpuData.buffer.destroy();
+
+    const downloadingData = this.downloadCache.get(id);
+    if (downloadingData) {
+      void downloadingData.data.then(() => {
+        downloadingData.gpuData.buffer.destroy();
+      });
+      this.downloadCache.delete(id);
+    }
+  }
+
+  async download(id: GpuDataId): Promise<ArrayBufferLike> {
+    const downloadData = this.downloadCache.get(id);
+    if (downloadData) {
+      return downloadData.data;
+    }
+
+    const cachedData = this.storageCache.get(id);
+    if (!cachedData) {
+      throw new Error('data does not exist');
+    }
+
+    Logger.verbose('GpuData', `Downloading data from GPU: {${id}}`);
+
+    const commandEncoder = this.backend.getCommandEncoder();
+    this.backend.endComputePass();
+    const gpuReadBuffer = this.backend.device.createBuffer(
+        // eslint-disable-next-line no-bitwise
+        {size: cachedData.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ});
+    commandEncoder.copyBufferToBuffer(
+        cachedData.gpuData.buffer /* source buffer */, 0 /* source offset */, gpuReadBuffer /* destination buffer */,
+        0 /* destination offset */, cachedData.size /* size */
+    );
+    this.backend.flush();
+
+    await gpuReadBuffer.mapAsync(GPUMapMode.READ);
+    return gpuReadBuffer.getMappedRange();
+  }
+}
+
+export const createGpuDataManager = (...args: ConstructorParameters<typeof GpuDataManagerImpl>): GpuDataManager =>
+    new GpuDataManagerImpl(...args);
diff --git a/js/web/lib/onnxjs/backends/webgpu/inference-handler.ts b/js/web/lib/onnxjs/backends/webgpu/inference-handler.ts
new file mode 100644
index 0000000000000..2509814c353f1
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/inference-handler.ts
@@ -0,0 +1,89 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {InferenceHandler} from '../../backend';
+import {Tensor} from '../../tensor';
+
+import {WebGpuSessionHandler} from './session-handler';
+import {createTensorDataManager, TensorDataManager} from './tensor-data-manager';
+import {GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './types';
+
+const getProgramInfoUniqueKey =
+    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly Tensor[], inputGpuDatas: readonly GpuData[]):
+        string => {
+          const inputGpuDataTypes = inputGpuDatas.map(data => `${data.type}`).join('_');
+          const inputTensorShapes = inputTensors.map(t => `${t.dims.join(',')}`).join('_');
+          let key = programInfo.name;
+          if (programInfo.cacheHint) {
+            key += '[' + programInfo.cacheHint + ']';
+          }
+          key += ':' + inputTensorShapes + ';' + inputGpuDataTypes;
+          return key;
+        };
+
+export class WebGpuInferenceHandler implements InferenceHandler {
+  // per inference context
+  dataManager: TensorDataManager;
+
+  constructor(public session: WebGpuSessionHandler) {
+    this.dataManager = createTensorDataManager(session.backend.gpuDataManager);
+  }
+
+  private async uploadGpuData(tensor: Tensor, textureType: GpuDataType): Promise<GpuData> {
+    if (this.session.isInitializer(tensor.dataId)) {
+      return this.session.dataManager.uploadTensorToGpu(tensor, textureType);
+    }
+
+    return this.dataManager.uploadTensorToGpu(tensor, textureType);
+  }
+
+  private createGpuData(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
+    return this.dataManager.createGpuTensor(type, dims, gpuDataType);
+  }
+
+  async run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly Tensor[]): Promise<Tensor[]> {
+    if (inputs.length !== program.inputTypes.length) {
+      throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
+    }
+
+    // create info for inputs
+    const inputDatas: GpuData[] = [];
+    for (let i = 0; i < program.inputTypes.length; ++i) {
+      inputDatas[i] = await this.uploadGpuData(inputs[i], program.inputTypes[i]);
+    }
+
+    const key = getProgramInfoUniqueKey(program, inputs, inputDatas);
+    let artifact = this.session.programManager.getArtifact(key);
+    const programInfo = artifact ?
+        artifact.programInfo :
+        (typeof (program as ProgramInfoLoader).get === 'function' ? (program as ProgramInfoLoader).get() :
+                                                                    (program as ProgramInfo));
+
+    // create info for outputs
+    const outputDatas: GpuData[] = [];
+    const outputTensors: Tensor[] = [];
+    for (let i = 0; i < programInfo.outputs.length; ++i) {
+      const [tensor, gpuData] = this.createGpuData(
+          programInfo.outputs[i].type, programInfo.outputs[i].dims, programInfo.outputs[i].gpuDataType);
+      outputTensors.push(tensor);
+      outputDatas.push(gpuData);
+    }
+
+    if (!artifact) {
+      artifact = this.session.programManager.build(programInfo);
+      this.session.programManager.setArtifact(key, artifact);
+    }
+
+    this.session.programManager.run(artifact, inputDatas, outputDatas, artifact.programInfo.dispatchGroup(inputs));
+
+    return outputTensors;
+  }
+
+  reshape(input: Tensor, reshapedDims: readonly number[]): Tensor {
+    return this.dataManager.hasGpuData(input.dataId) ?
+        this.dataManager.createGpuRef(input.dataId, input.type, reshapedDims)[0] :
+        new Tensor(reshapedDims, input.type, undefined, undefined, input.data);
+  }
+
+  dispose(): void {}
+}
diff --git a/js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts b/js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts
new file mode 100644
index 0000000000000..4adfb180893a6
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {OpSet} from '../../opset';
+
+import * as binaryOps from './ops/binary-op';
+import {concat, parseConcatAttributes} from './ops/concat';
+import {conv, parseConvAttributes} from './ops/conv';
+import {gather, parseGatherAttributes} from './ops/gather';
+import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
+import {matMul, parseMatMulAttributes} from './ops/matmul';
+import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes, parseGlobalAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool';
+import {sum} from './ops/reduce-tensors';
+import {reshape} from './ops/reshape';
+import {shape} from './ops/shape';
+import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
+import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
+import {parseTransposeAttributes, transpose} from './ops/transpose';
+import * as unaryOps from './ops/unary-op';
+import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
+
+export const WEBGPU_OP_RESOLVE_RULES: readonly OpSet.ResolveRule[] = [
+  ['Abs', '', '6+', unaryOps.abs], ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
+  // ['And', '', '7+', binaryOps.and],
+  ['Asin', '', '7+', unaryOps.asin], ['Atan', '', '7+', unaryOps.atan],
+  // TODO: support new attributes for AveragePool-10
+  ['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
+  // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
+  // ['Cast', '', '6+', cast, parseCastAttributes],
+  ['Ceil', '', '6+', unaryOps.ceil], ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
+  ['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
+  ['Conv', '', '1+', conv, parseConvAttributes], ['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
+  // ['Dropout', '', '7+', unaryOps.identity],
+  // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
+  // ['Equal', '', '7+', binaryOps.equal],
+  ['Elu', '', '6+', unaryOps.elu, unaryOps.parseEluAttributes], ['Exp', '', '6+', unaryOps.exp],
+  // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
+  ['Floor', '', '6+', unaryOps.floor],
+  // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
+  ['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
+  ['Gemm', '', '11+', gemm, parseGemmAttributesV11],
+  ['GlobalAveragePool', '', '1+', globalAveragePool, parseGlobalAveragePoolAttributes],
+  ['GlobalMaxPool', '', '1+', globalMaxPool],
+  // ['Greater', '', '7+', binaryOps.greater],
+  // ['Identity', '', '1+', unaryOps.identity],
+  // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
+  // ['InstanceNormalization', '', '6+', instanceNormalization, parseInstanceNormalizationAttributes],
+  ['LeakyRelu', '', '6+', unaryOps.leakyRelu, unaryOps.parseLeakyReluAttributes],
+  // ['Less', '', '7+', binaryOps.less],
+  ['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
+  // TODO: support new attributes for MaxPool-8 and MaxPool-10
+  ['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes], ['Mul', '', '7+', binaryOps.mul],
+  ['Neg', '', '6+', unaryOps.neg],
+  // ['Not', '', '1+', unaryOps.not],
+  // ['Or', '', '7+', binaryOps.or],
+  // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
+  // ['Pad', '', '11+', padV11, parsePadAttributesV11],
+  ['Pow', '', '7+', binaryOps.pow],
+  // ['PRelu', '', '7+', binaryOps.pRelu],
+  // ['ReduceLogSum', '', '1+', reduceLogSum, parseReduceAttributes],
+  // ['ReduceMax', '', '1+', reduceMax, parseReduceAttributes],
+  // ['ReduceMean', '', '1+', reduceMean, parseReduceAttributes],
+  // ['ReduceMin', '', '1+', reduceMin, parseReduceAttributes],
+  // ['ReduceProd', '', '1+', reduceProd, parseReduceAttributes],
+  // ['ReduceSum', '', '1-12', reduceSum, parseReduceAttributes],
+  // ['ReduceSumSquare', '', '1+', reduceLogSumSquare, parseReduceAttributes],
+  ['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
+  // ['Resize', '', '10', resize, parseResizeAttributesV10],
+  // ['Resize', '', '11+', resize, parseResizeAttributesV11],
+  ['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid], ['Sin', '', '7+', unaryOps.sin],
+  ['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
+  ['Slice', '', '1-9', slice, parseSliceAttributes],
+  // // The "semantic" meaning of axis has changed in opset-13.
+  // ['Softmax', '', '1-12', softmax, parseSoftmaxAttributes],
+  // ['Softmax', '', '13+', softmaxV13, parseSoftmaxAttributesV13],
+  // // 'Split' operator has an optional attribute 'split'
+  // // this attribute determines how the specified axis of input data is split.
+  // // When the attribute is missing, we need the count of number of outputs
+  // // so that we can determine the 'split' attribute from the runtime input to the Operator
+  // ['Split', '', '2-12', split, parseSplitAttributes],
+  ['Sqrt', '', '6+', unaryOps.sqrt], ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
+  ['Squeeze', '', '13+', squeezeV13], ['Sub', '', '7+', binaryOps.sub], ['Sum', '', '6+', sum],
+  ['Tan', '', '7+', unaryOps.tan], ['Tanh', '', '6+', unaryOps.tanh],
+  // ['Tile', '', '6+', tile],
+  ['Transpose', '', '1+', transpose, parseTransposeAttributes],
+  // ['Upsample', '', '7-8', upsample, parseUpsampleAttributesV7],
+  // ['Upsample', '', '9', upsample, parseUpsampleAttributesV9],
+  ['Unsqueeze', '', '1-12', unsqueeze, parseUnsqueezeAttributes], ['Unsqueeze', '', '13+', unsqueezeV13],
+  // ['Xor', '', '7+', binaryOps.xor],
+];
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts b/js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts
new file mode 100644
index 0000000000000..8997932602a2f
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts
@@ -0,0 +1,217 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Tensor} from '../../../tensor';
+import {BroadcastUtil, ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+type BuiltinFunctionName = string;
+type BinaryCustomExpression = (expressionA: string, expressionB: string) => string;
+type BinaryFunctionCall = BuiltinFunctionName|BinaryCustomExpression|{
+  scalar: BinaryCustomExpression;
+  vector: BinaryCustomExpression;
+};
+
+const createBinaryOpProgramShader =
+    (dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[], vectorize: boolean,
+     doBroadcast: boolean, funcCall: BinaryFunctionCall, additionalImplementation?: string, typeA = 'f32',
+     typeB = 'f32', typeOutput = 'f32') => {
+      const outputSize = ShapeUtil.size(dimsOutput);
+      const vecSize = Math.ceil(outputSize / 4);
+
+      let expressionScalar: BinaryCustomExpression;
+      let expressionVector: BinaryCustomExpression;
+      if (typeof funcCall === 'string') {
+        expressionScalar = expressionVector = (a, b) => `${funcCall}((${a}),(${b}))`;
+      } else if (typeof funcCall === 'function') {
+        expressionScalar = expressionVector = funcCall;
+      } else {
+        expressionScalar = funcCall.scalar;
+        expressionVector = funcCall.vector;
+      }
+
+      let broadcastImpl = '';
+      const outputIndicesHelper = createIndicesHelper('output', dimsOutput);
+      if (doBroadcast) {
+        const calcOffsetImpl = (dims: readonly number[]) => {
+          const strides = ShapeUtil.computeStrides(dims);
+          const offsets: string[] = [];
+          for (let i = dims.length - 1; i >= 0; i--) {
+            offsets.push(`${strides[i]}u * ((*outputIndices)[${i + dimsOutput.length - dims.length}] % ${dims[i]}u)`);
+          }
+          return offsets.length > 0 ? offsets.join('+') : '0u';
+        };
+
+        broadcastImpl = `
+  ${outputIndicesHelper.o2iImpl}
+
+  fn calcOffsetA(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
+    return ${calcOffsetImpl(dimsA)};
+  }
+
+  fn calcOffsetB(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
+    return ${calcOffsetImpl(dimsB)};
+  }
+  `;
+      }
+
+      let assignment: string;
+      if (vectorize) {
+        if (doBroadcast) {
+          assignment = `
+      ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+      ${outputIndicesHelper.o2iCall('global_id.x * 4u', 'outputIndices')}
+      let offsetA = calcOffsetA(&outputIndices);
+      let offsetB = calcOffsetB(&outputIndices);
+      outputData[global_id.x] = ${expressionVector('aData[offsetA / 4u]', 'bData[offsetB / 4u]')};`;
+        } else {
+          assignment = `outputData[global_id.x] = ${expressionVector('aData[global_id.x]', 'bData[global_id.x]')};`;
+        }
+      } else {
+        if (!doBroadcast) {
+          throw new Error('no necessary to use scalar implementation for element-wise binary op implementation.');
+        }
+        const singleAssignment = (x: number) => {
+          const expressionA = `aData[indexA${x}][componentA${x}]`;
+          const expressionB = `bData[indexB${x}][componentB${x}]`;
+          return `
+      ${outputIndicesHelper.o2iCall(`global_id.x * 4u + ${x}u`, 'outputIndices')}
+      let offsetA${x} = calcOffsetA(&outputIndices);
+      let offsetB${x} = calcOffsetB(&outputIndices);
+      let indexA${x} = offsetA${x} / 4u;
+      let indexB${x} = offsetB${x} / 4u;
+      let componentA${x} = offsetA${x} % 4u;
+      let componentB${x} = offsetB${x} % 4u;
+      outputData[global_id.x][${x}] = ${expressionScalar(expressionA, expressionB)};`;
+        };
+
+        assignment = `
+      ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+      ${singleAssignment(0)}
+      ${singleAssignment(1)}
+      ${singleAssignment(2)}
+      ${singleAssignment(3)}`;
+      }
+
+      return `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  @group(0) @binding(0) var<storage, read> aData : array<vec4<${typeA}>>;
+  @group(0) @binding(1) var<storage, read> bData : array<vec4<${typeB}>>;
+  @group(0) @binding(2) var<storage, read_write> outputData : array<vec4<${typeOutput}>>;
+
+  ${additionalImplementation ?? ''}
+  ${broadcastImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${vecSize}u) {
+      return;
+    }
+
+    ${assignment}
+  }`;
+    };
+
+const createBinaryOpProgramInfo =
+    (metadata: ProgramMetadata, a: Tensor, b: Tensor, funcCall: BinaryFunctionCall, additionalImplementation?: string,
+     outputTensorType: Tensor.DataType = a.type): ProgramInfo => {
+      const isBroadcast = !ShapeUtil.areEqual(a.dims, b.dims);
+      let outputShape = a.dims;
+      let outputSize = a.size;
+
+      let vectorize = false;
+
+      // TODO: deal with zero-sized tensors (eg. dims=[1,0])
+
+      if (isBroadcast) {
+        const calculatedShape = BroadcastUtil.calcShape(a.dims, b.dims, false);
+        if (!calculatedShape) {
+          throw new Error('Can\'t perform binary op on the given tensors');
+        }
+        outputShape = calculatedShape;
+        outputSize = ShapeUtil.size(outputShape);
+
+        // check whether vectorize can be enabled
+        let sharedDimension = 1;
+        for (let i = 0; i < outputShape.length; i++) {
+          const dimA = a.dims[a.dims.length - i] ?? 1;
+          const dimB = b.dims[b.dims.length - i] ?? 1;
+          if (dimA === dimB) {
+            sharedDimension *= dimA;
+          } else {
+            break;
+          }
+        }
+        if (sharedDimension % 4 === 0) {
+          vectorize = true;
+        }
+
+
+      } else {
+        // element-wise
+        vectorize = true;
+      }
+
+      return {
+        ...metadata,
+        shaderSource: createBinaryOpProgramShader(
+            a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, additionalImplementation),
+        outputs: [{dims: outputShape, type: outputTensorType, gpuDataType: GpuDataType.default}],
+        dispatchGroup: () =>
+            ({x: Math.ceil(outputSize / 64 /* workgroup size */ / (vectorize ? 4 : 1) /* vec size */)})
+      };
+    };
+
+const createBinaryOpProgramInfoLoader =
+    (inputs: Tensor[], name: string, funcCall: BinaryFunctionCall, additionalImplementation?: string,
+     cacheKey?: string): ProgramInfoLoader => {
+      const metadata:
+          ProgramMetadata = {name, inputTypes: [GpuDataType.default, GpuDataType.default], cacheHint: cacheKey};
+      return {
+        ...metadata,
+        get: () => createBinaryOpProgramInfo(metadata, inputs[0], inputs[1], funcCall, additionalImplementation)
+      };
+    };
+
+export const add = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Add', (a, b) => `${a}+${b}`), inputs);
+
+// export const and = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslAnd(), 'bool'), inputs)];
+
+export const div = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Div', (a, b) => `${a}/${b}`), inputs);
+
+// export const equal = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslEqual(), 'bool'), inputs)];
+
+// export const greater = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslGreater(), 'bool'), inputs)];
+
+// export const less = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslLess(), 'bool'), inputs)];
+
+export const mul = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Mul', (a, b) => `${a}*${b}`), inputs);
+
+// export const or = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslOr(), 'bool'), inputs)];
+
+export const pow = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Pow', 'pow'), inputs);
+
+// export const pRelu = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslPRelu()), inputs)];
+
+export const sub = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Sub', (a, b) => `${a}-${b}`), inputs);
+
+// export const xor = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslXor(), 'bool'), inputs)];
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/common.ts b/js/web/lib/onnxjs/backends/webgpu/ops/common.ts
new file mode 100644
index 0000000000000..ec7ec3107e084
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/common.ts
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {ShapeUtil} from '../../../util';
+
+/**
+ * constant value for a workgroup size.
+ *
+ * We definitely can do further optimization in future, but for now we use 64.
+ *
+ * rule of thumb: Use [a workgroup size of] 64 unless you know what GPU you are targeting or that your workload
+ *                needs something different.
+ *
+ * from: https://surma.dev/things/webgpu/
+ **/
+export const WORKGROUP_SIZE = 64;
+
+export interface IndicesHelper {
+  /**
+   * WGSL code of function implementation for offset-to-indices
+   */
+  o2iImpl: string;
+  /**
+   * WGSL code of function call for offset-to-indices
+   */
+  o2iCall: (varOffset: string, varIndices: string) => string;
+  /**
+   * WGSL code of function implementation for indices-to-offset
+   */
+  i2oImpl: string;
+  /**
+   * WGSL code of function implementation for indices-to-offset
+   *
+   * @param isPtr - whether the variable is a pointer. default is false.
+   */
+  i2oExpression: (varIndices: string, isPtr?: boolean) => string;
+  /**
+   * WGSL code of indices variable declaration
+   *
+   * @param v - variable name.
+   * @param init - initial value.
+   */
+  indicesVariableDeclaration: (v: string, init?: string[]) => string;
+  /**
+   * data type of indices
+   */
+  iType: string;
+}
+
+export const createIndicesHelper = (name: string, shape: readonly number[]) => {
+  const iType = shape.length < 2 ? 'u32' : `array<u32, ${shape.length}>`;
+
+  const strides = ShapeUtil.computeStrides(shape);
+  let o2iSnippet = '';
+  for (let i = 0; i < shape.length - 1; i++) {
+    o2iSnippet += `
+    let dim${i} = current / ${strides[i]}u;
+    let rest${i} = current % ${strides[i]}u;
+    (*indices)[${i}] = dim${i};
+    current = rest${i};
+    `;
+  }
+  o2iSnippet += `(*indices)[${shape.length - 1}] = current;`;
+
+  const o2iImpl = shape.length < 2 ? '' : `
+  fn ih_o2i_${name}(offset: u32, indices: ptr<function, ${iType}>) {
+    var current = offset;
+    ${o2iSnippet}
+  }`;
+
+  const o2iCall = (varOffset: string, varIndices: string) =>
+      shape.length < 2 ? `${varIndices}=${varOffset};` : `ih_o2i_${name}(${varOffset}, &${varIndices});`;
+
+  const offsets: string[] = [];
+  for (let i = shape.length - 1; i >= 0; i--) {
+    offsets.push(`${strides[i]}u * ((*indices)[${i}])`);
+  }
+
+  const i2oImpl = shape.length < 2 ? '' : `
+  fn ih_i2o_${name}(indices: ptr<function, ${iType}>) -> u32 {
+    return ${offsets.length > 0 ? offsets.join('+') : '0u'};
+  }`;
+
+  const i2oExpression = (varIndices: string, isPtr?: boolean) =>
+      shape.length < 2 ? `(${isPtr ? '*' : ''}${varIndices})` : `ih_i2o_${name}(${isPtr ? '' : '&'}${varIndices})`;
+
+  const indicesVariableDeclaration = (v: string, init?: string[]) =>
+      `var ${v}:${iType}${init ? `=${iType}(${init.join(',')})` : ''};`;
+
+  return {o2iImpl, o2iCall, i2oImpl, i2oExpression, indicesVariableDeclaration, iType};
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts b/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
new file mode 100644
index 0000000000000..874aef1e44bff
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
@@ -0,0 +1,176 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, IndicesHelper, WORKGROUP_SIZE} from './common';
+
+export interface ConcatAttributes extends AttributeWithCacheKey {
+  readonly axis: number;
+}
+
+export const concat = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConcatAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(createConcatProgramInfoLoader(inputs, attributes), inputs);
+};
+
+const createConcatProgramMetadata = (inputCount: number, cacheHint: string) =>
+    ({name: 'Concat', inputTypes: Array(inputCount).fill(GpuDataType.default), cacheHint});
+
+const createConcatProgramInfo =
+    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
+      const inputShape = inputs[0].dims.slice();
+      if (axis >= inputShape.length || axis < (-1 * inputShape.length)) {
+        throw new Error('axis specified for concat doesn\'t match input dimensionality');
+      }
+      if (axis < 0) {
+        axis = inputShape.length + axis;
+      }
+      // ensure all of the non-concatenated axes match each other
+      // calculate the shape of the output tensor while we do that
+      const outputShape = inputShape.slice(0);
+      for (let i = 1; i < inputs.length; i++) {
+        const dataNShape = inputs[i].dims.slice();
+        for (let axisIndex = 0; axisIndex < inputShape.length; axisIndex++) {
+          // add to the placeholder for computing output shape
+          if (axisIndex === axis) {
+            outputShape[axis] += dataNShape[axisIndex];
+          }
+          // ensure all non-cancatenated axes match each other
+          else if (inputShape[axisIndex] !== dataNShape[axisIndex]) {
+            throw new Error('non concat dimensions must match');
+          }
+        }
+      }
+
+      const outputSize = ShapeUtil.size(outputShape);
+      const rank = outputShape.length;
+
+      const sizeInConcatAxis = new Array<number>(inputs.length);
+      const inputStorageBuffersDeclarations = new Array<string>(inputs.length);
+      const inputIndicesHelpers = new Array<IndicesHelper>(inputs.length);
+
+      let previousSum = 0;
+      for (let i = 0; i < inputs.length; ++i) {
+        previousSum += inputs[i].dims[axis];
+        sizeInConcatAxis[i] = previousSum;
+
+        inputStorageBuffersDeclarations[i] =
+            `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`;
+
+        inputIndicesHelpers[i] = createIndicesHelper(`input${i}`, inputs[i].dims);
+      }
+
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+
+      const indicesAxis = rank < 2 ? 'indices' : `indices[${axis}]`;
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+  ${inputIndicesHelpers.map(i => i.i2oImpl).join('\n')}
+  ${outputIndicesHelper.o2iImpl}
+
+  let sizeInConcatAxis = array<u32, ${sizeInConcatAxis.length}>(${sizeInConcatAxis.map(i => `${i}u`).join(',')});
+  ${calculateInputIndexImpl(sizeInConcatAxis.length)}
+  ${readBufferDataImpl(inputIndicesHelpers, rank, dataType)}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+
+    let textureIndex = calculateInputIndex(${indicesAxis});
+    if (textureIndex != 0u) {
+      ${indicesAxis} -= sizeInConcatAxis[textureIndex - 1u];
+    }
+
+    output[global_id.x] = readBufferData(textureIndex, &indices);
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const createConcatProgramInfoLoader = (inputs: Tensor[], attributes: ConcatAttributes): ProgramInfoLoader => {
+  const metadata = createConcatProgramMetadata(inputs.length, attributes.cacheKey);
+  return {...metadata, get: () => createConcatProgramInfo(metadata, inputs, attributes.axis)};
+};
+
+const calculateInputIndexImpl = (numberOfTensors: number): string => `
+  fn calculateInputIndex(index: u32) -> u32 {
+    for (var i: u32 = 0u; i < ${numberOfTensors}u; i += 1u ) {
+      if (index < sizeInConcatAxis[i]) {
+        return i;
+      }
+    }
+    return ${numberOfTensors}u;
+  }`;
+
+const readBufferDataImpl = (indicesHelper: readonly IndicesHelper[], tensorRank: number, dataType: string) => {
+  const numberOfTensors = indicesHelper.length;
+  const codeLines: string[] = [];
+  for (let i = 0; i < numberOfTensors; ++i) {
+    const returnSnippet = `return input${i}[${indicesHelper[i].i2oExpression('indices', true)}];`;
+    if (numberOfTensors === 1) {
+      codeLines.push(returnSnippet);
+    } else if (i === 0) {
+      codeLines.push(`if (textureIndex == ${i}u) { ${returnSnippet} }`);
+    } else if (i === numberOfTensors - 1) {
+      codeLines.push(`else { ${returnSnippet} }`);
+    } else {
+      codeLines.push(`else if (textureIndex == ${i}) { ${returnSnippet} }`);
+    }
+  }
+  return `
+  fn readBufferData(textureIndex: u32, indices: ptr<function, ${indicesHelper[0].iType}>) -> ${dataType} {
+    ${codeLines.join('\n')}
+  }`;
+};
+
+export const parseConcatAttributes: OperatorInitialization<ConcatAttributes> = (node: Graph.Node): ConcatAttributes =>
+    createAttributeWithCacheKey({axis: node.attributes.getInt('axis')});
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length < 1) {
+    throw new Error('too few inputs');
+  }
+
+  const inputType = inputs[0].type;
+  const inputDimensionality = inputs[0].dims.length;
+
+  // TODO: Support string concat
+  if (inputType === 'string') {
+    throw new Error('string tensor is not supported yet');
+  }
+
+  for (const input of inputs) {
+    // make sure types of all inputs match
+    if (input.type !== inputType) {
+      throw new Error('input tensors should be one type');
+    }
+
+    // make sure the dimensionality of all inputs are the same
+    if (input.dims.length !== inputDimensionality) {
+      throw new Error('input tensors should have the same shape');
+    }
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts b/js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts
new file mode 100644
index 0000000000000..570ec041a34fc
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Logger} from '../../../instrument';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+import {calculateOutputShape, ConvAttributes} from './conv';
+import {getActicationSnippet} from './fuse-utils';
+
+const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
+  name: 'GroupedConv',
+  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                        [GpuDataType.default, GpuDataType.default],
+  cacheHint
+});
+
+const createGroupedConvProgramInfo =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], metadata: ProgramMetadata,
+     attributes: ConvAttributes): ProgramInfo => {
+      const hasBias = inputs.length > 2;
+      const processBias = hasBias ? 'value += b[output_channel];' : '';
+      const xShape = inputs[0].dims;
+      const wShape = inputs[1].dims;
+      const outputChannelsPerGroup = wShape[0] / attributes.group;
+
+      const dataType = 'f32';  // TODO: support other data type
+      const {activationFunction, applyActivation} = getActicationSnippet(attributes);
+      const inputStorageBuffersDeclarations = [
+        `@group(0) @binding(0) var<storage, read> x : array<${dataType}>;`,
+        `@group(0) @binding(1) var<storage, read> w : array<${dataType}>;`
+      ];
+      if (hasBias) {
+        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
+      }
+
+      Logger.verbose(
+          'GroupedConv',
+          `autpPad:${attributes.autoPad}, dilations:${attributes.dilations}, group:${attributes.group}, kernelShape:${
+              attributes.kernelShape}, pads:${attributes.pads}, strides:${attributes.strides}`);
+      const outputShape =
+          calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
+      const outputSize = ShapeUtil.size(outputShape);
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const xIndicesHelper = createIndicesHelper('x', xShape);
+      const wIndicesHelper = createIndicesHelper('w', wShape);
+
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
+  const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputStorageBuffersDeclarations.length}) var<storage, read_write> output : array<${dataType}>;
+
+  ${activationFunction}
+  ${outputIndicesHelper.o2iImpl}
+  ${xIndicesHelper.i2oImpl}
+  ${wIndicesHelper.i2oImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
+    let batch: u32 = outputIndices[0];
+    let output_channel: u32 = outputIndices[1];
+    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
+    let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
+
+    var value: ${dataType} = ${dataType}(0);
+    for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
+      let input_channel = group_id * ${wShape[1]}u + wInChannel;
+      for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
+        let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
+
+        if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
+          continue;
+        }
+
+        for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
+          let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
+          if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
+            continue;
+          }
+
+          ${
+          xIndicesHelper.indicesVariableDeclaration(
+              'xIndices',
+              [
+                'batch', 'input_channel', 'xHeight', 'xWidth'
+              ])}
+          let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
+          ${
+          wIndicesHelper.indicesVariableDeclaration('wIndices', [
+            'output_channel', 'wInChannel', 'wHeight', 'wWidth'
+          ])}
+          let wVal = w[${wIndicesHelper.i2oExpression('wIndices')}];
+          value += xVal*wVal;
+        }
+      }
+    }
+    ${processBias}
+    ${applyActivation}
+    output[global_id.x] = value;
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+export const createGroupedConvProgramInfoLoader =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], attributes: ConvAttributes):
+        ProgramInfoLoader => {
+          const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
+          return {...metadata, get: () => createGroupedConvProgramInfo(inferenceHandler, inputs, metadata, attributes)};
+        };
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/conv.ts b/js/web/lib/onnxjs/backends/webgpu/ops/conv.ts
new file mode 100644
index 0000000000000..644e9b08c7030
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/conv.ts
@@ -0,0 +1,150 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {InferenceHandler} from '../../../backend';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {PoolConvUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+import {createGroupedConvProgramInfoLoader} from './conv-grouped';
+// import {createDotProductProgramInfoLoader} from './dot-product';
+import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+
+// import {createIm2ColProgramInfoLoader} from './im2col';
+// import {createMatmulProgramInfoLoader} from './matmul';
+
+
+export const calculateOutputShape =
+    (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
+     adjustPads: readonly number[], strides: readonly number[]): number[] => {
+      const batchSize = inputShape[0];
+      const inputSpatialShape = inputShape.slice(2);
+      const spatialRank = inputSpatialShape.length;
+      const outChannels = kernelShape[0];
+      const kernelSpatialShape = kernelShape.slice(2);
+      const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
+      const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]);
+      const outputSpatialShape =
+          inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
+      const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
+      return outputShape;
+    };
+
+export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
+  readonly autoPad: string;
+  readonly dilations: readonly number[];
+  readonly group: number;
+  readonly kernelShape: readonly number[];
+  readonly pads: readonly number[];
+  readonly strides: readonly number[];
+}
+
+export const conv: OperatorAsyncImplementation<ConvAttributes> =
+    async(inferenceHandler: InferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs, attributes);  // currently will fail if not conv2D
+  return conv2d(inferenceHandler, inputs, attributes);
+};
+
+const conv2d: OperatorAsyncImplementation<ConvAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
+  const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
+  //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
+  //  if (adjustedAttributes.group > 1) {
+  return inferenceHandler.run(createGroupedConvProgramInfoLoader(inferenceHandler, inputs, adjustedAttributes), inputs);
+  //  } else if (isPointwise) {
+  //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
+  //  } else {
+  //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
+  //  }
+};
+
+const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inputs: Tensor[]): T => {
+  const kernelShape = attributes.kernelShape.slice();
+  // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
+  if (attributes.kernelShape.length === 0) {
+    for (let i = 2; i < inputs[1].dims.length; ++i) {
+      kernelShape.push(inputs[1].dims[i]);
+    }
+  }
+  const pads = attributes.pads.slice();
+  PoolConvUtil.adjustPadsBasedOnAutoPad(
+      inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
+
+  // always return a new object so does not modify the original attributes
+  const newAttributes: T = Object.assign({}, attributes);
+  Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
+  return newAttributes;
+};
+
+export const parseConvAttributes: OperatorInitialization<ConvAttributes> = (node: Graph.Node): ConvAttributes => {
+  const attributes = node.attributes;
+  const activationAttributes = parseInternalActivationAttributes(attributes);
+  // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
+  const autoPad = attributes.getString('auto_pad', 'NOTSET');
+  const dilations = attributes.getInts('dilations', [1, 1]);
+  const group = attributes.getInt('group', 1);
+  const kernelShape = attributes.getInts('kernel_shape', []);
+  const pads = attributes.getInts('pads', [0, 0, 0, 0]);
+  const strides = attributes.getInts('strides', [1, 1]);
+
+  return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides, ...activationAttributes});
+};
+
+const validateInputs = (inputs: Tensor[], attributes: ConvAttributes): void => {
+  // Refer to the below link for all input checks
+  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
+  if (!inputs || (inputs.length !== 2 && inputs.length !== 3)) {
+    throw new Error('Conv requires 2 or 3 inputs');
+  }
+
+  // TODO : Need to add support for multi-dimensional conv
+  if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
+    throw new Error('currently only support 2-dimensional conv');
+  }
+
+  // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
+  const dataChannel = inputs[0].dims[1];
+  const filterInChannel = inputs[1].dims[1] * attributes.group;
+  if (dataChannel !== filterInChannel) {
+    throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
+  }
+
+  // if bias is provided it should be 1D and the number of elements should be equal to the number of feature maps
+  if (inputs.length === 3 && (inputs[2].dims.length !== 1 || inputs[1].dims[0] !== inputs[2].dims[0])) {
+    throw new Error('invalid bias');
+  }
+
+  const spatialRank = inputs[0].dims.length - 2;
+  // wrong dilations dimension
+  if (attributes.dilations.length !== spatialRank) {
+    throw new Error(`dilations should be ${spatialRank}D`);
+  }
+
+  // Wrong strides dimension
+  if (attributes.strides.length !== spatialRank) {
+    throw new Error(`strides should be ${spatialRank}D`);
+  }
+
+  // Wrong pads dimension
+  if (attributes.pads.length !== spatialRank * 2) {
+    throw new Error(`pads should be ${spatialRank * 2}D`);
+  }
+
+  // if kernelShape is specified, it's data length must be 2 less than dims length of the weights tensor
+  // (the first 2 dims are batch_size and channels)
+  if (attributes.kernelShape.length !== 0 && attributes.kernelShape.length !== inputs[1].dims.length - 2) {
+    throw new Error('invalid kernel shape');
+  }
+
+  // TODO : Need to add support for float64
+  if (inputs[0].type !== 'float32' || inputs[1].type !== 'float32') {
+    throw new Error('Conv input(X,W) should be float tensor');
+  }
+
+  if (inputs.length === 3 && inputs[2].type !== 'float32') {
+    throw new Error('Conv input(bias) should be float tensor');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts b/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
new file mode 100644
index 0000000000000..fae2c9fb6e9b2
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Attribute} from '../../../attribute';
+import {MAX_CLIP, MIN_CLIP} from '../../../util';
+
+export interface InternalActivationAttributes {
+  readonly activation: string;
+  readonly clipMin?: number;
+  readonly clipMax?: number;
+  readonly activationCacheKey: string;
+}
+
+export function getActicationSnippet(attributes: InternalActivationAttributes) {
+  switch (attributes.activation) {
+    case 'Relu':
+      return {activationFunction: '', applyActivation: 'value = max(value, 0.0);'};
+    case 'Sigmoid':
+      return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
+    case 'Clip':
+      return {
+        activationFunction: `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
+        applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
+      };
+      // TODO: adding other activations that can be fused.
+    default:
+      return {activationFunction: '', applyActivation: ''};
+  }
+}
+
+export const parseInternalActivationAttributes = (attributes: Attribute): InternalActivationAttributes => {
+  const activation = attributes.getString('activation', '');
+
+  if (activation === 'Clip') {
+    const [clipMin, clipMax] = attributes.getFloats('activation_params', [MIN_CLIP, MAX_CLIP]);
+    return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
+  }
+  return {activation, activationCacheKey: activation};
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/gather.ts b/js/web/lib/onnxjs/backends/webgpu/ops/gather.ts
new file mode 100644
index 0000000000000..65f679a2cea83
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/gather.ts
@@ -0,0 +1,131 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {NUMBER_TYPES, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+interface GatherAttributes extends AttributeWithCacheKey {
+  readonly axis: number;
+}
+
+export const gather = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GatherAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs, attributes.axis);
+  return inferenceHandler.run(createGatherProgramInfoLoader(inputs, attributes), inputs);
+};
+
+export const parseGatherAttributes: OperatorInitialization<GatherAttributes> = (node: Graph.Node): GatherAttributes =>
+    createAttributeWithCacheKey({axis: node.attributes.getInt('axis', 0)});
+
+const gatherProgramMetadata = {
+  name: 'Gather',
+  inputTypes: [GpuDataType.default, GpuDataType.default]
+};
+
+const createGatherProgramInfo =
+    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
+      const dataShape = inputs[0].dims.slice();
+      const indicesShape = inputs[1].dims.slice();
+      const outputShape = new Array(dataShape.length + indicesShape.length - 1);
+
+      axis = ShapeUtil.normalizeAxis(axis, dataShape.length);
+      const indexCopyOps: string[] = [];
+      if (indicesShape.length > 1) {
+        indexCopyOps.push('indicesIdx[0] = 0u;');
+      } else {
+        indexCopyOps.push('indicesIdx = 0u;');
+      }
+      for (let i = 0; i < outputShape.length; i++) {
+        // outputShape is divided into three parts: A, B, C
+        // |0        axis|  axis + indicesShape.length |          end|
+        // |     A       |             B               |      C      |
+        //
+        // dataIdx: [A, inputs[1][B], C]
+        const outputIdxLValue = outputShape.length > 1 ? `outputIdx[${i}]` : 'outputIdx';
+        if (i < axis) {  // A
+          const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i}]` : 'dataIdx';
+          outputShape[i] = dataShape[i];
+          indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
+        } else {
+          if (i < axis + indicesShape.length) {  // B
+            const indicesIdxLValue = indicesShape.length > 1 ? `indicesIdx[${i - axis}]` : 'indicesIdx';
+            outputShape[i] = indicesShape[i - axis];
+            indexCopyOps.push(`${indicesIdxLValue} = ${outputIdxLValue};`);
+          } else {  // C
+            const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i - indicesShape.length + 1}]` : 'dataIdx';
+            outputShape[i] = dataShape[i - indicesShape.length + 1];  // skip 1 for axis
+            indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
+          }
+        }
+      }
+      const outputSize = ShapeUtil.size(outputShape);
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const dataIndicesHelper = createIndicesHelper('data', dataShape);
+      const indicesIndicesHelper = createIndicesHelper('indices', indicesShape);
+
+      const shaderSource = `
+    const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+    @group(0) @binding(0) var<storage, read> data : array<${dataType}>;
+    @group(0) @binding(1) var<storage, read> indices : array<i32>;
+    @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
+
+    ${outputIndicesHelper.o2iImpl}
+    ${indicesIndicesHelper.i2oImpl}
+    ${dataIndicesHelper.i2oImpl}
+
+    @compute @workgroup_size(WORKGROUP_SIZE)
+    fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+      // Guard against out-of-bounds work group sizes
+      if (global_id.x >= ${outputSize}u) {
+        return;
+      }
+
+      ${outputIndicesHelper.indicesVariableDeclaration('outputIdx')}
+      ${outputIndicesHelper.o2iCall('global_id.x', 'outputIdx')}
+      ${dataIndicesHelper.indicesVariableDeclaration('dataIdx')}
+      ${indicesIndicesHelper.indicesVariableDeclaration('indicesIdx')}
+      ${indexCopyOps.join('\n        ')}
+      let idx = indices[${indicesIndicesHelper.i2oExpression('indicesIdx')}];
+      dataIdx${dataShape.length > 1 ? `[${axis}]` : ''} = u32(select(idx, idx + ${dataShape[axis]}, idx < 0));
+      output[global_id.x] = data[${dataIndicesHelper.i2oExpression('dataIdx')}];
+    }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const createGatherProgramInfoLoader = (inputs: Tensor[], attributes: GatherAttributes): ProgramInfoLoader => {
+  const metadata = {...gatherProgramMetadata, cacheHint: attributes.cacheKey};
+  return {...metadata, get: () => createGatherProgramInfo(metadata, inputs, attributes.axis)};
+};
+
+const validateInputs = (inputs: Tensor[], axis: number): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Gather requires 2 inputs.');
+  }
+  const tensorRank = inputs[0].dims.length;
+  if (tensorRank < 1) {
+    throw new Error('Invalid input shape.');
+  }
+  if (axis < -tensorRank || axis > tensorRank - 1) {
+    throw new Error('Invalid axis.');
+  }
+  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
+    throw new Error('Invaid input type.');
+  }
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invaid input type.');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts b/js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts
new file mode 100644
index 0000000000000..3eeb49c91033a
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts
@@ -0,0 +1,165 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {GemmUtil, ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+export interface GemmAttributes extends AttributeWithCacheKey {
+  transA: boolean;
+  transB: boolean;
+  alpha: number;
+  beta: number;
+  isOptionalC: boolean;  // in opset 11, C becomes optional
+}
+
+export const gemm: OperatorAsyncImplementation<GemmAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GemmAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs, attributes);
+  return inferenceHandler.run(createGemmProgramInfoLoader(inputs, attributes), inputs);
+};
+
+const parseGemmAttributes = (node: Graph.Node, isOptionalC: boolean): GemmAttributes => {
+  const transA = node.attributes.getInt('transA', 0) !== 0;
+  const transB = node.attributes.getInt('transB', 0) !== 0;
+  const alpha = node.attributes.getFloat('alpha', 1.0);
+  const beta = node.attributes.getFloat('beta', 1.0);
+  return createAttributeWithCacheKey({transA, transB, alpha, beta, isOptionalC});
+};
+
+export const parseGemmAttributesV7: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
+    parseGemmAttributes(node, false);
+
+export const parseGemmAttributesV11: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
+    parseGemmAttributes(node, true);
+
+const createGemmProgramInfoLoader = (inputs: Tensor[], attributes: GemmAttributes): ProgramInfoLoader => {
+  const metadata = {
+    name: 'Gemm',
+    inputTypes: inputs.length === 3 ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                                      [GpuDataType.default, GpuDataType.default],
+    cacheHint: attributes.cacheKey
+  };
+
+  return {...metadata, get: () => createGemmProgramInfo(metadata, inputs, attributes)};
+};
+
+const offsetC = (m: number, n: number, dims: readonly number[]): string => {
+  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
+  const broadcastN = dims[dims.length - 1] !== n;
+
+  let offset = '0u';
+  if (!broadcastM) {
+    offset += `+ m * ${dims[dims.length - 1]}u`;
+  }
+  if (!broadcastN) {
+    offset += '+n';
+  }
+
+  return offset;
+};
+
+const createGemmProgramInfo =
+    (metadata: ProgramMetadata, inputs: Tensor[], attributes: GemmAttributes): ProgramInfo => {
+      const aShape = inputs[0].dims.slice();
+      const bShape = inputs[1].dims.slice();
+      const [M, N, K] = GemmUtil.getShapeOfGemmResult(
+          aShape, attributes.transA, bShape, attributes.transB, inputs.length === 3 ? inputs[2].dims : undefined);
+      const outputShape = [M, N];
+      if (!outputShape) {
+        throw new Error('Can\'t use gemm on the given tensors');
+      }
+      const outputSize = ShapeUtil.size(outputShape);
+      let line = '';
+      if (attributes.transA && attributes.transB) {
+        line = 'value += a[k * M + m] * b[n * K + k];';
+      } else if (attributes.transA && !attributes.transB) {
+        line = 'value += a[k * M + m] * b[k * N + n];';
+      } else if (!attributes.transA && attributes.transB) {
+        line = 'value += a[m * K + k] * b[n * K + k];';
+      } else if (!attributes.transA && !attributes.transB) {
+        line = 'value += a[m * K + k] * b[k * N + n];';
+      }
+
+      const dataType = 'f32';  // TODO: support other data type
+      const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
+      const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
+      const inputStorageBuffersDeclarations = [
+        `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
+        `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
+      ];
+      if (inputs.length === 3) {
+        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
+      }
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const M: u32 = ${M}u;
+  const N: u32 = ${N}u;
+  const K: u32 = ${K}u;
+  const alpha = ${dataType}(${attributes.alpha});
+  const beta = ${dataType}(${attributes.beta});
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    let m = global_id.x / N;
+    let n = global_id.x % N;
+
+    var value = ${dataType}(0);
+    for (var k: u32 = 0u; k<${K}u; k++) {
+      ${line}
+    }
+
+    ${calculateAlpha}
+    ${calculateC}
+    output[global_id.x] = value;
+
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const validateInputs = (inputs: Tensor[], attributes: GemmAttributes): void => {
+  if (!inputs) {
+    throw new Error('Input is missing');
+  }
+  if (attributes.isOptionalC && (inputs.length < 2 || inputs.length > 3)) {
+    throw new Error('Invaid input shape.');
+  }
+  if (!attributes.isOptionalC && inputs.length !== 3) {
+    throw new Error('Gemm requires 3 inputs');
+  }
+
+  // 'C' can be of dimensionality 1 or 2 only
+  if (inputs.length === 3 && inputs[2].dims.length !== 1 && inputs[2].dims.length !== 2) {
+    throw new Error('Invalid input shape of C');
+  }
+
+  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
+      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64') ||
+      (inputs.length === 3 && inputs[2].type !== 'float32' && inputs[2].type !== 'float64')) {
+    throw new Error('Invalid input type.');
+  }
+
+  if ((inputs[0].type !== inputs[1].type) || (inputs.length === 3 && inputs[0].type !== inputs[2].type)) {
+    throw new Error('Input types are mismatched');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts b/js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts
new file mode 100644
index 0000000000000..5b8f0bf94733e
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts
@@ -0,0 +1,115 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {BroadcastUtil, ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+import {getActicationSnippet, InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+
+export const matMul: OperatorAsyncImplementation<InternalActivationAttributes> =
+    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: InternalActivationAttributes):
+        Promise<Tensor[]> => {
+          validateInputs(inputs);
+
+          return inferenceHandler.run(createMatmulProgramInfoLoader(inputs, attributes), inputs);
+        };
+
+export const parseMatMulAttributes: OperatorInitialization<InternalActivationAttributes> =
+    (node: Graph.Node): InternalActivationAttributes => parseInternalActivationAttributes(node.attributes);
+
+const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
+  name: 'MatMul',
+  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                        [GpuDataType.default, GpuDataType.default],
+  cacheHint
+});
+
+function createMatmulProgramInfo(
+    metadata: ProgramMetadata, inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfo {
+  const aShape = inputs[0].dims;
+  const bShape = inputs[1].dims;
+  const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
+  if (!outputShape) {
+    throw new Error('Can\'t use matmul on the given tensors');
+  }
+  const outputSize = ShapeUtil.size(outputShape);
+  // TODO: support broadcasting
+
+  const dataType = 'f32';  // TODO: support other data type
+  const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
+
+  const M = outputShape[outputShape.length - 2];
+  const K = aShape[aShape.length - 1];
+  const N = outputShape[outputShape.length - 1];
+  const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const M: u32 = ${M}u;
+  const N: u32 = ${N}u;
+  const K: u32 = ${K}u;
+
+  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read> b : array<${dataType}>;
+  @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
+
+  ${activationFunction}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    let stack = global_id.x / (M * N);
+    let mn = global_id.x % (M * N);
+    let n = global_id.x % N;
+    let m = mn / N;
+
+    let offsetA = stack * (M * K);
+    let offsetB = stack * (K * N);
+
+    var value = ${dataType}(0);
+    for (var k: u32 = 0u; k<${K}u; k++) {
+      value += a[offsetA + m * K + k] * b[offsetB + k * N + n];
+    }
+    ${applyActivation}
+    output[global_id.x] = value;
+  }`;
+  return {
+    ...metadata,
+    outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+    shaderSource,
+    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+  };
+}
+
+export function createMatmulProgramInfoLoader(
+    inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfoLoader {
+  const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
+  return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes)};
+}
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('MatMul requires 2 inputs.');
+  }
+
+  if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
+    throw new Error('shared dimension does not match.');
+  }
+
+  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
+      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64')) {
+    throw new Error('inputs should be float type');
+  }
+
+  if (inputs[0].type !== inputs[1].type) {
+    throw new Error('inputs types should match');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/pool.ts b/js/web/lib/onnxjs/backends/webgpu/ops/pool.ts
new file mode 100644
index 0000000000000..0e92ff8cb906a
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/pool.ts
@@ -0,0 +1,376 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {PoolConvUtil, ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+export interface AveragePoolAttributes extends AttributeWithCacheKey {
+  readonly autoPad: string;
+  readonly ceilMode: number;
+  readonly countIncludePad: boolean;
+  readonly kernelShape: readonly number[];
+  readonly strides: readonly number[];
+  readonly pads: readonly number[];
+}
+
+export const averagePool: OperatorAsyncImplementation<AveragePoolAttributes> =
+    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
+        Promise<Tensor[]> => {
+          validateInputs(inputs);
+          const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+          return inferenceHandler.run(
+              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
+        };
+
+export const parseAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
+    (node: Graph.Node): AveragePoolAttributes => {
+      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
+      const ceilMode = node.attributes.getInt('ceil_mode', 0);
+      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
+      const kernelShape = node.attributes.getInts('kernel_shape');
+      const strides = node.attributes.getInts('strides', []);
+      const pads = node.attributes.getInts('pads', []);
+
+      // TODO: support attribute 'ceil_mode'
+      if (ceilMode !== 0) {
+        throw new Error('using ceil() in shape computation is not yet supported for AveragePool');
+      }
+
+      return createAttributeWithCacheKey({autoPad, ceilMode, countIncludePad, kernelShape, strides, pads});
+    };
+
+const createAveragePoolProgramInfo =
+    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean,
+     attributes: AveragePoolAttributes): ProgramInfo => {
+      const [adjustedAttributes, outputShape] =
+          getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+      const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
+
+      const dataType = 'f32';
+
+      const op1 = 'value += x_val;';
+      let op2 = '';
+      if (adjustedAttributes.countIncludePad) {
+        op2 += `value /= ${dataType}(${kernelSize});`;
+      } else {
+        op2 += `value /= ${dataType}(${kernelSize} - pad);`;
+      }
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType, '0.0'),
+        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+      };
+    };
+
+export const globalAveragePool: OperatorAsyncImplementation<AveragePoolAttributes> =
+    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
+        Promise<Tensor[]> => {
+          validateInputs(inputs);
+          const metadata = {
+            name: 'GlobalAveragePool',
+            inputTypes: [GpuDataType.default],
+            cacheHint: `${attributes.countIncludePad}`
+          };
+          return inferenceHandler.run(
+              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, true, attributes)}, inputs);
+        };
+
+export const parseGlobalAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
+    (node: Graph.Node): AveragePoolAttributes => {
+      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
+      return createAttributeWithCacheKey(
+          {autoPad: '', ceilMode: 0, countIncludePad, kernelShape: [], strides: [], pads: []});
+    };
+
+export interface MaxPoolAttributes extends AveragePoolAttributes {
+  readonly storageOrder: number;
+  readonly dilations: number[];
+}
+
+export const maxPool: OperatorAsyncImplementation<MaxPoolAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: MaxPoolAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+  return inferenceHandler.run(
+      {...metadata, get: () => createMaxPoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
+};
+
+export const parseMaxPoolAttributes: OperatorInitialization<MaxPoolAttributes> =
+    (node: Graph.Node): MaxPoolAttributes => {
+      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
+      const ceilMode = node.attributes.getInt('ceil_mode', 0);
+      const kernelShape = node.attributes.getInts('kernel_shape');
+      const strides = node.attributes.getInts('strides', []);
+      const pads = node.attributes.getInts('pads', []);
+      const storageOrder = node.attributes.getInt('storage_order', 0);
+      const dilations = node.attributes.getInts('dilations', []);
+
+      // TODO: support attribute 'ceil_mode' and 'storage_order'
+      if (storageOrder !== 0) {
+        throw new Error('column major storage order is not yet supported for MaxPool');
+      }
+      if (ceilMode !== 0) {
+        throw new Error('using ceil() in shape computation is not yet supported for MaxPool');
+      }
+
+      return createAttributeWithCacheKey(
+          {autoPad, ceilMode, countIncludePad: false, kernelShape, strides, pads, storageOrder, dilations});
+    };
+
+const createMaxPoolProgramInfo =
+    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean, attributes: MaxPoolAttributes):
+        ProgramInfo => {
+          const [adjustedAttributes, outputShape] =
+              getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+          const op1 = `
+      value = max(x_val, value);
+    `;
+          const op2 = '';
+          return {
+            ...metadata,
+            outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+            shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32', '-1e5'),
+            dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+          };
+        };
+
+const getAdjustedPoolAttributesAndOutputShape =
+    (inputs: Tensor[], attributes: AveragePoolAttributes|MaxPoolAttributes, isGlobalOperator: boolean):
+        [AveragePoolAttributes|MaxPoolAttributes, number[]] => {
+          const inputShape = inputs[0].dims.slice();
+          const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
+          const kernelShape = attributes.kernelShape.slice();
+          const strides = attributes.strides.slice();
+          const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
+          const pads = attributes.pads.slice();
+          PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
+
+          const outputShape = PoolConvUtil.computePoolOutputShape(
+              isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
+
+          const newAttributes = Object.assign({}, attributes);
+          if (hasDilations) {
+            Object.assign(newAttributes, {kernelShape, strides, pads, dilations, cacheKey: attributes.cacheKey});
+          } else {
+            Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
+          }
+          return [newAttributes, outputShape];
+        };
+
+const globalMaxPoolAttributes = {
+  autoPad: '',
+  ceilMode: 0,
+  countIncludePad: false,
+  kernelShape: [],
+  strides: [],
+  pads: [],
+  storageOrder: 0,
+  dilations: [],
+  cacheKey: ''
+};
+
+const globalMaxPoolMetadata = {
+  name: 'GlobalMaxPool',
+  inputTypes: [GpuDataType.default]
+};
+
+export const globalMaxPool = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(
+      {
+        ...globalMaxPoolMetadata,
+        get: () => createMaxPoolProgramInfo(inputs, globalMaxPoolMetadata, true, globalMaxPoolAttributes)
+      },
+      inputs);
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Pool ops requires 1 input.');
+  }
+  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+    throw new Error('Invalid input type.');
+  }
+};
+
+const generatePoolingCode =
+    (inputDims: readonly number[], outputShape: readonly number[], attributes: AveragePoolAttributes, op1: string,
+     op2: string, dataType: string, start: string): string => {
+      const rank = inputDims.length;
+      const outputSize = ShapeUtil.size(outputShape);
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const xIndicesHelper = createIndicesHelper('x', inputDims);
+
+      if (attributes.kernelShape.length <= 2) {
+        const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
+        const sw = attributes.strides[attributes.strides.length - 1];
+        const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
+        const pwEnd = attributes.pads[attributes.pads.length - 1];
+        const dimW = inputDims[rank - 1];
+        let codeW = '';
+        let codeH = '';
+        let codeHEnd = '';
+        if (pwStart + pwEnd !== 0) {
+          codeW = `
+          for (var i: u32 = 0u; i < ${kw}u; i++) {
+            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+            if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
+              pad++;
+              continue;
+            }
+            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+            ${op1}
+          }`;
+        } else {
+          codeW = `
+          for (var i: u32 = 0u; i < ${kw}u; i++) {
+            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+            ${op1}
+          }`;
+        }
+
+        if (attributes.kernelShape.length === 2) {
+          const kh = attributes.kernelShape[attributes.kernelShape.length - 2];
+          const sh = attributes.strides[attributes.strides.length - 2];
+          const phStart = attributes.pads[attributes.pads.length / 2 - 2];
+          const phEnd = attributes.pads[attributes.pads.length - 2];
+          const dimH = inputDims[rank - 2];
+          if (phStart + phEnd !== 0) {
+            codeH = `
+            for (var j: u32 = 0u; j < ${kh}u; j++) {
+              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+              if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
+                pad+= ${kw};
+                continue;
+              }
+          `;
+          } else {
+            codeH = `
+            for (var j: u32 = 0u; j < ${kh}u; j++) {
+              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+            `;
+          }
+          codeHEnd = `
+          }
+        `;
+        }
+
+        const poolingCode = `
+        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+        ${outputIndicesHelper.o2iImpl}
+        ${xIndicesHelper.i2oImpl}
+
+        @compute @workgroup_size(WORKGROUP_SIZE)
+        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+          // Guard against out-of-bounds work group sizes
+          if (global_id.x >= ${outputSize}u) {
+            return;
+          }
+
+          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+          var value: ${dataType} = ${dataType}(${start});
+          var pad = 0;
+          ${codeH}
+          ${codeW}
+          ${codeHEnd}
+          ${op2}
+
+          output[global_id.x] = value;
+        }`;
+        return poolingCode;
+      } else {
+        const kernelSize = ShapeUtil.size(attributes.kernelShape);
+        const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
+        const stridesRank = kernelStrides.length;
+        const padsRank = attributes.pads.length;
+        const hasPads = attributes.pads.reduce((sum, cur) => sum + cur);
+        let padCode = '';
+        if (hasPads) {
+          padCode = `
+            if (xIndices[j] >= inputDims[j]) {
+              pad++;
+              isPad = true;
+              break;
+            }
+          }
+          if (!isPad) {
+            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+            ${op1}
+          }`;
+        } else {
+          padCode = `
+          }
+          let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+          ${op1}
+        `;
+        }
+        const poolingCode = `
+        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+        ${outputIndicesHelper.o2iImpl}
+        ${xIndicesHelper.i2oImpl}
+
+        const pads = array<u32, ${padsRank}>(${attributes.pads.map(i => `${i}u`).join(',')});
+        const inputDims = array<u32, ${rank}>(${inputDims.map(i => `${i}u`).join(',')});
+        const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
+        const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
+
+        @compute @workgroup_size(WORKGROUP_SIZE)
+        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+          // Guard against out-of-bounds work group sizes
+          if (global_id.x >= ${outputSize}u) {
+            return;
+          }
+
+          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+          var offsets: array<u32, ${stridesRank}>;
+
+          var value = ${dataType}(${start});
+          var pad = 0;
+          var isPad = false;
+
+          for (var i: u32 = 0u; i < ${kernelSize}u; i++) {
+            var offset = i;
+            for (var j = 0u; j < ${stridesRank - 1}u; j++) {
+              offsets[j] = offset / kernelStrides[j];
+              offset -= offsets[j] * kernelStrides[j];
+            }
+            offsets[${stridesRank - 1}] = offset;
+
+            isPad = false;
+            for (var j = ${rank - stridesRank}u; j < ${rank}u; j++) {
+              xIndices[j] = indices[j] * strides[j - ${rank - stridesRank}u]
+                + offsets[j - ${rank - stridesRank}u] - pads[j - 2u];
+              ${padCode}
+          }
+          ${op2}
+
+          output[global_id.x] = value;
+        }`;
+        return poolingCode;
+      }
+    };
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts b/js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts
new file mode 100644
index 0000000000000..763a656d92abb
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+export const sum = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+
+  const sumProgramMetadata = {name: 'Sum', inputTypes: new Array(inputs.length).fill(GpuDataType.default)};
+
+  return inferenceHandler.run(
+      {...sumProgramMetadata, get: () => createSumProgramInfo(inferenceHandler, inputs, sumProgramMetadata)}, inputs);
+};
+
+const createSumProgramInfo =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], sumProgramMetadata: ProgramMetadata): ProgramInfo => {
+      const dataType = 'f32';
+      const outputShape = inputs[0].dims;
+      const outputSize = ShapeUtil.size(outputShape);
+
+
+      const inputsDeclaration =
+          inputs.map((_, i) => `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`);
+      const sumLine = inputs.map((_, i) => `input${i}[offset]`).join('+');
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  ${inputsDeclaration.join('\n')}
+  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    let offset = global_id.x;
+
+    var value = ${dataType}(0);
+    value = ${sumLine};
+
+    output[offset] = value;
+  }`;
+      return {
+        ...sumProgramMetadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length === 0) {
+    throw new Error('Sum requires inputs.');
+  }
+
+  const length = inputs[0].dims.length;
+  for (let i = 1; i < inputs.length; i++) {
+    if (length !== inputs[i].dims.length) {
+      throw new Error('Input shapes are mismatched. broadcasting not supported yet');
+    }
+
+    for (let j = 0; j < length; j++) {
+      if (inputs[0].dims[j] !== inputs[i].dims[j]) {
+        throw new Error('Input shapes are not matched. broadcasting not supported yet');
+      }
+    }
+  }
+
+  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+    throw new Error('Invalid input type.');
+  }
+  for (let i = 1; i < inputs.length; i++) {
+    if (inputs[0].type !== inputs[i].type) {
+      throw new Error('Input types are not matched.');
+    }
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts b/js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts
new file mode 100644
index 0000000000000..323e80bdb596a
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const reshape = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  const shape = await inputs[1].getData();
+  const reshapedDims = ShapeUtil.calculateReshapedDims(inputs[0].dims, shape as Int32Array);
+  return [handler.reshape(inputs[0], reshapedDims)];
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Reshape requires 2 inputs.');
+  }
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/shape.ts b/js/web/lib/onnxjs/backends/webgpu/ops/shape.ts
new file mode 100644
index 0000000000000..94ba9293c457a
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/shape.ts
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from '../../../tensor';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const shape = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return [new Tensor([inputs[0].dims.length], 'int32', undefined, undefined, new Int32Array(inputs[0].dims))];
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Shape requires 1 input.');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/slice.ts b/js/web/lib/onnxjs/backends/webgpu/ops/slice.ts
new file mode 100644
index 0000000000000..fd5d6e2d2299e
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/slice.ts
@@ -0,0 +1,180 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {NUMBER_TYPES, OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+export interface SliceAttributes extends AttributeWithCacheKey {
+  readonly axes: number[];
+  readonly ends: number[];
+  readonly starts: number[];
+}
+
+const sliceProgramMetadata = {
+  name: 'Slice',
+  inputTypes: [GpuDataType.default]
+};
+
+export const slice: OperatorAsyncImplementation<SliceAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: SliceAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(
+      {
+        ...sliceProgramMetadata,
+        cacheHint: attributes.cacheKey,
+        get: () => createSliceProgramInfo(inputs[0], attributes)
+      },
+      inputs);
+};
+
+export const parseSliceAttributes: OperatorInitialization<SliceAttributes> = (node: Graph.Node): SliceAttributes => {
+  const starts = node.attributes.getInts('starts');
+  const ends = node.attributes.getInts('ends');
+  const axes = node.attributes.getInts('axes', []);
+  return createAttributeWithCacheKey({starts, ends, axes});
+};
+
+const offsetToIndices = (offset: string, strides: readonly number[], indicesPrefix: string): string => {
+  const outputLines: string[] = [];
+
+  for (let i = 0; i < strides.length - 1; i++) {
+    outputLines.push(`var ${indicesPrefix}${i}=${offset}/${strides[i]}u;`);
+    outputLines.push(`${offset}%=${strides[i]}u;`);
+  }
+  outputLines.push(`var ${indicesPrefix}${strides.length - 1}=${offset};`);
+
+  return outputLines.join('\n');
+};
+
+const indicesToOffset = (indicesPrefix: string, strides: readonly number[], offset: string): string => {
+  const outputLines: string[] = [];
+
+  for (let i = 0; i < strides.length - 1; i++) {
+    outputLines.push(`${offset}+=${indicesPrefix}${i} * ${strides[i]}u;`);
+  }
+  outputLines.push(`${offset}+=${indicesPrefix}${strides.length - 1};`);
+
+  return outputLines.join('\n');
+};
+
+const createSliceProgramInfo = (input: Tensor, attributes: SliceAttributes, dataType = 'f32'): ProgramInfo => {
+  const axes = (attributes.axes.length === 0) ? input.dims.slice(0).map((val, i) => i) : attributes.axes;
+  const normalizedAxes = ShapeUtil.normalizeAxes(axes, input.dims.length);
+  const starts = attributes.starts.map((start, i) => {
+    if (start > input.dims[normalizedAxes[i]] - 1) {
+      return input.dims[normalizedAxes[i]];
+    }
+    return ShapeUtil.normalizeAxis(start, input.dims[normalizedAxes[i]]);
+  });
+  const ends = attributes.ends.map((end, i) => {
+    if (end > input.dims[normalizedAxes[i]] - 1) {
+      return input.dims[normalizedAxes[i]];
+    }
+    return ShapeUtil.normalizeAxis(end, input.dims[normalizedAxes[i]]);
+  });
+
+  const outputShape = input.dims.slice();
+
+  const sliceOps: string[] = [];
+  for (let i = 0; i < normalizedAxes.length; i++) {
+    outputShape[normalizedAxes[i]] = ends[i] - starts[i];
+    if (starts[i] > 0) {
+      sliceOps.push(`idx_${normalizedAxes[i]} += ${starts[i]}u;`);
+    }  // else { sliceOps.push(`outputIdx[${normalizedAxes[i]}] += 0;`); }
+  }
+
+  const outputSize = ShapeUtil.size(outputShape);
+  const outputStrides = ShapeUtil.computeStrides(outputShape);
+  const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  @group(0) @binding(0) var<storage, read> input : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    var offset = global_id.x;
+    ${offsetToIndices('offset', outputStrides, 'idx_')}
+    ${sliceOps.join('')}
+    var offsetInput = 0u;
+    ${indicesToOffset('idx_', ShapeUtil.computeStrides(input.dims), 'offsetInput')}
+    output[global_id.x] = input[offsetInput];
+  }`;
+  return {
+    ...sliceProgramMetadata,
+    outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
+    shaderSource,
+    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+  };
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Slice requires 1 input.');
+  }
+  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
+    throw new Error('Invalid input type.');
+  }
+};
+
+export const sliceV10 = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputsV10(inputs);
+  const attributes = generateSliceAttributesFromInputs(inferenceHandler, inputs);
+  return inferenceHandler.run(
+      {
+        ...sliceProgramMetadata,
+        cacheHint: attributes.cacheKey,
+        get: () => createSliceProgramInfo(inputs[0], attributes)
+      },
+      [inputs[0]]);
+};
+
+const generateSliceAttributesFromInputs =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): SliceAttributes => {
+      if (!inferenceHandler.session.isInitializer(inputs[1].dataId) ||
+          !inferenceHandler.session.isInitializer(inputs[2].dataId) ||
+          (inputs.length >= 4 && !inferenceHandler.session.isInitializer(inputs[3].dataId)) ||
+          (inputs.length >= 5 && !inferenceHandler.session.isInitializer(inputs[4].dataId))) {
+        throw new Error('dynamic slice attributes are not allowed');
+      }
+
+      if (inputs.length >= 5 && inputs[4].integerData.some((i: number) => i !== 1)) {
+        throw new Error('currently non-1 steps is not supported for Slice');
+      }
+
+      const starts = Array.from(inputs[1].integerData);
+      const ends = Array.from(inputs[2].integerData);
+      const axes = inputs.length >= 4 ? Array.from(inputs[3].integerData) : [];
+      const cacheKey = `${axes};${starts};${ends}`;
+      return {starts, ends, axes, cacheKey};
+    };
+
+const validateInputsV10 = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length < 3 || inputs.length > 5) {
+    throw new Error('Invalid input number.');
+  }
+  if (inputs[1].type !== 'int32' || inputs[1].dims.length !== 1) {
+    throw new Error('Invalid input type.');
+  }
+  if (inputs[2].type !== 'int32' || inputs[2].dims.length !== 1) {
+    throw new Error('Invalid input type.');
+  }
+  if (inputs.length >= 4 && (inputs[3].type !== 'int32' || inputs[3].dims.length !== 1)) {
+    throw new Error('Invalid input type.');
+  }
+  if (inputs.length >= 5 && (inputs[4].type !== 'int32' || inputs[4].dims.length !== 1)) {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts b/js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts
new file mode 100644
index 0000000000000..7cd85e6877b03
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Graph} from '../../../graph';
+import {OperatorImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const squeeze: OperatorImplementation<number[]> =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
+      validateInputs(inputs);
+      const outputShape = ShapeUtil.squeezeShape(inputs[0].dims, axes);
+      const output = inferenceHandler.reshape(inputs[0], outputShape);
+      return [output];
+    };
+
+export const squeezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
+  validateInputsV13(inputs);
+  return squeeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
+};
+
+export const parseSqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
+    node.attributes.getInts('axes');
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Squeeze requires 1 input.');
+  }
+
+  if (inputs[0].type === 'string') {
+    throw new Error('invalid input tensor types.');
+  }
+};
+
+const validateInputsV13 = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Squeeze requires 2 inputs.');
+  }
+
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts b/js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts
new file mode 100644
index 0000000000000..e83dd7fcbb0b9
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts
@@ -0,0 +1,116 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+export interface TransposeAttributes extends AttributeWithCacheKey {
+  readonly perm: number[];
+}
+
+const transposeProgramMetadata = {
+  name: 'Transpose',
+  inputTypes: [GpuDataType.default]
+};
+
+export const transpose: OperatorAsyncImplementation<TransposeAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: TransposeAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(
+      {
+        ...transposeProgramMetadata,
+        cacheHint: attributes.cacheKey,
+        get: () => createTransposeProgramInfo(inferenceHandler, inputs[0], attributes.perm)
+      },
+      inputs);
+};
+
+export const parseTransposeAttributes: OperatorInitialization<TransposeAttributes> =
+    (node: Graph.Node): TransposeAttributes => createAttributeWithCacheKey({perm: node.attributes.getInts('perm', [])});
+
+const createTransposeProgramInfo =
+    (_inferenceHandler: WebGpuInferenceHandler, input: Tensor, perm: number[]): ProgramInfo => {
+      const dataType = 'f32';  // TODO: support other data type
+      const inputShape = input.dims;
+      perm = getAdjustedPerm(inputShape, perm);
+      const outputShape = getOutputShape(inputShape, perm);
+      const rank = inputShape.length;
+      const outputSize = ShapeUtil.size(outputShape);
+      // A dims=[${inputs[0].dims.toString()}]
+      // out Dims=[${unpackedOutputShape.toString()}]
+      // based on perm=[${perm.toString()}]
+
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const inputIndicesHelper = createIndicesHelper('a', inputShape);
+
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+  ${permFunctionBody(perm, rank)}
+  ${outputIndicesHelper.o2iImpl}
+  ${inputIndicesHelper.i2oImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+    ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
+    perm(&aIndices, &indices);
+
+    output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
+  }`;
+      return {
+        ...transposeProgramMetadata,
+        outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] => {
+  if (perm && perm.length !== inputShape.length) {
+    perm = [...(inputShape.keys())].reverse();
+  }
+  return perm;
+};
+
+const getOutputShape = (inputShape: readonly number[], perm: number[]): readonly number[] => {
+  perm = getAdjustedPerm(inputShape, perm);
+  return ShapeUtil.sortBasedOnPerm(inputShape, perm);
+};
+
+const permFunctionBody = (perm: number[], rank: number): string => {
+  const reverseFunc = [];
+  reverseFunc.push(`fn perm(a: ptr<function, array<u32, ${rank}>>, i: ptr<function, array<u32, ${rank}>>) {`);
+  for (let i = 0; i < rank; ++i) {
+    reverseFunc.push(`\t(*a)[${perm[i]}]=(*i)[${i}];`);
+  }
+  reverseFunc.push('\t}');
+  return reverseFunc.join('\n');
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Transpose requires 1 input.');
+  }
+
+  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+    throw new Error('input should be float tensor');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts b/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
new file mode 100644
index 0000000000000..54213cfdd2313
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
@@ -0,0 +1,197 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {Tensor} from '../../../tensor';
+import {MAX_CLIP, MIN_CLIP} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+type BuiltinFunctionName = string;
+type ElementwiseCustomExpression = (expression: string) => string;
+type ElementwiseFunctionCall = BuiltinFunctionName|ElementwiseCustomExpression;
+
+const createElementwiseProgramShader =
+    (datasize: number, funcCall: ElementwiseFunctionCall, additionalImplementation?: string): string => {
+      const vecSize = Math.ceil(datasize / 4);
+
+      let expression = '';
+      if (typeof funcCall === 'string') {
+        expression = `${funcCall}(a)`;
+      } else {
+        expression = funcCall('a');
+      }
+      return `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  @group(0) @binding(0) var<storage, read> inputData : array<vec4<f32>>;
+  @group(0) @binding(1) var<storage, read_write> outputData : array<vec4<f32>>;
+
+  ${additionalImplementation ?? ''}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${vecSize}u) {
+      return;
+    }
+
+    let a = inputData[global_id.x];
+    outputData[global_id.x] = ${expression};
+  }`;
+    };
+
+const createElementwiseProgramInfo =
+    (metadata: ProgramMetadata, input: Tensor, funcCall: ElementwiseFunctionCall, additionalImplementation?: string):
+        ProgramInfo => ({
+          ...metadata,
+          shaderSource: createElementwiseProgramShader(input.size, funcCall, additionalImplementation),
+          outputs: [{dims: input.dims, type: input.type, gpuDataType: GpuDataType.default}],
+          dispatchGroup: (inputTensors) =>
+              ({x: Math.ceil(inputTensors[0].size / 64 /* workgroup size */ / 4 /* vec size */)})
+        });
+
+const createElementwiseProgramInfoLoader =
+    (input: Tensor, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string,
+     cacheKey?: string): ProgramInfoLoader => {
+      const metadata: ProgramMetadata = {name, inputTypes: [GpuDataType.default], cacheHint: cacheKey};
+      return {
+        ...metadata,
+        get: () => createElementwiseProgramInfo(metadata, input, funcCall, additionalImplementation)
+      };
+    };
+
+export const abs = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Abs', 'abs'), inputs);
+
+export const acos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Acos', 'acos'), inputs);
+
+export const asin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Asin', 'asin'), inputs);
+
+export const atan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Atan', 'atan'), inputs);
+
+export interface ClipAttributes extends AttributeWithCacheKey {
+  readonly min: number;
+  readonly max: number;
+}
+
+export const clip = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ClipAttributes):
+                        Promise<Tensor[] >=>handler.run(
+                            createElementwiseProgramInfoLoader(
+                                inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
+    let clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
+    let clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
+`,
+                                attributes.cacheKey),
+                            inputs);
+
+export const parseClipAttributes = (node: Graph.Node): ClipAttributes => createAttributeWithCacheKey(
+    {min: node.attributes.getFloat('min', MIN_CLIP), max: node.attributes.getFloat('max', MAX_CLIP)});
+
+const generateClipAttributesFromInputs = (handler: WebGpuInferenceHandler, inputs: Tensor[]): ClipAttributes => {
+  if (inputs.length >= 3 &&
+      (!handler.session.isInitializer(inputs[1].dataId) || !handler.session.isInitializer(inputs[2].dataId))) {
+    throw new Error('dynamic clip attributes are not allowed');
+  }
+
+  const min = (inputs.length >= 3) ? inputs[1].numberData[0] : MIN_CLIP;
+  const max = (inputs.length >= 3) ? inputs[2].numberData[0] : MAX_CLIP;
+  return createAttributeWithCacheKey({min, max});
+};
+
+export const clipV11 = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  const attributes = generateClipAttributesFromInputs(handler, inputs);
+  return clip(handler, [inputs[0]], attributes);
+};
+
+export const ceil = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Ceil', 'ceil'), inputs);
+
+export const cos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Cos', 'cos'), inputs);
+
+export interface EluAttributes extends AttributeWithCacheKey {
+  readonly alpha: number;
+}
+
+export const elu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
+                       Promise<Tensor[] >=>handler.run(
+                           createElementwiseProgramInfoLoader(
+                               inputs[0], 'Elu', a => `elu_vf32(${a})`, `
+    let elu_alpha_: f32 = f32(${attributes.alpha});
+
+    fn elu_f32(a: f32) -> f32 {
+      return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
+    }
+
+    fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
+      return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
+    }`,
+                               attributes.cacheKey),
+                           inputs);
+
+export const parseEluAttributes = (node: Graph.Node): EluAttributes =>
+    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 1.0)});
+
+export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
+
+export const floor = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Floor', 'floor'), inputs);
+
+export interface LeakyReluAttributes extends AttributeWithCacheKey {
+  readonly alpha: number;
+}
+
+export const leakyRelu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
+                             Promise<Tensor[] >=>handler.run(
+                                 createElementwiseProgramInfoLoader(
+                                     inputs[0], 'LeakyRelu', a => `leaky_relu_vf32(${a})`, `
+    let leaky_relu_alpha_: f32 = f32(${attributes.alpha});
+
+    fn leaky_relu_f32(a: f32) -> f32 {
+      return select(a, a * leaky_relu_alpha_, a < 0.0);
+    }
+
+    fn leaky_relu_vf32(v: vec4<f32>) -> vec4<f32> {
+      return vec4(leaky_relu_f32(v.x), leaky_relu_f32(v.y), leaky_relu_f32(v.z), leaky_relu_f32(v.w));
+    }`,
+                                     attributes.cacheKey),
+                                 inputs);
+
+export const parseLeakyReluAttributes = (node: Graph.Node): LeakyReluAttributes =>
+    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 0.01)});
+
+export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
+
+export const neg = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Neg', a => `-${a}`), inputs);
+
+// export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
+
+export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
+    createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
+
+export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
+    createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
+
+export const sin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sin', 'sin'), inputs);
+
+export const sqrt = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sqrt', 'sqrt'), inputs);
+
+export const tan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tan', 'tan'), inputs);
+
+export const tanh = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tanh', 'tanh'), inputs);
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts b/js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts
new file mode 100644
index 0000000000000..8a099dc92cbd9
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Graph} from '../../../graph';
+import {OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const unsqueeze = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
+  validateInputs(inputs);
+  const outputShape = ShapeUtil.unsqueezeShape(inputs[0].dims, axes);
+  const output = inferenceHandler.reshape(inputs[0], outputShape);
+  return [output];
+};
+
+export const unsqueezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
+  validateInputsV13(inputs);
+  return unsqueeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
+};
+
+export const parseUnsqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
+    node.attributes.getInts('axes');
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Unsqueeze requires 1 input.');
+  }
+
+  if (inputs[0].type === 'string') {
+    throw new Error('invalid input tensor types.');
+  }
+};
+
+const validateInputsV13 = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Unsqueeze requires 2 inputs.');
+  }
+
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/onnxjs/backends/webgpu/program-manager.ts b/js/web/lib/onnxjs/backends/webgpu/program-manager.ts
new file mode 100644
index 0000000000000..dac32ccbe4f72
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/program-manager.ts
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {env} from 'onnxruntime-common';
+
+import {Logger, Profiler} from '../../instrument';
+import {WebGpuBackend} from '../backend-webgpu';
+
+import {Artifact, GpuData, ProgramInfo} from './types';
+
+/**
+ * ProgramManager is the main class behind running computations
+ * It builds ProgramInfo's into Artifacts
+ * It compiles given ProgramInfo's into WebGL Prorams (cached as Artifacts)
+ * Uses the artifact to run the computation by calling Draw on
+ * the WebGL drawing buffer
+ * ProgramManager automatically maps (binds) input variables to their
+ * corresponding Location's in the binary program
+ */
+export class ProgramManager {
+  repo: Map<unknown, Artifact>;  // this should be per-session object
+  attributesBound: boolean;
+
+  constructor(private backend: WebGpuBackend, public profiler: Readonly<Profiler>) {
+    this.repo = new Map();
+    this.attributesBound = false;
+  }
+  getArtifact(key: unknown): Artifact|undefined {
+    return this.repo.get(key);
+  }
+  setArtifact(key: unknown, artifact: Artifact): void {
+    this.repo.set(key, artifact);
+  }
+  run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[],
+      dispatchGroup: {x: number; y?: number; z?: number}): void {
+    const device = this.backend.device;
+
+    const computePassEncoder = this.backend.getComputePassEncoder();
+
+    computePassEncoder.setPipeline(buildArtifact.computePipeline);
+    const entries = [];
+    for (const input of inputs) {
+      entries.push({binding: entries.length, resource: {buffer: input.buffer}});
+    }
+    for (const output of outputs) {
+      entries.push({binding: entries.length, resource: {buffer: output.buffer}});
+    }
+    const bindGroup = device.createBindGroup({layout: buildArtifact.computePipeline.getBindGroupLayout(0), entries});
+    computePassEncoder.setBindGroup(0, bindGroup);
+
+    const {x, y, z} = dispatchGroup;
+    computePassEncoder.dispatch(x, y, z);
+
+    this.backend.pendingDispatchNumber++;
+
+    if (this.backend.pendingDispatchNumber >= 16) {
+      this.backend.flush();
+    }
+  }
+  dispose(): void {
+    // this.repo.forEach(a => this.glContext.deleteProgram(a.program));
+  }
+  build(programInfo: ProgramInfo): Artifact {
+    const device = this.backend.device;
+
+    const shaderModule = device.createShaderModule({code: programInfo.shaderSource});
+    if (env.debug) {
+      Logger.verbose('WebGpuProgram', programInfo.shaderSource);
+    }
+
+    const computePipeline = device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}});
+
+    return {programInfo, computePipeline};
+  }
+}
diff --git a/js/web/lib/onnxjs/backends/webgpu/session-handler.ts b/js/web/lib/onnxjs/backends/webgpu/session-handler.ts
new file mode 100644
index 0000000000000..1fe288c36dd1e
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/session-handler.ts
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {SessionHandler} from '../../backend';
+import {Graph} from '../../graph';
+import {Operator} from '../../operators';
+import {OpSet, resolveOperator} from '../../opset';
+import {Session} from '../../session';
+import {Tensor} from '../../tensor';
+import {WebGpuBackend} from '../backend-webgpu';
+
+import {WebGpuInferenceHandler} from './inference-handler';
+import {WEBGPU_OP_RESOLVE_RULES} from './op-resolve-rules';
+import {ProgramManager} from './program-manager';
+import {createTensorDataManager, TensorDataManager} from './tensor-data-manager';
+
+export class WebGpuSessionHandler implements SessionHandler {
+  private initializers: Set<Tensor.Id>;
+  readonly dataManager: TensorDataManager;
+  readonly programManager: ProgramManager;
+
+  constructor(public readonly backend: WebGpuBackend, public readonly context: Session.Context) {
+    this.dataManager = createTensorDataManager(this.backend.gpuDataManager);
+    this.programManager = new ProgramManager(this.backend, this.context.profiler);
+  }
+
+  createInferenceHandler() {
+    return new WebGpuInferenceHandler(this);
+  }
+  onGraphInitialized(graph: Graph): void {
+    const initializers = graph.getValues().filter(v => v.from === -1 && v.tensor).map(v => v.tensor!.dataId);
+    this.initializers = new Set(initializers);
+  }
+  isInitializer(tensorId: Tensor.Id): boolean {
+    return this.initializers ? this.initializers.has(tensorId) : false;
+  }
+  addInitializer(tensorId: Tensor.Id): void {
+    this.initializers.add(tensorId);
+  }
+  dispose(): void {
+    // TODO
+  }
+  resolve(node: Graph.Node, opsets: readonly OpSet[], graph: Graph): Operator {
+    const op = resolveOperator(node, opsets, WEBGPU_OP_RESOLVE_RULES);
+    return {impl: op.opImpl, context: op.opInit ? op.opInit(node, graph) : node};
+  }
+}
diff --git a/js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts b/js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts
new file mode 100644
index 0000000000000..bdf6c7f9ebe42
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts
@@ -0,0 +1,140 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {createView, Tensor} from '../../tensor';
+
+import {GpuDataManager} from './gpu-data-manager';
+import {GpuData, GpuDataId, GpuDataType} from './types';
+
+/**
+ * manages Tensor ID -> Gpu Data ID
+ *
+ * A tensor ID is a unique ID representing a value(tensor), which is the graph's node's input or output.
+ * A GPU Data ID is a unique ID representing an abstract data on GPU memory. Specifically, for current WebGPU scenarios,
+ *   GPU Data is a storage buffer, and GPU Data ID is a handle to a storage buffer.
+ *
+ * - a value is different to the graph's edge. if a node's output is consumed by 2 other downstream nodes, there are
+ *   2 edges, but only one value.
+ *
+ * - a tensor ID maps to 0 or 1 GPU Data ID, depending on whether the data is available on GPU or not.
+ *
+ * - a GPU Data ID maps to 1 or more tensor ID.
+ *
+ */
+export interface TensorDataManager {
+  /**
+   * upload a CPU tensor to GPU.
+   */
+  uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): Promise<GpuData>;
+
+  /**
+   * create a new GPU tensor.
+   */
+  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData];
+
+  /**
+   * check whether the tensor has GPU data
+   */
+  hasGpuData(tensorId: Tensor.Id): boolean;
+
+  /**
+   * create a reference to the GPU data.
+   */
+  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData];
+
+  /**
+   * release the GPU resources referred by the tensor.
+   */
+  releaseGpuTensor(tensorId: Tensor.Id): void;
+}
+
+class TensorDataManagerImpl implements TensorDataManager {
+  private map: Map<Tensor.Id, GpuDataId>;
+  private reverseMap: Map<GpuDataId, Set<Tensor.Id>>;
+
+  constructor(private gpuDataManager: GpuDataManager) {
+    this.map = new Map();
+    this.reverseMap = new Map();
+  }
+
+  private registerIdMapping(tensorId: Tensor.Id, gpuDataId: GpuDataId): void {
+    this.map.set(tensorId, gpuDataId);
+
+    let tensorIds = this.reverseMap.get(gpuDataId);
+    if (!tensorIds) {
+      tensorIds = new Set();
+      this.reverseMap.set(gpuDataId, tensorIds);
+    }
+    tensorIds.add(tensorId);
+  }
+
+  async uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): Promise<GpuData> {
+    const gpuDataId = this.map.get(tensor.dataId);
+    if (gpuDataId) {
+      const gpuData = this.gpuDataManager.get(gpuDataId);
+      if (!gpuData) {
+        throw new Error('internal error. this should never happen');
+      }
+      return gpuData;
+    }
+
+    const gpuData = await this.gpuDataManager.upload(tensor.numberData, gpuDataType);
+    this.registerIdMapping(tensor.dataId, gpuData.id);
+    return gpuData;
+  }
+
+  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
+    const gpuData = this.gpuDataManager.create(type, dims, gpuDataType);
+    const tensor = new Tensor(dims, type, undefined, async () => {
+      const data = await this.gpuDataManager.download(gpuData.id);
+      return createView(data, type);
+    });
+
+    this.registerIdMapping(tensor.dataId, gpuData.id);
+    return [tensor, gpuData];
+  }
+
+  hasGpuData(tensorId: Tensor.Id): boolean {
+    return this.map.has(tensorId);
+  }
+
+  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData] {
+    const gpuDataId = this.map.get(tensorId);
+    if (!gpuDataId) {
+      throw new Error('internal error. this should never happen');
+    }
+
+    const gpuData = this.gpuDataManager.get(gpuDataId);
+    if (!gpuData) {
+      throw new Error('internal error. this should never happen');
+    }
+
+    const tensor = new Tensor(dims, type, undefined, async () => {
+      const data = await this.gpuDataManager.download(gpuData.id);
+      return createView(data, type);
+    });
+
+    this.registerIdMapping(tensor.dataId, gpuData.id);
+    return [tensor, gpuData];
+  }
+
+  releaseGpuTensor(tensorId: Tensor.Id): void {
+    const gpuDataId = this.map.get(tensorId);
+    if (gpuDataId) {
+      this.map.delete(tensorId);
+
+      const tensorIds = this.reverseMap.get(gpuDataId);
+      if (!tensorIds) {
+        throw new Error('internal error. this should never happen');
+      }
+      tensorIds.delete(tensorId);
+      if (tensorIds.size === 0) {
+        this.gpuDataManager.release(gpuDataId);
+        this.reverseMap.delete(gpuDataId);
+      }
+    }
+  }
+}
+
+export const createTensorDataManager = (gpuDataManager: GpuDataManager): TensorDataManager =>
+    new TensorDataManagerImpl(gpuDataManager);
diff --git a/js/web/lib/onnxjs/backends/webgpu/types.ts b/js/web/lib/onnxjs/backends/webgpu/types.ts
new file mode 100644
index 0000000000000..96f6e247de5a3
--- /dev/null
+++ b/js/web/lib/onnxjs/backends/webgpu/types.ts
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Guid} from 'guid-typescript';
+
+import {Tensor} from '../../tensor';
+
+export enum GpuDataType {
+  default = 0
+}
+export type GpuDataId = Guid;
+
+export interface GpuData {
+  type: GpuDataType;
+  id: GpuDataId;
+  buffer: GPUBuffer;
+}
+
+export interface TensorInfo {
+  id?: Tensor.Id;
+  dims: readonly number[];
+  type: Tensor.DataType;
+  gpuDataType: GpuDataType;
+}
+
+
+export interface ProgramVariable {
+  type: 'float'|'int';
+  name: string;
+  arrayLength?: number;
+  data: number|number[];
+}
+
+
+export interface ProgramMetadata {
+  /**
+   * the name of the program. used for debugging and profiling
+   */
+  name: string;
+
+  // inputLayouts: GPUBindGroupLayoutEntry[];
+  // outputLayouts: GPUBindGroupLayoutEntry[];
+
+  /**
+   * gpu data types for each input
+   */
+  inputTypes: GpuDataType[];
+  /**
+   * an optional string as a cache hint in the artifact cache
+   */
+  cacheHint?: string;
+}
+
+/**
+ * A ProgramInfoLoader allows
+ */
+export interface ProgramInfoLoader extends ProgramMetadata {
+  /**
+   * a function to get the program info
+   */
+  get(): ProgramInfo;
+}
+
+/**
+ * A set of data that represent a shader program
+ */
+export interface ProgramInfo extends ProgramMetadata {
+  /**
+   * information of uniform variables
+   */
+  variables?: ProgramVariable[];
+  /**
+   * tensor info for outputs
+   */
+  outputs: TensorInfo[];
+  /**
+   * the shader's processing source code
+   */
+  shaderSource: string;
+  /**
+   * default is "main"
+   */
+  // entryPoint: string;
+
+  dispatchGroup: (inputs: readonly Tensor[]) => {
+    x: number;
+    y?: number;
+    z?: number;
+  };
+}
+
+export interface Artifact {
+  programInfo: ProgramInfo;
+  computePipeline: GPUComputePipeline;
+  // attribLocations: {position: number; textureCoord: number};
+}
diff --git a/js/web/lib/onnxjs/execution-plan.ts b/js/web/lib/onnxjs/execution-plan.ts
index b95e639817dbf..5136e1283d119 100644
--- a/js/web/lib/onnxjs/execution-plan.ts
+++ b/js/web/lib/onnxjs/execution-plan.ts
@@ -1,7 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {env} from 'onnxruntime-common';
+
 import {SessionHandler} from './backend';
+import {WebGpuBackend} from './backends/backend-webgpu';
 import {Graph} from './graph';
 import {Logger, Profiler} from './instrument';
 import {Operator} from './operators';
@@ -57,6 +60,7 @@ export class ExecutionPlan {
 
       // create inference handler
       const inferenceHandler = sessionHandler.createInferenceHandler();
+      const IS_WEBGPU = sessionHandler.backend instanceof WebGpuBackend;
 
       // populate inputs value
       const graphInputs = this.graph.getInputIndices();
@@ -103,6 +107,17 @@ export class ExecutionPlan {
           throw new Error('the size of output does not match model definition.');
         }
 
+        if (env.debug) {
+          for (let i = 0; i < outputList.length; i++) {
+            if (IS_WEBGPU) {
+              await outputList[i].getData();
+            } else {
+              // eslint-disable-next-line no-unused-expressions
+              outputList[i].data;
+            }
+          }
+        }
+
         // fill value
         outputList.forEach((output, i) => {
           const j = thisOp.node.outputs[i];
@@ -110,6 +125,10 @@ export class ExecutionPlan {
             throw new Error(`output [${j}] already has value: op:${thisOp.node.name}`);
           }
           this._values[j] = output;
+
+          if (env.debug) {
+            Logger.verbose('ExecPlanDataDump', `output${i}[${output.dims}]:${output.data}`);
+          }
         });
 
         // resolve downstream nodes
@@ -140,7 +159,8 @@ export class ExecutionPlan {
         if (outputTensor === undefined) {
           throw new Error(`required output [${outputIndex}] does not have value`);
         }
-        if (outputIndex === 0) {
+
+        if (IS_WEBGPU) {
           await outputTensor.getData();
         } else {
           // eslint-disable-next-line no-unused-expressions
diff --git a/js/web/lib/onnxjs/operators.ts b/js/web/lib/onnxjs/operators.ts
index 4d664f6dcda5a..2117484316dca 100644
--- a/js/web/lib/onnxjs/operators.ts
+++ b/js/web/lib/onnxjs/operators.ts
@@ -5,11 +5,13 @@ import {InferenceHandler} from './backend';
 import {Graph} from './graph';
 import {Tensor} from './tensor';
 
-export type OperatorImplementation<T> = (inferenceHandler: InferenceHandler, inputs: Tensor[], context: T) => Tensor[];
+export type OperatorImplementation<ContextType, ReturnType extends Tensor[]|Promise<Tensor[]> = Tensor[]> =
+    (inferenceHandler: InferenceHandler, inputs: Tensor[], context: ContextType) => ReturnType;
+export type OperatorAsyncImplementation<T> = OperatorImplementation<T, Promise<Tensor[]>>;
 export type OperatorInitialization<T> = (node: Graph.Node, graph: Graph) => T;
 
 export interface Operator {
-  readonly impl: OperatorImplementation<unknown>;
+  readonly impl: OperatorImplementation<unknown>|OperatorAsyncImplementation<unknown>;
   readonly context: Graph.Node|unknown;
 }
 
diff --git a/js/web/lib/onnxjs/opset.ts b/js/web/lib/onnxjs/opset.ts
index e23a288b4e22b..12618969efc1a 100644
--- a/js/web/lib/onnxjs/opset.ts
+++ b/js/web/lib/onnxjs/opset.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {Graph} from './graph';
-import {OperatorImplementation, OperatorInitialization} from './operators';
+import {OperatorAsyncImplementation, OperatorImplementation, OperatorInitialization} from './operators';
 
 export interface OpSet {
   domain: string;
@@ -19,9 +19,11 @@ export declare namespace OpSet {
    * A resolve rule consists of 4 or 5 items: opType, opSetDomain, versionSelector, operatorImplementation and
    * operatorInitialization (optional)
    */
-  type ResolveRule = [
-    string, Domain, string, OperatorImplementation<Graph.Node>
-  ]|[string, Domain, string, OperatorImplementation<unknown>, OperatorInitialization<unknown>];
+  type ResolveRule =
+      [
+        string, Domain, string, OperatorImplementation<Graph.Node>| OperatorAsyncImplementation<Graph.Node>
+      ]|[string, Domain, string, OperatorImplementation<unknown>| OperatorAsyncImplementation<unknown>,
+         OperatorInitialization<unknown>];
 }
 
 export function resolveOperator(node: Graph.Node, opsets: readonly OpSet[], rules: readonly OpSet.ResolveRule[]) {
diff --git a/js/web/lib/onnxjs/tensor.ts b/js/web/lib/onnxjs/tensor.ts
index 4ec49f7b936ea..db5e599fd68dc 100644
--- a/js/web/lib/onnxjs/tensor.ts
+++ b/js/web/lib/onnxjs/tensor.ts
@@ -131,7 +131,15 @@ export class Tensor {
    */
   async getData(): Promise<TensorData> {
     if (this.cache === undefined) {
-      this.cache = await this.asyncDataProvider!(this.dataId);
+      if (this.asyncDataProvider) {
+        const data = await this.asyncDataProvider(this.dataId);
+        if (data.length !== this.size) {
+          throw new Error('Length of data provided by the Data Provider is inconsistent with the dims of this Tensor.');
+        }
+        this.cache = data;
+      } else {
+        return this.data;
+      }
     }
     return this.cache;
   }
@@ -348,7 +356,7 @@ export class Tensor {
   }
 }
 
-function sizeof(type: Tensor.DataType): number {
+export function sizeof(type: Tensor.DataType): number {
   switch (type) {
     case 'bool':
     case 'int8':
@@ -390,7 +398,7 @@ function sizeofProto(type: onnx.TensorProto.DataType|ortFbs.TensorDataType): num
   }
 }
 
-function createView(dataBuffer: ArrayBuffer, type: Tensor.DataType) {
+export function createView(dataBuffer: ArrayBuffer, type: Tensor.DataType) {
   return new (dataviewConstructor(type))(dataBuffer);
 }
 
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 186fee0dcfd54..4e68256a80774 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -29,6 +29,7 @@
         "@types/mocha": "^8.2.2",
         "@types/npmlog": "^4.1.2",
         "@types/platform": "^1.3.3",
+        "@webgpu/types": "^0.1.13",
         "base64-js": "^1.5.1",
         "chai": "^4.3.4",
         "dir-compare": "^3.3.0",
@@ -591,6 +592,12 @@
         "@xtuc/long": "4.2.2"
       }
     },
+    "node_modules/@webgpu/types": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.13.tgz",
+      "integrity": "sha512-SAq8FRONvMANQi/eXw5ArKfSvih6am/EC+5y7+du2xf1VyprtKn4ylUPKGW4T6ZkDogtH3xZgGE+J/cx601L5w==",
+      "dev": true
+    },
     "node_modules/@webpack-cli/configtest": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/@webpack-cli/configtest/-/configtest-2.0.1.tgz",
@@ -7552,6 +7559,12 @@
         "@xtuc/long": "4.2.2"
       }
     },
+    "@webgpu/types": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.13.tgz",
+      "integrity": "sha512-SAq8FRONvMANQi/eXw5ArKfSvih6am/EC+5y7+du2xf1VyprtKn4ylUPKGW4T6ZkDogtH3xZgGE+J/cx601L5w==",
+      "dev": true
+    },
     "@webpack-cli/configtest": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/@webpack-cli/configtest/-/configtest-2.0.1.tgz",
diff --git a/js/web/package.json b/js/web/package.json
index 42ecd1ec81d6c..1aa8e3e673de7 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -45,6 +45,7 @@
     "@types/mocha": "^8.2.2",
     "@types/npmlog": "^4.1.2",
     "@types/platform": "^1.3.3",
+    "@webgpu/types": "^0.1.13",
     "base64-js": "^1.5.1",
     "chai": "^4.3.4",
     "dir-compare": "^3.3.0",
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index 289f9ad8eeddc..c0921ff65470e 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -34,6 +34,7 @@ Options:
  -b=<...>, --backend=<...>     Specify one or more backend(s) to run the test upon.
                                  Backends can be one or more of the following, splitted by comma:
                                    webgl
+                                   webgpu
                                    wasm
                                    xnnpack
                                    js
@@ -99,7 +100,7 @@ Examples:
 
 export declare namespace TestRunnerCliArgs {
   type Mode = 'suite0'|'suite1'|'model'|'unittest'|'op';
-  type Backend = 'cpu'|'webgl'|'wasm'|'onnxruntime'|'xnnpack'|'js';
+  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack'|'js';
   type Environment = 'chrome'|'edge'|'firefox'|'electron'|'safari'|'node'|'bs';
   type BundleMode = 'prod'|'dev'|'perf';
 }
@@ -335,7 +336,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   }
 
   // Option: -b=<...>, --backend=<...>
-  const browserBackends = ['webgl', 'wasm', 'xnnpack', 'js'];
+  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack', 'js'];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
   const backend =
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index f4d2d0a90b4fd..9b242b036b691 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -50,7 +50,7 @@ if (shouldLoadSuiteTestData) {
 
 // The default backends and opset version lists. Those will be used in suite tests.
 const DEFAULT_BACKENDS: readonly TestRunnerCliArgs.Backend[] =
-    args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl'];
+    args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu'];
 const DEFAULT_OPSET_VERSIONS: readonly number[] = [13, 12, 11, 10, 9, 8, 7];
 
 const FILE_CACHE_ENABLED = args.fileCache;         // whether to enable file cache
@@ -454,11 +454,13 @@ function run(config: Test.Config) {
     // STEP 5. use Karma to run test
     npmlog.info('TestRunnerCli.Run', '(5/5) Running karma to start test runner...');
     const karmaCommand = path.join(npmBin, 'karma');
+    const webgpu = args.backends.indexOf('webgpu') > -1;
     const browser = getBrowserNameFromEnv(
         args.env,
         args.bundleMode === 'perf' ? 'perf' :
             args.debug             ? 'debug' :
-                                     'test');
+                                     'test',
+        webgpu);
     const karmaArgs = ['start', `--browsers ${browser}`];
     if (args.debug) {
       karmaArgs.push('--log-level info --timeout-mocha 9999999');
@@ -468,6 +470,9 @@ function run(config: Test.Config) {
     if (args.noSandbox) {
       karmaArgs.push('--no-sandbox');
     }
+    if (webgpu) {
+      karmaArgs.push('--force-localhost');
+    }
     karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
     if (browser === 'Edge') {
       // There are currently 2 Edge browser launchers:
@@ -559,10 +564,11 @@ function saveConfig(config: Test.Config) {
   fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config);
 }
 
-function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test') {
+
+function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean) {
   switch (env) {
     case 'chrome':
-      return selectChromeBrowser(mode);
+      return selectChromeBrowser(mode, webgpu);
     case 'edge':
       return 'Edge';
     case 'firefox':
@@ -578,13 +584,22 @@ function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'per
   }
 }
 
-function selectChromeBrowser(mode: 'debug'|'perf'|'test') {
-  switch (mode) {
-    case 'debug':
-      return 'ChromeDebug';
-    case 'perf':
-      return 'ChromePerf';
-    default:
-      return 'ChromeTest';
+function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean) {
+  if (webgpu) {
+    switch (mode) {
+      case 'debug':
+        return 'ChromeCanaryDebug';
+      default:
+        return 'ChromeCanaryTest';
+    }
+  } else {
+    switch (mode) {
+      case 'debug':
+        return 'ChromeDebug';
+      case 'perf':
+        return 'ChromePerf';
+      default:
+        return 'ChromeTest';
+    }
   }
 }
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index cb59689c4b027..eb84e2babc754 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -282,6 +282,276 @@
       "xor.jsonc"
     ]
   },
+  "webgpu": {
+    "onnx": [],
+    "node": [
+      "test_abs",
+      "test_acos_example",
+      "test_acos",
+      "test_add_bcast",
+      "test_add",
+      // "test_and_bcast3v1d",
+      // "test_and_bcast3v2d",
+      // "test_and_bcast4v2d",
+      // "test_and_bcast4v3d",
+      // "test_and_bcast4v4d",
+      // "test_and2d",
+      // "test_and3d",
+      // "test_and4d",
+      "test_asin_example",
+      "test_asin",
+      "test_atan_example",
+      "test_atan",
+      "test_averagepool_1d_default",
+      "test_averagepool_2d_default",
+      "test_averagepool_2d_pads",
+      "test_averagepool_2d_precomputed_pads",
+      "test_averagepool_2d_precomputed_same_upper",
+      "test_averagepool_2d_precomputed_strides",
+      "test_averagepool_2d_same_upper",
+      "test_averagepool_2d_same_lower",
+      "test_averagepool_2d_strides",
+      "test_averagepool_3d_default",
+      "test_basic_conv_with_padding",
+      "test_basic_conv_without_padding",
+      // "test_batchnorm_epsilon",
+      // "test_batchnorm_example",
+      // "test_cast_DOUBLE_to_FLOAT",
+      // "test_cast_FLOAT_to_DOUBLE",
+      "v{7,8,9,10}/test_clip_splitbounds",
+      "v{7,8,9,10}/test_clip_outbounds",
+      "v{7,8,9,10}/test_clip_inbounds",
+      "v{7,8,9,10}/test_clip_example",
+      "v{7,8,9,10}/test_clip_default_min",
+      "v{7,8,9,10}/test_clip_default_max",
+      "v{7,8,9,10}/test_clip_default_inbounds",
+      "v{7,8,9,10}/test_clip",
+      "test_concat_1d_axis_0",
+      "test_concat_2d_axis_0",
+      "test_concat_2d_axis_1",
+      "test_concat_3d_axis_0",
+      "test_concat_3d_axis_1",
+      "test_concat_3d_axis_2",
+      "test_conv_with_strides_and_asymmetric_padding",
+      "test_conv_with_strides_no_padding",
+      "test_conv_with_strides_padding",
+      "test_constant",
+      "test_cos_example",
+      "test_cos",
+      "test_div_bcast",
+      "test_div_example",
+      "test_div",
+      // "test_dropout_default",
+      // "test_dropout_random",
+      // "test_depthtospace_crd_mode",
+      // "test_depthtospace_crd_mode_example",
+      // "test_depthtospace_dcr_mode",
+      // "test_depthtospace_example",
+      "test_elu_example",
+      "test_elu",
+      "test_elu_default",
+      // "test_flatten_axis0",
+      // "test_flatten_axis1",
+      // "test_flatten_axis2",
+      // "test_flatten_axis3",
+      // "test_flatten_default_axis",
+      "test_gather_0",
+      "test_gather_1",
+      "test_gemm_nobroadcast",
+      "test_gemm_broadcast",
+      "test_globalaveragepool_precomputed",
+      "test_globalaveragepool",
+      "test_globalmaxpool_precomputed",
+      "test_globalmaxpool",
+      // "test_greater_bcast",
+      // "test_greater",
+      // "test_instancenorm_epsilon",
+      // "test_instancenorm_example",
+      // "test_less_bcast",
+      // "test_less",
+      // "test_equal_bcast",
+      // "test_equal",
+      // "test_identity",
+      "test_leakyrelu_default",
+      "test_leakyrelu_example",
+      "test_leakyrelu",
+      // "test_lrn_default",  <-- failing due to low precison. If absolute CPU error threshold is increased from 1e-4 to 1e-2 (100x increase), it passes the test.
+      // "test_lrn",  <-- failing due to low precison. If absolute CPU error threshold is increased from 1e-4 to 1e-3 (10x increase), it passes the test.
+      "test_matmul_2d",
+      "test_matmul_3d",
+      "test_matmul_4d",
+      "test_maxpool_1d_default",
+      "test_maxpool_2d_default",
+      "test_maxpool_2d_pads",
+      "test_maxpool_2d_precomputed_pads",
+      "test_maxpool_2d_precomputed_same_upper",
+      "test_maxpool_2d_precomputed_strides",
+      "test_maxpool_2d_same_lower",
+      "test_maxpool_2d_same_upper",
+      "test_maxpool_2d_strides",
+      "test_maxpool_3d_default",
+      "test_mul_bcast",
+      "test_mul_example",
+      "test_mul",
+      "test_neg",
+      "test_neg_example",
+      // "test_not_2d",
+      // "test_not_3d",
+      // "test_not_4d",
+      // "test_or_bcast3v1d",
+      // "test_or_bcast3v2d",
+      // "test_or_bcast4v2d",
+      // "test_or_bcast4v3d",
+      // "test_or_bcast4v4d",
+      // "test_prelu_broadcast",
+      // "test_prelu_example",
+      "test_relu",
+      // "test_reshape_extended_dims",
+      // "test_reshape_negative_dim",
+      // "test_reshape_one_dim",
+      // "test_reshape_reduced_dims",
+      // "test_reshape_reordered_dims",
+      "test_sigmoid",
+      "test_sigmoid_example",
+      "test_sin_example",
+      "test_sin",
+      // "test_softmax_axis_0",
+      // "test_softmax_axis_1",
+      // "test_softmax_axis_2",
+      // "test_softmax_default_axis",
+      // "test_softmax_example",
+      // {
+      //   "name": "test_softmax_large_number",
+      //   "condition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
+      // },
+      "test_sub_bcast",
+      "test_sub_example",
+      "test_sub",
+      // "test_sum_example",
+      // "test_sum_one_input",
+      // "test_sum_two_inputs",
+      // "test_reduce_log_sum_asc_axes",
+      // "test_reduce_log_sum_default",
+      // "test_reduce_log_sum_desc_axes",
+      // "test_reduce_max_default_axes_keepdim_example",
+      // "test_reduce_max_default_axes_keepdims_random",
+      // "test_reduce_max_do_not_keepdims_example",
+      // "test_reduce_max_do_not_keepdims_random",
+      // "test_reduce_max_keepdims_example",
+      // "test_reduce_max_keepdims_random",
+      // "test_reduce_mean_default_axes_keepdims_example",
+      // "test_reduce_mean_default_axes_keepdims_random",
+      // "test_reduce_mean_do_not_keepdims_example",
+      // "test_reduce_mean_do_not_keepdims_random",
+      // "test_reduce_mean_keepdims_example",
+      // "test_reduce_mean_keepdims_random",
+      // "test_reduce_min_default_axes_keepdims_example",
+      // "test_reduce_min_default_axes_keepdims_random",
+      // "test_reduce_min_do_not_keepdims_example",
+      // "test_reduce_min_do_not_keepdims_random",
+      // "test_reduce_min_keepdims_example",
+      // "test_reduce_min_keepdims_random",
+      // {
+      //   "name": "test_reduce_prod_default_axes_keepdims_example",
+      //   "condition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
+      // },
+      // "test_reduce_prod_default_axes_keepdims_random",
+      // "test_reduce_prod_do_not_keepdims_example",
+      // "test_reduce_prod_do_not_keepdims_random",
+      // "test_reduce_prod_keepdims_example",
+      // "test_reduce_prod_keepdims_random",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_example",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_random",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_example",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_random",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_keepdims_example",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_keepdims_random",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_example",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_random",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_example",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_random",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_example",
+      // "v{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_random",
+      // "v{7,8,9,10,11,12}/test_split_variable_parts_default_axis",
+      // "v{7,8,9,10,11,12}/test_split_variable_parts_1d",
+      // "v{7,8,9,10,11,12}/test_split_variable_parts_2d",
+      // "v{7,8,9,10,11,12}/test_split_equal_parts_default_axis",
+      // "v{7,8,9,10,11,12}/test_split_equal_parts_1d",
+      // "v{7,8,9,10,11,12}/test_split_equal_parts_2d",
+      "v{7,8,9}/test_slice",
+      "v{7,8,9}/test_slice_default_axes",
+      "v{7,8,9}/test_slice_end_out_of_bounds",
+      "v{7,8,9}/test_slice_neg",
+      // "test_slice_start_out_of_bounds", // tensor shape of 0
+      // "test_squeeze",
+      "test_tan_example",
+      "test_tan",
+      "test_tanh_example",
+      "test_tanh",
+      // "test_tile",
+      // "test_tile_precomputed",
+      "test_transpose_all_permutations_0",
+      "test_transpose_all_permutations_1",
+      "test_transpose_all_permutations_2",
+      "test_transpose_all_permutations_3",
+      "test_transpose_all_permutations_4",
+      "test_transpose_all_permutations_5",
+      "test_transpose_default"
+      // "test_unsqueeze",
+      // "test_xor_bcast3v1d",
+      // "test_xor_bcast3v2d",
+      // "test_xor_bcast4v2d",
+      // "test_xor_bcast4v3d",
+      // "test_xor_bcast4v4d",
+      // "test_xor2d",
+      // "test_xor3d",
+      // "test_xor4d"
+    ],
+    "ops": [
+      "abs.jsonc",
+      "acos.jsonc",
+      "add.jsonc",
+      //"and.jsonc",
+      "asin.jsonc",
+      "ceil.jsonc",
+      "concat.jsonc",
+      "conv.jsonc",
+      "cos.jsonc",
+      "div.jsonc",
+      //"depth-to-space.jsonc",
+      //"equal.jsonc",
+      "exp.jsonc",
+      "floor.jsonc",
+      "global-average-pool.jsonc",
+      "gemm.jsonc",
+      //"greater.jsonc",
+      ////"identity.jsonc",
+      //"image-scaler.jsonc",
+      //"less.jsonc",
+      "log.jsonc",
+      //"matmul.jsonc",
+      "mul.jsonc",
+      "neg.jsonc",
+      //"not.jsonc",
+      //"or.jsonc",
+      "leaky-relu.jsonc",
+      //"reduce-min.jsonc",
+      "relu.jsonc",
+      //"pad.jsonc",
+      //"pad-big.jsonc",
+      "pow.jsonc",
+      "pow-big-number.jsonc",
+      //"reshape.jsonc",
+      //"softmax.jsonc",
+      "sin.jsonc",
+      //"split.jsonc",
+      "sqrt.jsonc",
+      "sub.jsonc",
+      "tan.jsonc",
+      "transpose.jsonc"
+      //"xor.jsonc"
+    ]
+  },
   "wasm": {
     "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
     "node": [
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 83a1301c7a24b..f7aa8d39ef219 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -25,6 +25,8 @@ const WEBGL_THRESHOLD_ABSOLUTE_ERROR = 1.0e-3;
 const WEBGL_THRESHOLD_RELATIVE_ERROR = 1.00001;
 const WEBGL_HALF_FLOAT_THRESHOLD_ABSOLUTE_ERROR = 0.1;
 const WEBGL_HALF_FLOAT_THRESHOLD_RELATIVE_ERROR = 1.02;
+const WEBGPU_THRESHOLD_ABSOLUTE_ERROR = 1.0e-3;
+const WEBGPU_THRESHOLD_RELATIVE_ERROR = 1.00001;
 const WASM_THRESHOLD_ABSOLUTE_ERROR = 1.0e-4;
 const WASM_THRESHOLD_RELATIVE_ERROR = 1.000001;
 const ONNXRUNTIME_THRESHOLD_ABSOLUTE_ERROR = 1.0e-3;
@@ -274,6 +276,9 @@ export class TensorResultValidator {
         this.absoluteThreshold = WEBGL_THRESHOLD_ABSOLUTE_ERROR;
         this.relativeThreshold = WEBGL_THRESHOLD_RELATIVE_ERROR;
       }
+    } else if (backend === 'webgpu') {
+      this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
+      this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'wasm' || backend === 'xnnpack') {
       this.absoluteThreshold = WASM_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WASM_THRESHOLD_RELATIVE_ERROR;
@@ -518,7 +523,7 @@ export class OpTestContext {
   inferenceHandler: InferenceHandler;
 
   constructor(protected opTest: Test.OperatorTest) {
-    this.backendHint = opTest.backend === 'webgl' ? 'webgl' : 'cpu';
+    this.backendHint = opTest.backend ?? 'cpu';
   }
   createOperator(): Operator {
     return initializeOperator(
@@ -556,10 +561,15 @@ async function runOpTestcase(
   const inputTensors =
       testcase.inputs.map(input => createTensor(input.dims, input.type as Tensor.DataType, input.data));
 
-  const results = operator.impl(inferenceHandler, inputTensors, operator.context);
-  // if ('then' in results) {
-  //   results = await results;
-  // }
+  const results = await operator.impl(inferenceHandler, inputTensors, operator.context);
+
+  // try async data read.
+  for (const result of results) {
+    try {
+      await result.getData();
+    } catch {
+    }
+  }
 
   results.forEach((output, i) => {
     Logger.verbose('TestOpRunner', `  Result'${i}': ${output.type}[${output.dims.join(',')}]`);
diff --git a/js/web/test/unittests/backends/webgl/test-conv-new.ts b/js/web/test/unittests/backends/webgl/test-conv-new.ts
index 0fddddf58181c..fa783acb6c4d0 100644
--- a/js/web/test/unittests/backends/webgl/test-conv-new.ts
+++ b/js/web/test/unittests/backends/webgl/test-conv-new.ts
@@ -832,7 +832,7 @@ function webglConv(
   if (biasTensor) {
     inputs.push(biasTensor);
   }
-  return (op.impl(webglInferenceHandler!, inputs, op.context))[0];
+  return (op.impl(webglInferenceHandler!, inputs, op.context) as Tensor[])[0];
 }
 function cpuConv(
     inputTensor: Tensor, kernelTensor: Tensor, biasTensor: Tensor|null, autoPad: string|undefined, dilations: number[],
diff --git a/js/web/tsconfig.json b/js/web/tsconfig.json
index b9dc974997b28..865c393b5b2b6 100644
--- a/js/web/tsconfig.json
+++ b/js/web/tsconfig.json
@@ -4,6 +4,7 @@
     "module": "CommonJS",
     "downlevelIteration": true,
     "declarationDir": "./types",
+    "typeRoots": ["./node_modules/@webgpu/types", "./node_modules/@types"]
   },
   "include": ["lib", "script", "test"],
   "exclude": ["lib/wasm/proxy-worker"]

From 93e6d0a076b00db1c8233bf1361cf07b1ee26951 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 13 Oct 2022 16:08:27 -0700
Subject: [PATCH 03/81] working

---
 bb.bat                                        |   6 +
 js/web/lib/wasm/jsep/backend-webgpu.ts        | 149 +++++
 js/web/lib/wasm/{jsep.ts => jsep/init.ts}     |  29 +-
 js/web/lib/wasm/jsep/tensor.ts                | 264 +++++++++
 js/web/lib/wasm/jsep/util.ts                  | 511 ++++++++++++++++++
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  | 162 ++++++
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  90 +++
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts  | 217 ++++++++
 js/web/lib/wasm/jsep/webgpu/ops/common.ts     |  91 ++++
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts     | 176 ++++++
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  | 127 +++++
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 150 +++++
 js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts |  39 ++
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts     | 131 +++++
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts       | 165 ++++++
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts     | 115 ++++
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       | 376 +++++++++++++
 .../wasm/jsep/webgpu/ops/reduce-tensors.ts    |  85 +++
 js/web/lib/wasm/jsep/webgpu/ops/reshape.ts    |  22 +
 js/web/lib/wasm/jsep/webgpu/ops/shape.ts      |  16 +
 js/web/lib/wasm/jsep/webgpu/ops/slice.ts      | 180 ++++++
 js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts    |  44 ++
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  | 116 ++++
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   | 197 +++++++
 js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts  |  43 ++
 .../lib/wasm/jsep/webgpu/program-manager.ts   |  75 +++
 .../lib/wasm/jsep/webgpu/session-handler.ts   |  47 ++
 .../wasm/jsep/webgpu/tensor-data-manager.ts   | 140 +++++
 js/web/lib/wasm/jsep/webgpu/types.ts          |  94 ++++
 js/web/lib/wasm/proxy-wrapper.ts              |   8 +-
 js/web/lib/wasm/wasm-core-impl.ts             |   3 -
 js/web/script/test-runner-cli.ts              |   2 +-
 js/web/test/test-runner.ts                    |   4 +-
 .../core/providers/js/data_transfer.cc        |   4 +-
 34 files changed, 3861 insertions(+), 17 deletions(-)
 create mode 100644 bb.bat
 create mode 100644 js/web/lib/wasm/jsep/backend-webgpu.ts
 rename js/web/lib/wasm/{jsep.ts => jsep/init.ts} (54%)
 create mode 100644 js/web/lib/wasm/jsep/tensor.ts
 create mode 100644 js/web/lib/wasm/jsep/util.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/common.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/concat.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/conv.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/gather.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/pool.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/shape.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/slice.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/program-manager.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/session-handler.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/types.ts

diff --git a/bb.bat b/bb.bat
new file mode 100644
index 0000000000000..97193fce3a950
--- /dev/null
+++ b/bb.bat
@@ -0,0 +1,6 @@
+call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" --target onnxruntime_webassembly
+
+IF %ERRORLEVEL% == 0 (
+copy /Y .\build\Windows\Debug\ort-wasm.js .\js\web\lib\wasm\binding\
+copy /Y .\build\Windows\Debug\ort-wasm.wasm .\js\web\dist\
+)
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
new file mode 100644
index 0000000000000..c4a9c05af26a7
--- /dev/null
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -0,0 +1,149 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from './tensor';
+import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
+import {ProgramManager} from './webgpu/program-manager';
+import {createTensorDataManager, TensorDataManager} from './webgpu/tensor-data-manager';
+import {GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+
+const getProgramInfoUniqueKey =
+    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly Tensor[], inputGpuDatas: readonly GpuData[]):
+        string => {
+          const inputGpuDataTypes = inputGpuDatas.map(data => `${data.type}`).join('_');
+          const inputTensorShapes = inputTensors.map(t => `${t.dims.join(',')}`).join('_');
+          let key = programInfo.name;
+          if (programInfo.cacheHint) {
+            key += '[' + programInfo.cacheHint + ']';
+          }
+          key += ':' + inputTensorShapes + ';' + inputGpuDataTypes;
+          return key;
+        };
+
+export class WebGpuBackend {
+  device: GPUDevice;
+  gpuDataManager: GpuDataManager;
+  dataManager: TensorDataManager;
+  programManager: ProgramManager;
+
+  commandEncoder: GPUCommandEncoder|null = null;
+  computePassEncoder: GPUComputePassEncoder|null = null;
+  pendingDispatchNumber = 0;
+
+  async initialize(): Promise<void> {
+    if (!navigator.gpu) {
+      // WebGPU is not available.
+      throw new Error('WebGpuBackend: WebGPU is not available.');
+    }
+
+    const adapter = await navigator.gpu.requestAdapter();
+    if (!adapter) {
+      throw new Error('WebGpuBackend: Failed to get GPU adapter.');
+    }
+    this.device = await adapter.requestDevice();
+    this.gpuDataManager = createGpuDataManager(this);
+    this.dataManager = createTensorDataManager(this.gpuDataManager);
+    this.programManager = new ProgramManager(this);
+    // TODO: set up flags
+
+    this.device.onuncapturederror = ev => {
+      if (ev.error instanceof GPUValidationError) {
+        // eslint-disable-next-line no-console
+        console.error(`An uncaught WebGPU validation error was raised: ${ev.error.message}`);
+      }
+    };
+  }
+
+  dispose(): void {
+    // TODO: uninitialization
+    // this.glContext.dispose();
+  }
+
+  getCommandEncoder(): GPUCommandEncoder {
+    if (!this.commandEncoder) {
+      this.commandEncoder = this.device.createCommandEncoder();
+    }
+    return this.commandEncoder;
+  }
+
+  getComputePassEncoder(): GPUComputePassEncoder {
+    if (!this.computePassEncoder) {
+      this.computePassEncoder = this.getCommandEncoder().beginComputePass();
+    }
+    return this.computePassEncoder;
+  }
+
+  endComputePass(): void {
+    if (this.computePassEncoder) {
+      this.computePassEncoder.end();
+      this.computePassEncoder = null;
+    }
+  }
+
+  flush(): void {
+    this.endComputePass();
+    this.device.queue.submit([this.getCommandEncoder().finish()]);
+    this.commandEncoder = null;
+    this.pendingDispatchNumber = 0;
+  }
+
+  private uploadGpuData(tensor: Tensor, textureType: GpuDataType): GpuData {
+    return this.dataManager.uploadTensorToGpu(tensor, textureType);
+  }
+
+  private createGpuData(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
+    return this.dataManager.createGpuTensor(type, dims, gpuDataType);
+  }
+
+  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly Tensor[]): Tensor[] {
+    if (inputs.length !== program.inputTypes.length) {
+      throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
+    }
+
+    // create info for inputs
+    const inputDatas: GpuData[] = [];
+    for (let i = 0; i < program.inputTypes.length; ++i) {
+      inputDatas[i] = this.uploadGpuData(inputs[i], program.inputTypes[i]);
+    }
+
+    const key = getProgramInfoUniqueKey(program, inputs, inputDatas);
+    let artifact = this.programManager.getArtifact(key);
+    const programInfo = artifact ?
+        artifact.programInfo :
+        (typeof (program as ProgramInfoLoader).get === 'function' ? (program as ProgramInfoLoader).get() :
+                                                                    (program as ProgramInfo));
+
+    // create info for outputs
+    const outputDatas: GpuData[] = [];
+    const outputTensors: Tensor[] = [];
+    for (let i = 0; i < programInfo.outputs.length; ++i) {
+      const [tensor, gpuData] = this.createGpuData(
+          programInfo.outputs[i].type, programInfo.outputs[i].dims, programInfo.outputs[i].gpuDataType);
+      outputTensors.push(tensor);
+      outputDatas.push(gpuData);
+    }
+
+    if (!artifact) {
+      artifact = this.programManager.build(programInfo);
+      this.programManager.setArtifact(key, artifact);
+    }
+
+    this.programManager.run(artifact, inputDatas, outputDatas, artifact.programInfo.dispatchGroup(inputs));
+
+    return outputTensors;
+  }
+
+  reshape(input: Tensor, reshapedDims: readonly number[]): Tensor {
+    return this.dataManager.hasGpuData(input.dataId) ?
+        this.dataManager.createGpuRef(input.dataId, input.type, reshapedDims)[0] :
+        new Tensor(reshapedDims, input.type, undefined, undefined, input.data);
+  }
+
+  upload(dataOffset: number, data: Uint8Array, gpuDataId: number) {
+    throw new Error('Method not implemented.');
+  }
+
+  alloc(size: number): number {
+    throw new Error('Method not implemented.');
+  }
+}
diff --git a/js/web/lib/wasm/jsep.ts b/js/web/lib/wasm/jsep/init.ts
similarity index 54%
rename from js/web/lib/wasm/jsep.ts
rename to js/web/lib/wasm/jsep/init.ts
index 6766840303077..c003cd01f99fa 100644
--- a/js/web/lib/wasm/jsep.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -1,30 +1,43 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {OrtWasmModule} from './binding/ort-wasm';
+import {OrtWasmModule} from '../binding/ort-wasm';
 
-export const init = (module: OrtWasmModule): void => {
+import {WebGpuBackend} from './backend-webgpu';
+
+export const init = async(module: OrtWasmModule): Promise<void> => {
   // init JSEP if available
   const init = module.jsepInit;
   if (init) {
+    const backend = new WebGpuBackend();
+    await backend.initialize();
+
     init(
-        {},
+        // backend
+        {backend},
+
+        // jsepAlloc()
         (size: number) => {
           // eslint-disable-next-line no-console
           console.log(`jsepAlloc: ${size}`);
-          return 1234;
+          return backend.alloc(size);
         },
+
+        // jsepFree()
         (ptr: number) => {
           // eslint-disable-next-line no-console
           console.log(`jsepFree: ${ptr}`);
-          return 5678;
+          return backend.free(size);
         },
-        (_a: number) => {
+
+        // jsepUpload(src, dst, size)
+        (dataOffset: number, gpuDataId: number, size: number) => {
           // eslint-disable-next-line no-console
           console.log('jsepUpload');
-          return 40;
+          const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
+          backend.upload(dataOffset, data, gpuDataId);
         },
-        (_a: number) => {
+        (_src: number, _dst: number) => {
           // eslint-disable-next-line no-console
           console.log('jsepDownload');
           return 41;
diff --git a/js/web/lib/wasm/jsep/tensor.ts b/js/web/lib/wasm/jsep/tensor.ts
new file mode 100644
index 0000000000000..7dd23f4e7edc1
--- /dev/null
+++ b/js/web/lib/wasm/jsep/tensor.ts
@@ -0,0 +1,264 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {ShapeUtil} from './util';
+
+export declare namespace Tensor {
+  export interface DataTypeMap {
+    bool: Uint8Array;
+    float32: Float32Array;
+    float64: Float64Array;
+    string: string[];
+    int8: Int8Array;
+    uint8: Uint8Array;
+    int16: Int16Array;
+    uint16: Uint16Array;
+    int32: Int32Array;
+    uint32: Uint32Array;
+    int64: BigInt64Array;
+    uint64: BigUint64Array;
+  }
+
+  export type DataType = keyof DataTypeMap;
+
+  export type StringType = Tensor.DataTypeMap['string'];
+  export type BooleanType = Tensor.DataTypeMap['bool'];
+  export type IntegerType = Tensor.DataTypeMap['int8']|Tensor.DataTypeMap['uint8']|Tensor.DataTypeMap['int16']|
+                            Tensor.DataTypeMap['uint16']|Tensor.DataTypeMap['int32']|Tensor.DataTypeMap['uint32']|
+                            Tensor.DataTypeMap['int64']|Tensor.DataTypeMap['uint64'];
+  export type FloatType = Tensor.DataTypeMap['float32']|Tensor.DataTypeMap['float64'];
+  export type NumberType = BooleanType|IntegerType|FloatType;
+
+  export type Id = number;
+}
+
+type TensorData = Tensor.DataTypeMap[Tensor.DataType];
+
+type DataProvider = (id: Tensor.Id) => TensorData;
+type AsyncDataProvider = (id: Tensor.Id) => Promise<TensorData>;
+
+let guid = 0;
+const createNewTensorId = () => guid++;
+
+
+export const sizeof = (type: Tensor.DataType): number => {
+  switch (type) {
+    case 'bool':
+    case 'int8':
+    case 'uint8':
+      return 1;
+    case 'int16':
+    case 'uint16':
+      return 2;
+    case 'int32':
+    case 'uint32':
+    case 'float32':
+      return 4;
+    case 'int64':
+    case 'uint64':
+    case 'float64':
+      return 8;
+    default:
+      throw new Error(`cannot calculate sizeof() on type ${type}`);
+  }
+};
+
+const dataviewConstructor = (type: Tensor.DataType) => {
+  switch (type) {
+    case 'bool':
+    case 'uint8':
+      return Uint8Array;
+    case 'int8':
+      return Int8Array;
+    case 'int16':
+      return Int16Array;
+    case 'uint16':
+      return Uint16Array;
+    case 'int32':
+      return Int32Array;
+    case 'uint32':
+      return Uint32Array;
+    case 'int64':
+      return BigInt64Array;
+    case 'uint64':
+      return BigUint64Array;
+    case 'float32':
+      return Float32Array;
+    case 'float64':
+      return Float64Array;
+    default:
+      // should never run to here
+      throw new Error('unspecified error');
+  }
+};
+
+export const createView = (dataBuffer: ArrayBuffer, type: Tensor.DataType): Int32Array|Uint32Array|BigInt64Array|
+    BigUint64Array|Uint8Array|Float32Array|Float64Array|Int8Array|Int16Array|Uint16Array =>
+        new (dataviewConstructor(type))(dataBuffer);
+
+export class Tensor {
+  /**
+   * get the underlying tensor data
+   */
+  get data(): TensorData {
+    if (this.cache === undefined) {
+      const data = this.dataProvider!(this.dataId);
+      if (data.length !== this.size) {
+        throw new Error('Length of data provided by the Data Provider is inconsistent with the dims of this Tensor.');
+      }
+      this.cache = data;
+    }
+    return this.cache;
+  }
+
+  /**
+   * get the underlying string tensor data. Should only use when type is STRING
+   */
+  get stringData(): Tensor.StringType {
+    if (this.type !== 'string') {
+      throw new TypeError('data type is not string');
+    }
+
+    return this.data as Tensor.StringType;
+  }
+
+  /**
+   * get the underlying integer tensor data. Should only use when type is one of the following: (UINT8, INT8, UINT16,
+   * INT16, INT32, UINT32, BOOL)
+   */
+  get integerData(): Tensor.IntegerType {
+    switch (this.type) {
+      case 'uint8':
+      case 'int8':
+      case 'uint16':
+      case 'int16':
+      case 'int32':
+      case 'uint32':
+      case 'int64':
+      case 'uint64':
+      case 'bool':
+        return this.data as Tensor.IntegerType;
+
+      default:
+        throw new TypeError(
+            'data type is not integer (uint8, int8, uint16, int16, int32, uint32, int64, uint64, bool)');
+    }
+  }
+
+  /**
+   * get the underlying float tensor data. Should only use when type is one of the following: (FLOAT, DOUBLE)
+   */
+  get floatData(): Tensor.FloatType {
+    switch (this.type) {
+      case 'float32':
+      case 'float64':
+        return this.data as Tensor.FloatType;
+
+      default:
+        throw new TypeError('data type is not float (float32, float64)');
+    }
+  }
+
+  /**
+   * get the underlying number tensor data. Should only use when type is one of the following: (UINT8, INT8, UINT16,
+   * INT16, INT32, UINT32, BOOL, FLOAT, DOUBLE)
+   */
+  get numberData(): Tensor.NumberType {
+    if (this.type !== 'string') {
+      return this.data as Tensor.NumberType;
+    }
+    throw new TypeError('type cannot be non-number (string)');
+  }
+
+  /**
+   * get the underlying tensor data asynchronously
+   */
+  async getData(): Promise<TensorData> {
+    if (this.cache === undefined) {
+      if (this.asyncDataProvider) {
+        const data = await this.asyncDataProvider(this.dataId);
+        if (data.length !== this.size) {
+          throw new Error('Length of data provided by the Data Provider is inconsistent with the dims of this Tensor.');
+        }
+        this.cache = data;
+      } else {
+        return this.data;
+      }
+    }
+    return this.cache;
+  }
+
+  /**
+   * get the number of elements in the tensor
+   */
+  public readonly size: number;
+
+  private _strides: readonly number[];
+  /**
+   * get the strides for each dimension
+   */
+  get strides(): readonly number[] {
+    if (!this._strides) {
+      this._strides = ShapeUtil.computeStrides(this.dims);
+    }
+    return this._strides;
+  }
+
+  constructor(
+      /**
+       * get the dimensions of the tensor
+       */
+      public readonly dims: readonly number[],
+      /**
+       * get the type of the tensor
+       */
+      public readonly type: Tensor.DataType, private dataProvider?: DataProvider,
+      private asyncDataProvider?: AsyncDataProvider, private cache?: TensorData,
+      /**
+       * get the data ID that used to map to a tensor data
+       */
+      public readonly dataId: Tensor.Id = createNewTensorId()) {
+    this.size = ShapeUtil.validateDimsAndCalcSize(dims);
+    const size = this.size;
+    const empty = (dataProvider === undefined && asyncDataProvider === undefined && cache === undefined);
+
+    if (cache !== undefined) {
+      if (cache.length !== size) {
+        throw new RangeError('Input dims doesn\'t match data length.');
+      }
+    }
+
+    if (type === 'string') {
+      if (cache !== undefined && (!Array.isArray(cache) || !cache.every(i => typeof i === 'string'))) {
+        throw new TypeError('cache should be a string array');
+      }
+
+      if (empty) {
+        this.cache = new Array<string>(size);
+      }
+    } else {
+      if (cache !== undefined) {
+        const constructor = dataviewConstructor(type);
+        if (!(cache instanceof constructor)) {
+          throw new TypeError(`cache should be type ${constructor.name}`);
+        }
+      }
+
+      if (empty) {
+        const buf = new ArrayBuffer(size * sizeof(type));
+        this.cache = createView(buf, type);
+      }
+    }
+  }
+
+  /**
+   * Construct new Tensor from raw data
+   * @param data the raw data object. Should be a string array for 'string' tensor, and the corresponding typed array
+   * for other types of tensor.
+   * @param dims the dimensions of the tensor
+   * @param type the type of the tensor
+   */
+  static fromData(data: Tensor.DataTypeMap[Tensor.DataType], dims: readonly number[], type: Tensor.DataType): Tensor {
+    return new Tensor(dims, type, undefined, undefined, data);
+  }
+}
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
new file mode 100644
index 0000000000000..72f4ae13056b6
--- /dev/null
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -0,0 +1,511 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+/* eslint-disable no-param-reassign */
+
+export class MatMulUtil {
+  /**
+   * Fix the input shapes for MatMul operation if they need fixing
+   * @param dimsA The shape of tensor A. Should be an array of positive integers
+   * @param dimsB The shape of tensor B. Should be an array of positive integers
+   * @returns A tuple containing the preprocessed input shapes as required by ONNX specifications
+   */
+  static preprocessInputShapes(dimsA: readonly number[], dimsB: readonly number[]):
+      [readonly number[], readonly number[]] {
+    // If the first argument is 1-D, it is promoted to a matrix by prepending
+    // a 1 to its dimensions. After matrix multiplication the prepended 1 is
+    // removed.
+    const a = (dimsA.length === 1) ? [1, dimsA[0]] : dimsA;
+
+    // If the second argument is 1-D, it is promoted to a matrix by appending
+    // a 1 to its dimensions. After matrix multiplication the appended 1 is
+    // removed.
+    const b = (dimsB.length === 1) ? [dimsB[0], 1] : dimsB;
+
+    return [a, b];
+  }
+
+  /**
+   * Fix the output shape computed for MatMul operation if it needs fixing
+   * @param outputShape The computed outputShape. Should be an array (atleast of length 2) of positive integers.
+   * This will be mutated.
+   * @param aRank The rank of tensor A.
+   * @param bRank The rank of tensor B.
+   */
+  static postprocessOutputShape(outputShape: number[], aRank: number, bRank: number): void {
+    // Remove prepended dimension if first input is 1d
+    if (aRank === 1) {
+      // outputShape = outputShape.slice(0, outputShape.length - 2).concat(outputShape.slice(outputShape.length - 1));
+      outputShape.splice(outputShape.length - 2, 1);
+    }
+    // Remove appended dimension if second input is 1d
+    if (bRank === 1) {
+      outputShape.pop();
+    }
+  }
+
+  /**
+   * Calculate the expected shape when matrix multiplication
+   * @param a The shape of tensor A. Should be a tuple of 2 positive integers
+   * @param b The shape of tensor B. Should be a tuple of 2 positive integers
+   * @returns The expected shape of the result, or undefined if N/A
+   */
+  static calcMatMulShape(a: [number, number], b: [number, number]): [number, number]|undefined {
+    return (a[1] !== b[0]) ? undefined : [a[0], b[1]];
+  }
+}
+
+
+export class BroadcastUtil {
+  /**
+   * Calculate the expected shape when broadcasting 2 tensors
+   * @param a The shape of tensor A. Should be an array of positive integers
+   * @param b The shape of tensor B. Should be an array of positive integers
+   * @param isMatMul Whether the operation is MatMul
+   * @returns The expected shape of the result, or undefined if N/A
+   */
+  static calcShape(adims: readonly number[], bdims: readonly number[], isMatMul = false): readonly number[]|undefined {
+    const arank = adims.length;
+    const brank = bdims.length;
+    if (arank === 0) {
+      return bdims;
+    }
+    if (brank === 0) {
+      return adims;
+    }
+    const crank = Math.max(adims.length, bdims.length);
+    const cdims = new Array<number>(crank);
+
+    // calculate the last 2 dimension if it is MatMul
+    if (isMatMul) {
+      if (arank < 2 || brank < 2) {
+        return undefined;
+      }
+      const cShapeMatMul =
+          MatMulUtil.calcMatMulShape([adims[arank - 2], adims[arank - 1]], [bdims[brank - 2], bdims[brank - 1]]);
+      if (cShapeMatMul === undefined) {
+        return undefined;
+      }
+      [cdims[crank - 2], cdims[crank - 1]] = cShapeMatMul;
+    }
+
+    for (let i = isMatMul ? 3 : 1; i <= crank; i++) {
+      const aLen = arank - i < 0 ? 1 : adims[arank - i];
+      const bLen = brank - i < 0 ? 1 : bdims[brank - i];
+
+      if (aLen !== bLen && aLen > 1 && bLen > 1) {
+        return undefined;
+      }
+      cdims[crank - i] = Math.max(aLen, bLen);
+    }
+
+    return cdims;
+  }
+
+  /**
+   * Given the indices of a broadcasted tensor, calculate the original indices
+   * @param broadcastedIndices The given indices of the broadcasted tensor.
+   * @param originalShape The original shape of the tensor before broadcas
+   * @returns The calculated indices that maps to the original tensor.
+   */
+  static index(broadcastedIndices: readonly number[], originalShape: readonly number[]): number[] {
+    // NOTE 1: we assume the parameter broadcastedIndices is valid. ie. it should have the same
+    // length as the broadcasted shape, and for each dimension the index should
+    // not be out of range.
+    const originalIndices = new Array(originalShape.length);
+    BroadcastUtil.fillIndex(broadcastedIndices, originalShape, originalIndices);
+    return originalIndices;
+  }
+
+  /**
+   * Given the indices of a broadcasted tensor, calculate the original indices
+   * @param broadcastedIndices The given indices of the broadcasted tensor.
+   * @param originalShape The original shape of the tensor before broadcast
+   * @param originalIndices The mapping of broadcastedIndices to the originalIndices (output parameter - will be
+   *     mutated).
+   */
+  static fillIndex(broadcastedIndices: readonly number[], originalShape: readonly number[], originalIndices: number[]):
+      void {
+    // NOTE 1: we assume the parameter broadcastedIndices is valid. ie. it should have the same length as the
+    // broadcasted shape, and for each dimension the index should not be out of range.
+    // NOTE 2: we assume the parameter originalIndices has the same length as the originalShape
+    const dimOffset = broadcastedIndices.length - originalShape.length;
+    for (let i = 0; i < originalShape.length; i++) {
+      originalIndices[i] = broadcastedIndices[dimOffset + i] % originalShape[i];
+    }
+  }
+
+  /**
+   * Determine if a shape is unidirectional broadcastable to another shape
+   * @param shape The input shape
+   * @param finalShape The desired shape after broadcasting
+   */
+  static isValidBroadcast(shape: readonly number[], finalShape: readonly number[]): boolean {
+    // align shape to the right
+    const inputRank = shape.length;
+    const finalRank = finalShape.length;
+    if (inputRank > finalRank) {
+      return false;
+    }
+    for (let i = 1; i <= inputRank; i++) {
+      if (shape[inputRank - i] !== 1 && shape[inputRank - i] !== finalShape[finalRank - i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Determine the broadcasted dims in input shape based on the given output shape.
+   * Note that this function only returns the broadcasted dims.
+   * @param inputShape The input shape
+   * @param outputShape The output shape
+   * @returns The broadcasted dims in input shape.
+   */
+  static getBroadcastDims(inputShape: readonly number[], outputShape: readonly number[]): number[] {
+    const inRank = inputShape.length;
+    const dims: number[] = [];
+    for (let i = 0; i < inRank; i++) {
+      const dim = inRank - 1 - i;
+      const a = inputShape[dim] || 1;
+      const b = outputShape[outputShape.length - 1 - i] || 1;
+      if (b > 1 && a === 1) {
+        dims.unshift(dim);
+      }
+    }
+    return dims;
+  }
+}
+
+
+export class ShapeUtil {
+  static size(dims: readonly number[]): number {
+    return ShapeUtil.getSizeFromDimensionRange(dims, 0, dims.length);
+  }
+
+  // `axis` inclusive
+  static sizeFromDimension(dims: readonly number[], axis: number): number {
+    if (axis < 0 || axis > dims.length) {
+      throw new Error(`invalid dimension of ${axis} for sizeFromDimension as Tensor has ${dims.length} dimensions.`);
+    }
+    return ShapeUtil.getSizeFromDimensionRange(dims, axis, dims.length);
+  }
+
+  // `axis` exclusive
+  static sizeToDimension(dims: readonly number[], axis: number): number {
+    if (axis < 0 || axis > dims.length) {
+      throw new Error(`invalid dimension of ${axis} for sizeToDimension as Tensor has ${dims.length} dimensions.`);
+    }
+    return ShapeUtil.getSizeFromDimensionRange(dims, 0, axis);
+  }
+
+  static getSizeFromDimensionRange(dims: readonly number[], start: number, end: number): number {
+    let size = 1;
+    for (let i = start; i < end; i++) {
+      // safety check as this method is called by multiple other methods requiring size.
+      // size cannot be 0 or negative.
+      if (dims[i] <= 0) {
+        throw new Error(
+            // eslint-disable-next-line max-len
+            'cannot get valid size from specified dimension range. Most likely the range contains 0 or negative values in them.');
+      }
+      size *= dims[i];
+    }
+    return size;
+  }
+
+  static computeStrides(dims: readonly number[]): readonly number[] {
+    const rank = dims.length;
+    if (rank === 0) {
+      return [];
+    } else if (rank === 1) {
+      return [1];
+    }
+    const strides = new Array(rank);
+    strides[rank - 1] = 1;
+    strides[rank - 2] = dims[rank - 1];
+    for (let i = rank - 3; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dims[i + 1];
+    }
+    return strides;
+  }
+
+  static transpose(dims: readonly number[]): readonly number[] {
+    const copy = dims.slice();
+    return copy.reverse();
+  }
+
+  static indicesToOffset(indices: readonly number[], strides: readonly number[], axis?: number): number {
+    if (axis === undefined) {
+      axis = indices.length;
+    }
+    let offset = 0;
+    for (let i = 0; i < axis; ++i) {
+      offset += strides[i] * indices[i];
+    }
+    return offset;
+  }
+
+  static offsetToIndices(offset: number, strides: readonly number[]): readonly number[] {
+    const rank = strides.length;
+    if (rank === 0) {
+      return [];
+    } else if (rank === 1) {
+      return [offset * strides[0]];
+    }
+    const indices: number[] = new Array(strides.length);
+    for (let i = 0; i < indices.length - 1; ++i) {
+      indices[i] = Math.floor(offset / strides[i]);
+      offset -= indices[i] * strides[i];
+    }
+    indices[indices.length - 1] = offset;
+    return indices;
+  }
+
+  /**
+   * normailze axis of range [-r, r) into [0, r).
+   */
+  static normalizeAxis(axis: number, tensorRank: number): number {
+    if (axis < -tensorRank && axis >= tensorRank) {
+      throw new Error('unsupported axis for this operation.');
+    }
+    return axis < 0 ? axis + tensorRank : axis;
+  }
+
+  static normalizeAxes(axes: readonly number[], tensorRank: number): number[] {
+    return axes.map(x => this.normalizeAxis(x, tensorRank));
+  }
+
+  // Increment an index into a tensor (in lexicographic
+  // ordering), wrapping around the specified upper_bound.
+  /**
+   * Increment an index into a tensor (in lexicographic ordering), wrapping around the specified upper_bound.
+   * @param index Given index to increment (Will be mutated)
+   * @param dims The dimensions of the tensor for which the given index corresponds to
+   * @param axisToIncrementOn The 1-indexed axis to increment on. If undefined, axisToIncrementOn == rank
+   */
+  static incrementIndex(index: number[], dims: readonly number[], axisToIncrementOn?: number): void {
+    if (dims.length === 0 || index.length === 0) {
+      throw new Error('Index incrementing unsupported for scalar Tensor');
+    }
+    if (axisToIncrementOn === undefined) {
+      axisToIncrementOn = dims.length;
+    } else {
+      if (axisToIncrementOn <= 0 || axisToIncrementOn > dims.length) {
+        throw new Error('Incorrect axis to increment on');
+      }
+    }
+
+    for (let k = axisToIncrementOn - 1; k >= 0; --k) {
+      index[k]++;
+      if (index[k] < dims[k]) {
+        break;
+      }
+      index[k] = 0;
+    }
+  }
+
+  /**
+   * Produces a new dimensions array based on the values in the 'originalDimensions' and 'shape' array
+   * Used in Reshape
+   * @param originalDims Original Shape array
+   * @param shapeHints array containing values to compute the new dimensions
+   * For example:
+   * originalDims = [2,2] and shapeHints = [0,-1] will return [2,2]
+   * originalDims = [2,2] and shapeHints = [4] will return [4]
+   * originalDims = [2,2] and shapeHints = [5] will throw an exception
+   * https://github.com/onnx/onnx/blob/main/docs/Operators.md#Reshape
+   */
+
+  static calculateReshapedDims(originalDims: readonly number[], shapeHints: ArrayLike<number>): number[] {
+    // reshape to a Scalar Tensor
+    if (shapeHints.length === 0) {
+      if (originalDims.length === 0 || ShapeUtil.size(originalDims) === 1) {
+        return [];
+      } else {
+        throw new Error('cannot reshape to a scalar Tensor');
+      }
+    }
+
+    const nDims = shapeHints.length;
+    const reshapedDims = new Array<number>(nDims);
+    let unknownDimension = -1;
+    let newTensorSize = 1;
+    for (let i = 0; i < nDims; i++) {
+      if (shapeHints[i] < -1) {
+        throw new Error('a dimension in shape hints cannot be less than -1');
+      }
+      if (shapeHints[i] === -1) {
+        if (unknownDimension !== -1) {
+          throw new Error('at most one dimension in shape hints can be -1');
+        }
+        unknownDimension = i;
+      } else {
+        if (shapeHints[i] === 0) {
+          if (i >= originalDims.length) {
+            throw new Error('the dimension with value zero exceeds the dimension size of the input tensor');
+          }
+          reshapedDims[i] = originalDims[i];
+        } else {
+          reshapedDims[i] = shapeHints[i];
+        }
+        newTensorSize *= reshapedDims[i];
+      }
+    }
+
+    const oldTensorSize = ShapeUtil.size(originalDims);
+    if (unknownDimension !== -1) {
+      if (oldTensorSize % newTensorSize !== 0) {
+        throw new Error(`the input tensor cannot be reshaped to the requested shape. Input shape: [${
+            originalDims}] Output shape: [${shapeHints}]`);
+      }
+      reshapedDims[unknownDimension] = oldTensorSize / newTensorSize;
+    }
+    // validate sizes from originalDims and reshapedDims match
+    else {
+      if (newTensorSize !== oldTensorSize) {
+        throw new Error('reshapedDims and originalDims don\'t have matching sizes');
+      }
+    }
+    return reshapedDims;
+  }
+
+  /**
+   * Sorts a given array based on the indices in the Perm array
+   * Used in Transpose
+   * @param a Array to be sorted such as dims or strides
+   * @param perm Perm given; if null a will be reversed
+   */
+  static sortBasedOnPerm(a: readonly number[], perm?: readonly number[]): readonly number[] {
+    if (perm) {
+      return perm.map((v) => a[v]);
+    } else {
+      return a.slice().reverse();
+    }
+  }
+
+  /**
+   * Pads a given shape according to the padding values
+   * @param dims shape of the Tensor to be padded
+   * @param pad pad values
+   */
+  static padShape(dims: readonly number[], pad: readonly number[]): readonly number[] {
+    const rank = dims.length;
+    return dims.map((v, i) => v + pad[i] + pad[i + rank]);
+  }
+
+  /**
+   * Determines if the two shapes are identical
+   * @param shape1
+   * @param shape2
+   */
+  static areEqual(shape1: readonly number[], shape2: readonly number[]): boolean {
+    if (shape1.length !== shape2.length) {
+      return false;
+    }
+    return shape1.every((v, i) => v === shape2[i]);
+  }
+
+  /**
+   * Validates if the given `dims` or `shape` is valid in ONNX.js context and returns data size
+   * @param dims - input `dims` that needs to be checked
+   */
+  static validateDimsAndCalcSize(dims: readonly number[]): number {
+    if (dims.length > 6) {
+      throw new TypeError('Only rank 0 to 6 is supported for tensor shape.');
+    }
+    let size = 1;
+    for (const n of dims) {
+      if (!Number.isInteger(n)) {
+        throw new TypeError(`Invalid shape: ${n} is not an integer`);
+      }
+      if (n < 0 || n > 2147483647) {
+        throw new TypeError(`Invalid shape: length ${n} is not allowed`);
+      }
+      size *= n;
+    }
+    return size;
+  }
+
+  /**
+   * Determines the shape of output tensor y = flatten(x, axis)
+   * @param dims - shape of input tensor
+   * @param axis - flatten axis, in the range [-r, r]
+   */
+  static flattenShape(dims: readonly number[], axis: number): readonly number[] {
+    if (axis < 0) {
+      axis += dims.length;
+    }
+    const total = dims.reduce((x, y) => x * y, 1);
+    const right = dims.slice(axis).reduce((x, y) => x * y, 1);
+    const outputDims = [total / right, right];
+
+    return outputDims;
+  }
+
+  /**
+   * Determines the shape of output tensor y = squeeze(x, axes)
+   * @param dims - shape of input tensor
+   * @param axes - squeeze axes
+   */
+  static squeezeShape(dims: readonly number[], axes: readonly number[]): readonly number[] {
+    const outputDims = new Array<number>();
+
+    // sanity check
+    axes = ShapeUtil.normalizeAxes(axes, dims.length);
+
+    for (let i = 0; i < dims.length; i++) {
+      const inSqueezeList = axes.indexOf(i) >= 0;
+      if (inSqueezeList && dims[i] !== 1) {
+        throw new Error('squeeze an axis of size different than 1');
+      }
+
+      if ((axes.length === 0 && dims[i] > 1) || (axes.length > 0 && !inSqueezeList)) {
+        outputDims.push(dims[i]);
+      }
+    }
+
+    return outputDims;
+  }
+
+  /**
+   * Determines the shape of output tensor y = unsqueeze(x, axes)
+   * @param dims - shape of input tensor
+   * @param axes - unsqueeze axes
+   */
+  static unsqueezeShape(dims: readonly number[], axes: readonly number[]): readonly number[] {
+    const outputDims = new Array<number>(dims.length + axes.length);
+
+    // initialize the array elements to 0
+    outputDims.fill(0);
+
+    // set all axes indices to 1 in outputDims and check for duplicates
+    for (let i = 0; i < axes.length; i++) {
+      const axis = ShapeUtil.normalizeAxis(axes[i], outputDims.length);
+      if (axis >= outputDims.length) {
+        throw new Error('\'axes\' has an out of range axis');
+      }
+      if (outputDims[axis] !== 0) {
+        throw new Error('\'axes\' has a duplicate axis');
+      }
+
+      outputDims[axis] = 1;
+    }
+
+    // fill in the zero entries of outputDims with the input tensor's shape
+    let inputDimsIterator = 0;
+    for (let i = 0; i < outputDims.length; i++) {
+      if (outputDims[i] === 0) {
+        outputDims[i] = dims[inputDimsIterator++];
+      }
+    }
+
+    // sanity check assertion. 'inputDimsIterator'
+    // should be equal to the length of 'dims'
+    if (inputDimsIterator !== dims.length) {
+      throw new Error('the unsqueezed dimension could not be established');
+    }
+
+    return outputDims;
+  }
+}
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
new file mode 100644
index 0000000000000..527219a97d210
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -0,0 +1,162 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {sizeof, Tensor} from '../tensor';
+import {ShapeUtil} from '../util';
+import {WebGpuBackend} from '../backend-webgpu';
+import {GpuData, GpuDataId, GpuDataType} from './types';
+
+/**
+ * manages GpuDataId -> GpuBuffer
+ */
+export interface GpuDataManager {
+  /**
+   * upload data to GPU. if the ID already exists in cache, returns the cached value without uploading anything.
+   */
+  upload(id: GpuDataId, data: Uint8Array, gpuDataType: GpuDataType): GpuData;
+  /**
+   * create new data on GPU.
+   */
+  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData;
+  /**
+   * get GPU data by ID.
+   */
+  get(id: GpuDataId): GpuData|undefined;
+  /**
+   * release the data on GPU by ID.
+   */
+  release(id: GpuDataId): void;
+  /**
+   * download the data from GPU.
+   */
+  download(id: GpuDataId): Promise<ArrayBufferLike>;
+}
+
+interface StorageCacheValue {
+  gpuData: GpuData;
+  size: number;
+}
+
+interface DownloadCacheValue {
+  gpuData: GpuData;
+  data: Promise<ArrayBufferLike>;
+}
+
+/**
+ * normalize the buffer size so that it fits the 128-bits (16 bytes) alignment.
+ */
+const calcNormalizedBufferSize = (size: number) => Math.ceil(size / 16) * 16;
+
+let guid = 0;
+const createNewGpuDataId = () => guid++;
+
+class GpuDataManagerImpl implements GpuDataManager {
+  // GPU Data ID => GPU Data ( storage buffer )
+  storageCache: Map<GpuDataId, StorageCacheValue>;
+
+  // GPU Data ID => GPU Data ( read buffer )
+  downloadCache: Map<GpuDataId, DownloadCacheValue>;
+
+  constructor(private backend: WebGpuBackend /* , private reuseBuffer: boolean */) {
+    this.storageCache = new Map();
+    this.downloadCache = new Map();
+  }
+
+  upload(data: Tensor.NumberType, gpuDataType: GpuDataType): GpuData {
+    if (gpuDataType !== GpuDataType.default) {
+      throw new Error('we only support default GPU data type now');
+    }
+
+    const srcArrayBuffer = data.buffer;
+    const srcOffset = data.byteOffset;
+    const srcLength = data.byteLength;
+    const size = calcNormalizedBufferSize(srcLength);
+
+    // create gpu buffer
+    const gpuBuffer = this.backend.device.createBuffer({mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE});
+
+    // copy (upload) data
+    const arrayBuffer = gpuBuffer.getMappedRange();
+    new Uint8Array(arrayBuffer).set(new Uint8Array(srcArrayBuffer, srcOffset, srcLength));
+    gpuBuffer.unmap();
+
+    const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
+    this.storageCache.set(gpuData.id, {gpuData, size: srcLength});
+    return gpuData;
+  }
+
+  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData {
+    if (gpuDataType !== GpuDataType.default) {
+      throw new Error('we only support default GPU data type now');
+    }
+
+    // !!!
+    // !!! IMPORTANT: TODO: whether we should keep the storage buffer every time, or always create new ones.
+    // !!!                  This need to be figured out by performance test results.
+    // !!!
+
+    const elemCount = ShapeUtil.size(dims);
+    const bufferLength = sizeof(type) * elemCount;
+    const size = calcNormalizedBufferSize(bufferLength);
+
+    // create gpu buffer
+    const gpuBuffer =
+        // eslint-disable-next-line no-bitwise
+        this.backend.device.createBuffer({size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC});
+
+    const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
+    this.storageCache.set(gpuData.id, {gpuData, size: bufferLength});
+    return gpuData;
+  }
+
+  get(id: GpuDataId): GpuData|undefined {
+    return this.storageCache.get(id)?.gpuData;
+  }
+
+  release(id: GpuDataId): void {
+    const cachedData = this.storageCache.get(id);
+    if (!cachedData) {
+      throw new Error('releasing data does not exist');
+    }
+
+    this.storageCache.delete(id);
+    cachedData.gpuData.buffer.destroy();
+
+    const downloadingData = this.downloadCache.get(id);
+    if (downloadingData) {
+      void downloadingData.data.then(() => {
+        downloadingData.gpuData.buffer.destroy();
+      });
+      this.downloadCache.delete(id);
+    }
+  }
+
+  async download(id: GpuDataId): Promise<ArrayBufferLike> {
+    const downloadData = this.downloadCache.get(id);
+    if (downloadData) {
+      return downloadData.data;
+    }
+
+    const cachedData = this.storageCache.get(id);
+    if (!cachedData) {
+      throw new Error('data does not exist');
+    }
+
+    const commandEncoder = this.backend.getCommandEncoder();
+    this.backend.endComputePass();
+    const gpuReadBuffer = this.backend.device.createBuffer(
+        // eslint-disable-next-line no-bitwise
+        {size: cachedData.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ});
+    commandEncoder.copyBufferToBuffer(
+        cachedData.gpuData.buffer /* source buffer */, 0 /* source offset */, gpuReadBuffer /* destination buffer */,
+        0 /* destination offset */, cachedData.size /* size */
+    );
+    this.backend.flush();
+
+    await gpuReadBuffer.mapAsync(GPUMapMode.READ);
+    return gpuReadBuffer.getMappedRange();
+  }
+}
+
+export const createGpuDataManager = (...args: ConstructorParameters<typeof GpuDataManagerImpl>): GpuDataManager =>
+    new GpuDataManagerImpl(...args);
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
new file mode 100644
index 0000000000000..4adfb180893a6
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {OpSet} from '../../opset';
+
+import * as binaryOps from './ops/binary-op';
+import {concat, parseConcatAttributes} from './ops/concat';
+import {conv, parseConvAttributes} from './ops/conv';
+import {gather, parseGatherAttributes} from './ops/gather';
+import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
+import {matMul, parseMatMulAttributes} from './ops/matmul';
+import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes, parseGlobalAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool';
+import {sum} from './ops/reduce-tensors';
+import {reshape} from './ops/reshape';
+import {shape} from './ops/shape';
+import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
+import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
+import {parseTransposeAttributes, transpose} from './ops/transpose';
+import * as unaryOps from './ops/unary-op';
+import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
+
+export const WEBGPU_OP_RESOLVE_RULES: readonly OpSet.ResolveRule[] = [
+  ['Abs', '', '6+', unaryOps.abs], ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
+  // ['And', '', '7+', binaryOps.and],
+  ['Asin', '', '7+', unaryOps.asin], ['Atan', '', '7+', unaryOps.atan],
+  // TODO: support new attributes for AveragePool-10
+  ['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
+  // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
+  // ['Cast', '', '6+', cast, parseCastAttributes],
+  ['Ceil', '', '6+', unaryOps.ceil], ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
+  ['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
+  ['Conv', '', '1+', conv, parseConvAttributes], ['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
+  // ['Dropout', '', '7+', unaryOps.identity],
+  // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
+  // ['Equal', '', '7+', binaryOps.equal],
+  ['Elu', '', '6+', unaryOps.elu, unaryOps.parseEluAttributes], ['Exp', '', '6+', unaryOps.exp],
+  // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
+  ['Floor', '', '6+', unaryOps.floor],
+  // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
+  ['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
+  ['Gemm', '', '11+', gemm, parseGemmAttributesV11],
+  ['GlobalAveragePool', '', '1+', globalAveragePool, parseGlobalAveragePoolAttributes],
+  ['GlobalMaxPool', '', '1+', globalMaxPool],
+  // ['Greater', '', '7+', binaryOps.greater],
+  // ['Identity', '', '1+', unaryOps.identity],
+  // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
+  // ['InstanceNormalization', '', '6+', instanceNormalization, parseInstanceNormalizationAttributes],
+  ['LeakyRelu', '', '6+', unaryOps.leakyRelu, unaryOps.parseLeakyReluAttributes],
+  // ['Less', '', '7+', binaryOps.less],
+  ['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
+  // TODO: support new attributes for MaxPool-8 and MaxPool-10
+  ['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes], ['Mul', '', '7+', binaryOps.mul],
+  ['Neg', '', '6+', unaryOps.neg],
+  // ['Not', '', '1+', unaryOps.not],
+  // ['Or', '', '7+', binaryOps.or],
+  // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
+  // ['Pad', '', '11+', padV11, parsePadAttributesV11],
+  ['Pow', '', '7+', binaryOps.pow],
+  // ['PRelu', '', '7+', binaryOps.pRelu],
+  // ['ReduceLogSum', '', '1+', reduceLogSum, parseReduceAttributes],
+  // ['ReduceMax', '', '1+', reduceMax, parseReduceAttributes],
+  // ['ReduceMean', '', '1+', reduceMean, parseReduceAttributes],
+  // ['ReduceMin', '', '1+', reduceMin, parseReduceAttributes],
+  // ['ReduceProd', '', '1+', reduceProd, parseReduceAttributes],
+  // ['ReduceSum', '', '1-12', reduceSum, parseReduceAttributes],
+  // ['ReduceSumSquare', '', '1+', reduceLogSumSquare, parseReduceAttributes],
+  ['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
+  // ['Resize', '', '10', resize, parseResizeAttributesV10],
+  // ['Resize', '', '11+', resize, parseResizeAttributesV11],
+  ['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid], ['Sin', '', '7+', unaryOps.sin],
+  ['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
+  ['Slice', '', '1-9', slice, parseSliceAttributes],
+  // // The "semantic" meaning of axis has changed in opset-13.
+  // ['Softmax', '', '1-12', softmax, parseSoftmaxAttributes],
+  // ['Softmax', '', '13+', softmaxV13, parseSoftmaxAttributesV13],
+  // // 'Split' operator has an optional attribute 'split'
+  // // this attribute determines how the specified axis of input data is split.
+  // // When the attribute is missing, we need the count of number of outputs
+  // // so that we can determine the 'split' attribute from the runtime input to the Operator
+  // ['Split', '', '2-12', split, parseSplitAttributes],
+  ['Sqrt', '', '6+', unaryOps.sqrt], ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
+  ['Squeeze', '', '13+', squeezeV13], ['Sub', '', '7+', binaryOps.sub], ['Sum', '', '6+', sum],
+  ['Tan', '', '7+', unaryOps.tan], ['Tanh', '', '6+', unaryOps.tanh],
+  // ['Tile', '', '6+', tile],
+  ['Transpose', '', '1+', transpose, parseTransposeAttributes],
+  // ['Upsample', '', '7-8', upsample, parseUpsampleAttributesV7],
+  // ['Upsample', '', '9', upsample, parseUpsampleAttributesV9],
+  ['Unsqueeze', '', '1-12', unsqueeze, parseUnsqueezeAttributes], ['Unsqueeze', '', '13+', unsqueezeV13],
+  // ['Xor', '', '7+', binaryOps.xor],
+];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
new file mode 100644
index 0000000000000..31642e47503c7
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -0,0 +1,217 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {WebGpuBackend} from '../../backend-webgpu';
+import {Tensor} from '../../tensor';
+import {BroadcastUtil, ShapeUtil} from '../../util';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+type BuiltinFunctionName = string;
+type BinaryCustomExpression = (expressionA: string, expressionB: string) => string;
+type BinaryFunctionCall = BuiltinFunctionName|BinaryCustomExpression|{
+  scalar: BinaryCustomExpression;
+  vector: BinaryCustomExpression;
+};
+
+const createBinaryOpProgramShader =
+    (dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[], vectorize: boolean,
+     doBroadcast: boolean, funcCall: BinaryFunctionCall, additionalImplementation?: string, typeA = 'f32',
+     typeB = 'f32', typeOutput = 'f32') => {
+      const outputSize = ShapeUtil.size(dimsOutput);
+      const vecSize = Math.ceil(outputSize / 4);
+
+      let expressionScalar: BinaryCustomExpression;
+      let expressionVector: BinaryCustomExpression;
+      if (typeof funcCall === 'string') {
+        expressionScalar = expressionVector = (a, b) => `${funcCall}((${a}),(${b}))`;
+      } else if (typeof funcCall === 'function') {
+        expressionScalar = expressionVector = funcCall;
+      } else {
+        expressionScalar = funcCall.scalar;
+        expressionVector = funcCall.vector;
+      }
+
+      let broadcastImpl = '';
+      const outputIndicesHelper = createIndicesHelper('output', dimsOutput);
+      if (doBroadcast) {
+        const calcOffsetImpl = (dims: readonly number[]) => {
+          const strides = ShapeUtil.computeStrides(dims);
+          const offsets: string[] = [];
+          for (let i = dims.length - 1; i >= 0; i--) {
+            offsets.push(`${strides[i]}u * ((*outputIndices)[${i + dimsOutput.length - dims.length}] % ${dims[i]}u)`);
+          }
+          return offsets.length > 0 ? offsets.join('+') : '0u';
+        };
+
+        broadcastImpl = `
+  ${outputIndicesHelper.o2iImpl}
+
+  fn calcOffsetA(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
+    return ${calcOffsetImpl(dimsA)};
+  }
+
+  fn calcOffsetB(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
+    return ${calcOffsetImpl(dimsB)};
+  }
+  `;
+      }
+
+      let assignment: string;
+      if (vectorize) {
+        if (doBroadcast) {
+          assignment = `
+      ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+      ${outputIndicesHelper.o2iCall('global_id.x * 4u', 'outputIndices')}
+      let offsetA = calcOffsetA(&outputIndices);
+      let offsetB = calcOffsetB(&outputIndices);
+      outputData[global_id.x] = ${expressionVector('aData[offsetA / 4u]', 'bData[offsetB / 4u]')};`;
+        } else {
+          assignment = `outputData[global_id.x] = ${expressionVector('aData[global_id.x]', 'bData[global_id.x]')};`;
+        }
+      } else {
+        if (!doBroadcast) {
+          throw new Error('no necessary to use scalar implementation for element-wise binary op implementation.');
+        }
+        const singleAssignment = (x: number) => {
+          const expressionA = `aData[indexA${x}][componentA${x}]`;
+          const expressionB = `bData[indexB${x}][componentB${x}]`;
+          return `
+      ${outputIndicesHelper.o2iCall(`global_id.x * 4u + ${x}u`, 'outputIndices')}
+      let offsetA${x} = calcOffsetA(&outputIndices);
+      let offsetB${x} = calcOffsetB(&outputIndices);
+      let indexA${x} = offsetA${x} / 4u;
+      let indexB${x} = offsetB${x} / 4u;
+      let componentA${x} = offsetA${x} % 4u;
+      let componentB${x} = offsetB${x} % 4u;
+      outputData[global_id.x][${x}] = ${expressionScalar(expressionA, expressionB)};`;
+        };
+
+        assignment = `
+      ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+      ${singleAssignment(0)}
+      ${singleAssignment(1)}
+      ${singleAssignment(2)}
+      ${singleAssignment(3)}`;
+      }
+
+      return `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  @group(0) @binding(0) var<storage, read> aData : array<vec4<${typeA}>>;
+  @group(0) @binding(1) var<storage, read> bData : array<vec4<${typeB}>>;
+  @group(0) @binding(2) var<storage, read_write> outputData : array<vec4<${typeOutput}>>;
+
+  ${additionalImplementation ?? ''}
+  ${broadcastImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${vecSize}u) {
+      return;
+    }
+
+    ${assignment}
+  }`;
+    };
+
+const createBinaryOpProgramInfo =
+    (metadata: ProgramMetadata, a: Tensor, b: Tensor, funcCall: BinaryFunctionCall, additionalImplementation?: string,
+     outputTensorType: Tensor.DataType = a.type): ProgramInfo => {
+      const isBroadcast = !ShapeUtil.areEqual(a.dims, b.dims);
+      let outputShape = a.dims;
+      let outputSize = a.size;
+
+      let vectorize = false;
+
+      // TODO: deal with zero-sized tensors (eg. dims=[1,0])
+
+      if (isBroadcast) {
+        const calculatedShape = BroadcastUtil.calcShape(a.dims, b.dims, false);
+        if (!calculatedShape) {
+          throw new Error('Can\'t perform binary op on the given tensors');
+        }
+        outputShape = calculatedShape;
+        outputSize = ShapeUtil.size(outputShape);
+
+        // check whether vectorize can be enabled
+        let sharedDimension = 1;
+        for (let i = 0; i < outputShape.length; i++) {
+          const dimA = a.dims[a.dims.length - i] ?? 1;
+          const dimB = b.dims[b.dims.length - i] ?? 1;
+          if (dimA === dimB) {
+            sharedDimension *= dimA;
+          } else {
+            break;
+          }
+        }
+        if (sharedDimension % 4 === 0) {
+          vectorize = true;
+        }
+
+
+      } else {
+        // element-wise
+        vectorize = true;
+      }
+
+      return {
+        ...metadata,
+        shaderSource: createBinaryOpProgramShader(
+            a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, additionalImplementation),
+        outputs: [{dims: outputShape, type: outputTensorType, gpuDataType: GpuDataType.default}],
+        dispatchGroup: () =>
+            ({x: Math.ceil(outputSize / 64 /* workgroup size */ / (vectorize ? 4 : 1) /* vec size */)})
+      };
+    };
+
+const createBinaryOpProgramInfoLoader =
+    (inputs: Tensor[], name: string, funcCall: BinaryFunctionCall, additionalImplementation?: string,
+     cacheKey?: string): ProgramInfoLoader => {
+      const metadata:
+          ProgramMetadata = {name, inputTypes: [GpuDataType.default, GpuDataType.default], cacheHint: cacheKey};
+      return {
+        ...metadata,
+        get: () => createBinaryOpProgramInfo(metadata, inputs[0], inputs[1], funcCall, additionalImplementation)
+      };
+    };
+
+export const add = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
+    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Add', (a, b) => `${a}+${b}`), inputs);
+
+// export const and = (backend: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslAnd(), 'bool'), inputs)];
+
+export const div = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
+    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Div', (a, b) => `${a}/${b}`), inputs);
+
+// export const equal = (backend: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslEqual(), 'bool'), inputs)];
+
+// export const greater = (backend: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslGreater(), 'bool'), inputs)];
+
+// export const less = (backend: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslLess(), 'bool'), inputs)];
+
+export const mul = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
+    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Mul', (a, b) => `${a}*${b}`), inputs);
+
+// export const or = (backend: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslOr(), 'bool'), inputs)];
+
+export const pow = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
+    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Pow', 'pow'), inputs);
+
+// export const pRelu = (backend: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslPRelu()), inputs)];
+
+export const sub = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
+    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Sub', (a, b) => `${a}-${b}`), inputs);
+
+// export const xor = (backend: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslXor(), 'bool'), inputs)];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
new file mode 100644
index 0000000000000..f006d175da0ed
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {ShapeUtil} from '../../util';
+
+/**
+ * constant value for a workgroup size.
+ *
+ * We definitely can do further optimization in future, but for now we use 64.
+ *
+ * rule of thumb: Use [a workgroup size of] 64 unless you know what GPU you are targeting or that your workload
+ *                needs something different.
+ *
+ * from: https://surma.dev/things/webgpu/
+ **/
+export const WORKGROUP_SIZE = 64;
+
+export interface IndicesHelper {
+  /**
+   * WGSL code of function implementation for offset-to-indices
+   */
+  o2iImpl: string;
+  /**
+   * WGSL code of function call for offset-to-indices
+   */
+  o2iCall: (varOffset: string, varIndices: string) => string;
+  /**
+   * WGSL code of function implementation for indices-to-offset
+   */
+  i2oImpl: string;
+  /**
+   * WGSL code of function implementation for indices-to-offset
+   *
+   * @param isPtr - whether the variable is a pointer. default is false.
+   */
+  i2oExpression: (varIndices: string, isPtr?: boolean) => string;
+  /**
+   * WGSL code of indices variable declaration
+   *
+   * @param v - variable name.
+   * @param init - initial value.
+   */
+  indicesVariableDeclaration: (v: string, init?: string[]) => string;
+  /**
+   * data type of indices
+   */
+  iType: string;
+}
+
+export const createIndicesHelper = (name: string, shape: readonly number[]): IndicesHelper => {
+  const iType = shape.length < 2 ? 'u32' : `array<u32, ${shape.length}>`;
+
+  const strides = ShapeUtil.computeStrides(shape);
+  let o2iSnippet = '';
+  for (let i = 0; i < shape.length - 1; i++) {
+    o2iSnippet += `
+    let dim${i} = current / ${strides[i]}u;
+    let rest${i} = current % ${strides[i]}u;
+    (*indices)[${i}] = dim${i};
+    current = rest${i};
+    `;
+  }
+  o2iSnippet += `(*indices)[${shape.length - 1}] = current;`;
+
+  const o2iImpl = shape.length < 2 ? '' : `
+  fn ih_o2i_${name}(offset: u32, indices: ptr<function, ${iType}>) {
+    var current = offset;
+    ${o2iSnippet}
+  }`;
+
+  const o2iCall = (varOffset: string, varIndices: string) =>
+      shape.length < 2 ? `${varIndices}=${varOffset};` : `ih_o2i_${name}(${varOffset}, &${varIndices});`;
+
+  const offsets: string[] = [];
+  for (let i = shape.length - 1; i >= 0; i--) {
+    offsets.push(`${strides[i]}u * ((*indices)[${i}])`);
+  }
+
+  const i2oImpl = shape.length < 2 ? '' : `
+  fn ih_i2o_${name}(indices: ptr<function, ${iType}>) -> u32 {
+    return ${offsets.length > 0 ? offsets.join('+') : '0u'};
+  }`;
+
+  const i2oExpression = (varIndices: string, isPtr?: boolean) =>
+      shape.length < 2 ? `(${isPtr ? '*' : ''}${varIndices})` : `ih_i2o_${name}(${isPtr ? '' : '&'}${varIndices})`;
+
+  const indicesVariableDeclaration = (v: string, init?: string[]) =>
+      `var ${v}:${iType}${init ? `=${iType}(${init.join(',')})` : ''};`;
+
+  return {o2iImpl, o2iCall, i2oImpl, i2oExpression, indicesVariableDeclaration, iType};
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
new file mode 100644
index 0000000000000..37fb8be4536fa
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -0,0 +1,176 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, IndicesHelper, WORKGROUP_SIZE} from './common';
+
+export interface ConcatAttributes extends AttributeWithCacheKey {
+  readonly axis: number;
+}
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length < 1) {
+    throw new Error('too few inputs');
+  }
+
+  const inputType = inputs[0].type;
+  const inputDimensionality = inputs[0].dims.length;
+
+  // TODO: Support string concat
+  if (inputType === 'string') {
+    throw new Error('string tensor is not supported yet');
+  }
+
+  for (const input of inputs) {
+    // make sure types of all inputs match
+    if (input.type !== inputType) {
+      throw new Error('input tensors should be one type');
+    }
+
+    // make sure the dimensionality of all inputs are the same
+    if (input.dims.length !== inputDimensionality) {
+      throw new Error('input tensors should have the same shape');
+    }
+  }
+};
+
+export const concat = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConcatAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(createConcatProgramInfoLoader(inputs, attributes), inputs);
+};
+
+const createConcatProgramMetadata = (inputCount: number, cacheHint: string) =>
+    ({name: 'Concat', inputTypes: Array(inputCount).fill(GpuDataType.default), cacheHint});
+
+const createConcatProgramInfo =
+    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
+      const inputShape = inputs[0].dims.slice();
+      if (axis >= inputShape.length || axis < (-1 * inputShape.length)) {
+        throw new Error('axis specified for concat doesn\'t match input dimensionality');
+      }
+      if (axis < 0) {
+        axis = inputShape.length + axis;
+      }
+      // ensure all of the non-concatenated axes match each other
+      // calculate the shape of the output tensor while we do that
+      const outputShape = inputShape.slice(0);
+      for (let i = 1; i < inputs.length; i++) {
+        const dataNShape = inputs[i].dims.slice();
+        for (let axisIndex = 0; axisIndex < inputShape.length; axisIndex++) {
+          // add to the placeholder for computing output shape
+          if (axisIndex === axis) {
+            outputShape[axis] += dataNShape[axisIndex];
+          }
+          // ensure all non-cancatenated axes match each other
+          else if (inputShape[axisIndex] !== dataNShape[axisIndex]) {
+            throw new Error('non concat dimensions must match');
+          }
+        }
+      }
+
+      const outputSize = ShapeUtil.size(outputShape);
+      const rank = outputShape.length;
+
+      const sizeInConcatAxis = new Array<number>(inputs.length);
+      const inputStorageBuffersDeclarations = new Array<string>(inputs.length);
+      const inputIndicesHelpers = new Array<IndicesHelper>(inputs.length);
+
+      let previousSum = 0;
+      for (let i = 0; i < inputs.length; ++i) {
+        previousSum += inputs[i].dims[axis];
+        sizeInConcatAxis[i] = previousSum;
+
+        inputStorageBuffersDeclarations[i] =
+            `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`;
+
+        inputIndicesHelpers[i] = createIndicesHelper(`input${i}`, inputs[i].dims);
+      }
+
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+
+      const indicesAxis = rank < 2 ? 'indices' : `indices[${axis}]`;
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+  ${inputIndicesHelpers.map(i => i.i2oImpl).join('\n')}
+  ${outputIndicesHelper.o2iImpl}
+
+  let sizeInConcatAxis = array<u32, ${sizeInConcatAxis.length}>(${sizeInConcatAxis.map(i => `${i}u`).join(',')});
+  ${calculateInputIndexImpl(sizeInConcatAxis.length)}
+  ${readBufferDataImpl(inputIndicesHelpers, rank, dataType)}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+
+    let textureIndex = calculateInputIndex(${indicesAxis});
+    if (textureIndex != 0u) {
+      ${indicesAxis} -= sizeInConcatAxis[textureIndex - 1u];
+    }
+
+    output[global_id.x] = readBufferData(textureIndex, &indices);
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const createConcatProgramInfoLoader = (inputs: Tensor[], attributes: ConcatAttributes): ProgramInfoLoader => {
+  const metadata = createConcatProgramMetadata(inputs.length, attributes.cacheKey);
+  return {...metadata, get: () => createConcatProgramInfo(metadata, inputs, attributes.axis)};
+};
+
+const calculateInputIndexImpl = (numberOfTensors: number): string => `
+  fn calculateInputIndex(index: u32) -> u32 {
+    for (var i: u32 = 0u; i < ${numberOfTensors}u; i += 1u ) {
+      if (index < sizeInConcatAxis[i]) {
+        return i;
+      }
+    }
+    return ${numberOfTensors}u;
+  }`;
+
+const readBufferDataImpl = (indicesHelper: readonly IndicesHelper[], tensorRank: number, dataType: string) => {
+  const numberOfTensors = indicesHelper.length;
+  const codeLines: string[] = [];
+  for (let i = 0; i < numberOfTensors; ++i) {
+    const returnSnippet = `return input${i}[${indicesHelper[i].i2oExpression('indices', true)}];`;
+    if (numberOfTensors === 1) {
+      codeLines.push(returnSnippet);
+    } else if (i === 0) {
+      codeLines.push(`if (textureIndex == ${i}u) { ${returnSnippet} }`);
+    } else if (i === numberOfTensors - 1) {
+      codeLines.push(`else { ${returnSnippet} }`);
+    } else {
+      codeLines.push(`else if (textureIndex == ${i}) { ${returnSnippet} }`);
+    }
+  }
+  return `
+  fn readBufferData(textureIndex: u32, indices: ptr<function, ${indicesHelper[0].iType}>) -> ${dataType} {
+    ${codeLines.join('\n')}
+  }`;
+};
+
+export const parseConcatAttributes: OperatorInitialization<ConcatAttributes> = (node: Graph.Node): ConcatAttributes =>
+    createAttributeWithCacheKey({axis: node.attributes.getInt('axis')});
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
new file mode 100644
index 0000000000000..570ec041a34fc
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Logger} from '../../../instrument';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+import {calculateOutputShape, ConvAttributes} from './conv';
+import {getActicationSnippet} from './fuse-utils';
+
+const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
+  name: 'GroupedConv',
+  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                        [GpuDataType.default, GpuDataType.default],
+  cacheHint
+});
+
+const createGroupedConvProgramInfo =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], metadata: ProgramMetadata,
+     attributes: ConvAttributes): ProgramInfo => {
+      const hasBias = inputs.length > 2;
+      const processBias = hasBias ? 'value += b[output_channel];' : '';
+      const xShape = inputs[0].dims;
+      const wShape = inputs[1].dims;
+      const outputChannelsPerGroup = wShape[0] / attributes.group;
+
+      const dataType = 'f32';  // TODO: support other data type
+      const {activationFunction, applyActivation} = getActicationSnippet(attributes);
+      const inputStorageBuffersDeclarations = [
+        `@group(0) @binding(0) var<storage, read> x : array<${dataType}>;`,
+        `@group(0) @binding(1) var<storage, read> w : array<${dataType}>;`
+      ];
+      if (hasBias) {
+        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
+      }
+
+      Logger.verbose(
+          'GroupedConv',
+          `autpPad:${attributes.autoPad}, dilations:${attributes.dilations}, group:${attributes.group}, kernelShape:${
+              attributes.kernelShape}, pads:${attributes.pads}, strides:${attributes.strides}`);
+      const outputShape =
+          calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
+      const outputSize = ShapeUtil.size(outputShape);
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const xIndicesHelper = createIndicesHelper('x', xShape);
+      const wIndicesHelper = createIndicesHelper('w', wShape);
+
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
+  const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputStorageBuffersDeclarations.length}) var<storage, read_write> output : array<${dataType}>;
+
+  ${activationFunction}
+  ${outputIndicesHelper.o2iImpl}
+  ${xIndicesHelper.i2oImpl}
+  ${wIndicesHelper.i2oImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
+    let batch: u32 = outputIndices[0];
+    let output_channel: u32 = outputIndices[1];
+    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
+    let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
+
+    var value: ${dataType} = ${dataType}(0);
+    for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
+      let input_channel = group_id * ${wShape[1]}u + wInChannel;
+      for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
+        let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
+
+        if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
+          continue;
+        }
+
+        for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
+          let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
+          if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
+            continue;
+          }
+
+          ${
+          xIndicesHelper.indicesVariableDeclaration(
+              'xIndices',
+              [
+                'batch', 'input_channel', 'xHeight', 'xWidth'
+              ])}
+          let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
+          ${
+          wIndicesHelper.indicesVariableDeclaration('wIndices', [
+            'output_channel', 'wInChannel', 'wHeight', 'wWidth'
+          ])}
+          let wVal = w[${wIndicesHelper.i2oExpression('wIndices')}];
+          value += xVal*wVal;
+        }
+      }
+    }
+    ${processBias}
+    ${applyActivation}
+    output[global_id.x] = value;
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+export const createGroupedConvProgramInfoLoader =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], attributes: ConvAttributes):
+        ProgramInfoLoader => {
+          const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
+          return {...metadata, get: () => createGroupedConvProgramInfo(inferenceHandler, inputs, metadata, attributes)};
+        };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
new file mode 100644
index 0000000000000..644e9b08c7030
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -0,0 +1,150 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {InferenceHandler} from '../../../backend';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {PoolConvUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+import {createGroupedConvProgramInfoLoader} from './conv-grouped';
+// import {createDotProductProgramInfoLoader} from './dot-product';
+import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+
+// import {createIm2ColProgramInfoLoader} from './im2col';
+// import {createMatmulProgramInfoLoader} from './matmul';
+
+
+export const calculateOutputShape =
+    (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
+     adjustPads: readonly number[], strides: readonly number[]): number[] => {
+      const batchSize = inputShape[0];
+      const inputSpatialShape = inputShape.slice(2);
+      const spatialRank = inputSpatialShape.length;
+      const outChannels = kernelShape[0];
+      const kernelSpatialShape = kernelShape.slice(2);
+      const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
+      const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]);
+      const outputSpatialShape =
+          inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
+      const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
+      return outputShape;
+    };
+
+export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
+  readonly autoPad: string;
+  readonly dilations: readonly number[];
+  readonly group: number;
+  readonly kernelShape: readonly number[];
+  readonly pads: readonly number[];
+  readonly strides: readonly number[];
+}
+
+export const conv: OperatorAsyncImplementation<ConvAttributes> =
+    async(inferenceHandler: InferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs, attributes);  // currently will fail if not conv2D
+  return conv2d(inferenceHandler, inputs, attributes);
+};
+
+const conv2d: OperatorAsyncImplementation<ConvAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
+  const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
+  //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
+  //  if (adjustedAttributes.group > 1) {
+  return inferenceHandler.run(createGroupedConvProgramInfoLoader(inferenceHandler, inputs, adjustedAttributes), inputs);
+  //  } else if (isPointwise) {
+  //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
+  //  } else {
+  //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
+  //  }
+};
+
+const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inputs: Tensor[]): T => {
+  const kernelShape = attributes.kernelShape.slice();
+  // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
+  if (attributes.kernelShape.length === 0) {
+    for (let i = 2; i < inputs[1].dims.length; ++i) {
+      kernelShape.push(inputs[1].dims[i]);
+    }
+  }
+  const pads = attributes.pads.slice();
+  PoolConvUtil.adjustPadsBasedOnAutoPad(
+      inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
+
+  // always return a new object so does not modify the original attributes
+  const newAttributes: T = Object.assign({}, attributes);
+  Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
+  return newAttributes;
+};
+
+export const parseConvAttributes: OperatorInitialization<ConvAttributes> = (node: Graph.Node): ConvAttributes => {
+  const attributes = node.attributes;
+  const activationAttributes = parseInternalActivationAttributes(attributes);
+  // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
+  const autoPad = attributes.getString('auto_pad', 'NOTSET');
+  const dilations = attributes.getInts('dilations', [1, 1]);
+  const group = attributes.getInt('group', 1);
+  const kernelShape = attributes.getInts('kernel_shape', []);
+  const pads = attributes.getInts('pads', [0, 0, 0, 0]);
+  const strides = attributes.getInts('strides', [1, 1]);
+
+  return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides, ...activationAttributes});
+};
+
+const validateInputs = (inputs: Tensor[], attributes: ConvAttributes): void => {
+  // Refer to the below link for all input checks
+  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
+  if (!inputs || (inputs.length !== 2 && inputs.length !== 3)) {
+    throw new Error('Conv requires 2 or 3 inputs');
+  }
+
+  // TODO : Need to add support for multi-dimensional conv
+  if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
+    throw new Error('currently only support 2-dimensional conv');
+  }
+
+  // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
+  const dataChannel = inputs[0].dims[1];
+  const filterInChannel = inputs[1].dims[1] * attributes.group;
+  if (dataChannel !== filterInChannel) {
+    throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
+  }
+
+  // if bias is provided it should be 1D and the number of elements should be equal to the number of feature maps
+  if (inputs.length === 3 && (inputs[2].dims.length !== 1 || inputs[1].dims[0] !== inputs[2].dims[0])) {
+    throw new Error('invalid bias');
+  }
+
+  const spatialRank = inputs[0].dims.length - 2;
+  // wrong dilations dimension
+  if (attributes.dilations.length !== spatialRank) {
+    throw new Error(`dilations should be ${spatialRank}D`);
+  }
+
+  // Wrong strides dimension
+  if (attributes.strides.length !== spatialRank) {
+    throw new Error(`strides should be ${spatialRank}D`);
+  }
+
+  // Wrong pads dimension
+  if (attributes.pads.length !== spatialRank * 2) {
+    throw new Error(`pads should be ${spatialRank * 2}D`);
+  }
+
+  // if kernelShape is specified, it's data length must be 2 less than dims length of the weights tensor
+  // (the first 2 dims are batch_size and channels)
+  if (attributes.kernelShape.length !== 0 && attributes.kernelShape.length !== inputs[1].dims.length - 2) {
+    throw new Error('invalid kernel shape');
+  }
+
+  // TODO : Need to add support for float64
+  if (inputs[0].type !== 'float32' || inputs[1].type !== 'float32') {
+    throw new Error('Conv input(X,W) should be float tensor');
+  }
+
+  if (inputs.length === 3 && inputs[2].type !== 'float32') {
+    throw new Error('Conv input(bias) should be float tensor');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
new file mode 100644
index 0000000000000..fae2c9fb6e9b2
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Attribute} from '../../../attribute';
+import {MAX_CLIP, MIN_CLIP} from '../../../util';
+
+export interface InternalActivationAttributes {
+  readonly activation: string;
+  readonly clipMin?: number;
+  readonly clipMax?: number;
+  readonly activationCacheKey: string;
+}
+
+export function getActicationSnippet(attributes: InternalActivationAttributes) {
+  switch (attributes.activation) {
+    case 'Relu':
+      return {activationFunction: '', applyActivation: 'value = max(value, 0.0);'};
+    case 'Sigmoid':
+      return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
+    case 'Clip':
+      return {
+        activationFunction: `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
+        applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
+      };
+      // TODO: adding other activations that can be fused.
+    default:
+      return {activationFunction: '', applyActivation: ''};
+  }
+}
+
+export const parseInternalActivationAttributes = (attributes: Attribute): InternalActivationAttributes => {
+  const activation = attributes.getString('activation', '');
+
+  if (activation === 'Clip') {
+    const [clipMin, clipMax] = attributes.getFloats('activation_params', [MIN_CLIP, MAX_CLIP]);
+    return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
+  }
+  return {activation, activationCacheKey: activation};
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
new file mode 100644
index 0000000000000..65f679a2cea83
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -0,0 +1,131 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {NUMBER_TYPES, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+interface GatherAttributes extends AttributeWithCacheKey {
+  readonly axis: number;
+}
+
+export const gather = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GatherAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs, attributes.axis);
+  return inferenceHandler.run(createGatherProgramInfoLoader(inputs, attributes), inputs);
+};
+
+export const parseGatherAttributes: OperatorInitialization<GatherAttributes> = (node: Graph.Node): GatherAttributes =>
+    createAttributeWithCacheKey({axis: node.attributes.getInt('axis', 0)});
+
+const gatherProgramMetadata = {
+  name: 'Gather',
+  inputTypes: [GpuDataType.default, GpuDataType.default]
+};
+
+const createGatherProgramInfo =
+    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
+      const dataShape = inputs[0].dims.slice();
+      const indicesShape = inputs[1].dims.slice();
+      const outputShape = new Array(dataShape.length + indicesShape.length - 1);
+
+      axis = ShapeUtil.normalizeAxis(axis, dataShape.length);
+      const indexCopyOps: string[] = [];
+      if (indicesShape.length > 1) {
+        indexCopyOps.push('indicesIdx[0] = 0u;');
+      } else {
+        indexCopyOps.push('indicesIdx = 0u;');
+      }
+      for (let i = 0; i < outputShape.length; i++) {
+        // outputShape is divided into three parts: A, B, C
+        // |0        axis|  axis + indicesShape.length |          end|
+        // |     A       |             B               |      C      |
+        //
+        // dataIdx: [A, inputs[1][B], C]
+        const outputIdxLValue = outputShape.length > 1 ? `outputIdx[${i}]` : 'outputIdx';
+        if (i < axis) {  // A
+          const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i}]` : 'dataIdx';
+          outputShape[i] = dataShape[i];
+          indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
+        } else {
+          if (i < axis + indicesShape.length) {  // B
+            const indicesIdxLValue = indicesShape.length > 1 ? `indicesIdx[${i - axis}]` : 'indicesIdx';
+            outputShape[i] = indicesShape[i - axis];
+            indexCopyOps.push(`${indicesIdxLValue} = ${outputIdxLValue};`);
+          } else {  // C
+            const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i - indicesShape.length + 1}]` : 'dataIdx';
+            outputShape[i] = dataShape[i - indicesShape.length + 1];  // skip 1 for axis
+            indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
+          }
+        }
+      }
+      const outputSize = ShapeUtil.size(outputShape);
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const dataIndicesHelper = createIndicesHelper('data', dataShape);
+      const indicesIndicesHelper = createIndicesHelper('indices', indicesShape);
+
+      const shaderSource = `
+    const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+    @group(0) @binding(0) var<storage, read> data : array<${dataType}>;
+    @group(0) @binding(1) var<storage, read> indices : array<i32>;
+    @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
+
+    ${outputIndicesHelper.o2iImpl}
+    ${indicesIndicesHelper.i2oImpl}
+    ${dataIndicesHelper.i2oImpl}
+
+    @compute @workgroup_size(WORKGROUP_SIZE)
+    fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+      // Guard against out-of-bounds work group sizes
+      if (global_id.x >= ${outputSize}u) {
+        return;
+      }
+
+      ${outputIndicesHelper.indicesVariableDeclaration('outputIdx')}
+      ${outputIndicesHelper.o2iCall('global_id.x', 'outputIdx')}
+      ${dataIndicesHelper.indicesVariableDeclaration('dataIdx')}
+      ${indicesIndicesHelper.indicesVariableDeclaration('indicesIdx')}
+      ${indexCopyOps.join('\n        ')}
+      let idx = indices[${indicesIndicesHelper.i2oExpression('indicesIdx')}];
+      dataIdx${dataShape.length > 1 ? `[${axis}]` : ''} = u32(select(idx, idx + ${dataShape[axis]}, idx < 0));
+      output[global_id.x] = data[${dataIndicesHelper.i2oExpression('dataIdx')}];
+    }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const createGatherProgramInfoLoader = (inputs: Tensor[], attributes: GatherAttributes): ProgramInfoLoader => {
+  const metadata = {...gatherProgramMetadata, cacheHint: attributes.cacheKey};
+  return {...metadata, get: () => createGatherProgramInfo(metadata, inputs, attributes.axis)};
+};
+
+const validateInputs = (inputs: Tensor[], axis: number): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Gather requires 2 inputs.');
+  }
+  const tensorRank = inputs[0].dims.length;
+  if (tensorRank < 1) {
+    throw new Error('Invalid input shape.');
+  }
+  if (axis < -tensorRank || axis > tensorRank - 1) {
+    throw new Error('Invalid axis.');
+  }
+  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
+    throw new Error('Invaid input type.');
+  }
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invaid input type.');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
new file mode 100644
index 0000000000000..3eeb49c91033a
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -0,0 +1,165 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {GemmUtil, ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+export interface GemmAttributes extends AttributeWithCacheKey {
+  transA: boolean;
+  transB: boolean;
+  alpha: number;
+  beta: number;
+  isOptionalC: boolean;  // in opset 11, C becomes optional
+}
+
+export const gemm: OperatorAsyncImplementation<GemmAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GemmAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs, attributes);
+  return inferenceHandler.run(createGemmProgramInfoLoader(inputs, attributes), inputs);
+};
+
+const parseGemmAttributes = (node: Graph.Node, isOptionalC: boolean): GemmAttributes => {
+  const transA = node.attributes.getInt('transA', 0) !== 0;
+  const transB = node.attributes.getInt('transB', 0) !== 0;
+  const alpha = node.attributes.getFloat('alpha', 1.0);
+  const beta = node.attributes.getFloat('beta', 1.0);
+  return createAttributeWithCacheKey({transA, transB, alpha, beta, isOptionalC});
+};
+
+export const parseGemmAttributesV7: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
+    parseGemmAttributes(node, false);
+
+export const parseGemmAttributesV11: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
+    parseGemmAttributes(node, true);
+
+const createGemmProgramInfoLoader = (inputs: Tensor[], attributes: GemmAttributes): ProgramInfoLoader => {
+  const metadata = {
+    name: 'Gemm',
+    inputTypes: inputs.length === 3 ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                                      [GpuDataType.default, GpuDataType.default],
+    cacheHint: attributes.cacheKey
+  };
+
+  return {...metadata, get: () => createGemmProgramInfo(metadata, inputs, attributes)};
+};
+
+const offsetC = (m: number, n: number, dims: readonly number[]): string => {
+  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
+  const broadcastN = dims[dims.length - 1] !== n;
+
+  let offset = '0u';
+  if (!broadcastM) {
+    offset += `+ m * ${dims[dims.length - 1]}u`;
+  }
+  if (!broadcastN) {
+    offset += '+n';
+  }
+
+  return offset;
+};
+
+const createGemmProgramInfo =
+    (metadata: ProgramMetadata, inputs: Tensor[], attributes: GemmAttributes): ProgramInfo => {
+      const aShape = inputs[0].dims.slice();
+      const bShape = inputs[1].dims.slice();
+      const [M, N, K] = GemmUtil.getShapeOfGemmResult(
+          aShape, attributes.transA, bShape, attributes.transB, inputs.length === 3 ? inputs[2].dims : undefined);
+      const outputShape = [M, N];
+      if (!outputShape) {
+        throw new Error('Can\'t use gemm on the given tensors');
+      }
+      const outputSize = ShapeUtil.size(outputShape);
+      let line = '';
+      if (attributes.transA && attributes.transB) {
+        line = 'value += a[k * M + m] * b[n * K + k];';
+      } else if (attributes.transA && !attributes.transB) {
+        line = 'value += a[k * M + m] * b[k * N + n];';
+      } else if (!attributes.transA && attributes.transB) {
+        line = 'value += a[m * K + k] * b[n * K + k];';
+      } else if (!attributes.transA && !attributes.transB) {
+        line = 'value += a[m * K + k] * b[k * N + n];';
+      }
+
+      const dataType = 'f32';  // TODO: support other data type
+      const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
+      const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
+      const inputStorageBuffersDeclarations = [
+        `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
+        `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
+      ];
+      if (inputs.length === 3) {
+        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
+      }
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const M: u32 = ${M}u;
+  const N: u32 = ${N}u;
+  const K: u32 = ${K}u;
+  const alpha = ${dataType}(${attributes.alpha});
+  const beta = ${dataType}(${attributes.beta});
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    let m = global_id.x / N;
+    let n = global_id.x % N;
+
+    var value = ${dataType}(0);
+    for (var k: u32 = 0u; k<${K}u; k++) {
+      ${line}
+    }
+
+    ${calculateAlpha}
+    ${calculateC}
+    output[global_id.x] = value;
+
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const validateInputs = (inputs: Tensor[], attributes: GemmAttributes): void => {
+  if (!inputs) {
+    throw new Error('Input is missing');
+  }
+  if (attributes.isOptionalC && (inputs.length < 2 || inputs.length > 3)) {
+    throw new Error('Invaid input shape.');
+  }
+  if (!attributes.isOptionalC && inputs.length !== 3) {
+    throw new Error('Gemm requires 3 inputs');
+  }
+
+  // 'C' can be of dimensionality 1 or 2 only
+  if (inputs.length === 3 && inputs[2].dims.length !== 1 && inputs[2].dims.length !== 2) {
+    throw new Error('Invalid input shape of C');
+  }
+
+  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
+      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64') ||
+      (inputs.length === 3 && inputs[2].type !== 'float32' && inputs[2].type !== 'float64')) {
+    throw new Error('Invalid input type.');
+  }
+
+  if ((inputs[0].type !== inputs[1].type) || (inputs.length === 3 && inputs[0].type !== inputs[2].type)) {
+    throw new Error('Input types are mismatched');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
new file mode 100644
index 0000000000000..5b8f0bf94733e
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -0,0 +1,115 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {BroadcastUtil, ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+import {getActicationSnippet, InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+
+export const matMul: OperatorAsyncImplementation<InternalActivationAttributes> =
+    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: InternalActivationAttributes):
+        Promise<Tensor[]> => {
+          validateInputs(inputs);
+
+          return inferenceHandler.run(createMatmulProgramInfoLoader(inputs, attributes), inputs);
+        };
+
+export const parseMatMulAttributes: OperatorInitialization<InternalActivationAttributes> =
+    (node: Graph.Node): InternalActivationAttributes => parseInternalActivationAttributes(node.attributes);
+
+const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
+  name: 'MatMul',
+  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                        [GpuDataType.default, GpuDataType.default],
+  cacheHint
+});
+
+function createMatmulProgramInfo(
+    metadata: ProgramMetadata, inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfo {
+  const aShape = inputs[0].dims;
+  const bShape = inputs[1].dims;
+  const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
+  if (!outputShape) {
+    throw new Error('Can\'t use matmul on the given tensors');
+  }
+  const outputSize = ShapeUtil.size(outputShape);
+  // TODO: support broadcasting
+
+  const dataType = 'f32';  // TODO: support other data type
+  const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
+
+  const M = outputShape[outputShape.length - 2];
+  const K = aShape[aShape.length - 1];
+  const N = outputShape[outputShape.length - 1];
+  const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const M: u32 = ${M}u;
+  const N: u32 = ${N}u;
+  const K: u32 = ${K}u;
+
+  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read> b : array<${dataType}>;
+  @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
+
+  ${activationFunction}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    let stack = global_id.x / (M * N);
+    let mn = global_id.x % (M * N);
+    let n = global_id.x % N;
+    let m = mn / N;
+
+    let offsetA = stack * (M * K);
+    let offsetB = stack * (K * N);
+
+    var value = ${dataType}(0);
+    for (var k: u32 = 0u; k<${K}u; k++) {
+      value += a[offsetA + m * K + k] * b[offsetB + k * N + n];
+    }
+    ${applyActivation}
+    output[global_id.x] = value;
+  }`;
+  return {
+    ...metadata,
+    outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+    shaderSource,
+    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+  };
+}
+
+export function createMatmulProgramInfoLoader(
+    inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfoLoader {
+  const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
+  return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes)};
+}
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('MatMul requires 2 inputs.');
+  }
+
+  if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
+    throw new Error('shared dimension does not match.');
+  }
+
+  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
+      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64')) {
+    throw new Error('inputs should be float type');
+  }
+
+  if (inputs[0].type !== inputs[1].type) {
+    throw new Error('inputs types should match');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
new file mode 100644
index 0000000000000..0e92ff8cb906a
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -0,0 +1,376 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {PoolConvUtil, ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+export interface AveragePoolAttributes extends AttributeWithCacheKey {
+  readonly autoPad: string;
+  readonly ceilMode: number;
+  readonly countIncludePad: boolean;
+  readonly kernelShape: readonly number[];
+  readonly strides: readonly number[];
+  readonly pads: readonly number[];
+}
+
+export const averagePool: OperatorAsyncImplementation<AveragePoolAttributes> =
+    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
+        Promise<Tensor[]> => {
+          validateInputs(inputs);
+          const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+          return inferenceHandler.run(
+              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
+        };
+
+export const parseAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
+    (node: Graph.Node): AveragePoolAttributes => {
+      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
+      const ceilMode = node.attributes.getInt('ceil_mode', 0);
+      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
+      const kernelShape = node.attributes.getInts('kernel_shape');
+      const strides = node.attributes.getInts('strides', []);
+      const pads = node.attributes.getInts('pads', []);
+
+      // TODO: support attribute 'ceil_mode'
+      if (ceilMode !== 0) {
+        throw new Error('using ceil() in shape computation is not yet supported for AveragePool');
+      }
+
+      return createAttributeWithCacheKey({autoPad, ceilMode, countIncludePad, kernelShape, strides, pads});
+    };
+
+const createAveragePoolProgramInfo =
+    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean,
+     attributes: AveragePoolAttributes): ProgramInfo => {
+      const [adjustedAttributes, outputShape] =
+          getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+      const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
+
+      const dataType = 'f32';
+
+      const op1 = 'value += x_val;';
+      let op2 = '';
+      if (adjustedAttributes.countIncludePad) {
+        op2 += `value /= ${dataType}(${kernelSize});`;
+      } else {
+        op2 += `value /= ${dataType}(${kernelSize} - pad);`;
+      }
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType, '0.0'),
+        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+      };
+    };
+
+export const globalAveragePool: OperatorAsyncImplementation<AveragePoolAttributes> =
+    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
+        Promise<Tensor[]> => {
+          validateInputs(inputs);
+          const metadata = {
+            name: 'GlobalAveragePool',
+            inputTypes: [GpuDataType.default],
+            cacheHint: `${attributes.countIncludePad}`
+          };
+          return inferenceHandler.run(
+              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, true, attributes)}, inputs);
+        };
+
+export const parseGlobalAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
+    (node: Graph.Node): AveragePoolAttributes => {
+      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
+      return createAttributeWithCacheKey(
+          {autoPad: '', ceilMode: 0, countIncludePad, kernelShape: [], strides: [], pads: []});
+    };
+
+export interface MaxPoolAttributes extends AveragePoolAttributes {
+  readonly storageOrder: number;
+  readonly dilations: number[];
+}
+
+export const maxPool: OperatorAsyncImplementation<MaxPoolAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: MaxPoolAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+  return inferenceHandler.run(
+      {...metadata, get: () => createMaxPoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
+};
+
+export const parseMaxPoolAttributes: OperatorInitialization<MaxPoolAttributes> =
+    (node: Graph.Node): MaxPoolAttributes => {
+      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
+      const ceilMode = node.attributes.getInt('ceil_mode', 0);
+      const kernelShape = node.attributes.getInts('kernel_shape');
+      const strides = node.attributes.getInts('strides', []);
+      const pads = node.attributes.getInts('pads', []);
+      const storageOrder = node.attributes.getInt('storage_order', 0);
+      const dilations = node.attributes.getInts('dilations', []);
+
+      // TODO: support attribute 'ceil_mode' and 'storage_order'
+      if (storageOrder !== 0) {
+        throw new Error('column major storage order is not yet supported for MaxPool');
+      }
+      if (ceilMode !== 0) {
+        throw new Error('using ceil() in shape computation is not yet supported for MaxPool');
+      }
+
+      return createAttributeWithCacheKey(
+          {autoPad, ceilMode, countIncludePad: false, kernelShape, strides, pads, storageOrder, dilations});
+    };
+
+const createMaxPoolProgramInfo =
+    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean, attributes: MaxPoolAttributes):
+        ProgramInfo => {
+          const [adjustedAttributes, outputShape] =
+              getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+          const op1 = `
+      value = max(x_val, value);
+    `;
+          const op2 = '';
+          return {
+            ...metadata,
+            outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+            shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32', '-1e5'),
+            dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+          };
+        };
+
+const getAdjustedPoolAttributesAndOutputShape =
+    (inputs: Tensor[], attributes: AveragePoolAttributes|MaxPoolAttributes, isGlobalOperator: boolean):
+        [AveragePoolAttributes|MaxPoolAttributes, number[]] => {
+          const inputShape = inputs[0].dims.slice();
+          const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
+          const kernelShape = attributes.kernelShape.slice();
+          const strides = attributes.strides.slice();
+          const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
+          const pads = attributes.pads.slice();
+          PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
+
+          const outputShape = PoolConvUtil.computePoolOutputShape(
+              isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
+
+          const newAttributes = Object.assign({}, attributes);
+          if (hasDilations) {
+            Object.assign(newAttributes, {kernelShape, strides, pads, dilations, cacheKey: attributes.cacheKey});
+          } else {
+            Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
+          }
+          return [newAttributes, outputShape];
+        };
+
+const globalMaxPoolAttributes = {
+  autoPad: '',
+  ceilMode: 0,
+  countIncludePad: false,
+  kernelShape: [],
+  strides: [],
+  pads: [],
+  storageOrder: 0,
+  dilations: [],
+  cacheKey: ''
+};
+
+const globalMaxPoolMetadata = {
+  name: 'GlobalMaxPool',
+  inputTypes: [GpuDataType.default]
+};
+
+export const globalMaxPool = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(
+      {
+        ...globalMaxPoolMetadata,
+        get: () => createMaxPoolProgramInfo(inputs, globalMaxPoolMetadata, true, globalMaxPoolAttributes)
+      },
+      inputs);
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Pool ops requires 1 input.');
+  }
+  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+    throw new Error('Invalid input type.');
+  }
+};
+
+const generatePoolingCode =
+    (inputDims: readonly number[], outputShape: readonly number[], attributes: AveragePoolAttributes, op1: string,
+     op2: string, dataType: string, start: string): string => {
+      const rank = inputDims.length;
+      const outputSize = ShapeUtil.size(outputShape);
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const xIndicesHelper = createIndicesHelper('x', inputDims);
+
+      if (attributes.kernelShape.length <= 2) {
+        const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
+        const sw = attributes.strides[attributes.strides.length - 1];
+        const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
+        const pwEnd = attributes.pads[attributes.pads.length - 1];
+        const dimW = inputDims[rank - 1];
+        let codeW = '';
+        let codeH = '';
+        let codeHEnd = '';
+        if (pwStart + pwEnd !== 0) {
+          codeW = `
+          for (var i: u32 = 0u; i < ${kw}u; i++) {
+            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+            if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
+              pad++;
+              continue;
+            }
+            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+            ${op1}
+          }`;
+        } else {
+          codeW = `
+          for (var i: u32 = 0u; i < ${kw}u; i++) {
+            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+            ${op1}
+          }`;
+        }
+
+        if (attributes.kernelShape.length === 2) {
+          const kh = attributes.kernelShape[attributes.kernelShape.length - 2];
+          const sh = attributes.strides[attributes.strides.length - 2];
+          const phStart = attributes.pads[attributes.pads.length / 2 - 2];
+          const phEnd = attributes.pads[attributes.pads.length - 2];
+          const dimH = inputDims[rank - 2];
+          if (phStart + phEnd !== 0) {
+            codeH = `
+            for (var j: u32 = 0u; j < ${kh}u; j++) {
+              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+              if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
+                pad+= ${kw};
+                continue;
+              }
+          `;
+          } else {
+            codeH = `
+            for (var j: u32 = 0u; j < ${kh}u; j++) {
+              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+            `;
+          }
+          codeHEnd = `
+          }
+        `;
+        }
+
+        const poolingCode = `
+        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+        ${outputIndicesHelper.o2iImpl}
+        ${xIndicesHelper.i2oImpl}
+
+        @compute @workgroup_size(WORKGROUP_SIZE)
+        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+          // Guard against out-of-bounds work group sizes
+          if (global_id.x >= ${outputSize}u) {
+            return;
+          }
+
+          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+          var value: ${dataType} = ${dataType}(${start});
+          var pad = 0;
+          ${codeH}
+          ${codeW}
+          ${codeHEnd}
+          ${op2}
+
+          output[global_id.x] = value;
+        }`;
+        return poolingCode;
+      } else {
+        const kernelSize = ShapeUtil.size(attributes.kernelShape);
+        const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
+        const stridesRank = kernelStrides.length;
+        const padsRank = attributes.pads.length;
+        const hasPads = attributes.pads.reduce((sum, cur) => sum + cur);
+        let padCode = '';
+        if (hasPads) {
+          padCode = `
+            if (xIndices[j] >= inputDims[j]) {
+              pad++;
+              isPad = true;
+              break;
+            }
+          }
+          if (!isPad) {
+            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+            ${op1}
+          }`;
+        } else {
+          padCode = `
+          }
+          let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+          ${op1}
+        `;
+        }
+        const poolingCode = `
+        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+        ${outputIndicesHelper.o2iImpl}
+        ${xIndicesHelper.i2oImpl}
+
+        const pads = array<u32, ${padsRank}>(${attributes.pads.map(i => `${i}u`).join(',')});
+        const inputDims = array<u32, ${rank}>(${inputDims.map(i => `${i}u`).join(',')});
+        const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
+        const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
+
+        @compute @workgroup_size(WORKGROUP_SIZE)
+        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+          // Guard against out-of-bounds work group sizes
+          if (global_id.x >= ${outputSize}u) {
+            return;
+          }
+
+          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+          var offsets: array<u32, ${stridesRank}>;
+
+          var value = ${dataType}(${start});
+          var pad = 0;
+          var isPad = false;
+
+          for (var i: u32 = 0u; i < ${kernelSize}u; i++) {
+            var offset = i;
+            for (var j = 0u; j < ${stridesRank - 1}u; j++) {
+              offsets[j] = offset / kernelStrides[j];
+              offset -= offsets[j] * kernelStrides[j];
+            }
+            offsets[${stridesRank - 1}] = offset;
+
+            isPad = false;
+            for (var j = ${rank - stridesRank}u; j < ${rank}u; j++) {
+              xIndices[j] = indices[j] * strides[j - ${rank - stridesRank}u]
+                + offsets[j - ${rank - stridesRank}u] - pads[j - 2u];
+              ${padCode}
+          }
+          ${op2}
+
+          output[global_id.x] = value;
+        }`;
+        return poolingCode;
+      }
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
new file mode 100644
index 0000000000000..763a656d92abb
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+export const sum = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+
+  const sumProgramMetadata = {name: 'Sum', inputTypes: new Array(inputs.length).fill(GpuDataType.default)};
+
+  return inferenceHandler.run(
+      {...sumProgramMetadata, get: () => createSumProgramInfo(inferenceHandler, inputs, sumProgramMetadata)}, inputs);
+};
+
+const createSumProgramInfo =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], sumProgramMetadata: ProgramMetadata): ProgramInfo => {
+      const dataType = 'f32';
+      const outputShape = inputs[0].dims;
+      const outputSize = ShapeUtil.size(outputShape);
+
+
+      const inputsDeclaration =
+          inputs.map((_, i) => `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`);
+      const sumLine = inputs.map((_, i) => `input${i}[offset]`).join('+');
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  ${inputsDeclaration.join('\n')}
+  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    let offset = global_id.x;
+
+    var value = ${dataType}(0);
+    value = ${sumLine};
+
+    output[offset] = value;
+  }`;
+      return {
+        ...sumProgramMetadata,
+        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length === 0) {
+    throw new Error('Sum requires inputs.');
+  }
+
+  const length = inputs[0].dims.length;
+  for (let i = 1; i < inputs.length; i++) {
+    if (length !== inputs[i].dims.length) {
+      throw new Error('Input shapes are mismatched. broadcasting not supported yet');
+    }
+
+    for (let j = 0; j < length; j++) {
+      if (inputs[0].dims[j] !== inputs[i].dims[j]) {
+        throw new Error('Input shapes are not matched. broadcasting not supported yet');
+      }
+    }
+  }
+
+  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+    throw new Error('Invalid input type.');
+  }
+  for (let i = 1; i < inputs.length; i++) {
+    if (inputs[0].type !== inputs[i].type) {
+      throw new Error('Input types are not matched.');
+    }
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts b/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
new file mode 100644
index 0000000000000..323e80bdb596a
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const reshape = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  const shape = await inputs[1].getData();
+  const reshapedDims = ShapeUtil.calculateReshapedDims(inputs[0].dims, shape as Int32Array);
+  return [handler.reshape(inputs[0], reshapedDims)];
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Reshape requires 2 inputs.');
+  }
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/shape.ts b/js/web/lib/wasm/jsep/webgpu/ops/shape.ts
new file mode 100644
index 0000000000000..94ba9293c457a
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/shape.ts
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from '../../../tensor';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const shape = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return [new Tensor([inputs[0].dims.length], 'int32', undefined, undefined, new Int32Array(inputs[0].dims))];
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Shape requires 1 input.');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
new file mode 100644
index 0000000000000..fd5d6e2d2299e
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
@@ -0,0 +1,180 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {NUMBER_TYPES, OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+export interface SliceAttributes extends AttributeWithCacheKey {
+  readonly axes: number[];
+  readonly ends: number[];
+  readonly starts: number[];
+}
+
+const sliceProgramMetadata = {
+  name: 'Slice',
+  inputTypes: [GpuDataType.default]
+};
+
+export const slice: OperatorAsyncImplementation<SliceAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: SliceAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(
+      {
+        ...sliceProgramMetadata,
+        cacheHint: attributes.cacheKey,
+        get: () => createSliceProgramInfo(inputs[0], attributes)
+      },
+      inputs);
+};
+
+export const parseSliceAttributes: OperatorInitialization<SliceAttributes> = (node: Graph.Node): SliceAttributes => {
+  const starts = node.attributes.getInts('starts');
+  const ends = node.attributes.getInts('ends');
+  const axes = node.attributes.getInts('axes', []);
+  return createAttributeWithCacheKey({starts, ends, axes});
+};
+
+const offsetToIndices = (offset: string, strides: readonly number[], indicesPrefix: string): string => {
+  const outputLines: string[] = [];
+
+  for (let i = 0; i < strides.length - 1; i++) {
+    outputLines.push(`var ${indicesPrefix}${i}=${offset}/${strides[i]}u;`);
+    outputLines.push(`${offset}%=${strides[i]}u;`);
+  }
+  outputLines.push(`var ${indicesPrefix}${strides.length - 1}=${offset};`);
+
+  return outputLines.join('\n');
+};
+
+const indicesToOffset = (indicesPrefix: string, strides: readonly number[], offset: string): string => {
+  const outputLines: string[] = [];
+
+  for (let i = 0; i < strides.length - 1; i++) {
+    outputLines.push(`${offset}+=${indicesPrefix}${i} * ${strides[i]}u;`);
+  }
+  outputLines.push(`${offset}+=${indicesPrefix}${strides.length - 1};`);
+
+  return outputLines.join('\n');
+};
+
+const createSliceProgramInfo = (input: Tensor, attributes: SliceAttributes, dataType = 'f32'): ProgramInfo => {
+  const axes = (attributes.axes.length === 0) ? input.dims.slice(0).map((val, i) => i) : attributes.axes;
+  const normalizedAxes = ShapeUtil.normalizeAxes(axes, input.dims.length);
+  const starts = attributes.starts.map((start, i) => {
+    if (start > input.dims[normalizedAxes[i]] - 1) {
+      return input.dims[normalizedAxes[i]];
+    }
+    return ShapeUtil.normalizeAxis(start, input.dims[normalizedAxes[i]]);
+  });
+  const ends = attributes.ends.map((end, i) => {
+    if (end > input.dims[normalizedAxes[i]] - 1) {
+      return input.dims[normalizedAxes[i]];
+    }
+    return ShapeUtil.normalizeAxis(end, input.dims[normalizedAxes[i]]);
+  });
+
+  const outputShape = input.dims.slice();
+
+  const sliceOps: string[] = [];
+  for (let i = 0; i < normalizedAxes.length; i++) {
+    outputShape[normalizedAxes[i]] = ends[i] - starts[i];
+    if (starts[i] > 0) {
+      sliceOps.push(`idx_${normalizedAxes[i]} += ${starts[i]}u;`);
+    }  // else { sliceOps.push(`outputIdx[${normalizedAxes[i]}] += 0;`); }
+  }
+
+  const outputSize = ShapeUtil.size(outputShape);
+  const outputStrides = ShapeUtil.computeStrides(outputShape);
+  const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  @group(0) @binding(0) var<storage, read> input : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    var offset = global_id.x;
+    ${offsetToIndices('offset', outputStrides, 'idx_')}
+    ${sliceOps.join('')}
+    var offsetInput = 0u;
+    ${indicesToOffset('idx_', ShapeUtil.computeStrides(input.dims), 'offsetInput')}
+    output[global_id.x] = input[offsetInput];
+  }`;
+  return {
+    ...sliceProgramMetadata,
+    outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
+    shaderSource,
+    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+  };
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Slice requires 1 input.');
+  }
+  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
+    throw new Error('Invalid input type.');
+  }
+};
+
+export const sliceV10 = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  validateInputsV10(inputs);
+  const attributes = generateSliceAttributesFromInputs(inferenceHandler, inputs);
+  return inferenceHandler.run(
+      {
+        ...sliceProgramMetadata,
+        cacheHint: attributes.cacheKey,
+        get: () => createSliceProgramInfo(inputs[0], attributes)
+      },
+      [inputs[0]]);
+};
+
+const generateSliceAttributesFromInputs =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): SliceAttributes => {
+      if (!inferenceHandler.session.isInitializer(inputs[1].dataId) ||
+          !inferenceHandler.session.isInitializer(inputs[2].dataId) ||
+          (inputs.length >= 4 && !inferenceHandler.session.isInitializer(inputs[3].dataId)) ||
+          (inputs.length >= 5 && !inferenceHandler.session.isInitializer(inputs[4].dataId))) {
+        throw new Error('dynamic slice attributes are not allowed');
+      }
+
+      if (inputs.length >= 5 && inputs[4].integerData.some((i: number) => i !== 1)) {
+        throw new Error('currently non-1 steps is not supported for Slice');
+      }
+
+      const starts = Array.from(inputs[1].integerData);
+      const ends = Array.from(inputs[2].integerData);
+      const axes = inputs.length >= 4 ? Array.from(inputs[3].integerData) : [];
+      const cacheKey = `${axes};${starts};${ends}`;
+      return {starts, ends, axes, cacheKey};
+    };
+
+const validateInputsV10 = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length < 3 || inputs.length > 5) {
+    throw new Error('Invalid input number.');
+  }
+  if (inputs[1].type !== 'int32' || inputs[1].dims.length !== 1) {
+    throw new Error('Invalid input type.');
+  }
+  if (inputs[2].type !== 'int32' || inputs[2].dims.length !== 1) {
+    throw new Error('Invalid input type.');
+  }
+  if (inputs.length >= 4 && (inputs[3].type !== 'int32' || inputs[3].dims.length !== 1)) {
+    throw new Error('Invalid input type.');
+  }
+  if (inputs.length >= 5 && (inputs[4].type !== 'int32' || inputs[4].dims.length !== 1)) {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts b/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
new file mode 100644
index 0000000000000..7cd85e6877b03
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Graph} from '../../../graph';
+import {OperatorImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const squeeze: OperatorImplementation<number[]> =
+    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
+      validateInputs(inputs);
+      const outputShape = ShapeUtil.squeezeShape(inputs[0].dims, axes);
+      const output = inferenceHandler.reshape(inputs[0], outputShape);
+      return [output];
+    };
+
+export const squeezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
+  validateInputsV13(inputs);
+  return squeeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
+};
+
+export const parseSqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
+    node.attributes.getInts('axes');
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Squeeze requires 1 input.');
+  }
+
+  if (inputs[0].type === 'string') {
+    throw new Error('invalid input tensor types.');
+  }
+};
+
+const validateInputsV13 = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Squeeze requires 2 inputs.');
+  }
+
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
new file mode 100644
index 0000000000000..e83dd7fcbb0b9
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -0,0 +1,116 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+export interface TransposeAttributes extends AttributeWithCacheKey {
+  readonly perm: number[];
+}
+
+const transposeProgramMetadata = {
+  name: 'Transpose',
+  inputTypes: [GpuDataType.default]
+};
+
+export const transpose: OperatorAsyncImplementation<TransposeAttributes> = async(
+    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: TransposeAttributes): Promise<Tensor[]> => {
+  validateInputs(inputs);
+  return inferenceHandler.run(
+      {
+        ...transposeProgramMetadata,
+        cacheHint: attributes.cacheKey,
+        get: () => createTransposeProgramInfo(inferenceHandler, inputs[0], attributes.perm)
+      },
+      inputs);
+};
+
+export const parseTransposeAttributes: OperatorInitialization<TransposeAttributes> =
+    (node: Graph.Node): TransposeAttributes => createAttributeWithCacheKey({perm: node.attributes.getInts('perm', [])});
+
+const createTransposeProgramInfo =
+    (_inferenceHandler: WebGpuInferenceHandler, input: Tensor, perm: number[]): ProgramInfo => {
+      const dataType = 'f32';  // TODO: support other data type
+      const inputShape = input.dims;
+      perm = getAdjustedPerm(inputShape, perm);
+      const outputShape = getOutputShape(inputShape, perm);
+      const rank = inputShape.length;
+      const outputSize = ShapeUtil.size(outputShape);
+      // A dims=[${inputs[0].dims.toString()}]
+      // out Dims=[${unpackedOutputShape.toString()}]
+      // based on perm=[${perm.toString()}]
+
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const inputIndicesHelper = createIndicesHelper('a', inputShape);
+
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+  ${permFunctionBody(perm, rank)}
+  ${outputIndicesHelper.o2iImpl}
+  ${inputIndicesHelper.i2oImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+    ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
+    perm(&aIndices, &indices);
+
+    output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
+  }`;
+      return {
+        ...transposeProgramMetadata,
+        outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] => {
+  if (perm && perm.length !== inputShape.length) {
+    perm = [...(inputShape.keys())].reverse();
+  }
+  return perm;
+};
+
+const getOutputShape = (inputShape: readonly number[], perm: number[]): readonly number[] => {
+  perm = getAdjustedPerm(inputShape, perm);
+  return ShapeUtil.sortBasedOnPerm(inputShape, perm);
+};
+
+const permFunctionBody = (perm: number[], rank: number): string => {
+  const reverseFunc = [];
+  reverseFunc.push(`fn perm(a: ptr<function, array<u32, ${rank}>>, i: ptr<function, array<u32, ${rank}>>) {`);
+  for (let i = 0; i < rank; ++i) {
+    reverseFunc.push(`\t(*a)[${perm[i]}]=(*i)[${i}];`);
+  }
+  reverseFunc.push('\t}');
+  return reverseFunc.join('\n');
+};
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Transpose requires 1 input.');
+  }
+
+  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+    throw new Error('input should be float tensor');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
new file mode 100644
index 0000000000000..54213cfdd2313
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -0,0 +1,197 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+import {Graph} from '../../../graph';
+import {Tensor} from '../../../tensor';
+import {MAX_CLIP, MIN_CLIP} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+type BuiltinFunctionName = string;
+type ElementwiseCustomExpression = (expression: string) => string;
+type ElementwiseFunctionCall = BuiltinFunctionName|ElementwiseCustomExpression;
+
+const createElementwiseProgramShader =
+    (datasize: number, funcCall: ElementwiseFunctionCall, additionalImplementation?: string): string => {
+      const vecSize = Math.ceil(datasize / 4);
+
+      let expression = '';
+      if (typeof funcCall === 'string') {
+        expression = `${funcCall}(a)`;
+      } else {
+        expression = funcCall('a');
+      }
+      return `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  @group(0) @binding(0) var<storage, read> inputData : array<vec4<f32>>;
+  @group(0) @binding(1) var<storage, read_write> outputData : array<vec4<f32>>;
+
+  ${additionalImplementation ?? ''}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${vecSize}u) {
+      return;
+    }
+
+    let a = inputData[global_id.x];
+    outputData[global_id.x] = ${expression};
+  }`;
+    };
+
+const createElementwiseProgramInfo =
+    (metadata: ProgramMetadata, input: Tensor, funcCall: ElementwiseFunctionCall, additionalImplementation?: string):
+        ProgramInfo => ({
+          ...metadata,
+          shaderSource: createElementwiseProgramShader(input.size, funcCall, additionalImplementation),
+          outputs: [{dims: input.dims, type: input.type, gpuDataType: GpuDataType.default}],
+          dispatchGroup: (inputTensors) =>
+              ({x: Math.ceil(inputTensors[0].size / 64 /* workgroup size */ / 4 /* vec size */)})
+        });
+
+const createElementwiseProgramInfoLoader =
+    (input: Tensor, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string,
+     cacheKey?: string): ProgramInfoLoader => {
+      const metadata: ProgramMetadata = {name, inputTypes: [GpuDataType.default], cacheHint: cacheKey};
+      return {
+        ...metadata,
+        get: () => createElementwiseProgramInfo(metadata, input, funcCall, additionalImplementation)
+      };
+    };
+
+export const abs = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Abs', 'abs'), inputs);
+
+export const acos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Acos', 'acos'), inputs);
+
+export const asin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Asin', 'asin'), inputs);
+
+export const atan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Atan', 'atan'), inputs);
+
+export interface ClipAttributes extends AttributeWithCacheKey {
+  readonly min: number;
+  readonly max: number;
+}
+
+export const clip = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ClipAttributes):
+                        Promise<Tensor[] >=>handler.run(
+                            createElementwiseProgramInfoLoader(
+                                inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
+    let clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
+    let clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
+`,
+                                attributes.cacheKey),
+                            inputs);
+
+export const parseClipAttributes = (node: Graph.Node): ClipAttributes => createAttributeWithCacheKey(
+    {min: node.attributes.getFloat('min', MIN_CLIP), max: node.attributes.getFloat('max', MAX_CLIP)});
+
+const generateClipAttributesFromInputs = (handler: WebGpuInferenceHandler, inputs: Tensor[]): ClipAttributes => {
+  if (inputs.length >= 3 &&
+      (!handler.session.isInitializer(inputs[1].dataId) || !handler.session.isInitializer(inputs[2].dataId))) {
+    throw new Error('dynamic clip attributes are not allowed');
+  }
+
+  const min = (inputs.length >= 3) ? inputs[1].numberData[0] : MIN_CLIP;
+  const max = (inputs.length >= 3) ? inputs[2].numberData[0] : MAX_CLIP;
+  return createAttributeWithCacheKey({min, max});
+};
+
+export const clipV11 = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+  const attributes = generateClipAttributesFromInputs(handler, inputs);
+  return clip(handler, [inputs[0]], attributes);
+};
+
+export const ceil = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Ceil', 'ceil'), inputs);
+
+export const cos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Cos', 'cos'), inputs);
+
+export interface EluAttributes extends AttributeWithCacheKey {
+  readonly alpha: number;
+}
+
+export const elu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
+                       Promise<Tensor[] >=>handler.run(
+                           createElementwiseProgramInfoLoader(
+                               inputs[0], 'Elu', a => `elu_vf32(${a})`, `
+    let elu_alpha_: f32 = f32(${attributes.alpha});
+
+    fn elu_f32(a: f32) -> f32 {
+      return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
+    }
+
+    fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
+      return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
+    }`,
+                               attributes.cacheKey),
+                           inputs);
+
+export const parseEluAttributes = (node: Graph.Node): EluAttributes =>
+    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 1.0)});
+
+export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
+
+export const floor = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Floor', 'floor'), inputs);
+
+export interface LeakyReluAttributes extends AttributeWithCacheKey {
+  readonly alpha: number;
+}
+
+export const leakyRelu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
+                             Promise<Tensor[] >=>handler.run(
+                                 createElementwiseProgramInfoLoader(
+                                     inputs[0], 'LeakyRelu', a => `leaky_relu_vf32(${a})`, `
+    let leaky_relu_alpha_: f32 = f32(${attributes.alpha});
+
+    fn leaky_relu_f32(a: f32) -> f32 {
+      return select(a, a * leaky_relu_alpha_, a < 0.0);
+    }
+
+    fn leaky_relu_vf32(v: vec4<f32>) -> vec4<f32> {
+      return vec4(leaky_relu_f32(v.x), leaky_relu_f32(v.y), leaky_relu_f32(v.z), leaky_relu_f32(v.w));
+    }`,
+                                     attributes.cacheKey),
+                                 inputs);
+
+export const parseLeakyReluAttributes = (node: Graph.Node): LeakyReluAttributes =>
+    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 0.01)});
+
+export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
+
+export const neg = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Neg', a => `-${a}`), inputs);
+
+// export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+//     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
+
+export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
+    createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
+
+export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
+    createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
+
+export const sin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sin', 'sin'), inputs);
+
+export const sqrt = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sqrt', 'sqrt'), inputs);
+
+export const tan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tan', 'tan'), inputs);
+
+export const tanh = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tanh', 'tanh'), inputs);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts b/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
new file mode 100644
index 0000000000000..8a099dc92cbd9
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Graph} from '../../../graph';
+import {OperatorInitialization} from '../../../operators';
+import {Tensor} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {WebGpuInferenceHandler} from '../inference-handler';
+
+export const unsqueeze = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
+  validateInputs(inputs);
+  const outputShape = ShapeUtil.unsqueezeShape(inputs[0].dims, axes);
+  const output = inferenceHandler.reshape(inputs[0], outputShape);
+  return [output];
+};
+
+export const unsqueezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
+  validateInputsV13(inputs);
+  return unsqueeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
+};
+
+export const parseUnsqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
+    node.attributes.getInts('axes');
+
+const validateInputs = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Unsqueeze requires 1 input.');
+  }
+
+  if (inputs[0].type === 'string') {
+    throw new Error('invalid input tensor types.');
+  }
+};
+
+const validateInputsV13 = (inputs: Tensor[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Unsqueeze requires 2 inputs.');
+  }
+
+  if (inputs[1].type !== 'int32') {
+    throw new Error('Invalid input type.');
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
new file mode 100644
index 0000000000000..fdb917dc2e4d5
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {env} from 'onnxruntime-common';
+
+import {WebGpuBackend} from '../backend-webgpu';
+
+import {Artifact, GpuData, ProgramInfo} from './types';
+
+/**
+ * ProgramManager is the main class behind running computations
+ * It builds ProgramInfo's into Artifacts
+ * It compiles given ProgramInfo's into WebGL Prorams (cached as Artifacts)
+ * Uses the artifact to run the computation by calling Draw on
+ * the WebGL drawing buffer
+ * ProgramManager automatically maps (binds) input variables to their
+ * corresponding Location's in the binary program
+ */
+export class ProgramManager {
+  repo: Map<unknown, Artifact>;  // this should be per-session object
+  attributesBound: boolean;
+
+  constructor(private backend: WebGpuBackend) {
+    this.repo = new Map();
+    this.attributesBound = false;
+  }
+  getArtifact(key: unknown): Artifact|undefined {
+    return this.repo.get(key);
+  }
+  setArtifact(key: unknown, artifact: Artifact): void {
+    this.repo.set(key, artifact);
+  }
+  run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[],
+      dispatchGroup: {x: number; y?: number; z?: number}): void {
+    const device = this.backend.device;
+
+    const computePassEncoder = this.backend.getComputePassEncoder();
+
+    computePassEncoder.setPipeline(buildArtifact.computePipeline);
+    const entries = [];
+    for (const input of inputs) {
+      entries.push({binding: entries.length, resource: {buffer: input.buffer}});
+    }
+    for (const output of outputs) {
+      entries.push({binding: entries.length, resource: {buffer: output.buffer}});
+    }
+    const bindGroup = device.createBindGroup({layout: buildArtifact.computePipeline.getBindGroupLayout(0), entries});
+    computePassEncoder.setBindGroup(0, bindGroup);
+
+    const {x, y, z} = dispatchGroup;
+    computePassEncoder.dispatch(x, y, z);
+
+    this.backend.pendingDispatchNumber++;
+
+    if (this.backend.pendingDispatchNumber >= 16) {
+      this.backend.flush();
+    }
+  }
+  dispose(): void {
+    // this.repo.forEach(a => this.glContext.deleteProgram(a.program));
+  }
+  build(programInfo: ProgramInfo): Artifact {
+    const device = this.backend.device;
+
+    const shaderModule = device.createShaderModule({code: programInfo.shaderSource});
+    if (env.debug) {
+      // eslint-disable-next-line no-console
+      console.log('WebGpuProgram: ' + programInfo.shaderSource);
+    }
+
+    const computePipeline = device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}});
+
+    return {programInfo, computePipeline};
+  }
+}
diff --git a/js/web/lib/wasm/jsep/webgpu/session-handler.ts b/js/web/lib/wasm/jsep/webgpu/session-handler.ts
new file mode 100644
index 0000000000000..1fe288c36dd1e
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/session-handler.ts
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {SessionHandler} from '../../backend';
+import {Graph} from '../../graph';
+import {Operator} from '../../operators';
+import {OpSet, resolveOperator} from '../../opset';
+import {Session} from '../../session';
+import {Tensor} from '../../tensor';
+import {WebGpuBackend} from '../backend-webgpu';
+
+import {WebGpuInferenceHandler} from './inference-handler';
+import {WEBGPU_OP_RESOLVE_RULES} from './op-resolve-rules';
+import {ProgramManager} from './program-manager';
+import {createTensorDataManager, TensorDataManager} from './tensor-data-manager';
+
+export class WebGpuSessionHandler implements SessionHandler {
+  private initializers: Set<Tensor.Id>;
+  readonly dataManager: TensorDataManager;
+  readonly programManager: ProgramManager;
+
+  constructor(public readonly backend: WebGpuBackend, public readonly context: Session.Context) {
+    this.dataManager = createTensorDataManager(this.backend.gpuDataManager);
+    this.programManager = new ProgramManager(this.backend, this.context.profiler);
+  }
+
+  createInferenceHandler() {
+    return new WebGpuInferenceHandler(this);
+  }
+  onGraphInitialized(graph: Graph): void {
+    const initializers = graph.getValues().filter(v => v.from === -1 && v.tensor).map(v => v.tensor!.dataId);
+    this.initializers = new Set(initializers);
+  }
+  isInitializer(tensorId: Tensor.Id): boolean {
+    return this.initializers ? this.initializers.has(tensorId) : false;
+  }
+  addInitializer(tensorId: Tensor.Id): void {
+    this.initializers.add(tensorId);
+  }
+  dispose(): void {
+    // TODO
+  }
+  resolve(node: Graph.Node, opsets: readonly OpSet[], graph: Graph): Operator {
+    const op = resolveOperator(node, opsets, WEBGPU_OP_RESOLVE_RULES);
+    return {impl: op.opImpl, context: op.opInit ? op.opInit(node, graph) : node};
+  }
+}
diff --git a/js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts
new file mode 100644
index 0000000000000..0899ecba6e272
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts
@@ -0,0 +1,140 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {createView, Tensor} from '../tensor';
+
+import {GpuDataManager} from './gpu-data-manager';
+import {GpuData, GpuDataId, GpuDataType} from './types';
+
+/**
+ * manages Tensor ID -> Gpu Data ID
+ *
+ * A tensor ID is a unique ID representing a value(tensor), which is the graph's node's input or output.
+ * A GPU Data ID is a unique ID representing an abstract data on GPU memory. Specifically, for current WebGPU scenarios,
+ *   GPU Data is a storage buffer, and GPU Data ID is a handle to a storage buffer.
+ *
+ * - a value is different to the graph's edge. if a node's output is consumed by 2 other downstream nodes, there are
+ *   2 edges, but only one value.
+ *
+ * - a tensor ID maps to 0 or 1 GPU Data ID, depending on whether the data is available on GPU or not.
+ *
+ * - a GPU Data ID maps to 1 or more tensor ID.
+ *
+ */
+export interface TensorDataManager {
+  /**
+   * upload a CPU tensor to GPU.
+   */
+  uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): GpuData;
+
+  /**
+   * create a new GPU tensor.
+   */
+  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData];
+
+  /**
+   * check whether the tensor has GPU data
+   */
+  hasGpuData(tensorId: Tensor.Id): boolean;
+
+  /**
+   * create a reference to the GPU data.
+   */
+  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData];
+
+  /**
+   * release the GPU resources referred by the tensor.
+   */
+  releaseGpuTensor(tensorId: Tensor.Id): void;
+}
+
+class TensorDataManagerImpl implements TensorDataManager {
+  private map: Map<Tensor.Id, GpuDataId>;
+  private reverseMap: Map<GpuDataId, Set<Tensor.Id>>;
+
+  constructor(private gpuDataManager: GpuDataManager) {
+    this.map = new Map();
+    this.reverseMap = new Map();
+  }
+
+  private registerIdMapping(tensorId: Tensor.Id, gpuDataId: GpuDataId): void {
+    this.map.set(tensorId, gpuDataId);
+
+    let tensorIds = this.reverseMap.get(gpuDataId);
+    if (!tensorIds) {
+      tensorIds = new Set();
+      this.reverseMap.set(gpuDataId, tensorIds);
+    }
+    tensorIds.add(tensorId);
+  }
+
+  uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): GpuData {
+    const gpuDataId = this.map.get(tensor.dataId);
+    if (gpuDataId) {
+      const gpuData = this.gpuDataManager.get(gpuDataId);
+      if (!gpuData) {
+        throw new Error('internal error. this should never happen');
+      }
+      return gpuData;
+    }
+
+    const gpuData = this.gpuDataManager.upload(tensor.numberData, gpuDataType);
+    this.registerIdMapping(tensor.dataId, gpuData.id);
+    return gpuData;
+  }
+
+  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
+    const gpuData = this.gpuDataManager.create(type, dims, gpuDataType);
+    const tensor = new Tensor(dims, type, undefined, async () => {
+      const data = await this.gpuDataManager.download(gpuData.id);
+      return createView(data, type);
+    });
+
+    this.registerIdMapping(tensor.dataId, gpuData.id);
+    return [tensor, gpuData];
+  }
+
+  hasGpuData(tensorId: Tensor.Id): boolean {
+    return this.map.has(tensorId);
+  }
+
+  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData] {
+    const gpuDataId = this.map.get(tensorId);
+    if (!gpuDataId) {
+      throw new Error('internal error. this should never happen');
+    }
+
+    const gpuData = this.gpuDataManager.get(gpuDataId);
+    if (!gpuData) {
+      throw new Error('internal error. this should never happen');
+    }
+
+    const tensor = new Tensor(dims, type, undefined, async () => {
+      const data = await this.gpuDataManager.download(gpuData.id);
+      return createView(data, type);
+    });
+
+    this.registerIdMapping(tensor.dataId, gpuData.id);
+    return [tensor, gpuData];
+  }
+
+  releaseGpuTensor(tensorId: Tensor.Id): void {
+    const gpuDataId = this.map.get(tensorId);
+    if (gpuDataId) {
+      this.map.delete(tensorId);
+
+      const tensorIds = this.reverseMap.get(gpuDataId);
+      if (!tensorIds) {
+        throw new Error('internal error. this should never happen');
+      }
+      tensorIds.delete(tensorId);
+      if (tensorIds.size === 0) {
+        this.gpuDataManager.release(gpuDataId);
+        this.reverseMap.delete(gpuDataId);
+      }
+    }
+  }
+}
+
+export const createTensorDataManager = (gpuDataManager: GpuDataManager): TensorDataManager =>
+    new TensorDataManagerImpl(gpuDataManager);
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
new file mode 100644
index 0000000000000..5d77af75cf427
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -0,0 +1,94 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from '../tensor';
+
+export enum GpuDataType {
+  default = 0
+}
+export type GpuDataId = unknown;
+
+export interface GpuData {
+  type: GpuDataType;
+  id: GpuDataId;
+  buffer: GPUBuffer;
+}
+
+export interface TensorInfo {
+  id?: Tensor.Id;
+  dims: readonly number[];
+  type: Tensor.DataType;
+  gpuDataType: GpuDataType;
+}
+
+
+export interface ProgramVariable {
+  type: 'float'|'int';
+  name: string;
+  arrayLength?: number;
+  data: number|number[];
+}
+
+
+export interface ProgramMetadata {
+  /**
+   * the name of the program. used for debugging and profiling
+   */
+  name: string;
+
+  // inputLayouts: GPUBindGroupLayoutEntry[];
+  // outputLayouts: GPUBindGroupLayoutEntry[];
+
+  /**
+   * gpu data types for each input
+   */
+  inputTypes: GpuDataType[];
+  /**
+   * an optional string as a cache hint in the artifact cache
+   */
+  cacheHint?: string;
+}
+
+/**
+ * A ProgramInfoLoader allows
+ */
+export interface ProgramInfoLoader extends ProgramMetadata {
+  /**
+   * a function to get the program info
+   */
+  get(): ProgramInfo;
+}
+
+/**
+ * A set of data that represent a shader program
+ */
+export interface ProgramInfo extends ProgramMetadata {
+  /**
+   * information of uniform variables
+   */
+  variables?: ProgramVariable[];
+  /**
+   * tensor info for outputs
+   */
+  outputs: TensorInfo[];
+  /**
+   * the shader's processing source code
+   */
+  shaderSource: string;
+  /**
+   * default is "main"
+   */
+  // entryPoint: string;
+
+  dispatchGroup: (inputs: readonly Tensor[]) => {
+    x: number;
+    y?: number;
+    z?: number;
+  };
+}
+
+export interface Artifact {
+  programInfo: ProgramInfo;
+  computePipeline: GPUComputePipeline;
+  // attribLocations: {position: number; textureCoord: number};
+}
diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 1e04fadd908b8..73aade5c1556a 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -3,9 +3,10 @@
 
 import {env, InferenceSession} from 'onnxruntime-common';
 
+import {init} from './jsep/init';
 import {OrtWasmMessage, SerializableModeldata, SerializableSessionMetadata, SerializableTensor} from './proxy-messages';
 import * as core from './wasm-core-impl';
-import {initializeWebAssembly} from './wasm-factory';
+import {getInstance, initializeWebAssembly} from './wasm-factory';
 
 const isProxy = (): boolean => !!env.wasm.proxy && typeof document !== 'undefined';
 let proxyWorker: Worker|undefined;
@@ -141,9 +142,14 @@ export const initOrt = async(numThreads: number, loggingLevel: number): Promise<
       initOrtCallbacks = [resolve, reject];
       const message: OrtWasmMessage = {type: 'init-ort', in : {numThreads, loggingLevel}};
       proxyWorker!.postMessage(message);
+
+      // TODO: support JSEP in worker
     });
   } else {
     core.initOrt(numThreads, loggingLevel);
+
+    // init JSEP if available
+    await init(getInstance());
   }
 };
 
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index ab72fcc73f448..4f405705e0da2 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -20,9 +20,6 @@ export const initOrt = (numThreads: number, loggingLevel: number): void => {
   if (errorCode !== 0) {
     throw new Error(`Can't initialize onnxruntime. error code = ${errorCode}`);
   }
-
-  // init JSEP if available
-  init(getInstance());
 };
 
 /**
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 9b242b036b691..5975ee57d8312 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -454,7 +454,7 @@ function run(config: Test.Config) {
     // STEP 5. use Karma to run test
     npmlog.info('TestRunnerCli.Run', '(5/5) Running karma to start test runner...');
     const karmaCommand = path.join(npmBin, 'karma');
-    const webgpu = args.backends.indexOf('webgpu') > -1;
+    const webgpu = args.backends.indexOf('webgpu') > -1 || args.backends.indexOf('js') > -1;
     const browser = getBrowserNameFromEnv(
         args.env,
         args.bundleMode === 'perf' ? 'perf' :
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index f7aa8d39ef219..814275cfc6c82 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -262,8 +262,8 @@ export class TensorResultValidator {
       this.absoluteThreshold = CPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = CPU_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'js') {
-      this.absoluteThreshold = WEBGL_THRESHOLD_ABSOLUTE_ERROR;
-      this.relativeThreshold = WEBGL_THRESHOLD_RELATIVE_ERROR;
+      this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
+      this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'webgl') {
       if (TensorResultValidator.isHalfFloat === undefined) {
         TensorResultValidator.isHalfFloat = !createWebGLContext(ort.env.webgl.contextId).isRenderFloat32Supported;
diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
index 9400145f554c0..41cf9b3d01e08 100644
--- a/onnxruntime/core/providers/js/data_transfer.cc
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -23,10 +23,10 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*un
 
   if (dst_device.Type() == OrtDevice::GPU) {
     // copy from CPU to GPU
-    EM_ASM({ Module.jsepUpload(); });
+    EM_ASM({ Module.jsepUpload($0, $1, $2); }, src_data, dst_data, bytes);
   } else if (src_device.Type() == OrtDevice::GPU) {
     // copy from GPU to CPU
-    EM_ASM({ Module.jsepDownload(); });
+    EM_ASM({ Module.jsepDownload($0, $1); }, src_data, dst_data);
   } else {
     // copy from CPU to CPU (don't think we ever get here)
     memcpy(dst_data, src_data, bytes);

From af6bad6773ebe7e2c286d3f02ab70e4f7572c336 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 17 Oct 2022 12:58:54 -0700
Subject: [PATCH 04/81] 1

---
 js/web/lib/wasm/binding/ort-wasm.d.ts           |  2 +-
 js/web/lib/wasm/jsep/backend-webgpu.ts          | 11 ++++++-----
 js/web/lib/wasm/jsep/init.ts                    |  2 +-
 js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts |  4 ++--
 js/web/lib/wasm/jsep/webgpu/types.ts            |  6 ++++--
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index e2c12840e3e2f..87507e7a20b94 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -5,7 +5,7 @@ declare namespace JSEP {
   type BackendType = unknown;
   type AllocFunction = (size: number) => number;
   type FreeFunction = (size: number) => number;
-  type UploadFunction = (size: number) => number;
+  type UploadFunction = (dataOffset: number, gpuDataId: number, size: number) => void;
   type DownloadFunction = (size: number) => number;
   type RunFunction = (size: number) => number;
 }
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index c4a9c05af26a7..4b878342cfdee 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -4,7 +4,6 @@
 import {Tensor} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {ProgramManager} from './webgpu/program-manager';
-import {createTensorDataManager, TensorDataManager} from './webgpu/tensor-data-manager';
 import {GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 const getProgramInfoUniqueKey =
@@ -23,7 +22,6 @@ const getProgramInfoUniqueKey =
 export class WebGpuBackend {
   device: GPUDevice;
   gpuDataManager: GpuDataManager;
-  dataManager: TensorDataManager;
   programManager: ProgramManager;
 
   commandEncoder: GPUCommandEncoder|null = null;
@@ -42,7 +40,6 @@ export class WebGpuBackend {
     }
     this.device = await adapter.requestDevice();
     this.gpuDataManager = createGpuDataManager(this);
-    this.dataManager = createTensorDataManager(this.gpuDataManager);
     this.programManager = new ProgramManager(this);
     // TODO: set up flags
 
@@ -88,7 +85,7 @@ export class WebGpuBackend {
   }
 
   private uploadGpuData(tensor: Tensor, textureType: GpuDataType): GpuData {
-    return this.dataManager.uploadTensorToGpu(tensor, textureType);
+    return this.gpuDataManager.upload(tensor, textureType);
   }
 
   private createGpuData(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
@@ -144,6 +141,10 @@ export class WebGpuBackend {
   }
 
   alloc(size: number): number {
-    throw new Error('Method not implemented.');
+    return this.gpuDataManager.create(size).id;
+  }
+
+  free(ptr: number): number {
+    return this.gpuDataManager.release(ptr);
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index c003cd01f99fa..b197490d64f17 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -27,7 +27,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         (ptr: number) => {
           // eslint-disable-next-line no-console
           console.log(`jsepFree: ${ptr}`);
-          return backend.free(size);
+          return backend.free(ptr);
         },
 
         // jsepUpload(src, dst, size)
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index 527219a97d210..4b7e30c518437 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -17,7 +17,7 @@ export interface GpuDataManager {
   /**
    * create new data on GPU.
    */
-  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData;
+  create(size: number): GpuData;
   /**
    * get GPU data by ID.
    */
@@ -25,7 +25,7 @@ export interface GpuDataManager {
   /**
    * release the data on GPU by ID.
    */
-  release(id: GpuDataId): void;
+  release(id: GpuDataId): number;
   /**
    * download the data from GPU.
    */
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 5d77af75cf427..d7da66513e527 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -4,9 +4,11 @@
 import {Tensor} from '../tensor';
 
 export enum GpuDataType {
-  default = 0
+  default = 0,
+  upload = 1,
+  profile = 2
 }
-export type GpuDataId = unknown;
+export type GpuDataId = number;
 
 export interface GpuData {
   type: GpuDataType;

From 41050d55e0d74271e052f1e5ec277818c2858561 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 17 Oct 2022 17:03:25 -0700
Subject: [PATCH 05/81] 2

---
 js/web/lib/wasm/binding/ort-wasm.d.ts         |  2 +-
 js/web/lib/wasm/jsep/backend-webgpu.ts        | 11 ++--
 js/web/lib/wasm/jsep/init.ts                  | 30 ++++-----
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  | 66 +++++++++++--------
 4 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 87507e7a20b94..ccaa78ef54ce8 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -6,7 +6,7 @@ declare namespace JSEP {
   type AllocFunction = (size: number) => number;
   type FreeFunction = (size: number) => number;
   type UploadFunction = (dataOffset: number, gpuDataId: number, size: number) => void;
-  type DownloadFunction = (size: number) => number;
+  type DownloadFunction = (gpuDataId: number, dataOffset: number, size: number) => Promise<void>;
   type RunFunction = (size: number) => number;
 }
 
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 4b878342cfdee..b99bca8196b68 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -130,14 +130,13 @@ export class WebGpuBackend {
     return outputTensors;
   }
 
-  reshape(input: Tensor, reshapedDims: readonly number[]): Tensor {
-    return this.dataManager.hasGpuData(input.dataId) ?
-        this.dataManager.createGpuRef(input.dataId, input.type, reshapedDims)[0] :
-        new Tensor(reshapedDims, input.type, undefined, undefined, input.data);
+  upload(gpuDataId: number, data: Uint8Array) {
+    this.gpuDataManager.upload(gpuDataId, data);
   }
 
-  upload(dataOffset: number, data: Uint8Array, gpuDataId: number) {
-    throw new Error('Method not implemented.');
+  async download(gpuDataId: number, data: Uint8Array) {
+    const arrayBuffer = await this.gpuDataManager.download(gpuDataId);
+    data.set(new Uint8Array(arrayBuffer));
   }
 
   alloc(size: number): number {
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index b197490d64f17..dcbfa54ac0cb4 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -17,31 +17,29 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         {backend},
 
         // jsepAlloc()
-        (size: number) => {
-          // eslint-disable-next-line no-console
-          console.log(`jsepAlloc: ${size}`);
-          return backend.alloc(size);
-        },
+        (size: number) => backend.alloc(size),
 
         // jsepFree()
-        (ptr: number) => {
-          // eslint-disable-next-line no-console
-          console.log(`jsepFree: ${ptr}`);
-          return backend.free(ptr);
-        },
+        (ptr: number) => backend.free(ptr),
 
         // jsepUpload(src, dst, size)
         (dataOffset: number, gpuDataId: number, size: number) => {
           // eslint-disable-next-line no-console
           console.log('jsepUpload');
           const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
-          backend.upload(dataOffset, data, gpuDataId);
-        },
-        (_src: number, _dst: number) => {
-          // eslint-disable-next-line no-console
-          console.log('jsepDownload');
-          return 41;
+          backend.upload(gpuDataId, data);
         },
+
+        // jsepDownload(src, dst, size)
+        async(gpuDataId: number, dataOffset: number, size: number):
+            Promise<void> => {
+              // eslint-disable-next-line no-console
+              console.log('jsepDownload');
+
+              const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
+              await backend.download(gpuDataId, data);
+            },
+
         (_a: number) => {
           // eslint-disable-next-line no-console
           console.log('jsepRun');
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index 4b7e30c518437..423725633627e 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -1,9 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {sizeof, Tensor} from '../tensor';
-import {ShapeUtil} from '../util';
 import {WebGpuBackend} from '../backend-webgpu';
+
 import {GpuData, GpuDataId, GpuDataType} from './types';
 
 /**
@@ -11,9 +10,9 @@ import {GpuData, GpuDataId, GpuDataType} from './types';
  */
 export interface GpuDataManager {
   /**
-   * upload data to GPU. if the ID already exists in cache, returns the cached value without uploading anything.
+   * upload data to GPU.
    */
-  upload(id: GpuDataId, data: Uint8Array, gpuDataType: GpuDataType): GpuData;
+  upload(id: GpuDataId, data: Uint8Array): void;
   /**
    * create new data on GPU.
    */
@@ -24,6 +23,8 @@ export interface GpuDataManager {
   get(id: GpuDataId): GpuData|undefined;
   /**
    * release the data on GPU by ID.
+   *
+   * @return size of the data released
    */
   release(id: GpuDataId): number;
   /**
@@ -34,7 +35,7 @@ export interface GpuDataManager {
 
 interface StorageCacheValue {
   gpuData: GpuData;
-  size: number;
+  originalSize: number;
 }
 
 interface DownloadCacheValue {
@@ -62,50 +63,53 @@ class GpuDataManagerImpl implements GpuDataManager {
     this.downloadCache = new Map();
   }
 
-  upload(data: Tensor.NumberType, gpuDataType: GpuDataType): GpuData {
-    if (gpuDataType !== GpuDataType.default) {
-      throw new Error('we only support default GPU data type now');
-    }
-
+  upload(id: GpuDataId, data: Uint8Array): void {
     const srcArrayBuffer = data.buffer;
     const srcOffset = data.byteOffset;
     const srcLength = data.byteLength;
     const size = calcNormalizedBufferSize(srcLength);
 
+    // get destination gpu buffer
+    const gpuDataCache = this.storageCache.get(id);
+    if (!gpuDataCache) {
+      throw new Error('gpu data for uploading does not exist');
+    }
+    if (gpuDataCache.originalSize !== srcLength) {
+      throw new Error(`inconsistent data size. gpu data size=${gpuDataCache.originalSize}, data size=${srcLength}`);
+    }
+
     // create gpu buffer
-    const gpuBuffer = this.backend.device.createBuffer({mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE});
+    const gpuBufferForUploading =
+        this.backend.device.createBuffer({mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE});
 
     // copy (upload) data
-    const arrayBuffer = gpuBuffer.getMappedRange();
+    const arrayBuffer = gpuBufferForUploading.getMappedRange();
     new Uint8Array(arrayBuffer).set(new Uint8Array(srcArrayBuffer, srcOffset, srcLength));
-    gpuBuffer.unmap();
+    gpuBufferForUploading.unmap();
 
-    const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
-    this.storageCache.set(gpuData.id, {gpuData, size: srcLength});
-    return gpuData;
-  }
 
-  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData {
-    if (gpuDataType !== GpuDataType.default) {
-      throw new Error('we only support default GPU data type now');
-    }
+    // GPU copy
+    this.backend.getCommandEncoder().copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size);
+    this.backend.flush();
 
+    gpuBufferForUploading.destroy();
+  }
+
+  create(size: number): GpuData {
     // !!!
     // !!! IMPORTANT: TODO: whether we should keep the storage buffer every time, or always create new ones.
     // !!!                  This need to be figured out by performance test results.
     // !!!
 
-    const elemCount = ShapeUtil.size(dims);
-    const bufferLength = sizeof(type) * elemCount;
-    const size = calcNormalizedBufferSize(bufferLength);
+    const bufferSize = calcNormalizedBufferSize(size);
 
     // create gpu buffer
     const gpuBuffer =
         // eslint-disable-next-line no-bitwise
-        this.backend.device.createBuffer({size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC});
+        this.backend.device.createBuffer({size: bufferSize, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC});
 
     const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
-    this.storageCache.set(gpuData.id, {gpuData, size: bufferLength});
+    this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
     return gpuData;
   }
 
@@ -113,7 +117,7 @@ class GpuDataManagerImpl implements GpuDataManager {
     return this.storageCache.get(id)?.gpuData;
   }
 
-  release(id: GpuDataId): void {
+  release(id: GpuDataId): number {
     const cachedData = this.storageCache.get(id);
     if (!cachedData) {
       throw new Error('releasing data does not exist');
@@ -129,6 +133,8 @@ class GpuDataManagerImpl implements GpuDataManager {
       });
       this.downloadCache.delete(id);
     }
+
+    return cachedData.originalSize;
   }
 
   async download(id: GpuDataId): Promise<ArrayBufferLike> {
@@ -146,15 +152,17 @@ class GpuDataManagerImpl implements GpuDataManager {
     this.backend.endComputePass();
     const gpuReadBuffer = this.backend.device.createBuffer(
         // eslint-disable-next-line no-bitwise
-        {size: cachedData.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ});
+        {size: cachedData.originalSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ});
     commandEncoder.copyBufferToBuffer(
         cachedData.gpuData.buffer /* source buffer */, 0 /* source offset */, gpuReadBuffer /* destination buffer */,
-        0 /* destination offset */, cachedData.size /* size */
+        0 /* destination offset */, cachedData.originalSize /* size */
     );
     this.backend.flush();
 
     await gpuReadBuffer.mapAsync(GPUMapMode.READ);
     return gpuReadBuffer.getMappedRange();
+
+    // TODO: release gpuReadBuffer
   }
 }
 

From 083d430002bb2ed26a69099ab7eacd1025fab357 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 17 Oct 2022 22:48:29 -0700
Subject: [PATCH 06/81] 3

---
 js/web/lib/wasm/binding/ort-wasm.d.ts      |  2 +-
 js/web/lib/wasm/jsep/init.ts               |  2 +-
 onnxruntime/core/providers/js/js_export.cc |  0
 onnxruntime/core/providers/js/js_export.h  | 12 ++++++++++++
 4 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/js_export.cc
 create mode 100644 onnxruntime/core/providers/js/js_export.h

diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index ccaa78ef54ce8..ec3b484e93d55 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -7,7 +7,7 @@ declare namespace JSEP {
   type FreeFunction = (size: number) => number;
   type UploadFunction = (dataOffset: number, gpuDataId: number, size: number) => void;
   type DownloadFunction = (gpuDataId: number, dataOffset: number, size: number) => Promise<void>;
-  type RunFunction = (size: number) => number;
+  type RunFunction = (contextDataOffset: number, output: (index: number) => number) => number;
 }
 
 export interface OrtWasmModule extends EmscriptenModule {
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index dcbfa54ac0cb4..818f57650c4a9 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -40,7 +40,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
               await backend.download(gpuDataId, data);
             },
 
-        (_a: number) => {
+        (contextDataOffset: number, output: (index: number) => number) => {
           // eslint-disable-next-line no-console
           console.log('jsepRun');
           return 42;
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/onnxruntime/core/providers/js/js_export.h b/onnxruntime/core/providers/js/js_export.h
new file mode 100644
index 0000000000000..7f3127df2fd27
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_export.h
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <emscripten.h>
+
+#include <stddef.h>
+
+extern "C" {
+void * EMSCRIPTEN_KEEPALIVE JSEP_Output(void * context, int index, void * data);
+};

From 65cc09b34b43cab3e1072d13174f76a7adb75101 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 18 Oct 2022 18:18:14 -0700
Subject: [PATCH 07/81] 4

---
 js/web/lib/wasm/binding/ort-wasm.d.ts         |  8 +-
 js/web/lib/wasm/jsep/backend-webgpu.ts        | 24 ++++-
 js/web/lib/wasm/jsep/init.ts                  | 53 ++++++++++-
 js/web/lib/wasm/jsep/tensor.ts                |  6 ++
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  | 92 ++++++++++---------
 js/web/lib/wasm/jsep/webgpu/types.ts          |  8 +-
 onnxruntime/core/providers/js/js_export.cc    | 20 ++++
 onnxruntime/core/providers/js/js_export.h     |  2 +-
 onnxruntime/core/providers/js/js_kernel.cc    |  9 ++
 onnxruntime/core/providers/js/js_kernel.h     | 13 +++
 .../core/providers/js/operators/unary.cc      |  7 +-
 onnxruntime/wasm/js_internal_api.js           |  4 +-
 12 files changed, 190 insertions(+), 56 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/js_kernel.cc

diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index ec3b484e93d55..cee1e699ec9cc 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -7,7 +7,9 @@ declare namespace JSEP {
   type FreeFunction = (size: number) => number;
   type UploadFunction = (dataOffset: number, gpuDataId: number, size: number) => void;
   type DownloadFunction = (gpuDataId: number, dataOffset: number, size: number) => Promise<void>;
-  type RunFunction = (contextDataOffset: number, output: (index: number) => number) => number;
+  type CreateKernelFunction = (name: string, kernel: number, attribute: unknown) => void;
+  type ReleaseKernelFunction = (kernel: number) => void;
+  type RunFunction = (kernel: number, contextDataOffset: number) => number;
 }
 
 export interface OrtWasmModule extends EmscriptenModule {
@@ -64,7 +66,9 @@ export interface OrtWasmModule extends EmscriptenModule {
   // #region JSEP
   jsepInit?
       (backend: JSEP.BackendType, alloc: JSEP.AllocFunction, free: JSEP.FreeFunction, upload: JSEP.UploadFunction,
-       download: JSEP.DownloadFunction, run: JSEP.RunFunction): void;
+       download: JSEP.DownloadFunction, createKernel: JSEP.CreateKernelFunction,
+       releaseKernel: JSEP.ReleaseKernelFunction, run: JSEP.RunFunction): void;
+  _JsepOutput(context: number, index: number, data: number): number;
   // #endregion
 }
 
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index b99bca8196b68..95b47f91df426 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -3,8 +3,9 @@
 
 import {Tensor} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
+import {WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+import {ComputeContext, GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 const getProgramInfoUniqueKey =
     (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly Tensor[], inputGpuDatas: readonly GpuData[]):
@@ -24,6 +25,8 @@ export class WebGpuBackend {
   gpuDataManager: GpuDataManager;
   programManager: ProgramManager;
 
+  kernelAttributes: Map<number, [(context: ComputeContext) => number, unknown]>;
+
   commandEncoder: GPUCommandEncoder|null = null;
   computePassEncoder: GPUComputePassEncoder|null = null;
   pendingDispatchNumber = 0;
@@ -41,6 +44,7 @@ export class WebGpuBackend {
     this.device = await adapter.requestDevice();
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
+    this.kernelAttributes = new Map();
     // TODO: set up flags
 
     this.device.onuncapturederror = ev => {
@@ -146,4 +150,22 @@ export class WebGpuBackend {
   free(ptr: number): number {
     return this.gpuDataManager.release(ptr);
   }
+
+  createKernel(name: string, kernelId: number, attribute: unknown) {
+    const lookup = WEBGPU_OP_RESOLVE_RULES.get(name);
+    if (!lookup) {
+      throw new Error(`kernel not implemented: ${name}`);
+    }
+
+    if (Array.isArray(lookup)) {
+      const init = lookup[1];
+    }
+    this.kernelAttributes.set(kernelId)
+  }
+
+  releaseKernel(kernelId: number) {}
+
+  computeKernel(kernelId: number, context: ComputeContext): number {
+    throw new Error('Method not implemented.');
+  }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 818f57650c4a9..c7e54d83f39c1 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -4,6 +4,47 @@
 import {OrtWasmModule} from '../binding/ort-wasm';
 
 import {WebGpuBackend} from './backend-webgpu';
+import {TensorView} from './tensor';
+import {ComputeContext} from './webgpu/types';
+
+/* eslint-disable no-bitwise */
+const output = (module: OrtWasmModule, pointer: number, index: number, dims: readonly number[]): number => {
+  const stack = module.stackSave();
+  try {
+    const data = module.stackAlloc((1 + dims.length) * 4 /* sizeof(size_t) */);
+    let offset = data >> 2;
+    module.HEAPU32[offset++] = dims.length;
+    for (let i = 0; i < dims.length; i++) {
+      module.HEAPU32[offset++] = dims[i];
+    }
+    return module._JsepOutput(pointer, index, data);
+  } finally {
+    module.stackRestore(stack);
+  }
+};
+
+const makeContext = (module: OrtWasmModule, contextDataOffset: number): ComputeContext => {
+  const heapU32 = module.HEAPU32;
+
+  // extract context data
+  let dataIndex = (contextDataOffset >> 2);
+  const pointer = heapU32[dataIndex++];
+  const inputCount = heapU32[dataIndex++];
+
+  const inputs: TensorView[] = [];
+  for (let i = 0; i < inputCount; i++) {
+    const dataType = heapU32[dataIndex++];
+    const data = heapU32[dataIndex++];
+    const dim = heapU32[dataIndex++];
+    const dims: number[] = [];
+    for (let d = 0; d < dim; d++) {
+      dims.push(heapU32[dataIndex++]);
+    }
+    inputs.push({dataType, data, dims});
+  }
+
+  return {pointer, inputs, output: (index: number, dims: readonly number[]) => output(module, pointer, index, dims)};
+};
 
 export const init = async(module: OrtWasmModule): Promise<void> => {
   // init JSEP if available
@@ -40,10 +81,18 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
               await backend.download(gpuDataId, data);
             },
 
-        (contextDataOffset: number, output: (index: number) => number) => {
+        // jsepCreateKernel
+        (name: string, kernel: number, attribute: unknown) => backend.createKernel(name, kernel, attribute),
+
+        // jsepReleaseKernel
+        (kernel: number) => backend.releaseKernel(kernel),
+
+        // jsepRun
+        (kernel: number, contextDataOffset: number) => {
           // eslint-disable-next-line no-console
           console.log('jsepRun');
-          return 42;
+          const context = makeContext(module, contextDataOffset);
+          return backend.computeKernel(kernel, context);
         });
   }
 };
diff --git a/js/web/lib/wasm/jsep/tensor.ts b/js/web/lib/wasm/jsep/tensor.ts
index 7dd23f4e7edc1..27575c31b7dc1 100644
--- a/js/web/lib/wasm/jsep/tensor.ts
+++ b/js/web/lib/wasm/jsep/tensor.ts
@@ -262,3 +262,9 @@ export class Tensor {
     return new Tensor(dims, type, undefined, undefined, data);
   }
 }
+
+export interface TensorView {
+  readonly data: number;
+  readonly dataType: number;
+  readonly dims: readonly number[];
+}
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 4adfb180893a6..6de6bedfa755b 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -1,61 +1,65 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {OpSet} from '../../opset';
-
-import * as binaryOps from './ops/binary-op';
-import {concat, parseConcatAttributes} from './ops/concat';
-import {conv, parseConvAttributes} from './ops/conv';
-import {gather, parseGatherAttributes} from './ops/gather';
-import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
-import {matMul, parseMatMulAttributes} from './ops/matmul';
-import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes, parseGlobalAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool';
-import {sum} from './ops/reduce-tensors';
-import {reshape} from './ops/reshape';
-import {shape} from './ops/shape';
-import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
-import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
-import {parseTransposeAttributes, transpose} from './ops/transpose';
+// import * as binaryOps from './ops/binary-op';
+// import {concat, parseConcatAttributes} from './ops/concat';
+// import {conv, parseConvAttributes} from './ops/conv';
+// import {gather, parseGatherAttributes} from './ops/gather';
+// import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
+// import {matMul, parseMatMulAttributes} from './ops/matmul';
+// import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes,
+// parseGlobalAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool'; import {sum} from
+// './ops/reduce-tensors'; import {reshape} from './ops/reshape'; import {shape} from './ops/shape';
+// import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
+// import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
+// import {parseTransposeAttributes, transpose} from './ops/transpose';
 import * as unaryOps from './ops/unary-op';
-import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
+import {ComputeContext} from './types';
+
+// import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
+
+type RunFunction = (context: ComputeContext) => number;
+type InitFunction = (attribute: unknown) => void;
+type ResolveRule = RunFunction|[RunFunction, InitFunction];
 
-export const WEBGPU_OP_RESOLVE_RULES: readonly OpSet.ResolveRule[] = [
-  ['Abs', '', '6+', unaryOps.abs], ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
+export const WEBGPU_OP_RESOLVE_RULES: Map<string, ResolveRule> = new Map([
+  ['abs', [unaryOps.abs]],
+  //, ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
   // ['And', '', '7+', binaryOps.and],
-  ['Asin', '', '7+', unaryOps.asin], ['Atan', '', '7+', unaryOps.atan],
+  //['Asin', '', '7+', unaryOps.asin], ['Atan', '', '7+', unaryOps.atan],
   // TODO: support new attributes for AveragePool-10
-  ['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
+  //['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
   // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
   // ['Cast', '', '6+', cast, parseCastAttributes],
-  ['Ceil', '', '6+', unaryOps.ceil], ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
-  ['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
-  ['Conv', '', '1+', conv, parseConvAttributes], ['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
+  //['Ceil', '', '6+', unaryOps.ceil], ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
+  //['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
+  //['Conv', '', '1+', conv, parseConvAttributes], ['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
   // ['Dropout', '', '7+', unaryOps.identity],
   // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
   // ['Equal', '', '7+', binaryOps.equal],
-  ['Elu', '', '6+', unaryOps.elu, unaryOps.parseEluAttributes], ['Exp', '', '6+', unaryOps.exp],
+  //['Elu', '', '6+', unaryOps.elu, unaryOps.parseEluAttributes], ['Exp', '', '6+', unaryOps.exp],
   // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
-  ['Floor', '', '6+', unaryOps.floor],
+  //['Floor', '', '6+', unaryOps.floor],
   // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
-  ['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
-  ['Gemm', '', '11+', gemm, parseGemmAttributesV11],
-  ['GlobalAveragePool', '', '1+', globalAveragePool, parseGlobalAveragePoolAttributes],
-  ['GlobalMaxPool', '', '1+', globalMaxPool],
+  //['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
+  //['Gemm', '', '11+', gemm, parseGemmAttributesV11],
+  //['GlobalAveragePool', '', '1+', globalAveragePool, parseGlobalAveragePoolAttributes],
+  //['GlobalMaxPool', '', '1+', globalMaxPool],
   // ['Greater', '', '7+', binaryOps.greater],
   // ['Identity', '', '1+', unaryOps.identity],
   // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
   // ['InstanceNormalization', '', '6+', instanceNormalization, parseInstanceNormalizationAttributes],
-  ['LeakyRelu', '', '6+', unaryOps.leakyRelu, unaryOps.parseLeakyReluAttributes],
+  //['LeakyRelu', '', '6+', unaryOps.leakyRelu, unaryOps.parseLeakyReluAttributes],
   // ['Less', '', '7+', binaryOps.less],
-  ['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
+  //['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
-  ['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes], ['Mul', '', '7+', binaryOps.mul],
-  ['Neg', '', '6+', unaryOps.neg],
+  //['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes], ['Mul', '', '7+', binaryOps.mul],
+  //['Neg', '', '6+', unaryOps.neg],
   // ['Not', '', '1+', unaryOps.not],
   // ['Or', '', '7+', binaryOps.or],
   // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
   // ['Pad', '', '11+', padV11, parsePadAttributesV11],
-  ['Pow', '', '7+', binaryOps.pow],
+  //['Pow', '', '7+', binaryOps.pow],
   // ['PRelu', '', '7+', binaryOps.pRelu],
   // ['ReduceLogSum', '', '1+', reduceLogSum, parseReduceAttributes],
   // ['ReduceMax', '', '1+', reduceMax, parseReduceAttributes],
@@ -64,12 +68,12 @@ export const WEBGPU_OP_RESOLVE_RULES: readonly OpSet.ResolveRule[] = [
   // ['ReduceProd', '', '1+', reduceProd, parseReduceAttributes],
   // ['ReduceSum', '', '1-12', reduceSum, parseReduceAttributes],
   // ['ReduceSumSquare', '', '1+', reduceLogSumSquare, parseReduceAttributes],
-  ['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
+  //['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
   // ['Resize', '', '10', resize, parseResizeAttributesV10],
   // ['Resize', '', '11+', resize, parseResizeAttributesV11],
-  ['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid], ['Sin', '', '7+', unaryOps.sin],
-  ['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
-  ['Slice', '', '1-9', slice, parseSliceAttributes],
+  //['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid], ['Sin', '', '7+', unaryOps.sin],
+  //['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
+  //['Slice', '', '1-9', slice, parseSliceAttributes],
   // // The "semantic" meaning of axis has changed in opset-13.
   // ['Softmax', '', '1-12', softmax, parseSoftmaxAttributes],
   // ['Softmax', '', '13+', softmaxV13, parseSoftmaxAttributesV13],
@@ -78,13 +82,13 @@ export const WEBGPU_OP_RESOLVE_RULES: readonly OpSet.ResolveRule[] = [
   // // When the attribute is missing, we need the count of number of outputs
   // // so that we can determine the 'split' attribute from the runtime input to the Operator
   // ['Split', '', '2-12', split, parseSplitAttributes],
-  ['Sqrt', '', '6+', unaryOps.sqrt], ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
-  ['Squeeze', '', '13+', squeezeV13], ['Sub', '', '7+', binaryOps.sub], ['Sum', '', '6+', sum],
-  ['Tan', '', '7+', unaryOps.tan], ['Tanh', '', '6+', unaryOps.tanh],
+  //['Sqrt', '', '6+', unaryOps.sqrt], ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
+  //['Squeeze', '', '13+', squeezeV13], ['Sub', '', '7+', binaryOps.sub], ['Sum', '', '6+', sum],
+  //['Tan', '', '7+', unaryOps.tan], ['Tanh', '', '6+', unaryOps.tanh],
   // ['Tile', '', '6+', tile],
-  ['Transpose', '', '1+', transpose, parseTransposeAttributes],
+  //['Transpose', '', '1+', transpose, parseTransposeAttributes],
   // ['Upsample', '', '7-8', upsample, parseUpsampleAttributesV7],
   // ['Upsample', '', '9', upsample, parseUpsampleAttributesV9],
-  ['Unsqueeze', '', '1-12', unsqueeze, parseUnsqueezeAttributes], ['Unsqueeze', '', '13+', unsqueezeV13],
+  //['Unsqueeze', '', '1-12', unsqueeze, parseUnsqueezeAttributes], ['Unsqueeze', '', '13+', unsqueezeV13],
   // ['Xor', '', '7+', binaryOps.xor],
-];
+]);
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index d7da66513e527..b71749c8e5cab 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Tensor} from '../tensor';
+import {Tensor, TensorView} from '../tensor';
 
 export enum GpuDataType {
   default = 0,
@@ -94,3 +94,9 @@ export interface Artifact {
   computePipeline: GPUComputePipeline;
   // attribLocations: {position: number; textureCoord: number};
 }
+
+export interface ComputeContext {
+  readonly pointer: number;
+  readonly inputs: readonly TensorView[];
+  output(index: number, dims: readonly number[]): number;
+}
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index e69de29bb2d1d..486fed6ceeb07 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "js_export.h"
+
+#include "core/framework/op_kernel.h"
+
+const void * JsepOutput(void * context, int index, void * data) {
+    uint32_t * data_offset = reinterpret_cast<uint32_t *>(data);
+    uint32_t dim = *data_offset++;
+    size_t dim_size = static_cast<size_t>(dim);
+    std::vector<int64_t> dims;
+    dims.reserve(dim_size);
+    for (size_t i = 0; i < dim_size; i++) {
+        dims[i] = static_cast<int64_t>(*data_offset++);
+    }
+
+    auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
+    return output->DataRaw();
+}
diff --git a/onnxruntime/core/providers/js/js_export.h b/onnxruntime/core/providers/js/js_export.h
index 7f3127df2fd27..a178d4b37fe6e 100644
--- a/onnxruntime/core/providers/js/js_export.h
+++ b/onnxruntime/core/providers/js/js_export.h
@@ -8,5 +8,5 @@
 #include <stddef.h>
 
 extern "C" {
-void * EMSCRIPTEN_KEEPALIVE JSEP_Output(void * context, int index, void * data);
+const void * EMSCRIPTEN_KEEPALIVE JsepOutput(void * context, int index, void * data);
 };
diff --git a/onnxruntime/core/providers/js/js_kernel.cc b/onnxruntime/core/providers/js/js_kernel.cc
new file mode 100644
index 0000000000000..34f592814c1e4
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_kernel.cc
@@ -0,0 +1,9 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 72aab01221899..967f1936b24bc 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #pragma once
+
+#include <emscripten.h>
+
 #include "core/framework/op_kernel.h"
 #include "core/providers/js/js_execution_provider.h"
 
@@ -14,7 +17,17 @@ class JsKernel : public OpKernel {
  public:
   explicit JsKernel(const OpKernelInfo& info)
       : OpKernel(info) {
+        InitAttributes();
+      }
+  virtual ~JsKernel() {
+    EM_ASM({ Module.jsepReleaseKernel($0); }, this);
   }
+
+ protected:
+  virtual void InitAttributes() {
+    EM_ASM({ Module.jsepCreateKernel("abs", $0, undefined); }, this);
+  };
+
 };
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index 361adab12e985..e43a66651ef60 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -2,15 +2,14 @@
 // Licensed under the MIT License.
 
 #include "core/providers/js/js_execution_provider.h"
-
-#include "core/framework/op_kernel.h"
+#include "core/providers/js/js_kernel.h"
 
 namespace onnxruntime {
 namespace js {
 
-class AbsImpl : public OpKernel {
+class AbsImpl : public JsKernel {
 public:
-    AbsImpl(const OpKernelInfo& info) : OpKernel(info) {}
+    AbsImpl(const OpKernelInfo& info) : JsKernel(info) {}
 
     Status Compute(OpKernelContext* context) const override {
         AllocatorPtr alloc;
diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js
index 9fc4b297a2fac..77608029f8937 100644
--- a/onnxruntime/wasm/js_internal_api.js
+++ b/onnxruntime/wasm/js_internal_api.js
@@ -4,11 +4,13 @@
 'use strict';
 
 // init JSEP
-Module["jsepInit"] = function (backend, alloc, free, upload, download, run) {
+Module["jsepInit"] = function (backend, alloc, free, upload, download, createKernel, releaseKernel, run) {
     Module.jsepBackend = backend;
     Module.jsepAlloc = alloc;
     Module.jsepFree = free;
     Module.jsepUpload = upload;
     Module.jsepDownload = download;
+    Module.jsepCreateKernel = createKernel;
+    Module.jsepReleaseKernel = releaseKernel;
     Module.jsepRun = run;
 };

From f6cd92ca926cc0f106b4fc4503b849e92d37f540 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 24 Oct 2022 10:03:14 -0700
Subject: [PATCH 08/81] 5

---
 js/web/lib/wasm/jsep/backend-webgpu.ts        |  26 ++--
 js/web/lib/wasm/jsep/init.ts                  |  76 +++++-----
 js/web/lib/wasm/jsep/util.ts                  |   3 +
 .../jsep/webgpu/attribute-with-cache-key.ts   |  24 +++
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   6 +-
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   |  29 ++--
 .../lib/wasm/jsep/webgpu/session-handler.ts   |  47 ------
 .../wasm/jsep/webgpu/tensor-data-manager.ts   | 140 ------------------
 js/web/lib/wasm/jsep/webgpu/types.ts          |   3 +-
 onnxruntime/core/providers/js/js_kernel.h     |  55 ++++++-
 .../core/providers/js/operators/unary.cc      |  39 ++---
 11 files changed, 170 insertions(+), 278 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/session-handler.ts
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 95b47f91df426..99c5d229ace9d 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Tensor} from './tensor';
+import {Tensor, TensorView} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
@@ -25,7 +25,7 @@ export class WebGpuBackend {
   gpuDataManager: GpuDataManager;
   programManager: ProgramManager;
 
-  kernelAttributes: Map<number, [(context: ComputeContext) => number, unknown]>;
+  kernels: Map<number, [(context: ComputeContext) => number, unknown]>;
 
   commandEncoder: GPUCommandEncoder|null = null;
   computePassEncoder: GPUComputePassEncoder|null = null;
@@ -44,7 +44,7 @@ export class WebGpuBackend {
     this.device = await adapter.requestDevice();
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
-    this.kernelAttributes = new Map();
+    this.kernels = new Map();
     // TODO: set up flags
 
     this.device.onuncapturederror = ev => {
@@ -96,7 +96,7 @@ export class WebGpuBackend {
     return this.dataManager.createGpuTensor(type, dims, gpuDataType);
   }
 
-  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly Tensor[]): Tensor[] {
+  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[]): number {
     if (inputs.length !== program.inputTypes.length) {
       throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
     }
@@ -151,21 +151,25 @@ export class WebGpuBackend {
     return this.gpuDataManager.release(ptr);
   }
 
-  createKernel(name: string, kernelId: number, attribute: unknown) {
-    const lookup = WEBGPU_OP_RESOLVE_RULES.get(name);
-    if (!lookup) {
+  createKernel(name: string, kernelId: number, attribute: unknown): void {
+    const op = WEBGPU_OP_RESOLVE_RULES.get(name);
+    if (!op) {
       throw new Error(`kernel not implemented: ${name}`);
     }
 
-    if (Array.isArray(lookup)) {
-      const init = lookup[1];
+    let processedAttribute = attribute;
+    if (op.length > 1 && typeof op[1] !== 'undefined') {
+      processedAttribute = op[1](attribute);
     }
-    this.kernelAttributes.set(kernelId)
+    this.kernels.set(kernelId, [op[0], processedAttribute]);
   }
 
-  releaseKernel(kernelId: number) {}
+  releaseKernel(kernelId: number): void {
+    this.kernels.delete(kernelId);
+  }
 
   computeKernel(kernelId: number, context: ComputeContext): number {
+    const kernel = this.kernels
     throw new Error('Method not implemented.');
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index c7e54d83f39c1..2f087e69cfdc6 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -5,46 +5,54 @@ import {OrtWasmModule} from '../binding/ort-wasm';
 
 import {WebGpuBackend} from './backend-webgpu';
 import {TensorView} from './tensor';
-import {ComputeContext} from './webgpu/types';
+import {ComputeContext, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
-const output = (module: OrtWasmModule, pointer: number, index: number, dims: readonly number[]): number => {
-  const stack = module.stackSave();
-  try {
-    const data = module.stackAlloc((1 + dims.length) * 4 /* sizeof(size_t) */);
-    let offset = data >> 2;
-    module.HEAPU32[offset++] = dims.length;
-    for (let i = 0; i < dims.length; i++) {
-      module.HEAPU32[offset++] = dims[i];
+
+class OpKernelContext implements ComputeContext {
+  readonly opKernelContext: number;
+  readonly inputs: readonly TensorView[];
+  constructor(private module: OrtWasmModule, private backend: WebGpuBackend, contextDataOffset: number) {
+    const heapU32 = module.HEAPU32;
+
+    // extract context data
+    let dataIndex = (contextDataOffset >> 2);
+    this.opKernelContext = heapU32[dataIndex++];
+    const inputCount = heapU32[dataIndex++];
+
+    const inputs: TensorView[] = [];
+    for (let i = 0; i < inputCount; i++) {
+      const dataType = heapU32[dataIndex++];
+      const data = heapU32[dataIndex++];
+      const dim = heapU32[dataIndex++];
+      const dims: number[] = [];
+      for (let d = 0; d < dim; d++) {
+        dims.push(heapU32[dataIndex++]);
+      }
+      inputs.push({dataType, data, dims});
     }
-    return module._JsepOutput(pointer, index, data);
-  } finally {
-    module.stackRestore(stack);
+    this.inputs = inputs;
   }
-};
 
-const makeContext = (module: OrtWasmModule, contextDataOffset: number): ComputeContext => {
-  const heapU32 = module.HEAPU32;
-
-  // extract context data
-  let dataIndex = (contextDataOffset >> 2);
-  const pointer = heapU32[dataIndex++];
-  const inputCount = heapU32[dataIndex++];
-
-  const inputs: TensorView[] = [];
-  for (let i = 0; i < inputCount; i++) {
-    const dataType = heapU32[dataIndex++];
-    const data = heapU32[dataIndex++];
-    const dim = heapU32[dataIndex++];
-    const dims: number[] = [];
-    for (let d = 0; d < dim; d++) {
-      dims.push(heapU32[dataIndex++]);
-    }
-    inputs.push({dataType, data, dims});
+  compute(program: ProgramInfoLoader|ProgramInfo): number {
+    return this.backend.run(program, this.inputs);
   }
 
-  return {pointer, inputs, output: (index: number, dims: readonly number[]) => output(module, pointer, index, dims)};
-};
+  output(index: number, dims: readonly number[]): number {
+    const stack = this.module.stackSave();
+    try {
+      const data = this.module.stackAlloc((1 + dims.length) * 4 /* sizeof(size_t) */);
+      let offset = data >> 2;
+      this.module.HEAPU32[offset++] = dims.length;
+      for (let i = 0; i < dims.length; i++) {
+        this.module.HEAPU32[offset++] = dims[i];
+      }
+      return this.module._JsepOutput(this.opKernelContext, index, data);
+    } finally {
+      this.module.stackRestore(stack);
+    }
+  }
+}
 
 export const init = async(module: OrtWasmModule): Promise<void> => {
   // init JSEP if available
@@ -91,7 +99,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         (kernel: number, contextDataOffset: number) => {
           // eslint-disable-next-line no-console
           console.log('jsepRun');
-          const context = makeContext(module, contextDataOffset);
+          const context = new OpKernelContext(module, backend, contextDataOffset);
           return backend.computeKernel(kernel, context);
         });
   }
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index 72f4ae13056b6..92f24d38f343f 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -509,3 +509,6 @@ export class ShapeUtil {
     return outputDims;
   }
 }
+
+export const MIN_CLIP = -3.4028234663852886e+38;
+export const MAX_CLIP = 3.4028234663852886e+38;
diff --git a/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts b/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
new file mode 100644
index 0000000000000..6608b00471e77
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+class AttributeWithCacheKeyImpl {
+  constructor(attribute: Record<string, unknown>) {
+    Object.assign(this, attribute);
+  }
+
+  private _cacheKey: string;
+  public get cacheKey(): string {
+    if (!this._cacheKey) {
+      this._cacheKey =
+          Object.getOwnPropertyNames(this).sort().map(name => `${(this as Record<string, unknown>)[name]}`).join(';');
+    }
+    return this._cacheKey;
+  }
+}
+
+export interface AttributeWithCacheKey {
+  readonly cacheKey: string;
+}
+
+export const createAttributeWithCacheKey = <T extends Record<string, unknown>>(attribute: T): T&AttributeWithCacheKey =>
+    new AttributeWithCacheKeyImpl(attribute) as unknown as T & AttributeWithCacheKey;
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 6de6bedfa755b..511fab9635ffc 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -19,10 +19,10 @@ import {ComputeContext} from './types';
 // import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
 
 type RunFunction = (context: ComputeContext) => number;
-type InitFunction = (attribute: unknown) => void;
-type ResolveRule = RunFunction|[RunFunction, InitFunction];
+type ParseAttributeFunction = (attributeRaw: unknown) => unknown;
+type OperatorImplementation = [RunFunction]|[RunFunction, ParseAttributeFunction];
 
-export const WEBGPU_OP_RESOLVE_RULES: Map<string, ResolveRule> = new Map([
+export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new Map([
   ['abs', [unaryOps.abs]],
   //, ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
   // ['And', '', '7+', binaryOps.and],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index 54213cfdd2313..4e3468fc81cff 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -1,12 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {Tensor} from '../../../tensor';
-import {MAX_CLIP, MIN_CLIP} from '../../../util';
+import {TensorView} from '../../tensor';
+import {MAX_CLIP, MIN_CLIP, ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
 import {WORKGROUP_SIZE} from './common';
 
@@ -46,14 +45,14 @@ const createElementwiseProgramShader =
     };
 
 const createElementwiseProgramInfo =
-    (metadata: ProgramMetadata, input: Tensor, funcCall: ElementwiseFunctionCall, additionalImplementation?: string):
-        ProgramInfo => ({
-          ...metadata,
-          shaderSource: createElementwiseProgramShader(input.size, funcCall, additionalImplementation),
-          outputs: [{dims: input.dims, type: input.type, gpuDataType: GpuDataType.default}],
-          dispatchGroup: (inputTensors) =>
-              ({x: Math.ceil(inputTensors[0].size / 64 /* workgroup size */ / 4 /* vec size */)})
-        });
+    (metadata: ProgramMetadata, input: TensorView, funcCall: ElementwiseFunctionCall,
+     additionalImplementation?: string): ProgramInfo => ({
+      ...metadata,
+      shaderSource: createElementwiseProgramShader(ShapeUtil.size(input.dims), funcCall, additionalImplementation),
+      outputs: [{dims: input.dims, type: input.type, gpuDataType: GpuDataType.default}],
+      dispatchGroup: (inputTensors) =>
+          ({x: Math.ceil(inputTensors[0].size / 64 /* workgroup size */ / 4 /* vec size */)})
+    });
 
 const createElementwiseProgramInfoLoader =
     (input: Tensor, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string,
@@ -65,8 +64,8 @@ const createElementwiseProgramInfoLoader =
       };
     };
 
-export const abs = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Abs', 'abs'), inputs);
+export const abs = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Abs', 'abs'));
 
 export const acos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Acos', 'acos'), inputs);
diff --git a/js/web/lib/wasm/jsep/webgpu/session-handler.ts b/js/web/lib/wasm/jsep/webgpu/session-handler.ts
deleted file mode 100644
index 1fe288c36dd1e..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/session-handler.ts
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {SessionHandler} from '../../backend';
-import {Graph} from '../../graph';
-import {Operator} from '../../operators';
-import {OpSet, resolveOperator} from '../../opset';
-import {Session} from '../../session';
-import {Tensor} from '../../tensor';
-import {WebGpuBackend} from '../backend-webgpu';
-
-import {WebGpuInferenceHandler} from './inference-handler';
-import {WEBGPU_OP_RESOLVE_RULES} from './op-resolve-rules';
-import {ProgramManager} from './program-manager';
-import {createTensorDataManager, TensorDataManager} from './tensor-data-manager';
-
-export class WebGpuSessionHandler implements SessionHandler {
-  private initializers: Set<Tensor.Id>;
-  readonly dataManager: TensorDataManager;
-  readonly programManager: ProgramManager;
-
-  constructor(public readonly backend: WebGpuBackend, public readonly context: Session.Context) {
-    this.dataManager = createTensorDataManager(this.backend.gpuDataManager);
-    this.programManager = new ProgramManager(this.backend, this.context.profiler);
-  }
-
-  createInferenceHandler() {
-    return new WebGpuInferenceHandler(this);
-  }
-  onGraphInitialized(graph: Graph): void {
-    const initializers = graph.getValues().filter(v => v.from === -1 && v.tensor).map(v => v.tensor!.dataId);
-    this.initializers = new Set(initializers);
-  }
-  isInitializer(tensorId: Tensor.Id): boolean {
-    return this.initializers ? this.initializers.has(tensorId) : false;
-  }
-  addInitializer(tensorId: Tensor.Id): void {
-    this.initializers.add(tensorId);
-  }
-  dispose(): void {
-    // TODO
-  }
-  resolve(node: Graph.Node, opsets: readonly OpSet[], graph: Graph): Operator {
-    const op = resolveOperator(node, opsets, WEBGPU_OP_RESOLVE_RULES);
-    return {impl: op.opImpl, context: op.opInit ? op.opInit(node, graph) : node};
-  }
-}
diff --git a/js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts
deleted file mode 100644
index 0899ecba6e272..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/tensor-data-manager.ts
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {createView, Tensor} from '../tensor';
-
-import {GpuDataManager} from './gpu-data-manager';
-import {GpuData, GpuDataId, GpuDataType} from './types';
-
-/**
- * manages Tensor ID -> Gpu Data ID
- *
- * A tensor ID is a unique ID representing a value(tensor), which is the graph's node's input or output.
- * A GPU Data ID is a unique ID representing an abstract data on GPU memory. Specifically, for current WebGPU scenarios,
- *   GPU Data is a storage buffer, and GPU Data ID is a handle to a storage buffer.
- *
- * - a value is different to the graph's edge. if a node's output is consumed by 2 other downstream nodes, there are
- *   2 edges, but only one value.
- *
- * - a tensor ID maps to 0 or 1 GPU Data ID, depending on whether the data is available on GPU or not.
- *
- * - a GPU Data ID maps to 1 or more tensor ID.
- *
- */
-export interface TensorDataManager {
-  /**
-   * upload a CPU tensor to GPU.
-   */
-  uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): GpuData;
-
-  /**
-   * create a new GPU tensor.
-   */
-  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData];
-
-  /**
-   * check whether the tensor has GPU data
-   */
-  hasGpuData(tensorId: Tensor.Id): boolean;
-
-  /**
-   * create a reference to the GPU data.
-   */
-  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData];
-
-  /**
-   * release the GPU resources referred by the tensor.
-   */
-  releaseGpuTensor(tensorId: Tensor.Id): void;
-}
-
-class TensorDataManagerImpl implements TensorDataManager {
-  private map: Map<Tensor.Id, GpuDataId>;
-  private reverseMap: Map<GpuDataId, Set<Tensor.Id>>;
-
-  constructor(private gpuDataManager: GpuDataManager) {
-    this.map = new Map();
-    this.reverseMap = new Map();
-  }
-
-  private registerIdMapping(tensorId: Tensor.Id, gpuDataId: GpuDataId): void {
-    this.map.set(tensorId, gpuDataId);
-
-    let tensorIds = this.reverseMap.get(gpuDataId);
-    if (!tensorIds) {
-      tensorIds = new Set();
-      this.reverseMap.set(gpuDataId, tensorIds);
-    }
-    tensorIds.add(tensorId);
-  }
-
-  uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): GpuData {
-    const gpuDataId = this.map.get(tensor.dataId);
-    if (gpuDataId) {
-      const gpuData = this.gpuDataManager.get(gpuDataId);
-      if (!gpuData) {
-        throw new Error('internal error. this should never happen');
-      }
-      return gpuData;
-    }
-
-    const gpuData = this.gpuDataManager.upload(tensor.numberData, gpuDataType);
-    this.registerIdMapping(tensor.dataId, gpuData.id);
-    return gpuData;
-  }
-
-  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
-    const gpuData = this.gpuDataManager.create(type, dims, gpuDataType);
-    const tensor = new Tensor(dims, type, undefined, async () => {
-      const data = await this.gpuDataManager.download(gpuData.id);
-      return createView(data, type);
-    });
-
-    this.registerIdMapping(tensor.dataId, gpuData.id);
-    return [tensor, gpuData];
-  }
-
-  hasGpuData(tensorId: Tensor.Id): boolean {
-    return this.map.has(tensorId);
-  }
-
-  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData] {
-    const gpuDataId = this.map.get(tensorId);
-    if (!gpuDataId) {
-      throw new Error('internal error. this should never happen');
-    }
-
-    const gpuData = this.gpuDataManager.get(gpuDataId);
-    if (!gpuData) {
-      throw new Error('internal error. this should never happen');
-    }
-
-    const tensor = new Tensor(dims, type, undefined, async () => {
-      const data = await this.gpuDataManager.download(gpuData.id);
-      return createView(data, type);
-    });
-
-    this.registerIdMapping(tensor.dataId, gpuData.id);
-    return [tensor, gpuData];
-  }
-
-  releaseGpuTensor(tensorId: Tensor.Id): void {
-    const gpuDataId = this.map.get(tensorId);
-    if (gpuDataId) {
-      this.map.delete(tensorId);
-
-      const tensorIds = this.reverseMap.get(gpuDataId);
-      if (!tensorIds) {
-        throw new Error('internal error. this should never happen');
-      }
-      tensorIds.delete(tensorId);
-      if (tensorIds.size === 0) {
-        this.gpuDataManager.release(gpuDataId);
-        this.reverseMap.delete(gpuDataId);
-      }
-    }
-  }
-}
-
-export const createTensorDataManager = (gpuDataManager: GpuDataManager): TensorDataManager =>
-    new TensorDataManagerImpl(gpuDataManager);
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index b71749c8e5cab..ea9ddfdaf46fc 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -96,7 +96,8 @@ export interface Artifact {
 }
 
 export interface ComputeContext {
-  readonly pointer: number;
+  readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
+  compute(program: ProgramInfoLoader|ProgramInfo): number;
   output(index: number, dims: readonly number[]): number;
 }
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 967f1936b24bc..774bb437fcf3b 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -13,21 +13,60 @@ struct pthreadpool;
 namespace onnxruntime {
 namespace js {
 
+#define JSEP_INIT_KERNEL(x) EM_ASM({ Module.jsepCreateKernel(#x, $0, undefined); }, this)
+#define JSEP_INIT_KERNEL_ATTRIBUTE(x, a, ...) EM_ASM({ Module.jsepCreateKernel(#x, $0, a); }, this, __VA_ARGS__)
+
 class JsKernel : public OpKernel {
  public:
   explicit JsKernel(const OpKernelInfo& info)
-      : OpKernel(info) {
-        InitAttributes();
-      }
+      : OpKernel(info) {}
   virtual ~JsKernel() {
-    EM_ASM({ Module.jsepReleaseKernel($0); }, this);
+      EM_ASM({ Module.jsepReleaseKernel($0); }, this);
   }
 
- protected:
-  virtual void InitAttributes() {
-    EM_ASM({ Module.jsepCreateKernel("abs", $0, undefined); }, this);
-  };
+  Status Compute(OpKernelContext* context) const override {
+      AllocatorPtr alloc;
+      ORT_RETURN_IF_ERROR(context->GetTempSpaceCPUAllocator(&alloc));
+
+      //
+      // temp_data_format (every item is (u)int32_t):
+      //    input_count | [input_data_0] ... [input_data_N-1]
+      //
+      // input_data_format:
+      //    type | data_ptr | dim_size | dim[0] ... dim[N-1]
+      //
+      size_t temp_data_size = sizeof(size_t);
+      for (int i = 0; i < context->InputCount(); i++) {
+        temp_data_size += sizeof(size_t) * (3 + context->Input<Tensor>(i)->Shape().NumDimensions());
+      }
+      uint32_t *p_inputs_data = reinterpret_cast<uint32_t*>(alloc->Alloc(temp_data_size));
+      p_inputs_data[0] = reinterpret_cast<uint32_t>(context);
+      p_inputs_data[1] = static_cast<uint32_t>(context->InputCount());
+      size_t index = 2;
+      for (int i = 0; i < context->InputCount(); i++) {
+        p_inputs_data[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->GetElementType());
+        p_inputs_data[index++] = reinterpret_cast<uint32_t>(context->Input<Tensor>(i)->DataRaw());
+        p_inputs_data[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape().NumDimensions());
+        for (size_t d = 0; d < context->Input<Tensor>(i)->Shape().NumDimensions(); d++) {
+          p_inputs_data[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape()[d]);
+        }
+      }
+
+      printf("temp data size: %zu. Data: ", temp_data_size);
+      for (int i=0; i < (int)temp_data_size/4;i++) {printf("%u ", p_inputs_data[i]); }
+      printf("\n");
+
+      int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_inputs_data);
+
+      printf("outputs = %d. Y.data=%zu\n", context->OutputCount(), (size_t)(context->Output<Tensor>(0)->DataRaw()));
 
+      alloc->Free(p_inputs_data);
+      if (status == 0) {
+        return Status::OK();
+      } else {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to run JSEP kernel");
+      }
+  }
 };
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index e43a66651ef60..661490fd8684c 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -9,28 +9,29 @@ namespace js {
 
 class AbsImpl : public JsKernel {
 public:
-    AbsImpl(const OpKernelInfo& info) : JsKernel(info) {}
-
-    Status Compute(OpKernelContext* context) const override {
-        AllocatorPtr alloc;
-        ORT_RETURN_IF_ERROR(context->GetTempSpaceCPUAllocator(&alloc));
-        size_t temp_data_size = sizeof(size_t) * (1 + context->InputCount() * (3 + context->Input<Tensor>(0)->Shape().NumDimensions()));
-        printf("temp data size: %zu\n", temp_data_size);
-        void *p_inputs = alloc->Alloc(temp_data_size);
-
-        //
-        // type | data_ptr | dim_size | dim[0] ... dim[N-1]
-        //
-
-        Tensor* Y = context->Output(0, TensorShape(context->Input<Tensor>(0)->Shape()));
-        printf("Y.data=%zu\n", (size_t)(Y->DataRaw()));
-
-        alloc->Free(p_inputs);
-
-        return Status::OK();
+    AbsImpl(const OpKernelInfo& info) : JsKernel(info) {
+        JSEP_INIT_KERNEL(Abs);
     }
 };
 
+
+// class kJsExecutionProvider_Abs_kOnnxDomain_ver1_14;
+// template <> KernelCreateInfo BuildKernelCreateInfo<kJsExecutionProvider_Abs_kOnnxDomain_ver1_14>() {
+//     return KernelCreateInfo(
+//         KernelDefBuilder()
+//         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+//         .SetName("Abs")
+//         .SetDomain(kOnnxDomain)
+//         .SinceVersion(1, 14)
+//         .Provider(kJsExecutionProvider).Build(),
+//         static_cast<KernelCreatePtrFn>(
+//             [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+//                 out = std::make_unique<AbsImpl>(info);
+//                 return Status::OK();
+//             })
+//         );
+// }
+
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     Abs,
     kOnnxDomain,

From e5200235d28e96504cb6001089e1fc32ac66fb61 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 25 Oct 2022 12:02:32 -0700
Subject: [PATCH 09/81] 6

---
 cmake/onnxruntime_webassembly.cmake           |   3 +-
 js/web/lib/wasm/jsep/backend-webgpu.ts        |  79 +-
 js/web/lib/wasm/jsep/init.ts                  |   2 +-
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  |   9 +-
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   8 +-
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts  |  35 +-
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts     | 117 ++-
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  | 250 +++---
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 296 +++----
 js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts |  65 +-
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts     | 257 +++---
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts       | 324 ++++----
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts     | 224 +++---
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       | 748 +++++++++---------
 .../wasm/jsep/webgpu/ops/reduce-tensors.ts    | 166 ++--
 js/web/lib/wasm/jsep/webgpu/ops/reshape.ts    |  34 +-
 js/web/lib/wasm/jsep/webgpu/ops/shape.ts      |  16 -
 js/web/lib/wasm/jsep/webgpu/ops/slice.ts      | 354 ++++-----
 js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts    |  82 +-
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  | 228 +++---
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   | 187 +++--
 js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts  |  80 +-
 js/web/lib/wasm/jsep/webgpu/types.ts          |   4 +-
 js/web/lib/wasm/wasm-core-impl.ts             |   2 +-
 24 files changed, 1781 insertions(+), 1789 deletions(-)
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/ops/shape.ts

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index cf410a3e4f18f..66a393e164da0 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -202,7 +202,7 @@ else()
 
   set_target_properties(onnxruntime_webassembly PROPERTIES LINK_FLAGS " \
                         -s \"EXPORTED_RUNTIME_METHODS=${EXPORTED_RUNTIME_METHODS}\" \
-                        -s \"EXPORTED_FUNCTIONS=_malloc,_free\" \
+                        -s \"EXPORTED_FUNCTIONS=_malloc,_free,_JsepOutput\" \
                         -s MAXIMUM_MEMORY=4294967296 \
                         -s WASM=1 \
                         -s NO_EXIT_RUNTIME=0 \
@@ -216,6 +216,7 @@ else()
                         --no-entry")
 
   if (onnxruntime_USE_JS)
+    target_compile_definitions(onnxruntime_webassembly PRIVATE -DUSE_JS=1)
     set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js\"")
   endif()
 
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 99c5d229ace9d..e0f637ca53a2a 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -1,31 +1,31 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Tensor, TensorView} from './tensor';
+import {TensorView} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
-import {WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
+import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+import {ComputeContext, GpuData, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 const getProgramInfoUniqueKey =
-    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly Tensor[], inputGpuDatas: readonly GpuData[]):
-        string => {
-          const inputGpuDataTypes = inputGpuDatas.map(data => `${data.type}`).join('_');
-          const inputTensorShapes = inputTensors.map(t => `${t.dims.join(',')}`).join('_');
-          let key = programInfo.name;
-          if (programInfo.cacheHint) {
-            key += '[' + programInfo.cacheHint + ']';
-          }
-          key += ':' + inputTensorShapes + ';' + inputGpuDataTypes;
-          return key;
-        };
+    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly TensorView[],
+     inputGpuDatas: readonly GpuData[]): string => {
+      const inputGpuDataTypes = inputGpuDatas.map(data => `${data.type}`).join('_');
+      const inputTensorShapes = inputTensors.map(t => `${t.dims.join(',')}`).join('_');
+      let key = programInfo.name;
+      if (programInfo.cacheHint) {
+        key += '[' + programInfo.cacheHint + ']';
+      }
+      key += ':' + inputTensorShapes + ';' + inputGpuDataTypes;
+      return key;
+    };
 
 export class WebGpuBackend {
   device: GPUDevice;
   gpuDataManager: GpuDataManager;
   programManager: ProgramManager;
 
-  kernels: Map<number, [(context: ComputeContext) => number, unknown]>;
+  kernels: Map<number, [RunFunction, unknown]>;
 
   commandEncoder: GPUCommandEncoder|null = null;
   computePassEncoder: GPUComputePassEncoder|null = null;
@@ -88,23 +88,25 @@ export class WebGpuBackend {
     this.pendingDispatchNumber = 0;
   }
 
-  private uploadGpuData(tensor: Tensor, textureType: GpuDataType): GpuData {
-    return this.gpuDataManager.upload(tensor, textureType);
-  }
-
-  private createGpuData(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
-    return this.dataManager.createGpuTensor(type, dims, gpuDataType);
-  }
-
-  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[]): number {
+  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[],
+      createOutput: (index: number, dims: readonly number[]) => number): number {
     if (inputs.length !== program.inputTypes.length) {
       throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
     }
 
-    // create info for inputs
+    // // create info for inputs
+    // const inputDatas: GpuData[] = [];
+    // for (let i = 0; i < program.inputTypes.length; ++i) {
+    //   inputDatas[i] = this.uploadGpuData(inputs[i], program.inputTypes[i]);
+    // }
+
     const inputDatas: GpuData[] = [];
-    for (let i = 0; i < program.inputTypes.length; ++i) {
-      inputDatas[i] = this.uploadGpuData(inputs[i], program.inputTypes[i]);
+    for (let i = 0; i < inputs.length; ++i) {
+      const gpuData = this.gpuDataManager.get(inputs[i].data);
+      if (!gpuData) {
+        throw new Error(`no GPU data for ${inputs[i].data}`);
+      }
+      inputDatas[i] = gpuData;
     }
 
     const key = getProgramInfoUniqueKey(program, inputs, inputDatas);
@@ -116,11 +118,12 @@ export class WebGpuBackend {
 
     // create info for outputs
     const outputDatas: GpuData[] = [];
-    const outputTensors: Tensor[] = [];
     for (let i = 0; i < programInfo.outputs.length; ++i) {
-      const [tensor, gpuData] = this.createGpuData(
-          programInfo.outputs[i].type, programInfo.outputs[i].dims, programInfo.outputs[i].gpuDataType);
-      outputTensors.push(tensor);
+      const dataId = createOutput(i, programInfo.outputs[i].dims);
+      const gpuData = this.gpuDataManager.get(dataId);
+      if (!gpuData) {
+        throw new Error(`no GPU data for ${inputs[i].data}`);
+      }
       outputDatas.push(gpuData);
     }
 
@@ -131,14 +134,14 @@ export class WebGpuBackend {
 
     this.programManager.run(artifact, inputDatas, outputDatas, artifact.programInfo.dispatchGroup(inputs));
 
-    return outputTensors;
+    return 0;
   }
 
-  upload(gpuDataId: number, data: Uint8Array) {
+  upload(gpuDataId: number, data: Uint8Array): void {
     this.gpuDataManager.upload(gpuDataId, data);
   }
 
-  async download(gpuDataId: number, data: Uint8Array) {
+  async download(gpuDataId: number, data: Uint8Array): Promise<void> {
     const arrayBuffer = await this.gpuDataManager.download(gpuDataId);
     data.set(new Uint8Array(arrayBuffer));
   }
@@ -169,7 +172,11 @@ export class WebGpuBackend {
   }
 
   computeKernel(kernelId: number, context: ComputeContext): number {
-    const kernel = this.kernels
-    throw new Error('Method not implemented.');
+    const kernel = this.kernels.get(kernelId);
+    if (!kernel) {
+      throw new Error(`kernel not created: ${kernelId}`);
+    }
+    const [kernelEntry, attributes] = kernel;
+    return kernelEntry(context, attributes);
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 2f087e69cfdc6..865d22873b4d3 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -35,7 +35,7 @@ class OpKernelContext implements ComputeContext {
   }
 
   compute(program: ProgramInfoLoader|ProgramInfo): number {
-    return this.backend.run(program, this.inputs);
+    return this.backend.run(program, this.inputs, this.output.bind(this));
   }
 
   output(index: number, dims: readonly number[]): number {
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index 423725633627e..2596eec46f6f9 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -79,8 +79,9 @@ class GpuDataManagerImpl implements GpuDataManager {
     }
 
     // create gpu buffer
-    const gpuBufferForUploading =
-        this.backend.device.createBuffer({mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE});
+    const gpuBufferForUploading = this.backend.device.createBuffer(
+        // eslint-disable-next-line no-bitwise
+        {mappedAtCreation: true, size, usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC});
 
     // copy (upload) data
     const arrayBuffer = gpuBufferForUploading.getMappedRange();
@@ -104,9 +105,9 @@ class GpuDataManagerImpl implements GpuDataManager {
     const bufferSize = calcNormalizedBufferSize(size);
 
     // create gpu buffer
-    const gpuBuffer =
+    const gpuBuffer = this.backend.device.createBuffer(
         // eslint-disable-next-line no-bitwise
-        this.backend.device.createBuffer({size: bufferSize, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC});
+        {size: bufferSize, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST});
 
     const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
     this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 511fab9635ffc..a65c162727ecf 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -18,12 +18,12 @@ import {ComputeContext} from './types';
 
 // import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
 
-type RunFunction = (context: ComputeContext) => number;
-type ParseAttributeFunction = (attributeRaw: unknown) => unknown;
-type OperatorImplementation = [RunFunction]|[RunFunction, ParseAttributeFunction];
+export type RunFunction = (context: ComputeContext, attribute?: unknown) => number;
+export type ParseAttributeFunction = (attributeRaw: unknown) => unknown;
+export type OperatorImplementation = [RunFunction]|[RunFunction, ParseAttributeFunction];
 
 export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new Map([
-  ['abs', [unaryOps.abs]],
+  ['Abs', [unaryOps.abs]],
   //, ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
   // ['And', '', '7+', binaryOps.and],
   //['Asin', '', '7+', unaryOps.asin], ['Atan', '', '7+', unaryOps.atan],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index 31642e47503c7..b723ba19558fc 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -2,10 +2,9 @@
 // Licensed under the MIT License.
 
 // import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {WebGpuBackend} from '../../backend-webgpu';
-import {Tensor} from '../../tensor';
+import {TensorView} from '../../tensor';
 import {BroadcastUtil, ShapeUtil} from '../../util';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
 import {createIndicesHelper, WORKGROUP_SIZE} from './common';
 
@@ -120,11 +119,11 @@ const createBinaryOpProgramShader =
     };
 
 const createBinaryOpProgramInfo =
-    (metadata: ProgramMetadata, a: Tensor, b: Tensor, funcCall: BinaryFunctionCall, additionalImplementation?: string,
-     outputTensorType: Tensor.DataType = a.type): ProgramInfo => {
+    (metadata: ProgramMetadata, a: TensorView, b: TensorView, funcCall: BinaryFunctionCall,
+     additionalImplementation?: string, outputDataType: number = a.dataType): ProgramInfo => {
       const isBroadcast = !ShapeUtil.areEqual(a.dims, b.dims);
       let outputShape = a.dims;
-      let outputSize = a.size;
+      let outputSize = ShapeUtil.size(a.dims);
 
       let vectorize = false;
 
@@ -163,14 +162,14 @@ const createBinaryOpProgramInfo =
         ...metadata,
         shaderSource: createBinaryOpProgramShader(
             a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, additionalImplementation),
-        outputs: [{dims: outputShape, type: outputTensorType, gpuDataType: GpuDataType.default}],
+        outputs: [{dims: outputShape, dataType: outputDataType, gpuDataType: GpuDataType.default}],
         dispatchGroup: () =>
             ({x: Math.ceil(outputSize / 64 /* workgroup size */ / (vectorize ? 4 : 1) /* vec size */)})
       };
     };
 
 const createBinaryOpProgramInfoLoader =
-    (inputs: Tensor[], name: string, funcCall: BinaryFunctionCall, additionalImplementation?: string,
+    (inputs: readonly TensorView[], name: string, funcCall: BinaryFunctionCall, additionalImplementation?: string,
      cacheKey?: string): ProgramInfoLoader => {
       const metadata:
           ProgramMetadata = {name, inputTypes: [GpuDataType.default, GpuDataType.default], cacheHint: cacheKey};
@@ -180,14 +179,14 @@ const createBinaryOpProgramInfoLoader =
       };
     };
 
-export const add = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
-    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Add', (a, b) => `${a}+${b}`), inputs);
+export const add = (context: ComputeContext): number =>
+    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Add', (a, b) => `${a}+${b}`));
 
 // export const and = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslAnd(), 'bool'), inputs)];
 
-export const div = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
-    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Div', (a, b) => `${a}/${b}`), inputs);
+export const div = (context: ComputeContext): number =>
+    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Div', (a, b) => `${a}/${b}`));
 
 // export const equal = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslEqual(), 'bool'), inputs)];
@@ -198,20 +197,20 @@ export const div = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tens
 // export const less = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslLess(), 'bool'), inputs)];
 
-export const mul = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
-    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Mul', (a, b) => `${a}*${b}`), inputs);
+export const mul = (context: ComputeContext): number =>
+    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Mul', (a, b) => `${a}*${b}`));
 
 // export const or = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslOr(), 'bool'), inputs)];
 
-export const pow = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
-    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Pow', 'pow'), inputs);
+export const pow = (context: ComputeContext): number =>
+    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Pow', 'pow'));
 
 // export const pRelu = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslPRelu()), inputs)];
 
-export const sub = async(backend: WebGpuBackend, inputs: Tensor[]): Promise<Tensor[]> =>
-    backend.run(createBinaryOpProgramInfoLoader(inputs, 'Sub', (a, b) => `${a}-${b}`), inputs);
+export const sub = (context: ComputeContext): number =>
+    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Sub', (a, b) => `${a}-${b}`));
 
 // export const xor = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslXor(), 'bool'), inputs)];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index 37fb8be4536fa..bc7f7107fb978 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -1,13 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+import {DataType} from '../../../wasm-core-impl';
+import {TensorView} from '../../tensor';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
 import {createIndicesHelper, IndicesHelper, WORKGROUP_SIZE} from './common';
 
@@ -15,22 +13,22 @@ export interface ConcatAttributes extends AttributeWithCacheKey {
   readonly axis: number;
 }
 
-const validateInputs = (inputs: Tensor[]): void => {
+const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs || inputs.length < 1) {
     throw new Error('too few inputs');
   }
 
-  const inputType = inputs[0].type;
+  const inputType = inputs[0].dataType;
   const inputDimensionality = inputs[0].dims.length;
 
   // TODO: Support string concat
-  if (inputType === 'string') {
+  if (inputType === DataType.string) {
     throw new Error('string tensor is not supported yet');
   }
 
   for (const input of inputs) {
     // make sure types of all inputs match
-    if (input.type !== inputType) {
+    if (input.dataType !== inputType) {
       throw new Error('input tensors should be one type');
     }
 
@@ -41,24 +39,47 @@ const validateInputs = (inputs: Tensor[]): void => {
   }
 };
 
-export const concat = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConcatAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(createConcatProgramInfoLoader(inputs, attributes), inputs);
-};
-
 const createConcatProgramMetadata = (inputCount: number, cacheHint: string) =>
     ({name: 'Concat', inputTypes: Array(inputCount).fill(GpuDataType.default), cacheHint});
 
+const calculateInputIndexImpl = (numberOfTensors: number): string => `
+  fn calculateInputIndex(index: u32) -> u32 {
+    for (var i: u32 = 0u; i < ${numberOfTensors}u; i += 1u ) {
+      if (index < sizeInConcatAxis[i]) {
+        return i;
+      }
+    }
+    return ${numberOfTensors}u;
+  }`;
+
+const readBufferDataImpl = (indicesHelper: readonly IndicesHelper[], tensorRank: number, dataType: string) => {
+  const numberOfTensors = indicesHelper.length;
+  const codeLines: string[] = [];
+  for (let i = 0; i < numberOfTensors; ++i) {
+    const returnSnippet = `return input${i}[${indicesHelper[i].i2oExpression('indices', true)}];`;
+    if (numberOfTensors === 1) {
+      codeLines.push(returnSnippet);
+    } else if (i === 0) {
+      codeLines.push(`if (textureIndex == ${i}u) { ${returnSnippet} }`);
+    } else if (i === numberOfTensors - 1) {
+      codeLines.push(`else { ${returnSnippet} }`);
+    } else {
+      codeLines.push(`else if (textureIndex == ${i}) { ${returnSnippet} }`);
+    }
+  }
+  return `
+    fn readBufferData(textureIndex: u32, indices: ptr<function, ${indicesHelper[0].iType}>) -> ${dataType} {
+      ${codeLines.join('\n')}
+    }`;
+};
+
 const createConcatProgramInfo =
-    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
+    (metadata: ProgramMetadata, inputs: readonly TensorView[], axis: number, dataType = 'f32'): ProgramInfo => {
       const inputShape = inputs[0].dims.slice();
       if (axis >= inputShape.length || axis < (-1 * inputShape.length)) {
         throw new Error('axis specified for concat doesn\'t match input dimensionality');
       }
-      if (axis < 0) {
-        axis = inputShape.length + axis;
-      }
+      const adjustedAxis = (axis < 0) ? inputShape.length + axis : axis;
       // ensure all of the non-concatenated axes match each other
       // calculate the shape of the output tensor while we do that
       const outputShape = inputShape.slice(0);
@@ -66,8 +87,8 @@ const createConcatProgramInfo =
         const dataNShape = inputs[i].dims.slice();
         for (let axisIndex = 0; axisIndex < inputShape.length; axisIndex++) {
           // add to the placeholder for computing output shape
-          if (axisIndex === axis) {
-            outputShape[axis] += dataNShape[axisIndex];
+          if (axisIndex === adjustedAxis) {
+            outputShape[adjustedAxis] += dataNShape[axisIndex];
           }
           // ensure all non-cancatenated axes match each other
           else if (inputShape[axisIndex] !== dataNShape[axisIndex]) {
@@ -85,7 +106,7 @@ const createConcatProgramInfo =
 
       let previousSum = 0;
       for (let i = 0; i < inputs.length; ++i) {
-        previousSum += inputs[i].dims[axis];
+        previousSum += inputs[i].dims[adjustedAxis];
         sizeInConcatAxis[i] = previousSum;
 
         inputStorageBuffersDeclarations[i] =
@@ -96,7 +117,7 @@ const createConcatProgramInfo =
 
       const outputIndicesHelper = createIndicesHelper('output', outputShape);
 
-      const indicesAxis = rank < 2 ? 'indices' : `indices[${axis}]`;
+      const indicesAxis = rank < 2 ? 'indices' : `indices[${adjustedAxis}]`;
       const shaderSource = `
   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
 
@@ -130,47 +151,19 @@ const createConcatProgramInfo =
   }`;
       return {
         ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
         shaderSource,
         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
       };
     };
 
-const createConcatProgramInfoLoader = (inputs: Tensor[], attributes: ConcatAttributes): ProgramInfoLoader => {
-  const metadata = createConcatProgramMetadata(inputs.length, attributes.cacheKey);
-  return {...metadata, get: () => createConcatProgramInfo(metadata, inputs, attributes.axis)};
-};
-
-const calculateInputIndexImpl = (numberOfTensors: number): string => `
-  fn calculateInputIndex(index: u32) -> u32 {
-    for (var i: u32 = 0u; i < ${numberOfTensors}u; i += 1u ) {
-      if (index < sizeInConcatAxis[i]) {
-        return i;
-      }
-    }
-    return ${numberOfTensors}u;
-  }`;
+const createConcatProgramInfoLoader =
+    (inputs: readonly TensorView[], attributes: ConcatAttributes): ProgramInfoLoader => {
+      const metadata = createConcatProgramMetadata(inputs.length, attributes.cacheKey);
+      return {...metadata, get: () => createConcatProgramInfo(metadata, inputs, attributes.axis)};
+    };
 
-const readBufferDataImpl = (indicesHelper: readonly IndicesHelper[], tensorRank: number, dataType: string) => {
-  const numberOfTensors = indicesHelper.length;
-  const codeLines: string[] = [];
-  for (let i = 0; i < numberOfTensors; ++i) {
-    const returnSnippet = `return input${i}[${indicesHelper[i].i2oExpression('indices', true)}];`;
-    if (numberOfTensors === 1) {
-      codeLines.push(returnSnippet);
-    } else if (i === 0) {
-      codeLines.push(`if (textureIndex == ${i}u) { ${returnSnippet} }`);
-    } else if (i === numberOfTensors - 1) {
-      codeLines.push(`else { ${returnSnippet} }`);
-    } else {
-      codeLines.push(`else if (textureIndex == ${i}) { ${returnSnippet} }`);
-    }
-  }
-  return `
-  fn readBufferData(textureIndex: u32, indices: ptr<function, ${indicesHelper[0].iType}>) -> ${dataType} {
-    ${codeLines.join('\n')}
-  }`;
+export const concat = (context: ComputeContext, attributes: ConcatAttributes): number => {
+  validateInputs(context.inputs);
+  return context.compute(createConcatProgramInfoLoader(context.inputs, attributes));
 };
-
-export const parseConcatAttributes: OperatorInitialization<ConcatAttributes> = (node: Graph.Node): ConcatAttributes =>
-    createAttributeWithCacheKey({axis: node.attributes.getInt('axis')});
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 570ec041a34fc..e9b8a64d707df 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -1,127 +1,129 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Logger} from '../../../instrument';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-import {calculateOutputShape, ConvAttributes} from './conv';
-import {getActicationSnippet} from './fuse-utils';
-
-const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
-  name: 'GroupedConv',
-  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-                        [GpuDataType.default, GpuDataType.default],
-  cacheHint
-});
-
-const createGroupedConvProgramInfo =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], metadata: ProgramMetadata,
-     attributes: ConvAttributes): ProgramInfo => {
-      const hasBias = inputs.length > 2;
-      const processBias = hasBias ? 'value += b[output_channel];' : '';
-      const xShape = inputs[0].dims;
-      const wShape = inputs[1].dims;
-      const outputChannelsPerGroup = wShape[0] / attributes.group;
-
-      const dataType = 'f32';  // TODO: support other data type
-      const {activationFunction, applyActivation} = getActicationSnippet(attributes);
-      const inputStorageBuffersDeclarations = [
-        `@group(0) @binding(0) var<storage, read> x : array<${dataType}>;`,
-        `@group(0) @binding(1) var<storage, read> w : array<${dataType}>;`
-      ];
-      if (hasBias) {
-        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
-      }
-
-      Logger.verbose(
-          'GroupedConv',
-          `autpPad:${attributes.autoPad}, dilations:${attributes.dilations}, group:${attributes.group}, kernelShape:${
-              attributes.kernelShape}, pads:${attributes.pads}, strides:${attributes.strides}`);
-      const outputShape =
-          calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
-      const outputSize = ShapeUtil.size(outputShape);
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const xIndicesHelper = createIndicesHelper('x', xShape);
-      const wIndicesHelper = createIndicesHelper('w', wShape);
-
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
-  const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
-
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputStorageBuffersDeclarations.length}) var<storage, read_write> output : array<${dataType}>;
-
-  ${activationFunction}
-  ${outputIndicesHelper.o2iImpl}
-  ${xIndicesHelper.i2oImpl}
-  ${wIndicesHelper.i2oImpl}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
-    let batch: u32 = outputIndices[0];
-    let output_channel: u32 = outputIndices[1];
-    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
-    let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
-
-    var value: ${dataType} = ${dataType}(0);
-    for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
-      let input_channel = group_id * ${wShape[1]}u + wInChannel;
-      for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
-        let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
-
-        if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
-          continue;
-        }
-
-        for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
-          let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
-          if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
-            continue;
-          }
-
-          ${
-          xIndicesHelper.indicesVariableDeclaration(
-              'xIndices',
-              [
-                'batch', 'input_channel', 'xHeight', 'xWidth'
-              ])}
-          let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
-          ${
-          wIndicesHelper.indicesVariableDeclaration('wIndices', [
-            'output_channel', 'wInChannel', 'wHeight', 'wWidth'
-          ])}
-          let wVal = w[${wIndicesHelper.i2oExpression('wIndices')}];
-          value += xVal*wVal;
-        }
-      }
-    }
-    ${processBias}
-    ${applyActivation}
-    output[global_id.x] = value;
-  }`;
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-export const createGroupedConvProgramInfoLoader =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], attributes: ConvAttributes):
-        ProgramInfoLoader => {
-          const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
-          return {...metadata, get: () => createGroupedConvProgramInfo(inferenceHandler, inputs, metadata, attributes)};
-        };
+// import {Logger} from '../../../instrument';
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+// import {calculateOutputShape, ConvAttributes} from './conv';
+// import {getActicationSnippet} from './fuse-utils';
+
+// const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
+//   name: 'GroupedConv',
+//   inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+//                         [GpuDataType.default, GpuDataType.default],
+//   cacheHint
+// });
+
+// const createGroupedConvProgramInfo =
+//     (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], metadata: ProgramMetadata,
+//      attributes: ConvAttributes): ProgramInfo => {
+//       const hasBias = inputs.length > 2;
+//       const processBias = hasBias ? 'value += b[output_channel];' : '';
+//       const xShape = inputs[0].dims;
+//       const wShape = inputs[1].dims;
+//       const outputChannelsPerGroup = wShape[0] / attributes.group;
+
+//       const dataType = 'f32';  // TODO: support other data type
+//       const {activationFunction, applyActivation} = getActicationSnippet(attributes);
+//       const inputStorageBuffersDeclarations = [
+//         `@group(0) @binding(0) var<storage, read> x : array<${dataType}>;`,
+//         `@group(0) @binding(1) var<storage, read> w : array<${dataType}>;`
+//       ];
+//       if (hasBias) {
+//         inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
+//       }
+
+//       Logger.verbose(
+//           'GroupedConv',
+//           `autpPad:${attributes.autoPad}, dilations:${attributes.dilations}, group:${attributes.group},
+//           kernelShape:${
+//               attributes.kernelShape}, pads:${attributes.pads}, strides:${attributes.strides}`);
+//       const outputShape =
+//           calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
+//       const outputSize = ShapeUtil.size(outputShape);
+//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
+//       const xIndicesHelper = createIndicesHelper('x', xShape);
+//       const wIndicesHelper = createIndicesHelper('w', wShape);
+
+//       const shaderSource = `
+//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+//   const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
+//   const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
+
+//   ${inputStorageBuffersDeclarations.join('\n')}
+//   @group(0) @binding(${inputStorageBuffersDeclarations.length}) var<storage, read_write> output : array<${dataType}>;
+
+//   ${activationFunction}
+//   ${outputIndicesHelper.o2iImpl}
+//   ${xIndicesHelper.i2oImpl}
+//   ${wIndicesHelper.i2oImpl}
+
+//   @compute @workgroup_size(WORKGROUP_SIZE)
+//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+//     // Guard against out-of-bounds work group sizes
+//     if (global_id.x >= ${outputSize}u) {
+//       return;
+//     }
+
+//     ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+//     ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
+//     let batch: u32 = outputIndices[0];
+//     let output_channel: u32 = outputIndices[1];
+//     let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
+//     let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
+
+//     var value: ${dataType} = ${dataType}(0);
+//     for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
+//       let input_channel = group_id * ${wShape[1]}u + wInChannel;
+//       for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
+//         let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
+
+//         if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
+//           continue;
+//         }
+
+//         for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
+//           let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
+//           if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
+//             continue;
+//           }
+
+//           ${
+//           xIndicesHelper.indicesVariableDeclaration(
+//               'xIndices',
+//               [
+//                 'batch', 'input_channel', 'xHeight', 'xWidth'
+//               ])}
+//           let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
+//           ${
+//           wIndicesHelper.indicesVariableDeclaration('wIndices', [
+//             'output_channel', 'wInChannel', 'wHeight', 'wWidth'
+//           ])}
+//           let wVal = w[${wIndicesHelper.i2oExpression('wIndices')}];
+//           value += xVal*wVal;
+//         }
+//       }
+//     }
+//     ${processBias}
+//     ${applyActivation}
+//     output[global_id.x] = value;
+//   }`;
+//       return {
+//         ...metadata,
+//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+//         shaderSource,
+//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+//       };
+//     };
+
+// export const createGroupedConvProgramInfoLoader =
+//     (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], attributes: ConvAttributes):
+//         ProgramInfoLoader => {
+//           const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
+//           return {...metadata, get: () => createGroupedConvProgramInfo(inferenceHandler, inputs, metadata,
+//           attributes)};
+//         };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 644e9b08c7030..d68fae4152abb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -1,150 +1,152 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {InferenceHandler} from '../../../backend';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {PoolConvUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-import {createGroupedConvProgramInfoLoader} from './conv-grouped';
-// import {createDotProductProgramInfoLoader} from './dot-product';
-import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
-
-// import {createIm2ColProgramInfoLoader} from './im2col';
-// import {createMatmulProgramInfoLoader} from './matmul';
-
-
-export const calculateOutputShape =
-    (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
-     adjustPads: readonly number[], strides: readonly number[]): number[] => {
-      const batchSize = inputShape[0];
-      const inputSpatialShape = inputShape.slice(2);
-      const spatialRank = inputSpatialShape.length;
-      const outChannels = kernelShape[0];
-      const kernelSpatialShape = kernelShape.slice(2);
-      const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
-      const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]);
-      const outputSpatialShape =
-          inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
-      const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
-      return outputShape;
-    };
-
-export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
-  readonly autoPad: string;
-  readonly dilations: readonly number[];
-  readonly group: number;
-  readonly kernelShape: readonly number[];
-  readonly pads: readonly number[];
-  readonly strides: readonly number[];
-}
-
-export const conv: OperatorAsyncImplementation<ConvAttributes> =
-    async(inferenceHandler: InferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs, attributes);  // currently will fail if not conv2D
-  return conv2d(inferenceHandler, inputs, attributes);
-};
-
-const conv2d: OperatorAsyncImplementation<ConvAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
-  const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
-  //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
-  //  if (adjustedAttributes.group > 1) {
-  return inferenceHandler.run(createGroupedConvProgramInfoLoader(inferenceHandler, inputs, adjustedAttributes), inputs);
-  //  } else if (isPointwise) {
-  //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
-  //  } else {
-  //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
-  //  }
-};
-
-const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inputs: Tensor[]): T => {
-  const kernelShape = attributes.kernelShape.slice();
-  // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
-  if (attributes.kernelShape.length === 0) {
-    for (let i = 2; i < inputs[1].dims.length; ++i) {
-      kernelShape.push(inputs[1].dims[i]);
-    }
-  }
-  const pads = attributes.pads.slice();
-  PoolConvUtil.adjustPadsBasedOnAutoPad(
-      inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
-
-  // always return a new object so does not modify the original attributes
-  const newAttributes: T = Object.assign({}, attributes);
-  Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
-  return newAttributes;
-};
-
-export const parseConvAttributes: OperatorInitialization<ConvAttributes> = (node: Graph.Node): ConvAttributes => {
-  const attributes = node.attributes;
-  const activationAttributes = parseInternalActivationAttributes(attributes);
-  // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
-  const autoPad = attributes.getString('auto_pad', 'NOTSET');
-  const dilations = attributes.getInts('dilations', [1, 1]);
-  const group = attributes.getInt('group', 1);
-  const kernelShape = attributes.getInts('kernel_shape', []);
-  const pads = attributes.getInts('pads', [0, 0, 0, 0]);
-  const strides = attributes.getInts('strides', [1, 1]);
-
-  return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides, ...activationAttributes});
-};
-
-const validateInputs = (inputs: Tensor[], attributes: ConvAttributes): void => {
-  // Refer to the below link for all input checks
-  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
-  if (!inputs || (inputs.length !== 2 && inputs.length !== 3)) {
-    throw new Error('Conv requires 2 or 3 inputs');
-  }
-
-  // TODO : Need to add support for multi-dimensional conv
-  if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
-    throw new Error('currently only support 2-dimensional conv');
-  }
-
-  // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
-  const dataChannel = inputs[0].dims[1];
-  const filterInChannel = inputs[1].dims[1] * attributes.group;
-  if (dataChannel !== filterInChannel) {
-    throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
-  }
-
-  // if bias is provided it should be 1D and the number of elements should be equal to the number of feature maps
-  if (inputs.length === 3 && (inputs[2].dims.length !== 1 || inputs[1].dims[0] !== inputs[2].dims[0])) {
-    throw new Error('invalid bias');
-  }
-
-  const spatialRank = inputs[0].dims.length - 2;
-  // wrong dilations dimension
-  if (attributes.dilations.length !== spatialRank) {
-    throw new Error(`dilations should be ${spatialRank}D`);
-  }
-
-  // Wrong strides dimension
-  if (attributes.strides.length !== spatialRank) {
-    throw new Error(`strides should be ${spatialRank}D`);
-  }
-
-  // Wrong pads dimension
-  if (attributes.pads.length !== spatialRank * 2) {
-    throw new Error(`pads should be ${spatialRank * 2}D`);
-  }
-
-  // if kernelShape is specified, it's data length must be 2 less than dims length of the weights tensor
-  // (the first 2 dims are batch_size and channels)
-  if (attributes.kernelShape.length !== 0 && attributes.kernelShape.length !== inputs[1].dims.length - 2) {
-    throw new Error('invalid kernel shape');
-  }
-
-  // TODO : Need to add support for float64
-  if (inputs[0].type !== 'float32' || inputs[1].type !== 'float32') {
-    throw new Error('Conv input(X,W) should be float tensor');
-  }
-
-  if (inputs.length === 3 && inputs[2].type !== 'float32') {
-    throw new Error('Conv input(bias) should be float tensor');
-  }
-};
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+// import {InferenceHandler} from '../../../backend';
+// import {Graph} from '../../../graph';
+// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {PoolConvUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+
+// import {createGroupedConvProgramInfoLoader} from './conv-grouped';
+// // import {createDotProductProgramInfoLoader} from './dot-product';
+// import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+
+// // import {createIm2ColProgramInfoLoader} from './im2col';
+// // import {createMatmulProgramInfoLoader} from './matmul';
+
+
+// export const calculateOutputShape =
+//     (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
+//      adjustPads: readonly number[], strides: readonly number[]): number[] => {
+//       const batchSize = inputShape[0];
+//       const inputSpatialShape = inputShape.slice(2);
+//       const spatialRank = inputSpatialShape.length;
+//       const outChannels = kernelShape[0];
+//       const kernelSpatialShape = kernelShape.slice(2);
+//       const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
+//       const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i +
+//       spatialRank]); const outputSpatialShape =
+//           inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
+//       const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
+//       return outputShape;
+//     };
+
+// export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
+//   readonly autoPad: string;
+//   readonly dilations: readonly number[];
+//   readonly group: number;
+//   readonly kernelShape: readonly number[];
+//   readonly pads: readonly number[];
+//   readonly strides: readonly number[];
+// }
+
+// export const conv: OperatorAsyncImplementation<ConvAttributes> =
+//     async(inferenceHandler: InferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
+//   validateInputs(inputs, attributes);  // currently will fail if not conv2D
+//   return conv2d(inferenceHandler, inputs, attributes);
+// };
+
+// const conv2d: OperatorAsyncImplementation<ConvAttributes> = async(
+//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
+//   const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
+//   //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
+//   //  if (adjustedAttributes.group > 1) {
+//   return inferenceHandler.run(createGroupedConvProgramInfoLoader(inferenceHandler, inputs, adjustedAttributes),
+//   inputs);
+//   //  } else if (isPointwise) {
+//   //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
+//   //  } else {
+//   //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
+//   //  }
+// };
+
+// const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inputs: Tensor[]): T => {
+//   const kernelShape = attributes.kernelShape.slice();
+//   // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
+//   if (attributes.kernelShape.length === 0) {
+//     for (let i = 2; i < inputs[1].dims.length; ++i) {
+//       kernelShape.push(inputs[1].dims[i]);
+//     }
+//   }
+//   const pads = attributes.pads.slice();
+//   PoolConvUtil.adjustPadsBasedOnAutoPad(
+//       inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
+
+//   // always return a new object so does not modify the original attributes
+//   const newAttributes: T = Object.assign({}, attributes);
+//   Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
+//   return newAttributes;
+// };
+
+// export const parseConvAttributes: OperatorInitialization<ConvAttributes> = (node: Graph.Node): ConvAttributes => {
+//   const attributes = node.attributes;
+//   const activationAttributes = parseInternalActivationAttributes(attributes);
+//   // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
+//   const autoPad = attributes.getString('auto_pad', 'NOTSET');
+//   const dilations = attributes.getInts('dilations', [1, 1]);
+//   const group = attributes.getInt('group', 1);
+//   const kernelShape = attributes.getInts('kernel_shape', []);
+//   const pads = attributes.getInts('pads', [0, 0, 0, 0]);
+//   const strides = attributes.getInts('strides', [1, 1]);
+
+//   return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides,
+//   ...activationAttributes});
+// };
+
+// const validateInputs = (inputs: Tensor[], attributes: ConvAttributes): void => {
+//   // Refer to the below link for all input checks
+//   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
+//   if (!inputs || (inputs.length !== 2 && inputs.length !== 3)) {
+//     throw new Error('Conv requires 2 or 3 inputs');
+//   }
+
+//   // TODO : Need to add support for multi-dimensional conv
+//   if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
+//     throw new Error('currently only support 2-dimensional conv');
+//   }
+
+//   // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
+//   const dataChannel = inputs[0].dims[1];
+//   const filterInChannel = inputs[1].dims[1] * attributes.group;
+//   if (dataChannel !== filterInChannel) {
+//     throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
+//   }
+
+//   // if bias is provided it should be 1D and the number of elements should be equal to the number of feature maps
+//   if (inputs.length === 3 && (inputs[2].dims.length !== 1 || inputs[1].dims[0] !== inputs[2].dims[0])) {
+//     throw new Error('invalid bias');
+//   }
+
+//   const spatialRank = inputs[0].dims.length - 2;
+//   // wrong dilations dimension
+//   if (attributes.dilations.length !== spatialRank) {
+//     throw new Error(`dilations should be ${spatialRank}D`);
+//   }
+
+//   // Wrong strides dimension
+//   if (attributes.strides.length !== spatialRank) {
+//     throw new Error(`strides should be ${spatialRank}D`);
+//   }
+
+//   // Wrong pads dimension
+//   if (attributes.pads.length !== spatialRank * 2) {
+//     throw new Error(`pads should be ${spatialRank * 2}D`);
+//   }
+
+//   // if kernelShape is specified, it's data length must be 2 less than dims length of the weights tensor
+//   // (the first 2 dims are batch_size and channels)
+//   if (attributes.kernelShape.length !== 0 && attributes.kernelShape.length !== inputs[1].dims.length - 2) {
+//     throw new Error('invalid kernel shape');
+//   }
+
+//   // TODO : Need to add support for float64
+//   if (inputs[0].type !== 'float32' || inputs[1].type !== 'float32') {
+//     throw new Error('Conv input(X,W) should be float tensor');
+//   }
+
+//   if (inputs.length === 3 && inputs[2].type !== 'float32') {
+//     throw new Error('Conv input(bias) should be float tensor');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
index fae2c9fb6e9b2..1b403505a5962 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
@@ -1,39 +1,38 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Attribute} from '../../../attribute';
-import {MAX_CLIP, MIN_CLIP} from '../../../util';
+// export interface InternalActivationAttributes {
+//   readonly activation: string;
+//   readonly clipMin?: number;
+//   readonly clipMax?: number;
+//   readonly activationCacheKey: string;
+// }
 
-export interface InternalActivationAttributes {
-  readonly activation: string;
-  readonly clipMin?: number;
-  readonly clipMax?: number;
-  readonly activationCacheKey: string;
-}
+// export const getActicationSnippet =
+//     (attributes: InternalActivationAttributes): {activationFunction: string; applyActivation: string} => {
+//       switch (attributes.activation) {
+//         case 'Relu':
+//           return {activationFunction: '', applyActivation: 'value = max(value, 0.0);'};
+//         case 'Sigmoid':
+//           return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
+//         case 'Clip':
+//           return {
+//             activationFunction:
+//                `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
+//             applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
+//           };
+//           // TODO: adding other activations that can be fused.
+//         default:
+//           return {activationFunction: '', applyActivation: ''};
+//       }
+//     };
 
-export function getActicationSnippet(attributes: InternalActivationAttributes) {
-  switch (attributes.activation) {
-    case 'Relu':
-      return {activationFunction: '', applyActivation: 'value = max(value, 0.0);'};
-    case 'Sigmoid':
-      return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
-    case 'Clip':
-      return {
-        activationFunction: `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
-        applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
-      };
-      // TODO: adding other activations that can be fused.
-    default:
-      return {activationFunction: '', applyActivation: ''};
-  }
-}
+// export const parseInternalActivationAttributes = (attributes: Attribute): InternalActivationAttributes => {
+//   const activation = attributes.getString('activation', '');
 
-export const parseInternalActivationAttributes = (attributes: Attribute): InternalActivationAttributes => {
-  const activation = attributes.getString('activation', '');
-
-  if (activation === 'Clip') {
-    const [clipMin, clipMax] = attributes.getFloats('activation_params', [MIN_CLIP, MAX_CLIP]);
-    return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
-  }
-  return {activation, activationCacheKey: activation};
-};
+//   if (activation === 'Clip') {
+//     const [clipMin, clipMax] = attributes.getFloats('activation_params', [MIN_CLIP, MAX_CLIP]);
+//     return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
+//   }
+//   return {activation, activationCacheKey: activation};
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
index 65f679a2cea83..9f8a8e55417b5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -1,131 +1,132 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {NUMBER_TYPES, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-interface GatherAttributes extends AttributeWithCacheKey {
-  readonly axis: number;
-}
-
-export const gather = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GatherAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs, attributes.axis);
-  return inferenceHandler.run(createGatherProgramInfoLoader(inputs, attributes), inputs);
-};
-
-export const parseGatherAttributes: OperatorInitialization<GatherAttributes> = (node: Graph.Node): GatherAttributes =>
-    createAttributeWithCacheKey({axis: node.attributes.getInt('axis', 0)});
-
-const gatherProgramMetadata = {
-  name: 'Gather',
-  inputTypes: [GpuDataType.default, GpuDataType.default]
-};
-
-const createGatherProgramInfo =
-    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
-      const dataShape = inputs[0].dims.slice();
-      const indicesShape = inputs[1].dims.slice();
-      const outputShape = new Array(dataShape.length + indicesShape.length - 1);
-
-      axis = ShapeUtil.normalizeAxis(axis, dataShape.length);
-      const indexCopyOps: string[] = [];
-      if (indicesShape.length > 1) {
-        indexCopyOps.push('indicesIdx[0] = 0u;');
-      } else {
-        indexCopyOps.push('indicesIdx = 0u;');
-      }
-      for (let i = 0; i < outputShape.length; i++) {
-        // outputShape is divided into three parts: A, B, C
-        // |0        axis|  axis + indicesShape.length |          end|
-        // |     A       |             B               |      C      |
-        //
-        // dataIdx: [A, inputs[1][B], C]
-        const outputIdxLValue = outputShape.length > 1 ? `outputIdx[${i}]` : 'outputIdx';
-        if (i < axis) {  // A
-          const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i}]` : 'dataIdx';
-          outputShape[i] = dataShape[i];
-          indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
-        } else {
-          if (i < axis + indicesShape.length) {  // B
-            const indicesIdxLValue = indicesShape.length > 1 ? `indicesIdx[${i - axis}]` : 'indicesIdx';
-            outputShape[i] = indicesShape[i - axis];
-            indexCopyOps.push(`${indicesIdxLValue} = ${outputIdxLValue};`);
-          } else {  // C
-            const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i - indicesShape.length + 1}]` : 'dataIdx';
-            outputShape[i] = dataShape[i - indicesShape.length + 1];  // skip 1 for axis
-            indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
-          }
-        }
-      }
-      const outputSize = ShapeUtil.size(outputShape);
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const dataIndicesHelper = createIndicesHelper('data', dataShape);
-      const indicesIndicesHelper = createIndicesHelper('indices', indicesShape);
-
-      const shaderSource = `
-    const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-    @group(0) @binding(0) var<storage, read> data : array<${dataType}>;
-    @group(0) @binding(1) var<storage, read> indices : array<i32>;
-    @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
-
-    ${outputIndicesHelper.o2iImpl}
-    ${indicesIndicesHelper.i2oImpl}
-    ${dataIndicesHelper.i2oImpl}
-
-    @compute @workgroup_size(WORKGROUP_SIZE)
-    fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-      // Guard against out-of-bounds work group sizes
-      if (global_id.x >= ${outputSize}u) {
-        return;
-      }
-
-      ${outputIndicesHelper.indicesVariableDeclaration('outputIdx')}
-      ${outputIndicesHelper.o2iCall('global_id.x', 'outputIdx')}
-      ${dataIndicesHelper.indicesVariableDeclaration('dataIdx')}
-      ${indicesIndicesHelper.indicesVariableDeclaration('indicesIdx')}
-      ${indexCopyOps.join('\n        ')}
-      let idx = indices[${indicesIndicesHelper.i2oExpression('indicesIdx')}];
-      dataIdx${dataShape.length > 1 ? `[${axis}]` : ''} = u32(select(idx, idx + ${dataShape[axis]}, idx < 0));
-      output[global_id.x] = data[${dataIndicesHelper.i2oExpression('dataIdx')}];
-    }`;
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const createGatherProgramInfoLoader = (inputs: Tensor[], attributes: GatherAttributes): ProgramInfoLoader => {
-  const metadata = {...gatherProgramMetadata, cacheHint: attributes.cacheKey};
-  return {...metadata, get: () => createGatherProgramInfo(metadata, inputs, attributes.axis)};
-};
-
-const validateInputs = (inputs: Tensor[], axis: number): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Gather requires 2 inputs.');
-  }
-  const tensorRank = inputs[0].dims.length;
-  if (tensorRank < 1) {
-    throw new Error('Invalid input shape.');
-  }
-  if (axis < -tensorRank || axis > tensorRank - 1) {
-    throw new Error('Invalid axis.');
-  }
-  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
-    throw new Error('Invaid input type.');
-  }
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invaid input type.');
-  }
-};
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+// import {Graph} from '../../../graph';
+// import {NUMBER_TYPES, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+// interface GatherAttributes extends AttributeWithCacheKey {
+//   readonly axis: number;
+// }
+
+// export const gather = async(
+//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GatherAttributes): Promise<Tensor[]> => {
+//   validateInputs(inputs, attributes.axis);
+//   return inferenceHandler.run(createGatherProgramInfoLoader(inputs, attributes), inputs);
+// };
+
+// export const parseGatherAttributes: OperatorInitialization<GatherAttributes> = (node: Graph.Node): GatherAttributes
+// =>
+//     createAttributeWithCacheKey({axis: node.attributes.getInt('axis', 0)});
+
+// const gatherProgramMetadata = {
+//   name: 'Gather',
+//   inputTypes: [GpuDataType.default, GpuDataType.default]
+// };
+
+// const createGatherProgramInfo =
+//     (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
+//       const dataShape = inputs[0].dims.slice();
+//       const indicesShape = inputs[1].dims.slice();
+//       const outputShape = new Array(dataShape.length + indicesShape.length - 1);
+
+//       axis = ShapeUtil.normalizeAxis(axis, dataShape.length);
+//       const indexCopyOps: string[] = [];
+//       if (indicesShape.length > 1) {
+//         indexCopyOps.push('indicesIdx[0] = 0u;');
+//       } else {
+//         indexCopyOps.push('indicesIdx = 0u;');
+//       }
+//       for (let i = 0; i < outputShape.length; i++) {
+//         // outputShape is divided into three parts: A, B, C
+//         // |0        axis|  axis + indicesShape.length |          end|
+//         // |     A       |             B               |      C      |
+//         //
+//         // dataIdx: [A, inputs[1][B], C]
+//         const outputIdxLValue = outputShape.length > 1 ? `outputIdx[${i}]` : 'outputIdx';
+//         if (i < axis) {  // A
+//           const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i}]` : 'dataIdx';
+//           outputShape[i] = dataShape[i];
+//           indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
+//         } else {
+//           if (i < axis + indicesShape.length) {  // B
+//             const indicesIdxLValue = indicesShape.length > 1 ? `indicesIdx[${i - axis}]` : 'indicesIdx';
+//             outputShape[i] = indicesShape[i - axis];
+//             indexCopyOps.push(`${indicesIdxLValue} = ${outputIdxLValue};`);
+//           } else {  // C
+//             const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i - indicesShape.length + 1}]` : 'dataIdx';
+//             outputShape[i] = dataShape[i - indicesShape.length + 1];  // skip 1 for axis
+//             indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
+//           }
+//         }
+//       }
+//       const outputSize = ShapeUtil.size(outputShape);
+//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
+//       const dataIndicesHelper = createIndicesHelper('data', dataShape);
+//       const indicesIndicesHelper = createIndicesHelper('indices', indicesShape);
+
+//       const shaderSource = `
+//     const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+//     @group(0) @binding(0) var<storage, read> data : array<${dataType}>;
+//     @group(0) @binding(1) var<storage, read> indices : array<i32>;
+//     @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
+
+//     ${outputIndicesHelper.o2iImpl}
+//     ${indicesIndicesHelper.i2oImpl}
+//     ${dataIndicesHelper.i2oImpl}
+
+//     @compute @workgroup_size(WORKGROUP_SIZE)
+//     fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//       // Guard against out-of-bounds work group sizes
+//       if (global_id.x >= ${outputSize}u) {
+//         return;
+//       }
+
+//       ${outputIndicesHelper.indicesVariableDeclaration('outputIdx')}
+//       ${outputIndicesHelper.o2iCall('global_id.x', 'outputIdx')}
+//       ${dataIndicesHelper.indicesVariableDeclaration('dataIdx')}
+//       ${indicesIndicesHelper.indicesVariableDeclaration('indicesIdx')}
+//       ${indexCopyOps.join('\n        ')}
+//       let idx = indices[${indicesIndicesHelper.i2oExpression('indicesIdx')}];
+//       dataIdx${dataShape.length > 1 ? `[${axis}]` : ''} = u32(select(idx, idx + ${dataShape[axis]}, idx < 0));
+//       output[global_id.x] = data[${dataIndicesHelper.i2oExpression('dataIdx')}];
+//     }`;
+//       return {
+//         ...metadata,
+//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+//         shaderSource,
+//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+//       };
+//     };
+
+// const createGatherProgramInfoLoader = (inputs: Tensor[], attributes: GatherAttributes): ProgramInfoLoader => {
+//   const metadata = {...gatherProgramMetadata, cacheHint: attributes.cacheKey};
+//   return {...metadata, get: () => createGatherProgramInfo(metadata, inputs, attributes.axis)};
+// };
+
+// const validateInputs = (inputs: Tensor[], axis: number): void => {
+//   if (!inputs || inputs.length !== 2) {
+//     throw new Error('Gather requires 2 inputs.');
+//   }
+//   const tensorRank = inputs[0].dims.length;
+//   if (tensorRank < 1) {
+//     throw new Error('Invalid input shape.');
+//   }
+//   if (axis < -tensorRank || axis > tensorRank - 1) {
+//     throw new Error('Invalid axis.');
+//   }
+//   if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
+//     throw new Error('Invaid input type.');
+//   }
+//   if (inputs[1].type !== 'int32') {
+//     throw new Error('Invaid input type.');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 3eeb49c91033a..49429a3c9f1ea 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -1,165 +1,165 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {GemmUtil, ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-
-export interface GemmAttributes extends AttributeWithCacheKey {
-  transA: boolean;
-  transB: boolean;
-  alpha: number;
-  beta: number;
-  isOptionalC: boolean;  // in opset 11, C becomes optional
-}
-
-export const gemm: OperatorAsyncImplementation<GemmAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GemmAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs, attributes);
-  return inferenceHandler.run(createGemmProgramInfoLoader(inputs, attributes), inputs);
-};
-
-const parseGemmAttributes = (node: Graph.Node, isOptionalC: boolean): GemmAttributes => {
-  const transA = node.attributes.getInt('transA', 0) !== 0;
-  const transB = node.attributes.getInt('transB', 0) !== 0;
-  const alpha = node.attributes.getFloat('alpha', 1.0);
-  const beta = node.attributes.getFloat('beta', 1.0);
-  return createAttributeWithCacheKey({transA, transB, alpha, beta, isOptionalC});
-};
-
-export const parseGemmAttributesV7: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
-    parseGemmAttributes(node, false);
-
-export const parseGemmAttributesV11: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
-    parseGemmAttributes(node, true);
-
-const createGemmProgramInfoLoader = (inputs: Tensor[], attributes: GemmAttributes): ProgramInfoLoader => {
-  const metadata = {
-    name: 'Gemm',
-    inputTypes: inputs.length === 3 ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-                                      [GpuDataType.default, GpuDataType.default],
-    cacheHint: attributes.cacheKey
-  };
-
-  return {...metadata, get: () => createGemmProgramInfo(metadata, inputs, attributes)};
-};
-
-const offsetC = (m: number, n: number, dims: readonly number[]): string => {
-  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
-  const broadcastN = dims[dims.length - 1] !== n;
-
-  let offset = '0u';
-  if (!broadcastM) {
-    offset += `+ m * ${dims[dims.length - 1]}u`;
-  }
-  if (!broadcastN) {
-    offset += '+n';
-  }
-
-  return offset;
-};
-
-const createGemmProgramInfo =
-    (metadata: ProgramMetadata, inputs: Tensor[], attributes: GemmAttributes): ProgramInfo => {
-      const aShape = inputs[0].dims.slice();
-      const bShape = inputs[1].dims.slice();
-      const [M, N, K] = GemmUtil.getShapeOfGemmResult(
-          aShape, attributes.transA, bShape, attributes.transB, inputs.length === 3 ? inputs[2].dims : undefined);
-      const outputShape = [M, N];
-      if (!outputShape) {
-        throw new Error('Can\'t use gemm on the given tensors');
-      }
-      const outputSize = ShapeUtil.size(outputShape);
-      let line = '';
-      if (attributes.transA && attributes.transB) {
-        line = 'value += a[k * M + m] * b[n * K + k];';
-      } else if (attributes.transA && !attributes.transB) {
-        line = 'value += a[k * M + m] * b[k * N + n];';
-      } else if (!attributes.transA && attributes.transB) {
-        line = 'value += a[m * K + k] * b[n * K + k];';
-      } else if (!attributes.transA && !attributes.transB) {
-        line = 'value += a[m * K + k] * b[k * N + n];';
-      }
-
-      const dataType = 'f32';  // TODO: support other data type
-      const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
-      const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
-      const inputStorageBuffersDeclarations = [
-        `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
-        `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
-      ];
-      if (inputs.length === 3) {
-        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
-      }
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha = ${dataType}(${attributes.alpha});
-  const beta = ${dataType}(${attributes.beta});
-
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    let m = global_id.x / N;
-    let n = global_id.x % N;
-
-    var value = ${dataType}(0);
-    for (var k: u32 = 0u; k<${K}u; k++) {
-      ${line}
-    }
-
-    ${calculateAlpha}
-    ${calculateC}
-    output[global_id.x] = value;
-
-  }`;
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const validateInputs = (inputs: Tensor[], attributes: GemmAttributes): void => {
-  if (!inputs) {
-    throw new Error('Input is missing');
-  }
-  if (attributes.isOptionalC && (inputs.length < 2 || inputs.length > 3)) {
-    throw new Error('Invaid input shape.');
-  }
-  if (!attributes.isOptionalC && inputs.length !== 3) {
-    throw new Error('Gemm requires 3 inputs');
-  }
-
-  // 'C' can be of dimensionality 1 or 2 only
-  if (inputs.length === 3 && inputs[2].dims.length !== 1 && inputs[2].dims.length !== 2) {
-    throw new Error('Invalid input shape of C');
-  }
-
-  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
-      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64') ||
-      (inputs.length === 3 && inputs[2].type !== 'float32' && inputs[2].type !== 'float64')) {
-    throw new Error('Invalid input type.');
-  }
-
-  if ((inputs[0].type !== inputs[1].type) || (inputs.length === 3 && inputs[0].type !== inputs[2].type)) {
-    throw new Error('Input types are mismatched');
-  }
-};
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+// import {Graph} from '../../../graph';
+// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {GemmUtil, ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+// import {WORKGROUP_SIZE} from './common';
+
+// export interface GemmAttributes extends AttributeWithCacheKey {
+//   transA: boolean;
+//   transB: boolean;
+//   alpha: number;
+//   beta: number;
+//   isOptionalC: boolean;  // in opset 11, C becomes optional
+// }
+
+// export const gemm: OperatorAsyncImplementation<GemmAttributes> = async(
+//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GemmAttributes): Promise<Tensor[]> => {
+//   validateInputs(inputs, attributes);
+//   return inferenceHandler.run(createGemmProgramInfoLoader(inputs, attributes), inputs);
+// };
+
+// const parseGemmAttributes = (node: Graph.Node, isOptionalC: boolean): GemmAttributes => {
+//   const transA = node.attributes.getInt('transA', 0) !== 0;
+//   const transB = node.attributes.getInt('transB', 0) !== 0;
+//   const alpha = node.attributes.getFloat('alpha', 1.0);
+//   const beta = node.attributes.getFloat('beta', 1.0);
+//   return createAttributeWithCacheKey({transA, transB, alpha, beta, isOptionalC});
+// };
+
+// export const parseGemmAttributesV7: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
+//     parseGemmAttributes(node, false);
+
+// export const parseGemmAttributesV11: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
+//     parseGemmAttributes(node, true);
+
+// const createGemmProgramInfoLoader = (inputs: Tensor[], attributes: GemmAttributes): ProgramInfoLoader => {
+//   const metadata = {
+//     name: 'Gemm',
+//     inputTypes: inputs.length === 3 ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+//                                       [GpuDataType.default, GpuDataType.default],
+//     cacheHint: attributes.cacheKey
+//   };
+
+//   return {...metadata, get: () => createGemmProgramInfo(metadata, inputs, attributes)};
+// };
+
+// const offsetC = (m: number, n: number, dims: readonly number[]): string => {
+//   const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
+//   const broadcastN = dims[dims.length - 1] !== n;
+
+//   let offset = '0u';
+//   if (!broadcastM) {
+//     offset += `+ m * ${dims[dims.length - 1]}u`;
+//   }
+//   if (!broadcastN) {
+//     offset += '+n';
+//   }
+
+//   return offset;
+// };
+
+// const createGemmProgramInfo =
+//     (metadata: ProgramMetadata, inputs: Tensor[], attributes: GemmAttributes): ProgramInfo => {
+//       const aShape = inputs[0].dims.slice();
+//       const bShape = inputs[1].dims.slice();
+//       const [M, N, K] = GemmUtil.getShapeOfGemmResult(
+//           aShape, attributes.transA, bShape, attributes.transB, inputs.length === 3 ? inputs[2].dims : undefined);
+//       const outputShape = [M, N];
+//       if (!outputShape) {
+//         throw new Error('Can\'t use gemm on the given tensors');
+//       }
+//       const outputSize = ShapeUtil.size(outputShape);
+//       let line = '';
+//       if (attributes.transA && attributes.transB) {
+//         line = 'value += a[k * M + m] * b[n * K + k];';
+//       } else if (attributes.transA && !attributes.transB) {
+//         line = 'value += a[k * M + m] * b[k * N + n];';
+//       } else if (!attributes.transA && attributes.transB) {
+//         line = 'value += a[m * K + k] * b[n * K + k];';
+//       } else if (!attributes.transA && !attributes.transB) {
+//         line = 'value += a[m * K + k] * b[k * N + n];';
+//       }
+
+//       const dataType = 'f32';  // TODO: support other data type
+//       const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
+//       const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
+//       const inputStorageBuffersDeclarations = [
+//         `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
+//         `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
+//       ];
+//       if (inputs.length === 3) {
+//         inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
+//       }
+//       const shaderSource = `
+//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+//   const M: u32 = ${M}u;
+//   const N: u32 = ${N}u;
+//   const K: u32 = ${K}u;
+//   const alpha = ${dataType}(${attributes.alpha});
+//   const beta = ${dataType}(${attributes.beta});
+
+//   ${inputStorageBuffersDeclarations.join('\n')}
+//   @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+//   @compute @workgroup_size(WORKGROUP_SIZE)
+//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//     // Guard against out-of-bounds work group sizes
+//     if (global_id.x >= ${outputSize}u) {
+//       return;
+//     }
+
+//     let m = global_id.x / N;
+//     let n = global_id.x % N;
+
+//     var value = ${dataType}(0);
+//     for (var k: u32 = 0u; k<${K}u; k++) {
+//       ${line}
+//     }
+
+//     ${calculateAlpha}
+//     ${calculateC}
+//     output[global_id.x] = value;
+
+//   }`;
+//       return {
+//         ...metadata,
+//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+//         shaderSource,
+//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+//       };
+//     };
+
+// const validateInputs = (inputs: Tensor[], attributes: GemmAttributes): void => {
+//   if (!inputs) {
+//     throw new Error('Input is missing');
+//   }
+//   if (attributes.isOptionalC && (inputs.length < 2 || inputs.length > 3)) {
+//     throw new Error('Invaid input shape.');
+//   }
+//   if (!attributes.isOptionalC && inputs.length !== 3) {
+//     throw new Error('Gemm requires 3 inputs');
+//   }
+
+//   // 'C' can be of dimensionality 1 or 2 only
+//   if (inputs.length === 3 && inputs[2].dims.length !== 1 && inputs[2].dims.length !== 2) {
+//     throw new Error('Invalid input shape of C');
+//   }
+
+//   if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
+//       (inputs[1].type !== 'float32' && inputs[1].type !== 'float64') ||
+//       (inputs.length === 3 && inputs[2].type !== 'float32' && inputs[2].type !== 'float64')) {
+//     throw new Error('Invalid input type.');
+//   }
+
+//   if ((inputs[0].type !== inputs[1].type) || (inputs.length === 3 && inputs[0].type !== inputs[2].type)) {
+//     throw new Error('Input types are mismatched');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index 5b8f0bf94733e..d6f63820eff04 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -1,115 +1,115 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {BroadcastUtil, ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-import {getActicationSnippet, InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
-
-export const matMul: OperatorAsyncImplementation<InternalActivationAttributes> =
-    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: InternalActivationAttributes):
-        Promise<Tensor[]> => {
-          validateInputs(inputs);
-
-          return inferenceHandler.run(createMatmulProgramInfoLoader(inputs, attributes), inputs);
-        };
-
-export const parseMatMulAttributes: OperatorInitialization<InternalActivationAttributes> =
-    (node: Graph.Node): InternalActivationAttributes => parseInternalActivationAttributes(node.attributes);
-
-const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
-  name: 'MatMul',
-  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-                        [GpuDataType.default, GpuDataType.default],
-  cacheHint
-});
-
-function createMatmulProgramInfo(
-    metadata: ProgramMetadata, inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfo {
-  const aShape = inputs[0].dims;
-  const bShape = inputs[1].dims;
-  const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
-  if (!outputShape) {
-    throw new Error('Can\'t use matmul on the given tensors');
-  }
-  const outputSize = ShapeUtil.size(outputShape);
-  // TODO: support broadcasting
-
-  const dataType = 'f32';  // TODO: support other data type
-  const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
-
-  const M = outputShape[outputShape.length - 2];
-  const K = aShape[aShape.length - 1];
-  const N = outputShape[outputShape.length - 1];
-  const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-
-  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> b : array<${dataType}>;
-  @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
-
-  ${activationFunction}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    let stack = global_id.x / (M * N);
-    let mn = global_id.x % (M * N);
-    let n = global_id.x % N;
-    let m = mn / N;
-
-    let offsetA = stack * (M * K);
-    let offsetB = stack * (K * N);
-
-    var value = ${dataType}(0);
-    for (var k: u32 = 0u; k<${K}u; k++) {
-      value += a[offsetA + m * K + k] * b[offsetB + k * N + n];
-    }
-    ${applyActivation}
-    output[global_id.x] = value;
-  }`;
-  return {
-    ...metadata,
-    outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-    shaderSource,
-    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-  };
-}
-
-export function createMatmulProgramInfoLoader(
-    inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfoLoader {
-  const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
-  return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes)};
-}
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('MatMul requires 2 inputs.');
-  }
-
-  if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
-    throw new Error('shared dimension does not match.');
-  }
-
-  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
-      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64')) {
-    throw new Error('inputs should be float type');
-  }
-
-  if (inputs[0].type !== inputs[1].type) {
-    throw new Error('inputs types should match');
-  }
-};
+// import {Graph} from '../../../graph';
+// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {BroadcastUtil, ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+// import {WORKGROUP_SIZE} from './common';
+// import {getActicationSnippet, InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+
+// export const matMul: OperatorAsyncImplementation<InternalActivationAttributes> =
+//     async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: InternalActivationAttributes):
+//         Promise<Tensor[]> => {
+//           validateInputs(inputs);
+
+//           return inferenceHandler.run(createMatmulProgramInfoLoader(inputs, attributes), inputs);
+//         };
+
+// export const parseMatMulAttributes: OperatorInitialization<InternalActivationAttributes> =
+//     (node: Graph.Node): InternalActivationAttributes => parseInternalActivationAttributes(node.attributes);
+
+// const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
+//   name: 'MatMul',
+//   inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+//                         [GpuDataType.default, GpuDataType.default],
+//   cacheHint
+// });
+
+// function createMatmulProgramInfo(
+//     metadata: ProgramMetadata, inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfo {
+//   const aShape = inputs[0].dims;
+//   const bShape = inputs[1].dims;
+//   const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
+//   if (!outputShape) {
+//     throw new Error('Can\'t use matmul on the given tensors');
+//   }
+//   const outputSize = ShapeUtil.size(outputShape);
+//   // TODO: support broadcasting
+
+//   const dataType = 'f32';  // TODO: support other data type
+//   const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
+
+//   const M = outputShape[outputShape.length - 2];
+//   const K = aShape[aShape.length - 1];
+//   const N = outputShape[outputShape.length - 1];
+//   const shaderSource = `
+//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+//   const M: u32 = ${M}u;
+//   const N: u32 = ${N}u;
+//   const K: u32 = ${K}u;
+
+//   @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+//   @group(0) @binding(1) var<storage, read> b : array<${dataType}>;
+//   @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
+
+//   ${activationFunction}
+
+//   @compute @workgroup_size(WORKGROUP_SIZE)
+//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//     // Guard against out-of-bounds work group sizes
+//     if (global_id.x >= ${outputSize}u) {
+//       return;
+//     }
+
+//     let stack = global_id.x / (M * N);
+//     let mn = global_id.x % (M * N);
+//     let n = global_id.x % N;
+//     let m = mn / N;
+
+//     let offsetA = stack * (M * K);
+//     let offsetB = stack * (K * N);
+
+//     var value = ${dataType}(0);
+//     for (var k: u32 = 0u; k<${K}u; k++) {
+//       value += a[offsetA + m * K + k] * b[offsetB + k * N + n];
+//     }
+//     ${applyActivation}
+//     output[global_id.x] = value;
+//   }`;
+//   return {
+//     ...metadata,
+//     outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+//     shaderSource,
+//     dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+//   };
+// }
+
+// export function createMatmulProgramInfoLoader(
+//     inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfoLoader {
+//   const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
+//   return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes)};
+// }
+
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 2) {
+//     throw new Error('MatMul requires 2 inputs.');
+//   }
+
+//   if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
+//     throw new Error('shared dimension does not match.');
+//   }
+
+//   if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
+//       (inputs[1].type !== 'float32' && inputs[1].type !== 'float64')) {
+//     throw new Error('inputs should be float type');
+//   }
+
+//   if (inputs[0].type !== inputs[1].type) {
+//     throw new Error('inputs types should match');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 0e92ff8cb906a..801064aef12d1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -1,376 +1,378 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {PoolConvUtil, ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-export interface AveragePoolAttributes extends AttributeWithCacheKey {
-  readonly autoPad: string;
-  readonly ceilMode: number;
-  readonly countIncludePad: boolean;
-  readonly kernelShape: readonly number[];
-  readonly strides: readonly number[];
-  readonly pads: readonly number[];
-}
-
-export const averagePool: OperatorAsyncImplementation<AveragePoolAttributes> =
-    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
-        Promise<Tensor[]> => {
-          validateInputs(inputs);
-          const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-          return inferenceHandler.run(
-              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
-        };
-
-export const parseAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
-    (node: Graph.Node): AveragePoolAttributes => {
-      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
-      const ceilMode = node.attributes.getInt('ceil_mode', 0);
-      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
-      const kernelShape = node.attributes.getInts('kernel_shape');
-      const strides = node.attributes.getInts('strides', []);
-      const pads = node.attributes.getInts('pads', []);
-
-      // TODO: support attribute 'ceil_mode'
-      if (ceilMode !== 0) {
-        throw new Error('using ceil() in shape computation is not yet supported for AveragePool');
-      }
-
-      return createAttributeWithCacheKey({autoPad, ceilMode, countIncludePad, kernelShape, strides, pads});
-    };
-
-const createAveragePoolProgramInfo =
-    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean,
-     attributes: AveragePoolAttributes): ProgramInfo => {
-      const [adjustedAttributes, outputShape] =
-          getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
-      const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
-
-      const dataType = 'f32';
-
-      const op1 = 'value += x_val;';
-      let op2 = '';
-      if (adjustedAttributes.countIncludePad) {
-        op2 += `value /= ${dataType}(${kernelSize});`;
-      } else {
-        op2 += `value /= ${dataType}(${kernelSize} - pad);`;
-      }
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType, '0.0'),
-        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
-      };
-    };
-
-export const globalAveragePool: OperatorAsyncImplementation<AveragePoolAttributes> =
-    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
-        Promise<Tensor[]> => {
-          validateInputs(inputs);
-          const metadata = {
-            name: 'GlobalAveragePool',
-            inputTypes: [GpuDataType.default],
-            cacheHint: `${attributes.countIncludePad}`
-          };
-          return inferenceHandler.run(
-              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, true, attributes)}, inputs);
-        };
-
-export const parseGlobalAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
-    (node: Graph.Node): AveragePoolAttributes => {
-      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
-      return createAttributeWithCacheKey(
-          {autoPad: '', ceilMode: 0, countIncludePad, kernelShape: [], strides: [], pads: []});
-    };
-
-export interface MaxPoolAttributes extends AveragePoolAttributes {
-  readonly storageOrder: number;
-  readonly dilations: number[];
-}
-
-export const maxPool: OperatorAsyncImplementation<MaxPoolAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: MaxPoolAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-  return inferenceHandler.run(
-      {...metadata, get: () => createMaxPoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
-};
-
-export const parseMaxPoolAttributes: OperatorInitialization<MaxPoolAttributes> =
-    (node: Graph.Node): MaxPoolAttributes => {
-      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
-      const ceilMode = node.attributes.getInt('ceil_mode', 0);
-      const kernelShape = node.attributes.getInts('kernel_shape');
-      const strides = node.attributes.getInts('strides', []);
-      const pads = node.attributes.getInts('pads', []);
-      const storageOrder = node.attributes.getInt('storage_order', 0);
-      const dilations = node.attributes.getInts('dilations', []);
-
-      // TODO: support attribute 'ceil_mode' and 'storage_order'
-      if (storageOrder !== 0) {
-        throw new Error('column major storage order is not yet supported for MaxPool');
-      }
-      if (ceilMode !== 0) {
-        throw new Error('using ceil() in shape computation is not yet supported for MaxPool');
-      }
-
-      return createAttributeWithCacheKey(
-          {autoPad, ceilMode, countIncludePad: false, kernelShape, strides, pads, storageOrder, dilations});
-    };
-
-const createMaxPoolProgramInfo =
-    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean, attributes: MaxPoolAttributes):
-        ProgramInfo => {
-          const [adjustedAttributes, outputShape] =
-              getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
-          const op1 = `
-      value = max(x_val, value);
-    `;
-          const op2 = '';
-          return {
-            ...metadata,
-            outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-            shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32', '-1e5'),
-            dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
-          };
-        };
-
-const getAdjustedPoolAttributesAndOutputShape =
-    (inputs: Tensor[], attributes: AveragePoolAttributes|MaxPoolAttributes, isGlobalOperator: boolean):
-        [AveragePoolAttributes|MaxPoolAttributes, number[]] => {
-          const inputShape = inputs[0].dims.slice();
-          const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
-          const kernelShape = attributes.kernelShape.slice();
-          const strides = attributes.strides.slice();
-          const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
-          const pads = attributes.pads.slice();
-          PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
-
-          const outputShape = PoolConvUtil.computePoolOutputShape(
-              isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
-
-          const newAttributes = Object.assign({}, attributes);
-          if (hasDilations) {
-            Object.assign(newAttributes, {kernelShape, strides, pads, dilations, cacheKey: attributes.cacheKey});
-          } else {
-            Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
-          }
-          return [newAttributes, outputShape];
-        };
-
-const globalMaxPoolAttributes = {
-  autoPad: '',
-  ceilMode: 0,
-  countIncludePad: false,
-  kernelShape: [],
-  strides: [],
-  pads: [],
-  storageOrder: 0,
-  dilations: [],
-  cacheKey: ''
-};
-
-const globalMaxPoolMetadata = {
-  name: 'GlobalMaxPool',
-  inputTypes: [GpuDataType.default]
-};
-
-export const globalMaxPool = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(
-      {
-        ...globalMaxPoolMetadata,
-        get: () => createMaxPoolProgramInfo(inputs, globalMaxPoolMetadata, true, globalMaxPoolAttributes)
-      },
-      inputs);
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Pool ops requires 1 input.');
-  }
-  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-    throw new Error('Invalid input type.');
-  }
-};
-
-const generatePoolingCode =
-    (inputDims: readonly number[], outputShape: readonly number[], attributes: AveragePoolAttributes, op1: string,
-     op2: string, dataType: string, start: string): string => {
-      const rank = inputDims.length;
-      const outputSize = ShapeUtil.size(outputShape);
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const xIndicesHelper = createIndicesHelper('x', inputDims);
-
-      if (attributes.kernelShape.length <= 2) {
-        const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
-        const sw = attributes.strides[attributes.strides.length - 1];
-        const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
-        const pwEnd = attributes.pads[attributes.pads.length - 1];
-        const dimW = inputDims[rank - 1];
-        let codeW = '';
-        let codeH = '';
-        let codeHEnd = '';
-        if (pwStart + pwEnd !== 0) {
-          codeW = `
-          for (var i: u32 = 0u; i < ${kw}u; i++) {
-            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
-            if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
-              pad++;
-              continue;
-            }
-            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-            ${op1}
-          }`;
-        } else {
-          codeW = `
-          for (var i: u32 = 0u; i < ${kw}u; i++) {
-            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
-            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-            ${op1}
-          }`;
-        }
-
-        if (attributes.kernelShape.length === 2) {
-          const kh = attributes.kernelShape[attributes.kernelShape.length - 2];
-          const sh = attributes.strides[attributes.strides.length - 2];
-          const phStart = attributes.pads[attributes.pads.length / 2 - 2];
-          const phEnd = attributes.pads[attributes.pads.length - 2];
-          const dimH = inputDims[rank - 2];
-          if (phStart + phEnd !== 0) {
-            codeH = `
-            for (var j: u32 = 0u; j < ${kh}u; j++) {
-              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
-              if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
-                pad+= ${kw};
-                continue;
-              }
-          `;
-          } else {
-            codeH = `
-            for (var j: u32 = 0u; j < ${kh}u; j++) {
-              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
-            `;
-          }
-          codeHEnd = `
-          }
-        `;
-        }
-
-        const poolingCode = `
-        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-        ${outputIndicesHelper.o2iImpl}
-        ${xIndicesHelper.i2oImpl}
-
-        @compute @workgroup_size(WORKGROUP_SIZE)
-        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-          // Guard against out-of-bounds work group sizes
-          if (global_id.x >= ${outputSize}u) {
-            return;
-          }
-
-          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
-
-          var value: ${dataType} = ${dataType}(${start});
-          var pad = 0;
-          ${codeH}
-          ${codeW}
-          ${codeHEnd}
-          ${op2}
-
-          output[global_id.x] = value;
-        }`;
-        return poolingCode;
-      } else {
-        const kernelSize = ShapeUtil.size(attributes.kernelShape);
-        const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
-        const stridesRank = kernelStrides.length;
-        const padsRank = attributes.pads.length;
-        const hasPads = attributes.pads.reduce((sum, cur) => sum + cur);
-        let padCode = '';
-        if (hasPads) {
-          padCode = `
-            if (xIndices[j] >= inputDims[j]) {
-              pad++;
-              isPad = true;
-              break;
-            }
-          }
-          if (!isPad) {
-            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-            ${op1}
-          }`;
-        } else {
-          padCode = `
-          }
-          let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-          ${op1}
-        `;
-        }
-        const poolingCode = `
-        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-        ${outputIndicesHelper.o2iImpl}
-        ${xIndicesHelper.i2oImpl}
-
-        const pads = array<u32, ${padsRank}>(${attributes.pads.map(i => `${i}u`).join(',')});
-        const inputDims = array<u32, ${rank}>(${inputDims.map(i => `${i}u`).join(',')});
-        const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
-        const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
-
-        @compute @workgroup_size(WORKGROUP_SIZE)
-        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-          // Guard against out-of-bounds work group sizes
-          if (global_id.x >= ${outputSize}u) {
-            return;
-          }
-
-          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
-
-          var offsets: array<u32, ${stridesRank}>;
-
-          var value = ${dataType}(${start});
-          var pad = 0;
-          var isPad = false;
-
-          for (var i: u32 = 0u; i < ${kernelSize}u; i++) {
-            var offset = i;
-            for (var j = 0u; j < ${stridesRank - 1}u; j++) {
-              offsets[j] = offset / kernelStrides[j];
-              offset -= offsets[j] * kernelStrides[j];
-            }
-            offsets[${stridesRank - 1}] = offset;
-
-            isPad = false;
-            for (var j = ${rank - stridesRank}u; j < ${rank}u; j++) {
-              xIndices[j] = indices[j] * strides[j - ${rank - stridesRank}u]
-                + offsets[j - ${rank - stridesRank}u] - pads[j - 2u];
-              ${padCode}
-          }
-          ${op2}
-
-          output[global_id.x] = value;
-        }`;
-        return poolingCode;
-      }
-    };
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+// import {Graph} from '../../../graph';
+// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {PoolConvUtil, ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+// export interface AveragePoolAttributes extends AttributeWithCacheKey {
+//   readonly autoPad: string;
+//   readonly ceilMode: number;
+//   readonly countIncludePad: boolean;
+//   readonly kernelShape: readonly number[];
+//   readonly strides: readonly number[];
+//   readonly pads: readonly number[];
+// }
+
+// export const averagePool: OperatorAsyncImplementation<AveragePoolAttributes> =
+//     async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
+//         Promise<Tensor[]> => {
+//           validateInputs(inputs);
+//           const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+//           return inferenceHandler.run(
+//               {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
+//         };
+
+// export const parseAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
+//     (node: Graph.Node): AveragePoolAttributes => {
+//       const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
+//       const ceilMode = node.attributes.getInt('ceil_mode', 0);
+//       const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
+//       const kernelShape = node.attributes.getInts('kernel_shape');
+//       const strides = node.attributes.getInts('strides', []);
+//       const pads = node.attributes.getInts('pads', []);
+
+//       // TODO: support attribute 'ceil_mode'
+//       if (ceilMode !== 0) {
+//         throw new Error('using ceil() in shape computation is not yet supported for AveragePool');
+//       }
+
+//       return createAttributeWithCacheKey({autoPad, ceilMode, countIncludePad, kernelShape, strides, pads});
+//     };
+
+// const createAveragePoolProgramInfo =
+//     (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean,
+//      attributes: AveragePoolAttributes): ProgramInfo => {
+//       const [adjustedAttributes, outputShape] =
+//           getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+//       const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
+
+//       const dataType = 'f32';
+
+//       const op1 = 'value += x_val;';
+//       let op2 = '';
+//       if (adjustedAttributes.countIncludePad) {
+//         op2 += `value /= ${dataType}(${kernelSize});`;
+//       } else {
+//         op2 += `value /= ${dataType}(${kernelSize} - pad);`;
+//       }
+//       return {
+//         ...metadata,
+//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+//         shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType,
+//         '0.0'), dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+//       };
+//     };
+
+// export const globalAveragePool: OperatorAsyncImplementation<AveragePoolAttributes> =
+//     async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
+//         Promise<Tensor[]> => {
+//           validateInputs(inputs);
+//           const metadata = {
+//             name: 'GlobalAveragePool',
+//             inputTypes: [GpuDataType.default],
+//             cacheHint: `${attributes.countIncludePad}`
+//           };
+//           return inferenceHandler.run(
+//               {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, true, attributes)}, inputs);
+//         };
+
+// export const parseGlobalAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
+//     (node: Graph.Node): AveragePoolAttributes => {
+//       const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
+//       return createAttributeWithCacheKey(
+//           {autoPad: '', ceilMode: 0, countIncludePad, kernelShape: [], strides: [], pads: []});
+//     };
+
+// export interface MaxPoolAttributes extends AveragePoolAttributes {
+//   readonly storageOrder: number;
+//   readonly dilations: number[];
+// }
+
+// export const maxPool: OperatorAsyncImplementation<MaxPoolAttributes> = async(
+//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: MaxPoolAttributes): Promise<Tensor[]>
+//     => {
+//   validateInputs(inputs);
+//   const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+//   return inferenceHandler.run(
+//       {...metadata, get: () => createMaxPoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
+// };
+
+// export const parseMaxPoolAttributes: OperatorInitialization<MaxPoolAttributes> =
+//     (node: Graph.Node): MaxPoolAttributes => {
+//       const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
+//       const ceilMode = node.attributes.getInt('ceil_mode', 0);
+//       const kernelShape = node.attributes.getInts('kernel_shape');
+//       const strides = node.attributes.getInts('strides', []);
+//       const pads = node.attributes.getInts('pads', []);
+//       const storageOrder = node.attributes.getInt('storage_order', 0);
+//       const dilations = node.attributes.getInts('dilations', []);
+
+//       // TODO: support attribute 'ceil_mode' and 'storage_order'
+//       if (storageOrder !== 0) {
+//         throw new Error('column major storage order is not yet supported for MaxPool');
+//       }
+//       if (ceilMode !== 0) {
+//         throw new Error('using ceil() in shape computation is not yet supported for MaxPool');
+//       }
+
+//       return createAttributeWithCacheKey(
+//           {autoPad, ceilMode, countIncludePad: false, kernelShape, strides, pads, storageOrder, dilations});
+//     };
+
+// const createMaxPoolProgramInfo =
+//     (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean, attributes: MaxPoolAttributes):
+//         ProgramInfo => {
+//           const [adjustedAttributes, outputShape] =
+//               getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+//           const op1 = `
+//       value = max(x_val, value);
+//     `;
+//           const op2 = '';
+//           return {
+//             ...metadata,
+//             outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+//             shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32',
+//             '-1e5'), dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+//           };
+//         };
+
+// const getAdjustedPoolAttributesAndOutputShape =
+//     (inputs: Tensor[], attributes: AveragePoolAttributes|MaxPoolAttributes, isGlobalOperator: boolean):
+//         [AveragePoolAttributes|MaxPoolAttributes, number[]] => {
+//           const inputShape = inputs[0].dims.slice();
+//           const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
+//           const kernelShape = attributes.kernelShape.slice();
+//           const strides = attributes.strides.slice();
+//           const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
+//           const pads = attributes.pads.slice();
+//           PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
+
+//           const outputShape = PoolConvUtil.computePoolOutputShape(
+//               isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
+
+//           const newAttributes = Object.assign({}, attributes);
+//           if (hasDilations) {
+//             Object.assign(newAttributes, {kernelShape, strides, pads, dilations, cacheKey: attributes.cacheKey});
+//           } else {
+//             Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
+//           }
+//           return [newAttributes, outputShape];
+//         };
+
+// const globalMaxPoolAttributes = {
+//   autoPad: '',
+//   ceilMode: 0,
+//   countIncludePad: false,
+//   kernelShape: [],
+//   strides: [],
+//   pads: [],
+//   storageOrder: 0,
+//   dilations: [],
+//   cacheKey: ''
+// };
+
+// const globalMaxPoolMetadata = {
+//   name: 'GlobalMaxPool',
+//   inputTypes: [GpuDataType.default]
+// };
+
+// export const globalMaxPool = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]>
+// => {
+//   validateInputs(inputs);
+//   return inferenceHandler.run(
+//       {
+//         ...globalMaxPoolMetadata,
+//         get: () => createMaxPoolProgramInfo(inputs, globalMaxPoolMetadata, true, globalMaxPoolAttributes)
+//       },
+//       inputs);
+// };
+
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 1) {
+//     throw new Error('Pool ops requires 1 input.');
+//   }
+//   if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+//     throw new Error('Invalid input type.');
+//   }
+// };
+
+// const generatePoolingCode =
+//     (inputDims: readonly number[], outputShape: readonly number[], attributes: AveragePoolAttributes, op1: string,
+//      op2: string, dataType: string, start: string): string => {
+//       const rank = inputDims.length;
+//       const outputSize = ShapeUtil.size(outputShape);
+//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
+//       const xIndicesHelper = createIndicesHelper('x', inputDims);
+
+//       if (attributes.kernelShape.length <= 2) {
+//         const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
+//         const sw = attributes.strides[attributes.strides.length - 1];
+//         const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
+//         const pwEnd = attributes.pads[attributes.pads.length - 1];
+//         const dimW = inputDims[rank - 1];
+//         let codeW = '';
+//         let codeH = '';
+//         let codeHEnd = '';
+//         if (pwStart + pwEnd !== 0) {
+//           codeW = `
+//           for (var i: u32 = 0u; i < ${kw}u; i++) {
+//             xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+//             if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
+//               pad++;
+//               continue;
+//             }
+//             let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+//             ${op1}
+//           }`;
+//         } else {
+//           codeW = `
+//           for (var i: u32 = 0u; i < ${kw}u; i++) {
+//             xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+//             let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+//             ${op1}
+//           }`;
+//         }
+
+//         if (attributes.kernelShape.length === 2) {
+//           const kh = attributes.kernelShape[attributes.kernelShape.length - 2];
+//           const sh = attributes.strides[attributes.strides.length - 2];
+//           const phStart = attributes.pads[attributes.pads.length / 2 - 2];
+//           const phEnd = attributes.pads[attributes.pads.length - 2];
+//           const dimH = inputDims[rank - 2];
+//           if (phStart + phEnd !== 0) {
+//             codeH = `
+//             for (var j: u32 = 0u; j < ${kh}u; j++) {
+//               xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+//               if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
+//                 pad+= ${kw};
+//                 continue;
+//               }
+//           `;
+//           } else {
+//             codeH = `
+//             for (var j: u32 = 0u; j < ${kh}u; j++) {
+//               xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+//             `;
+//           }
+//           codeHEnd = `
+//           }
+//         `;
+//         }
+
+//         const poolingCode = `
+//         const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+//         @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+//         @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+//         ${outputIndicesHelper.o2iImpl}
+//         ${xIndicesHelper.i2oImpl}
+
+//         @compute @workgroup_size(WORKGROUP_SIZE)
+//         fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//           // Guard against out-of-bounds work group sizes
+//           if (global_id.x >= ${outputSize}u) {
+//             return;
+//           }
+
+//           ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+//           ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+//           ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+//           ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+//           var value: ${dataType} = ${dataType}(${start});
+//           var pad = 0;
+//           ${codeH}
+//           ${codeW}
+//           ${codeHEnd}
+//           ${op2}
+
+//           output[global_id.x] = value;
+//         }`;
+//         return poolingCode;
+//       } else {
+//         const kernelSize = ShapeUtil.size(attributes.kernelShape);
+//         const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
+//         const stridesRank = kernelStrides.length;
+//         const padsRank = attributes.pads.length;
+//         const hasPads = attributes.pads.reduce((sum, cur) => sum + cur);
+//         let padCode = '';
+//         if (hasPads) {
+//           padCode = `
+//             if (xIndices[j] >= inputDims[j]) {
+//               pad++;
+//               isPad = true;
+//               break;
+//             }
+//           }
+//           if (!isPad) {
+//             let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+//             ${op1}
+//           }`;
+//         } else {
+//           padCode = `
+//           }
+//           let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+//           ${op1}
+//         `;
+//         }
+//         const poolingCode = `
+//         const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+//         @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+//         @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+//         ${outputIndicesHelper.o2iImpl}
+//         ${xIndicesHelper.i2oImpl}
+
+//         const pads = array<u32, ${padsRank}>(${attributes.pads.map(i => `${i}u`).join(',')});
+//         const inputDims = array<u32, ${rank}>(${inputDims.map(i => `${i}u`).join(',')});
+//         const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
+//         const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
+
+//         @compute @workgroup_size(WORKGROUP_SIZE)
+//         fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//           // Guard against out-of-bounds work group sizes
+//           if (global_id.x >= ${outputSize}u) {
+//             return;
+//           }
+
+//           ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+//           ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+//           ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+//           ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+//           var offsets: array<u32, ${stridesRank}>;
+
+//           var value = ${dataType}(${start});
+//           var pad = 0;
+//           var isPad = false;
+
+//           for (var i: u32 = 0u; i < ${kernelSize}u; i++) {
+//             var offset = i;
+//             for (var j = 0u; j < ${stridesRank - 1}u; j++) {
+//               offsets[j] = offset / kernelStrides[j];
+//               offset -= offsets[j] * kernelStrides[j];
+//             }
+//             offsets[${stridesRank - 1}] = offset;
+
+//             isPad = false;
+//             for (var j = ${rank - stridesRank}u; j < ${rank}u; j++) {
+//               xIndices[j] = indices[j] * strides[j - ${rank - stridesRank}u]
+//                 + offsets[j - ${rank - stridesRank}u] - pads[j - 2u];
+//               ${padCode}
+//           }
+//           ${op2}
+
+//           output[global_id.x] = value;
+//         }`;
+//         return poolingCode;
+//       }
+//     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
index 763a656d92abb..48c98766a1ee3 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
@@ -1,85 +1,87 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-
-export const sum = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-
-  const sumProgramMetadata = {name: 'Sum', inputTypes: new Array(inputs.length).fill(GpuDataType.default)};
-
-  return inferenceHandler.run(
-      {...sumProgramMetadata, get: () => createSumProgramInfo(inferenceHandler, inputs, sumProgramMetadata)}, inputs);
-};
-
-const createSumProgramInfo =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], sumProgramMetadata: ProgramMetadata): ProgramInfo => {
-      const dataType = 'f32';
-      const outputShape = inputs[0].dims;
-      const outputSize = ShapeUtil.size(outputShape);
-
-
-      const inputsDeclaration =
-          inputs.map((_, i) => `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`);
-      const sumLine = inputs.map((_, i) => `input${i}[offset]`).join('+');
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-  ${inputsDeclaration.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    let offset = global_id.x;
-
-    var value = ${dataType}(0);
-    value = ${sumLine};
-
-    output[offset] = value;
-  }`;
-      return {
-        ...sumProgramMetadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length === 0) {
-    throw new Error('Sum requires inputs.');
-  }
-
-  const length = inputs[0].dims.length;
-  for (let i = 1; i < inputs.length; i++) {
-    if (length !== inputs[i].dims.length) {
-      throw new Error('Input shapes are mismatched. broadcasting not supported yet');
-    }
-
-    for (let j = 0; j < length; j++) {
-      if (inputs[0].dims[j] !== inputs[i].dims[j]) {
-        throw new Error('Input shapes are not matched. broadcasting not supported yet');
-      }
-    }
-  }
-
-  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-    throw new Error('Invalid input type.');
-  }
-  for (let i = 1; i < inputs.length; i++) {
-    if (inputs[0].type !== inputs[i].type) {
-      throw new Error('Input types are not matched.');
-    }
-  }
-};
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+// import {WORKGROUP_SIZE} from './common';
+
+// export const sum = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+//   validateInputs(inputs);
+
+//   const sumProgramMetadata = {name: 'Sum', inputTypes: new Array(inputs.length).fill(GpuDataType.default)};
+
+//   return inferenceHandler.run(
+//       {...sumProgramMetadata, get: () => createSumProgramInfo(inferenceHandler, inputs, sumProgramMetadata)},
+//       inputs);
+// };
+
+// const createSumProgramInfo =
+//     (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], sumProgramMetadata: ProgramMetadata): ProgramInfo
+//     => {
+//       const dataType = 'f32';
+//       const outputShape = inputs[0].dims;
+//       const outputSize = ShapeUtil.size(outputShape);
+
+
+//       const inputsDeclaration =
+//           inputs.map((_, i) => `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`);
+//       const sumLine = inputs.map((_, i) => `input${i}[offset]`).join('+');
+//       const shaderSource = `
+//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+//   ${inputsDeclaration.join('\n')}
+//   @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+//   @compute @workgroup_size(WORKGROUP_SIZE)
+//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//     // Guard against out-of-bounds work group sizes
+//     if (global_id.x >= ${outputSize}u) {
+//       return;
+//     }
+
+//     let offset = global_id.x;
+
+//     var value = ${dataType}(0);
+//     value = ${sumLine};
+
+//     output[offset] = value;
+//   }`;
+//       return {
+//         ...sumProgramMetadata,
+//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
+//         shaderSource,
+//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+//       };
+//     };
+
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length === 0) {
+//     throw new Error('Sum requires inputs.');
+//   }
+
+//   const length = inputs[0].dims.length;
+//   for (let i = 1; i < inputs.length; i++) {
+//     if (length !== inputs[i].dims.length) {
+//       throw new Error('Input shapes are mismatched. broadcasting not supported yet');
+//     }
+
+//     for (let j = 0; j < length; j++) {
+//       if (inputs[0].dims[j] !== inputs[i].dims[j]) {
+//         throw new Error('Input shapes are not matched. broadcasting not supported yet');
+//       }
+//     }
+//   }
+
+//   if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+//     throw new Error('Invalid input type.');
+//   }
+//   for (let i = 1; i < inputs.length; i++) {
+//     if (inputs[0].type !== inputs[i].type) {
+//       throw new Error('Input types are not matched.');
+//     }
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts b/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
index 323e80bdb596a..0227ce5ae28eb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
@@ -1,22 +1,22 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
 
-export const reshape = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  const shape = await inputs[1].getData();
-  const reshapedDims = ShapeUtil.calculateReshapedDims(inputs[0].dims, shape as Int32Array);
-  return [handler.reshape(inputs[0], reshapedDims)];
-};
+// export const reshape = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+//   validateInputs(inputs);
+//   const shape = await inputs[1].getData();
+//   const reshapedDims = ShapeUtil.calculateReshapedDims(inputs[0].dims, shape as Int32Array);
+//   return [handler.reshape(inputs[0], reshapedDims)];
+// };
 
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Reshape requires 2 inputs.');
-  }
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invalid input type.');
-  }
-};
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 2) {
+//     throw new Error('Reshape requires 2 inputs.');
+//   }
+//   if (inputs[1].type !== 'int32') {
+//     throw new Error('Invalid input type.');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/shape.ts b/js/web/lib/wasm/jsep/webgpu/ops/shape.ts
deleted file mode 100644
index 94ba9293c457a..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/ops/shape.ts
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Tensor} from '../../../tensor';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-export const shape = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return [new Tensor([inputs[0].dims.length], 'int32', undefined, undefined, new Int32Array(inputs[0].dims))];
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Shape requires 1 input.');
-  }
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
index fd5d6e2d2299e..c35bf970c5675 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
@@ -1,180 +1,180 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {NUMBER_TYPES, OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-
-export interface SliceAttributes extends AttributeWithCacheKey {
-  readonly axes: number[];
-  readonly ends: number[];
-  readonly starts: number[];
-}
-
-const sliceProgramMetadata = {
-  name: 'Slice',
-  inputTypes: [GpuDataType.default]
-};
-
-export const slice: OperatorAsyncImplementation<SliceAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: SliceAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(
-      {
-        ...sliceProgramMetadata,
-        cacheHint: attributes.cacheKey,
-        get: () => createSliceProgramInfo(inputs[0], attributes)
-      },
-      inputs);
-};
-
-export const parseSliceAttributes: OperatorInitialization<SliceAttributes> = (node: Graph.Node): SliceAttributes => {
-  const starts = node.attributes.getInts('starts');
-  const ends = node.attributes.getInts('ends');
-  const axes = node.attributes.getInts('axes', []);
-  return createAttributeWithCacheKey({starts, ends, axes});
-};
-
-const offsetToIndices = (offset: string, strides: readonly number[], indicesPrefix: string): string => {
-  const outputLines: string[] = [];
-
-  for (let i = 0; i < strides.length - 1; i++) {
-    outputLines.push(`var ${indicesPrefix}${i}=${offset}/${strides[i]}u;`);
-    outputLines.push(`${offset}%=${strides[i]}u;`);
-  }
-  outputLines.push(`var ${indicesPrefix}${strides.length - 1}=${offset};`);
-
-  return outputLines.join('\n');
-};
-
-const indicesToOffset = (indicesPrefix: string, strides: readonly number[], offset: string): string => {
-  const outputLines: string[] = [];
-
-  for (let i = 0; i < strides.length - 1; i++) {
-    outputLines.push(`${offset}+=${indicesPrefix}${i} * ${strides[i]}u;`);
-  }
-  outputLines.push(`${offset}+=${indicesPrefix}${strides.length - 1};`);
-
-  return outputLines.join('\n');
-};
-
-const createSliceProgramInfo = (input: Tensor, attributes: SliceAttributes, dataType = 'f32'): ProgramInfo => {
-  const axes = (attributes.axes.length === 0) ? input.dims.slice(0).map((val, i) => i) : attributes.axes;
-  const normalizedAxes = ShapeUtil.normalizeAxes(axes, input.dims.length);
-  const starts = attributes.starts.map((start, i) => {
-    if (start > input.dims[normalizedAxes[i]] - 1) {
-      return input.dims[normalizedAxes[i]];
-    }
-    return ShapeUtil.normalizeAxis(start, input.dims[normalizedAxes[i]]);
-  });
-  const ends = attributes.ends.map((end, i) => {
-    if (end > input.dims[normalizedAxes[i]] - 1) {
-      return input.dims[normalizedAxes[i]];
-    }
-    return ShapeUtil.normalizeAxis(end, input.dims[normalizedAxes[i]]);
-  });
-
-  const outputShape = input.dims.slice();
-
-  const sliceOps: string[] = [];
-  for (let i = 0; i < normalizedAxes.length; i++) {
-    outputShape[normalizedAxes[i]] = ends[i] - starts[i];
-    if (starts[i] > 0) {
-      sliceOps.push(`idx_${normalizedAxes[i]} += ${starts[i]}u;`);
-    }  // else { sliceOps.push(`outputIdx[${normalizedAxes[i]}] += 0;`); }
-  }
-
-  const outputSize = ShapeUtil.size(outputShape);
-  const outputStrides = ShapeUtil.computeStrides(outputShape);
-  const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  @group(0) @binding(0) var<storage, read> input : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    var offset = global_id.x;
-    ${offsetToIndices('offset', outputStrides, 'idx_')}
-    ${sliceOps.join('')}
-    var offsetInput = 0u;
-    ${indicesToOffset('idx_', ShapeUtil.computeStrides(input.dims), 'offsetInput')}
-    output[global_id.x] = input[offsetInput];
-  }`;
-  return {
-    ...sliceProgramMetadata,
-    outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
-    shaderSource,
-    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-  };
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Slice requires 1 input.');
-  }
-  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
-    throw new Error('Invalid input type.');
-  }
-};
-
-export const sliceV10 = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputsV10(inputs);
-  const attributes = generateSliceAttributesFromInputs(inferenceHandler, inputs);
-  return inferenceHandler.run(
-      {
-        ...sliceProgramMetadata,
-        cacheHint: attributes.cacheKey,
-        get: () => createSliceProgramInfo(inputs[0], attributes)
-      },
-      [inputs[0]]);
-};
-
-const generateSliceAttributesFromInputs =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): SliceAttributes => {
-      if (!inferenceHandler.session.isInitializer(inputs[1].dataId) ||
-          !inferenceHandler.session.isInitializer(inputs[2].dataId) ||
-          (inputs.length >= 4 && !inferenceHandler.session.isInitializer(inputs[3].dataId)) ||
-          (inputs.length >= 5 && !inferenceHandler.session.isInitializer(inputs[4].dataId))) {
-        throw new Error('dynamic slice attributes are not allowed');
-      }
-
-      if (inputs.length >= 5 && inputs[4].integerData.some((i: number) => i !== 1)) {
-        throw new Error('currently non-1 steps is not supported for Slice');
-      }
-
-      const starts = Array.from(inputs[1].integerData);
-      const ends = Array.from(inputs[2].integerData);
-      const axes = inputs.length >= 4 ? Array.from(inputs[3].integerData) : [];
-      const cacheKey = `${axes};${starts};${ends}`;
-      return {starts, ends, axes, cacheKey};
-    };
-
-const validateInputsV10 = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length < 3 || inputs.length > 5) {
-    throw new Error('Invalid input number.');
-  }
-  if (inputs[1].type !== 'int32' || inputs[1].dims.length !== 1) {
-    throw new Error('Invalid input type.');
-  }
-  if (inputs[2].type !== 'int32' || inputs[2].dims.length !== 1) {
-    throw new Error('Invalid input type.');
-  }
-  if (inputs.length >= 4 && (inputs[3].type !== 'int32' || inputs[3].dims.length !== 1)) {
-    throw new Error('Invalid input type.');
-  }
-  if (inputs.length >= 5 && (inputs[4].type !== 'int32' || inputs[4].dims.length !== 1)) {
-    throw new Error('Invalid input type.');
-  }
-};
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+// import {Graph} from '../../../graph';
+// import {NUMBER_TYPES, OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo} from '../types';
+
+// import {WORKGROUP_SIZE} from './common';
+
+// export interface SliceAttributes extends AttributeWithCacheKey {
+//   readonly axes: number[];
+//   readonly ends: number[];
+//   readonly starts: number[];
+// }
+
+// const sliceProgramMetadata = {
+//   name: 'Slice',
+//   inputTypes: [GpuDataType.default]
+// };
+
+// export const slice: OperatorAsyncImplementation<SliceAttributes> = async(
+//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: SliceAttributes): Promise<Tensor[]> => {
+//   validateInputs(inputs);
+//   return inferenceHandler.run(
+//       {
+//         ...sliceProgramMetadata,
+//         cacheHint: attributes.cacheKey,
+//         get: () => createSliceProgramInfo(inputs[0], attributes)
+//       },
+//       inputs);
+// };
+
+// export const parseSliceAttributes: OperatorInitialization<SliceAttributes> = (node: Graph.Node): SliceAttributes => {
+//   const starts = node.attributes.getInts('starts');
+//   const ends = node.attributes.getInts('ends');
+//   const axes = node.attributes.getInts('axes', []);
+//   return createAttributeWithCacheKey({starts, ends, axes});
+// };
+
+// const offsetToIndices = (offset: string, strides: readonly number[], indicesPrefix: string): string => {
+//   const outputLines: string[] = [];
+
+//   for (let i = 0; i < strides.length - 1; i++) {
+//     outputLines.push(`var ${indicesPrefix}${i}=${offset}/${strides[i]}u;`);
+//     outputLines.push(`${offset}%=${strides[i]}u;`);
+//   }
+//   outputLines.push(`var ${indicesPrefix}${strides.length - 1}=${offset};`);
+
+//   return outputLines.join('\n');
+// };
+
+// const indicesToOffset = (indicesPrefix: string, strides: readonly number[], offset: string): string => {
+//   const outputLines: string[] = [];
+
+//   for (let i = 0; i < strides.length - 1; i++) {
+//     outputLines.push(`${offset}+=${indicesPrefix}${i} * ${strides[i]}u;`);
+//   }
+//   outputLines.push(`${offset}+=${indicesPrefix}${strides.length - 1};`);
+
+//   return outputLines.join('\n');
+// };
+
+// const createSliceProgramInfo = (input: Tensor, attributes: SliceAttributes, dataType = 'f32'): ProgramInfo => {
+//   const axes = (attributes.axes.length === 0) ? input.dims.slice(0).map((val, i) => i) : attributes.axes;
+//   const normalizedAxes = ShapeUtil.normalizeAxes(axes, input.dims.length);
+//   const starts = attributes.starts.map((start, i) => {
+//     if (start > input.dims[normalizedAxes[i]] - 1) {
+//       return input.dims[normalizedAxes[i]];
+//     }
+//     return ShapeUtil.normalizeAxis(start, input.dims[normalizedAxes[i]]);
+//   });
+//   const ends = attributes.ends.map((end, i) => {
+//     if (end > input.dims[normalizedAxes[i]] - 1) {
+//       return input.dims[normalizedAxes[i]];
+//     }
+//     return ShapeUtil.normalizeAxis(end, input.dims[normalizedAxes[i]]);
+//   });
+
+//   const outputShape = input.dims.slice();
+
+//   const sliceOps: string[] = [];
+//   for (let i = 0; i < normalizedAxes.length; i++) {
+//     outputShape[normalizedAxes[i]] = ends[i] - starts[i];
+//     if (starts[i] > 0) {
+//       sliceOps.push(`idx_${normalizedAxes[i]} += ${starts[i]}u;`);
+//     }  // else { sliceOps.push(`outputIdx[${normalizedAxes[i]}] += 0;`); }
+//   }
+
+//   const outputSize = ShapeUtil.size(outputShape);
+//   const outputStrides = ShapeUtil.computeStrides(outputShape);
+//   const shaderSource = `
+//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+//   @group(0) @binding(0) var<storage, read> input : array<${dataType}>;
+//   @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+//   @compute @workgroup_size(WORKGROUP_SIZE)
+//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//     // Guard against out-of-bounds work group sizes
+//     if (global_id.x >= ${outputSize}u) {
+//       return;
+//     }
+
+//     var offset = global_id.x;
+//     ${offsetToIndices('offset', outputStrides, 'idx_')}
+//     ${sliceOps.join('')}
+//     var offsetInput = 0u;
+//     ${indicesToOffset('idx_', ShapeUtil.computeStrides(input.dims), 'offsetInput')}
+//     output[global_id.x] = input[offsetInput];
+//   }`;
+//   return {
+//     ...sliceProgramMetadata,
+//     outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
+//     shaderSource,
+//     dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+//   };
+// };
+
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 1) {
+//     throw new Error('Slice requires 1 input.');
+//   }
+//   if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
+//     throw new Error('Invalid input type.');
+//   }
+// };
+
+// export const sliceV10 = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
+//   validateInputsV10(inputs);
+//   const attributes = generateSliceAttributesFromInputs(inferenceHandler, inputs);
+//   return inferenceHandler.run(
+//       {
+//         ...sliceProgramMetadata,
+//         cacheHint: attributes.cacheKey,
+//         get: () => createSliceProgramInfo(inputs[0], attributes)
+//       },
+//       [inputs[0]]);
+// };
+
+// const generateSliceAttributesFromInputs =
+//     (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): SliceAttributes => {
+//       if (!inferenceHandler.session.isInitializer(inputs[1].dataId) ||
+//           !inferenceHandler.session.isInitializer(inputs[2].dataId) ||
+//           (inputs.length >= 4 && !inferenceHandler.session.isInitializer(inputs[3].dataId)) ||
+//           (inputs.length >= 5 && !inferenceHandler.session.isInitializer(inputs[4].dataId))) {
+//         throw new Error('dynamic slice attributes are not allowed');
+//       }
+
+//       if (inputs.length >= 5 && inputs[4].integerData.some((i: number) => i !== 1)) {
+//         throw new Error('currently non-1 steps is not supported for Slice');
+//       }
+
+//       const starts = Array.from(inputs[1].integerData);
+//       const ends = Array.from(inputs[2].integerData);
+//       const axes = inputs.length >= 4 ? Array.from(inputs[3].integerData) : [];
+//       const cacheKey = `${axes};${starts};${ends}`;
+//       return {starts, ends, axes, cacheKey};
+//     };
+
+// const validateInputsV10 = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length < 3 || inputs.length > 5) {
+//     throw new Error('Invalid input number.');
+//   }
+//   if (inputs[1].type !== 'int32' || inputs[1].dims.length !== 1) {
+//     throw new Error('Invalid input type.');
+//   }
+//   if (inputs[2].type !== 'int32' || inputs[2].dims.length !== 1) {
+//     throw new Error('Invalid input type.');
+//   }
+//   if (inputs.length >= 4 && (inputs[3].type !== 'int32' || inputs[3].dims.length !== 1)) {
+//     throw new Error('Invalid input type.');
+//   }
+//   if (inputs.length >= 5 && (inputs[4].type !== 'int32' || inputs[4].dims.length !== 1)) {
+//     throw new Error('Invalid input type.');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts b/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
index 7cd85e6877b03..f0509c34a06a5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
@@ -1,44 +1,44 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Graph} from '../../../graph';
-import {OperatorImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-export const squeeze: OperatorImplementation<number[]> =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
-      validateInputs(inputs);
-      const outputShape = ShapeUtil.squeezeShape(inputs[0].dims, axes);
-      const output = inferenceHandler.reshape(inputs[0], outputShape);
-      return [output];
-    };
-
-export const squeezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
-  validateInputsV13(inputs);
-  return squeeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
-};
-
-export const parseSqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
-    node.attributes.getInts('axes');
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Squeeze requires 1 input.');
-  }
-
-  if (inputs[0].type === 'string') {
-    throw new Error('invalid input tensor types.');
-  }
-};
-
-const validateInputsV13 = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Squeeze requires 2 inputs.');
-  }
-
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invalid input type.');
-  }
-};
+// import {Graph} from '../../../graph';
+// import {OperatorImplementation, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+
+// export const squeeze: OperatorImplementation<number[]> =
+//     (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
+//       validateInputs(inputs);
+//       const outputShape = ShapeUtil.squeezeShape(inputs[0].dims, axes);
+//       const output = inferenceHandler.reshape(inputs[0], outputShape);
+//       return [output];
+//     };
+
+// export const squeezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
+//   validateInputsV13(inputs);
+//   return squeeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
+// };
+
+// export const parseSqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
+//     node.attributes.getInts('axes');
+
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 1) {
+//     throw new Error('Squeeze requires 1 input.');
+//   }
+
+//   if (inputs[0].type === 'string') {
+//     throw new Error('invalid input tensor types.');
+//   }
+// };
+
+// const validateInputsV13 = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 2) {
+//     throw new Error('Squeeze requires 2 inputs.');
+//   }
+
+//   if (inputs[1].type !== 'int32') {
+//     throw new Error('Invalid input type.');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index e83dd7fcbb0b9..8519f319df7f5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -1,116 +1,118 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-export interface TransposeAttributes extends AttributeWithCacheKey {
-  readonly perm: number[];
-}
-
-const transposeProgramMetadata = {
-  name: 'Transpose',
-  inputTypes: [GpuDataType.default]
-};
-
-export const transpose: OperatorAsyncImplementation<TransposeAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: TransposeAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(
-      {
-        ...transposeProgramMetadata,
-        cacheHint: attributes.cacheKey,
-        get: () => createTransposeProgramInfo(inferenceHandler, inputs[0], attributes.perm)
-      },
-      inputs);
-};
-
-export const parseTransposeAttributes: OperatorInitialization<TransposeAttributes> =
-    (node: Graph.Node): TransposeAttributes => createAttributeWithCacheKey({perm: node.attributes.getInts('perm', [])});
-
-const createTransposeProgramInfo =
-    (_inferenceHandler: WebGpuInferenceHandler, input: Tensor, perm: number[]): ProgramInfo => {
-      const dataType = 'f32';  // TODO: support other data type
-      const inputShape = input.dims;
-      perm = getAdjustedPerm(inputShape, perm);
-      const outputShape = getOutputShape(inputShape, perm);
-      const rank = inputShape.length;
-      const outputSize = ShapeUtil.size(outputShape);
-      // A dims=[${inputs[0].dims.toString()}]
-      // out Dims=[${unpackedOutputShape.toString()}]
-      // based on perm=[${perm.toString()}]
-
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const inputIndicesHelper = createIndicesHelper('a', inputShape);
-
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-  ${permFunctionBody(perm, rank)}
-  ${outputIndicesHelper.o2iImpl}
-  ${inputIndicesHelper.i2oImpl}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-    ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
-    perm(&aIndices, &indices);
-
-    output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
-  }`;
-      return {
-        ...transposeProgramMetadata,
-        outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] => {
-  if (perm && perm.length !== inputShape.length) {
-    perm = [...(inputShape.keys())].reverse();
-  }
-  return perm;
-};
-
-const getOutputShape = (inputShape: readonly number[], perm: number[]): readonly number[] => {
-  perm = getAdjustedPerm(inputShape, perm);
-  return ShapeUtil.sortBasedOnPerm(inputShape, perm);
-};
-
-const permFunctionBody = (perm: number[], rank: number): string => {
-  const reverseFunc = [];
-  reverseFunc.push(`fn perm(a: ptr<function, array<u32, ${rank}>>, i: ptr<function, array<u32, ${rank}>>) {`);
-  for (let i = 0; i < rank; ++i) {
-    reverseFunc.push(`\t(*a)[${perm[i]}]=(*i)[${i}];`);
-  }
-  reverseFunc.push('\t}');
-  return reverseFunc.join('\n');
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Transpose requires 1 input.');
-  }
-
-  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-    throw new Error('input should be float tensor');
-  }
-};
+// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
+// import {Graph} from '../../../graph';
+// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+// import {GpuDataType, ProgramInfo} from '../types';
+
+// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+// export interface TransposeAttributes extends AttributeWithCacheKey {
+//   readonly perm: number[];
+// }
+
+// const transposeProgramMetadata = {
+//   name: 'Transpose',
+//   inputTypes: [GpuDataType.default]
+// };
+
+// export const transpose: OperatorAsyncImplementation<TransposeAttributes> = async(
+//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: TransposeAttributes): Promise<Tensor[]>
+//     => {
+//   validateInputs(inputs);
+//   return inferenceHandler.run(
+//       {
+//         ...transposeProgramMetadata,
+//         cacheHint: attributes.cacheKey,
+//         get: () => createTransposeProgramInfo(inferenceHandler, inputs[0], attributes.perm)
+//       },
+//       inputs);
+// };
+
+// export const parseTransposeAttributes: OperatorInitialization<TransposeAttributes> =
+//     (node: Graph.Node): TransposeAttributes => createAttributeWithCacheKey({perm: node.attributes.getInts('perm',
+//     [])});
+
+// const createTransposeProgramInfo =
+//     (_inferenceHandler: WebGpuInferenceHandler, input: Tensor, perm: number[]): ProgramInfo => {
+//       const dataType = 'f32';  // TODO: support other data type
+//       const inputShape = input.dims;
+//       perm = getAdjustedPerm(inputShape, perm);
+//       const outputShape = getOutputShape(inputShape, perm);
+//       const rank = inputShape.length;
+//       const outputSize = ShapeUtil.size(outputShape);
+//       // A dims=[${inputs[0].dims.toString()}]
+//       // out Dims=[${unpackedOutputShape.toString()}]
+//       // based on perm=[${perm.toString()}]
+
+//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
+//       const inputIndicesHelper = createIndicesHelper('a', inputShape);
+
+//       const shaderSource = `
+//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+//   @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+//   @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+//   ${permFunctionBody(perm, rank)}
+//   ${outputIndicesHelper.o2iImpl}
+//   ${inputIndicesHelper.i2oImpl}
+
+//   @compute @workgroup_size(WORKGROUP_SIZE)
+//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+//     // Guard against out-of-bounds work group sizes
+//     if (global_id.x >= ${outputSize}u) {
+//       return;
+//     }
+
+//     ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+//     ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+//     ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
+//     perm(&aIndices, &indices);
+
+//     output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
+//   }`;
+//       return {
+//         ...transposeProgramMetadata,
+//         outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
+//         shaderSource,
+//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+//       };
+//     };
+
+// const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] => {
+//   if (perm && perm.length !== inputShape.length) {
+//     perm = [...(inputShape.keys())].reverse();
+//   }
+//   return perm;
+// };
+
+// const getOutputShape = (inputShape: readonly number[], perm: number[]): readonly number[] => {
+//   perm = getAdjustedPerm(inputShape, perm);
+//   return ShapeUtil.sortBasedOnPerm(inputShape, perm);
+// };
+
+// const permFunctionBody = (perm: number[], rank: number): string => {
+//   const reverseFunc = [];
+//   reverseFunc.push(`fn perm(a: ptr<function, array<u32, ${rank}>>, i: ptr<function, array<u32, ${rank}>>) {`);
+//   for (let i = 0; i < rank; ++i) {
+//     reverseFunc.push(`\t(*a)[${perm[i]}]=(*i)[${i}];`);
+//   }
+//   reverseFunc.push('\t}');
+//   return reverseFunc.join('\n');
+// };
+
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 1) {
+//     throw new Error('Transpose requires 1 input.');
+//   }
+
+//   if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
+//     throw new Error('input should be float tensor');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index 4e3468fc81cff..d49e1a8acfa0d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -2,9 +2,8 @@
 // Licensed under the MIT License.
 
 import {TensorView} from '../../tensor';
-import {MAX_CLIP, MIN_CLIP, ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {WebGpuInferenceHandler} from '../inference-handler';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
 import {WORKGROUP_SIZE} from './common';
@@ -49,13 +48,13 @@ const createElementwiseProgramInfo =
      additionalImplementation?: string): ProgramInfo => ({
       ...metadata,
       shaderSource: createElementwiseProgramShader(ShapeUtil.size(input.dims), funcCall, additionalImplementation),
-      outputs: [{dims: input.dims, type: input.type, gpuDataType: GpuDataType.default}],
+      outputs: [{dims: input.dims, dataType: input.dataType, gpuDataType: GpuDataType.default}],
       dispatchGroup: (inputTensors) =>
-          ({x: Math.ceil(inputTensors[0].size / 64 /* workgroup size */ / 4 /* vec size */)})
+          ({x: Math.ceil(ShapeUtil.size(inputTensors[0].dims) / 64 /* workgroup size */ / 4 /* vec size */)})
     });
 
 const createElementwiseProgramInfoLoader =
-    (input: Tensor, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string,
+    (input: TensorView, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string,
      cacheKey?: string): ProgramInfoLoader => {
       const metadata: ProgramMetadata = {name, inputTypes: [GpuDataType.default], cacheHint: cacheKey};
       return {
@@ -67,130 +66,128 @@ const createElementwiseProgramInfoLoader =
 export const abs = (context: ComputeContext): number =>
     context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Abs', 'abs'));
 
-export const acos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Acos', 'acos'), inputs);
+export const acos = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Acos', 'acos'));
 
-export const asin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Asin', 'asin'), inputs);
+export const asin = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Asin', 'asin'));
 
-export const atan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Atan', 'atan'), inputs);
+export const atan = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Atan', 'atan'));
 
 export interface ClipAttributes extends AttributeWithCacheKey {
   readonly min: number;
   readonly max: number;
 }
 
-export const clip = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ClipAttributes):
-                        Promise<Tensor[] >=>handler.run(
-                            createElementwiseProgramInfoLoader(
-                                inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
+export const clip = (context: ComputeContext, attributes: ClipAttributes): number =>
+    context.compute(createElementwiseProgramInfoLoader(
+        context.inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
     let clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
     let clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
 `,
-                                attributes.cacheKey),
-                            inputs);
+        attributes.cacheKey));
 
-export const parseClipAttributes = (node: Graph.Node): ClipAttributes => createAttributeWithCacheKey(
-    {min: node.attributes.getFloat('min', MIN_CLIP), max: node.attributes.getFloat('max', MAX_CLIP)});
+// export const parseClipAttributes = (node: Graph.Node): ClipAttributes => createAttributeWithCacheKey(
+//     {min: node.attributes.getFloat('min', MIN_CLIP), max: node.attributes.getFloat('max', MAX_CLIP)});
 
-const generateClipAttributesFromInputs = (handler: WebGpuInferenceHandler, inputs: Tensor[]): ClipAttributes => {
-  if (inputs.length >= 3 &&
-      (!handler.session.isInitializer(inputs[1].dataId) || !handler.session.isInitializer(inputs[2].dataId))) {
-    throw new Error('dynamic clip attributes are not allowed');
-  }
+// const generateClipAttributesFromInputs = (handler: WebGpuInferenceHandler, inputs: Tensor[]): ClipAttributes => {
+//   if (inputs.length >= 3 &&
+//       (!handler.session.isInitializer(inputs[1].dataId) || !handler.session.isInitializer(inputs[2].dataId))) {
+//     throw new Error('dynamic clip attributes are not allowed');
+//   }
 
-  const min = (inputs.length >= 3) ? inputs[1].numberData[0] : MIN_CLIP;
-  const max = (inputs.length >= 3) ? inputs[2].numberData[0] : MAX_CLIP;
-  return createAttributeWithCacheKey({min, max});
-};
+//   const min = (inputs.length >= 3) ? inputs[1].numberData[0] : MIN_CLIP;
+//   const max = (inputs.length >= 3) ? inputs[2].numberData[0] : MAX_CLIP;
+//   return createAttributeWithCacheKey({min, max});
+// };
 
-export const clipV11 = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  const attributes = generateClipAttributesFromInputs(handler, inputs);
-  return clip(handler, [inputs[0]], attributes);
-};
+// export const clipV11 = (context: ComputeContext ): number=> {
+//   const attributes = generateClipAttributesFromInputs(handler, inputs);
+//   return clip(handler, [inputs[0]], attributes);
+// };
 
-export const ceil = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Ceil', 'ceil'), inputs);
+// export const ceil = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Ceil', 'ceil'), inputs);
 
-export const cos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Cos', 'cos'), inputs);
+// export const cos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Cos', 'cos'), inputs);
 
-export interface EluAttributes extends AttributeWithCacheKey {
-  readonly alpha: number;
-}
+// export interface EluAttributes extends AttributeWithCacheKey {
+//   readonly alpha: number;
+// }
 
-export const elu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
-                       Promise<Tensor[] >=>handler.run(
-                           createElementwiseProgramInfoLoader(
-                               inputs[0], 'Elu', a => `elu_vf32(${a})`, `
-    let elu_alpha_: f32 = f32(${attributes.alpha});
+// export const elu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
+//                        Promise<Tensor[] >=>handler.run(
+//                            createElementwiseProgramInfoLoader(
+//                                inputs[0], 'Elu', a => `elu_vf32(${a})`, `
+//     let elu_alpha_: f32 = f32(${attributes.alpha});
 
-    fn elu_f32(a: f32) -> f32 {
-      return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
-    }
+//     fn elu_f32(a: f32) -> f32 {
+//       return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
+//     }
 
-    fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
-      return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
-    }`,
-                               attributes.cacheKey),
-                           inputs);
+//     fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
+//       return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
+//     }`,
+//                                attributes.cacheKey),
+//                            inputs);
 
-export const parseEluAttributes = (node: Graph.Node): EluAttributes =>
-    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 1.0)});
+// export const parseEluAttributes = (node: Graph.Node): EluAttributes =>
+//     createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 1.0)});
 
-export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
+// export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
 
-export const floor = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Floor', 'floor'), inputs);
+// export const floor = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Floor', 'floor'), inputs);
 
-export interface LeakyReluAttributes extends AttributeWithCacheKey {
-  readonly alpha: number;
-}
+// export interface LeakyReluAttributes extends AttributeWithCacheKey {
+//   readonly alpha: number;
+// }
 
-export const leakyRelu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
-                             Promise<Tensor[] >=>handler.run(
-                                 createElementwiseProgramInfoLoader(
-                                     inputs[0], 'LeakyRelu', a => `leaky_relu_vf32(${a})`, `
-    let leaky_relu_alpha_: f32 = f32(${attributes.alpha});
+// export const leakyRelu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
+//                              Promise<Tensor[] >=>handler.run(
+//                                  createElementwiseProgramInfoLoader(
+//                                      inputs[0], 'LeakyRelu', a => `leaky_relu_vf32(${a})`, `
+//     let leaky_relu_alpha_: f32 = f32(${attributes.alpha});
 
-    fn leaky_relu_f32(a: f32) -> f32 {
-      return select(a, a * leaky_relu_alpha_, a < 0.0);
-    }
+//     fn leaky_relu_f32(a: f32) -> f32 {
+//       return select(a, a * leaky_relu_alpha_, a < 0.0);
+//     }
 
-    fn leaky_relu_vf32(v: vec4<f32>) -> vec4<f32> {
-      return vec4(leaky_relu_f32(v.x), leaky_relu_f32(v.y), leaky_relu_f32(v.z), leaky_relu_f32(v.w));
-    }`,
-                                     attributes.cacheKey),
-                                 inputs);
+//     fn leaky_relu_vf32(v: vec4<f32>) -> vec4<f32> {
+//       return vec4(leaky_relu_f32(v.x), leaky_relu_f32(v.y), leaky_relu_f32(v.z), leaky_relu_f32(v.w));
+//     }`,
+//                                      attributes.cacheKey),
+//                                  inputs);
 
-export const parseLeakyReluAttributes = (node: Graph.Node): LeakyReluAttributes =>
-    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 0.01)});
+// export const parseLeakyReluAttributes = (node: Graph.Node): LeakyReluAttributes =>
+//     createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 0.01)});
 
-export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
+// export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
 
-export const neg = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Neg', a => `-${a}`), inputs);
+// export const neg = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Neg', a => `-${a}`), inputs);
 
-// export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
+// // export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
+// //     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
 
-export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
-    createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
+// export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
+//     createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
 
-export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
-    createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
+// export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
+//     createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
 
-export const sin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sin', 'sin'), inputs);
+// export const sin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sin', 'sin'), inputs);
 
-export const sqrt = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sqrt', 'sqrt'), inputs);
+// export const sqrt = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sqrt', 'sqrt'), inputs);
 
-export const tan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tan', 'tan'), inputs);
+// export const tan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tan', 'tan'), inputs);
 
-export const tanh = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tanh', 'tanh'), inputs);
+// export const tanh = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
+//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tanh', 'tanh'), inputs);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts b/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
index 8a099dc92cbd9..2cefbe72bc8a5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
@@ -1,43 +1,43 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Graph} from '../../../graph';
-import {OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-export const unsqueeze = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
-  validateInputs(inputs);
-  const outputShape = ShapeUtil.unsqueezeShape(inputs[0].dims, axes);
-  const output = inferenceHandler.reshape(inputs[0], outputShape);
-  return [output];
-};
-
-export const unsqueezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
-  validateInputsV13(inputs);
-  return unsqueeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
-};
-
-export const parseUnsqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
-    node.attributes.getInts('axes');
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Unsqueeze requires 1 input.');
-  }
-
-  if (inputs[0].type === 'string') {
-    throw new Error('invalid input tensor types.');
-  }
-};
-
-const validateInputsV13 = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Unsqueeze requires 2 inputs.');
-  }
-
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invalid input type.');
-  }
-};
+// import {Graph} from '../../../graph';
+// import {OperatorInitialization} from '../../../operators';
+// import {Tensor} from '../../../tensor';
+// import {ShapeUtil} from '../../../util';
+// import {WebGpuInferenceHandler} from '../inference-handler';
+
+// export const unsqueeze = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
+//   validateInputs(inputs);
+//   const outputShape = ShapeUtil.unsqueezeShape(inputs[0].dims, axes);
+//   const output = inferenceHandler.reshape(inputs[0], outputShape);
+//   return [output];
+// };
+
+// export const unsqueezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
+//   validateInputsV13(inputs);
+//   return unsqueeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
+// };
+
+// export const parseUnsqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
+//     node.attributes.getInts('axes');
+
+// const validateInputs = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 1) {
+//     throw new Error('Unsqueeze requires 1 input.');
+//   }
+
+//   if (inputs[0].type === 'string') {
+//     throw new Error('invalid input tensor types.');
+//   }
+// };
+
+// const validateInputsV13 = (inputs: Tensor[]): void => {
+//   if (!inputs || inputs.length !== 2) {
+//     throw new Error('Unsqueeze requires 2 inputs.');
+//   }
+
+//   if (inputs[1].type !== 'int32') {
+//     throw new Error('Invalid input type.');
+//   }
+// };
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index ea9ddfdaf46fc..c93529c44c722 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -19,7 +19,7 @@ export interface GpuData {
 export interface TensorInfo {
   id?: Tensor.Id;
   dims: readonly number[];
-  type: Tensor.DataType;
+  dataType: number;
   gpuDataType: GpuDataType;
 }
 
@@ -82,7 +82,7 @@ export interface ProgramInfo extends ProgramMetadata {
    */
   // entryPoint: string;
 
-  dispatchGroup: (inputs: readonly Tensor[]) => {
+  dispatchGroup: (inputs: readonly TensorView[]) => {
     x: number;
     y?: number;
     z?: number;
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 4f405705e0da2..d4e8907f5f09d 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -119,7 +119,7 @@ export const releaseSession = (sessionId: number): void => {
 /**
  * Copied from ONNX definition. Use this to drop dependency 'onnx_proto' to decrease compiled .js file size.
  */
-const enum DataType {
+export const enum DataType {
   undefined = 0,
   float = 1,
   uint8 = 2,

From 2ec6178fa057b751e89a04a9b11ad60b4a0ac6f3 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 27 Oct 2022 14:36:51 -0700
Subject: [PATCH 10/81] 7

---
 cmake/onnxruntime_webassembly.cmake           |   6 +
 js/web/lib/wasm/jsep/init.ts                  |  12 +-
 js/web/lib/wasm/proxy-worker/main.ts          |  12 +-
 js/web/lib/wasm/wasm-core-impl.ts             | 309 +++++++++---------
 .../core/providers/js/data_transfer.cc        |   8 +-
 onnxruntime/core/providers/js/js_export.cc    |   3 +
 onnxruntime/wasm/api.cc                       |  11 +-
 7 files changed, 204 insertions(+), 157 deletions(-)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 66a393e164da0..bdccef504b5c8 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -213,6 +213,12 @@ else()
                         -s VERBOSE=0 \
                         -s NO_FILESYSTEM=1 \
                         ${WASM_API_EXCEPTION_CATCHING} \
+                        -s ASYNCIFY=1 \
+                        -s ASYNCIFY_STACK_SIZE=8192 \
+                        -s ASYNCIFY_ADVISE=1 \
+                        -s ASYNCIFY_DEBUG=2 \
+                        -s ASYNCIFY_IGNORE_INDIRECT=0 \
+                        -s ASYNCIFY_REMOVE=OrtInit \
                         --no-entry")
 
   if (onnxruntime_USE_JS)
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 865d22873b4d3..bae2b46d033f3 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -82,11 +82,19 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         // jsepDownload(src, dst, size)
         async(gpuDataId: number, dataOffset: number, size: number):
             Promise<void> => {
+              const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
+
               // eslint-disable-next-line no-console
-              console.log('jsepDownload');
+              console.log(`jsepDownload: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
+
+              // eslint-disable-next-line no-console
+              console.log(`jsepDownload: before download: ${
+                  new Float32Array(data.buffer, data.byteOffset, data.byteLength).join(',')}`);
 
-              const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
               await backend.download(gpuDataId, data);
+              // eslint-disable-next-line no-console
+              console.log(`jsepDownload: after download: ${
+                  new Float32Array(data.buffer, data.byteOffset, data.byteLength).join(',')}`);
             },
 
         // jsepCreateKernel
diff --git a/js/web/lib/wasm/proxy-worker/main.ts b/js/web/lib/wasm/proxy-worker/main.ts
index b72bfe42c6812..ef44ce416f451 100644
--- a/js/web/lib/wasm/proxy-worker/main.ts
+++ b/js/web/lib/wasm/proxy-worker/main.ts
@@ -10,7 +10,7 @@ import {initializeWebAssembly} from '../wasm-factory';
 self.onmessage = (ev: MessageEvent<OrtWasmMessage>): void => {
   switch (ev.data.type) {
     case 'init-wasm':
-      initializeWebAssembly(ev.data.in)
+      initializeWebAssembly(ev.data.in!)
           .then(
               () => postMessage({type: 'init-wasm'} as OrtWasmMessage),
               err => postMessage({type: 'init-wasm', err} as OrtWasmMessage));
@@ -63,8 +63,14 @@ self.onmessage = (ev: MessageEvent<OrtWasmMessage>): void => {
     case 'run':
       try {
         const {sessionId, inputIndices, inputs, outputIndices, options} = ev.data.in!;
-        const outputs = run(sessionId, inputIndices, inputs, outputIndices, options);
-        postMessage({type: 'run', out: outputs} as OrtWasmMessage, extractTransferableBuffers(outputs));
+        run(sessionId, inputIndices, inputs, outputIndices, options)
+            .then(
+                outputs => {
+                  postMessage({type: 'run', out: outputs} as OrtWasmMessage, extractTransferableBuffers(outputs));
+                },
+                err => {
+                  postMessage({type: 'run', err} as OrtWasmMessage);
+                });
       } catch (err) {
         postMessage({type: 'run', err} as OrtWasmMessage);
       }
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index d4e8907f5f09d..af67d1e4e99a3 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -238,171 +238,180 @@ const numericTensorTypeToTypedArray = (type: Tensor.Type): Float32ArrayConstruct
 /**
  * perform inference run
  */
-export const run =
-    (sessionId: number, inputIndices: number[], inputs: SerializableTensor[], outputIndices: number[],
-     options: InferenceSession.RunOptions): SerializableTensor[] => {
-      const wasm = getInstance();
-      const session = activeSessions.get(sessionId);
-      if (!session) {
-        throw new Error('invalid session id');
-      }
-      const sessionHandle = session[0];
-      const inputNamesUTF8Encoded = session[1];
-      const outputNamesUTF8Encoded = session[2];
+export const run = async(
+    sessionId: number, inputIndices: number[], inputs: SerializableTensor[], outputIndices: number[],
+    options: InferenceSession.RunOptions): Promise<SerializableTensor[]> => {
+  const wasm = getInstance();
+  const session = activeSessions.get(sessionId);
+  if (!session) {
+    throw new Error('invalid session id');
+  }
+  const sessionHandle = session[0];
+  const inputNamesUTF8Encoded = session[1];
+  const outputNamesUTF8Encoded = session[2];
 
-      const inputCount = inputIndices.length;
-      const outputCount = outputIndices.length;
+  const inputCount = inputIndices.length;
+  const outputCount = outputIndices.length;
 
-      let runOptionsHandle = 0;
-      let runOptionsAllocs: number[] = [];
+  let runOptionsHandle = 0;
+  let runOptionsAllocs: number[] = [];
 
-      const inputValues: number[] = [];
-      const inputAllocs: number[] = [];
+  const inputValues: number[] = [];
+  const inputAllocs: number[] = [];
 
-      try {
-        [runOptionsHandle, runOptionsAllocs] = setRunOptions(options);
-
-        // create input tensors
-        for (let i = 0; i < inputCount; i++) {
-          const dataType = inputs[i][0];
-          const dims = inputs[i][1];
-          const data = inputs[i][2];
-
-          let dataOffset: number;
-          let dataByteLength: number;
-
-          if (Array.isArray(data)) {
-            // string tensor
-            dataByteLength = 4 * data.length;
-            dataOffset = wasm._malloc(dataByteLength);
-            inputAllocs.push(dataOffset);
-            let dataIndex = dataOffset / 4;
-            for (let i = 0; i < data.length; i++) {
-              if (typeof data[i] !== 'string') {
-                throw new TypeError(`tensor data at index ${i} is not a string`);
-              }
-              wasm.HEAPU32[dataIndex++] = allocWasmString(data[i], inputAllocs);
-            }
-          } else {
-            dataByteLength = data.byteLength;
-            dataOffset = wasm._malloc(dataByteLength);
-            inputAllocs.push(dataOffset);
-            wasm.HEAPU8.set(new Uint8Array(data.buffer, data.byteOffset, dataByteLength), dataOffset);
-          }
+  try {
+    [runOptionsHandle, runOptionsAllocs] = setRunOptions(options);
 
-          const stack = wasm.stackSave();
-          const dimsOffset = wasm.stackAlloc(4 * dims.length);
-          try {
-            let dimIndex = dimsOffset / 4;
-            dims.forEach(d => wasm.HEAP32[dimIndex++] = d);
-            const tensor = wasm._OrtCreateTensor(
-                tensorDataTypeStringToEnum(dataType), dataOffset, dataByteLength, dimsOffset, dims.length);
-            if (tensor === 0) {
-              throw new Error('Can\'t create a tensor');
-            }
-            inputValues.push(tensor);
-          } finally {
-            wasm.stackRestore(stack);
+    // create input tensors
+    for (let i = 0; i < inputCount; i++) {
+      const dataType = inputs[i][0];
+      const dims = inputs[i][1];
+      const data = inputs[i][2];
+
+      let dataOffset: number;
+      let dataByteLength: number;
+
+      if (Array.isArray(data)) {
+        // string tensor
+        dataByteLength = 4 * data.length;
+        dataOffset = wasm._malloc(dataByteLength);
+        inputAllocs.push(dataOffset);
+        let dataIndex = dataOffset / 4;
+        for (let i = 0; i < data.length; i++) {
+          if (typeof data[i] !== 'string') {
+            throw new TypeError(`tensor data at index ${i} is not a string`);
           }
+          wasm.HEAPU32[dataIndex++] = allocWasmString(data[i], inputAllocs);
         }
+      } else {
+        dataByteLength = data.byteLength;
+        dataOffset = wasm._malloc(dataByteLength);
+        inputAllocs.push(dataOffset);
+        wasm.HEAPU8.set(new Uint8Array(data.buffer, data.byteOffset, dataByteLength), dataOffset);
+      }
 
-        const beforeRunStack = wasm.stackSave();
-        const inputValuesOffset = wasm.stackAlloc(inputCount * 4);
-        const inputNamesOffset = wasm.stackAlloc(inputCount * 4);
-        const outputValuesOffset = wasm.stackAlloc(outputCount * 4);
-        const outputNamesOffset = wasm.stackAlloc(outputCount * 4);
-
-        try {
-          let inputValuesIndex = inputValuesOffset / 4;
-          let inputNamesIndex = inputNamesOffset / 4;
-          let outputValuesIndex = outputValuesOffset / 4;
-          let outputNamesIndex = outputNamesOffset / 4;
-          for (let i = 0; i < inputCount; i++) {
-            wasm.HEAPU32[inputValuesIndex++] = inputValues[i];
-            wasm.HEAPU32[inputNamesIndex++] = inputNamesUTF8Encoded[inputIndices[i]];
-          }
-          for (let i = 0; i < outputCount; i++) {
-            wasm.HEAPU32[outputValuesIndex++] = 0;
-            wasm.HEAPU32[outputNamesIndex++] = outputNamesUTF8Encoded[outputIndices[i]];
-          }
+      const stack = wasm.stackSave();
+      const dimsOffset = wasm.stackAlloc(4 * dims.length);
+      try {
+        let dimIndex = dimsOffset / 4;
+        dims.forEach(d => wasm.HEAP32[dimIndex++] = d);
+        const tensor = wasm._OrtCreateTensor(
+            tensorDataTypeStringToEnum(dataType), dataOffset, dataByteLength, dimsOffset, dims.length);
+        if (tensor === 0) {
+          throw new Error('Can\'t create a tensor');
+        }
+        inputValues.push(tensor);
+      } finally {
+        wasm.stackRestore(stack);
+      }
+    }
 
-          // support RunOptions
-          let errorCode = wasm._OrtRun(
-              sessionHandle, inputNamesOffset, inputValuesOffset, inputCount, outputNamesOffset, outputCount,
-              outputValuesOffset, runOptionsHandle);
-
-          const output: SerializableTensor[] = [];
-
-          if (errorCode === 0) {
-            for (let i = 0; i < outputCount; i++) {
-              const tensor = wasm.HEAPU32[outputValuesOffset / 4 + i];
-
-              const beforeGetTensorDataStack = wasm.stackSave();
-              // stack allocate 4 pointer value
-              const tensorDataOffset = wasm.stackAlloc(4 * 4);
-
-              let type: Tensor.Type|undefined, dataOffset = 0;
-              try {
-                errorCode = wasm._OrtGetTensorData(
-                    tensor, tensorDataOffset, tensorDataOffset + 4, tensorDataOffset + 8, tensorDataOffset + 12);
-                if (errorCode !== 0) {
-                  throw new Error(`Can't access output tensor data. error code = ${errorCode}`);
-                }
-                let tensorDataIndex = tensorDataOffset / 4;
-                const dataType = wasm.HEAPU32[tensorDataIndex++];
-                dataOffset = wasm.HEAPU32[tensorDataIndex++];
-                const dimsOffset = wasm.HEAPU32[tensorDataIndex++];
-                const dimsLength = wasm.HEAPU32[tensorDataIndex++];
-                const dims = [];
-                for (let i = 0; i < dimsLength; i++) {
-                  dims.push(wasm.HEAPU32[dimsOffset / 4 + i]);
-                }
-                wasm._OrtFree(dimsOffset);
-
-                const size = dims.length === 0 ? 1 : dims.reduce((a, b) => a * b);
-                type = tensorDataTypeEnumToString(dataType);
-                if (type === 'string') {
-                  const stringData: string[] = [];
-                  let dataIndex = dataOffset / 4;
-                  for (let i = 0; i < size; i++) {
-                    const offset = wasm.HEAPU32[dataIndex++];
-                    const maxBytesToRead = i === size - 1 ? undefined : wasm.HEAPU32[dataIndex] - offset;
-                    stringData.push(wasm.UTF8ToString(offset, maxBytesToRead));
-                  }
-                  output.push([type, dims, stringData]);
-                } else {
-                  const typedArrayConstructor = numericTensorTypeToTypedArray(type);
-                  const data = new typedArrayConstructor(size);
-                  new Uint8Array(data.buffer, data.byteOffset, data.byteLength)
-                      .set(wasm.HEAPU8.subarray(dataOffset, dataOffset + data.byteLength));
-                  output.push([type, dims, data]);
-                }
-              } finally {
-                wasm.stackRestore(beforeGetTensorDataStack);
-                if (type === 'string' && dataOffset) {
-                  wasm._free(dataOffset);
-                }
-                wasm._OrtReleaseTensor(tensor);
+    const beforeRunStack = wasm.stackSave();
+    const inputValuesOffset = wasm.stackAlloc(inputCount * 4);
+    const inputNamesOffset = wasm.stackAlloc(inputCount * 4);
+    const outputValuesOffset = wasm.stackAlloc(outputCount * 4);
+    const outputNamesOffset = wasm.stackAlloc(outputCount * 4);
+
+    try {
+      let inputValuesIndex = inputValuesOffset / 4;
+      let inputNamesIndex = inputNamesOffset / 4;
+      let outputValuesIndex = outputValuesOffset / 4;
+      let outputNamesIndex = outputNamesOffset / 4;
+      for (let i = 0; i < inputCount; i++) {
+        wasm.HEAPU32[inputValuesIndex++] = inputValues[i];
+        wasm.HEAPU32[inputNamesIndex++] = inputNamesUTF8Encoded[inputIndices[i]];
+      }
+      for (let i = 0; i < outputCount; i++) {
+        wasm.HEAPU32[outputValuesIndex++] = 0;
+        wasm.HEAPU32[outputNamesIndex++] = outputNamesUTF8Encoded[outputIndices[i]];
+      }
+
+      // support RunOptions
+      let errorCode = wasm._OrtRun(
+          sessionHandle, inputNamesOffset, inputValuesOffset, inputCount, outputNamesOffset, outputCount,
+          outputValuesOffset, runOptionsHandle);
+
+      // eslint-disable-next-line @typescript-eslint/naming-convention
+      const prom = (wasm as {OrtRunPromise?: Promise<void>}).OrtRunPromise;
+      if (prom) {
+        await prom;
+      }
+
+      // eslint-disable-next-line no-console
+      console.log(`OrtRun() errorcode=${errorCode}`);
+
+      const output: SerializableTensor[] = [];
+
+      if (errorCode === 0) {
+        for (let i = 0; i < outputCount; i++) {
+          const tensor = wasm.HEAPU32[outputValuesOffset / 4 + i];
+
+          const beforeGetTensorDataStack = wasm.stackSave();
+          // stack allocate 4 pointer value
+          const tensorDataOffset = wasm.stackAlloc(4 * 4);
+
+          let type: Tensor.Type|undefined, dataOffset = 0;
+          try {
+            errorCode = wasm._OrtGetTensorData(
+                tensor, tensorDataOffset, tensorDataOffset + 4, tensorDataOffset + 8, tensorDataOffset + 12);
+            if (errorCode !== 0) {
+              throw new Error(`Can't access output tensor data. error code = ${errorCode}`);
+            }
+            let tensorDataIndex = tensorDataOffset / 4;
+            const dataType = wasm.HEAPU32[tensorDataIndex++];
+            dataOffset = wasm.HEAPU32[tensorDataIndex++];
+            const dimsOffset = wasm.HEAPU32[tensorDataIndex++];
+            const dimsLength = wasm.HEAPU32[tensorDataIndex++];
+            const dims = [];
+            for (let i = 0; i < dimsLength; i++) {
+              dims.push(wasm.HEAPU32[dimsOffset / 4 + i]);
+            }
+            wasm._OrtFree(dimsOffset);
+
+            const size = dims.length === 0 ? 1 : dims.reduce((a, b) => a * b);
+            type = tensorDataTypeEnumToString(dataType);
+            if (type === 'string') {
+              const stringData: string[] = [];
+              let dataIndex = dataOffset / 4;
+              for (let i = 0; i < size; i++) {
+                const offset = wasm.HEAPU32[dataIndex++];
+                const maxBytesToRead = i === size - 1 ? undefined : wasm.HEAPU32[dataIndex] - offset;
+                stringData.push(wasm.UTF8ToString(offset, maxBytesToRead));
               }
+              output.push([type, dims, stringData]);
+            } else {
+              const typedArrayConstructor = numericTensorTypeToTypedArray(type);
+              const data = new typedArrayConstructor(size);
+              new Uint8Array(data.buffer, data.byteOffset, data.byteLength)
+                  .set(wasm.HEAPU8.subarray(dataOffset, dataOffset + data.byteLength));
+              output.push([type, dims, data]);
             }
+          } finally {
+            wasm.stackRestore(beforeGetTensorDataStack);
+            if (type === 'string' && dataOffset) {
+              wasm._free(dataOffset);
+            }
+            wasm._OrtReleaseTensor(tensor);
           }
-
-          if (errorCode === 0) {
-            return output;
-          } else {
-            throw new Error(`failed to call OrtRun(). error code = ${errorCode}.`);
-          }
-        } finally {
-          wasm.stackRestore(beforeRunStack);
         }
-      } finally {
-        inputValues.forEach(wasm._OrtReleaseTensor);
-        inputAllocs.forEach(wasm._free);
+      }
 
-        wasm._OrtReleaseRunOptions(runOptionsHandle);
-        runOptionsAllocs.forEach(wasm._free);
+      if (errorCode === 0) {
+        return output;
+      } else {
+        throw new Error(`failed to call OrtRun(). error code = ${errorCode}.`);
       }
-    };
+    } finally {
+      wasm.stackRestore(beforeRunStack);
+    }
+  } finally {
+    inputValues.forEach(wasm._OrtReleaseTensor);
+    inputAllocs.forEach(wasm._free);
+
+    wasm._OrtReleaseRunOptions(runOptionsHandle);
+    runOptionsAllocs.forEach(wasm._free);
+  }
+};
 
 /**
  * end profiling
diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
index 41cf9b3d01e08..313e6660fc56c 100644
--- a/onnxruntime/core/providers/js/data_transfer.cc
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -5,6 +5,10 @@
 
 #include "core/providers/js/data_transfer.h"
 
+EM_ASYNC_JS(void, jsepDownload, (const void *src_data, void *dst_data, size_t bytes), {
+  await Module.jsepDownload(src_data, dst_data, bytes);
+});
+
 namespace onnxruntime {
 namespace js {
 
@@ -26,7 +30,9 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*un
     EM_ASM({ Module.jsepUpload($0, $1, $2); }, src_data, dst_data, bytes);
   } else if (src_device.Type() == OrtDevice::GPU) {
     // copy from GPU to CPU
-    EM_ASM({ Module.jsepDownload($0, $1); }, src_data, dst_data);
+    printf("DataTransfer::CopyTensor before jsepDownload\n");
+    jsepDownload(src_data, dst_data, bytes);
+    printf("DataTransfer::CopyTensor after jsepDownload\n");
   } else {
     // copy from CPU to CPU (don't think we ever get here)
     memcpy(dst_data, src_data, bytes);
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index 486fed6ceeb07..e83f994cca4c5 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -8,11 +8,14 @@
 const void * JsepOutput(void * context, int index, void * data) {
     uint32_t * data_offset = reinterpret_cast<uint32_t *>(data);
     uint32_t dim = *data_offset++;
+    printf("JsepOutput: dim=%u\n",dim);
     size_t dim_size = static_cast<size_t>(dim);
     std::vector<int64_t> dims;
     dims.reserve(dim_size);
+    dims.resize(dim_size);
     for (size_t i = 0; i < dim_size; i++) {
         dims[i] = static_cast<int64_t>(*data_offset++);
+        printf("dim[%zu]=%lld\n",i, dims[i]);
     }
 
     auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index fb7382680013e..e4fd968eed319 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -242,6 +242,7 @@ int OrtGetTensorData(OrtValue* tensor, int* data_type, void** data, size_t** dim
     }                                                             \
   } while (false)
 
+printf("OrtGetTensorData>>0\n");
   OrtTensorTypeAndShapeInfo* info = nullptr;
   OrtAllocator* allocator = nullptr;
   size_t* p_dims = nullptr;
@@ -250,9 +251,11 @@ int OrtGetTensorData(OrtValue* tensor, int* data_type, void** data, size_t** dim
   ONNXType tensor_type;
   RETURN_ERROR_CODE_IF_ERROR(GetValueType, tensor, &tensor_type);
   if ( tensor_type != ONNX_TYPE_TENSOR ) {
+printf("OrtGetTensorData>> tensor_type=%d\n", (int)(tensor_type));
     return ORT_FAIL;
   }
 
+printf("OrtGetTensorData>>1\n");
   RETURN_ERROR_CODE_IF_ERROR(GetTensorTypeAndShape, tensor, &info);
 
   size_t dims_len = 0;
@@ -263,6 +266,7 @@ int OrtGetTensorData(OrtValue* tensor, int* data_type, void** data, size_t** dim
 
   RELEASE_AND_RETURN_ERROR_CODE_IF_ERROR(GetTensorMutableData, tensor, data);
 
+printf("OrtGetTensorData>>2\n");
   ONNXTensorElementDataType type;
   RELEASE_AND_RETURN_ERROR_CODE_IF_ERROR(GetTensorElementType, info, &type);
   *data_type = static_cast<int>(type);
@@ -358,7 +362,12 @@ int OrtRun(OrtSession* session,
            const char** input_names, const ort_tensor_handle_t* inputs, size_t input_count,
            const char** output_names, size_t output_count, ort_tensor_handle_t* outputs,
            OrtRunOptions* run_options) {
-  return CHECK_STATUS(Run, session, run_options, input_names, inputs, input_count, output_names, output_count, outputs);
+  EM_ASM({ Module["OrtRunPromise"] = new Promise((r) => {Module["OrtRunPromiseResolve"] = r;}); });
+  printf("OrtRun start\n");
+  auto status_code = CHECK_STATUS(Run, session, run_options, input_names, inputs, input_count, output_names, output_count, outputs);
+  printf("OrtRun end\n");
+  EM_ASM({ Module["OrtRunPromiseResolve"](); });
+  return status_code;
 }
 
 char* OrtEndProfiling(ort_session_handle_t session) {

From c087b47f8d7b3febb87228247c9537e7c1aeceb3 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 27 Oct 2022 17:41:06 -0700
Subject: [PATCH 11/81] 8

---
 br.bat                                         |  6 ++++++
 cmake/onnxruntime_webassembly.cmake            |  3 ++-
 js/web/lib/wasm/binding/ort-wasm.d.ts          |  3 +++
 js/web/lib/wasm/jsep/backend-webgpu.ts         | 10 +++++++---
 js/web/lib/wasm/jsep/init.ts                   | 11 +----------
 js/web/lib/wasm/wasm-core-impl.ts              |  9 +++------
 onnxruntime/core/providers/js/data_transfer.cc |  2 --
 onnxruntime/core/providers/js/js_export.cc     |  2 --
 onnxruntime/core/providers/js/js_export.h      |  2 ++
 onnxruntime/wasm/api.cc                        | 10 ++--------
 10 files changed, 26 insertions(+), 32 deletions(-)
 create mode 100644 br.bat

diff --git a/br.bat b/br.bat
new file mode 100644
index 0000000000000..828acb95dd337
--- /dev/null
+++ b/br.bat
@@ -0,0 +1,6 @@
+call .\build.bat --config Release --skip_submodule_sync --skip_tests --disable_wasm_exception_catching --disable_rtti --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" --target onnxruntime_webassembly
+
+IF %ERRORLEVEL% == 0 (
+copy /Y .\build\Windows\Release\ort-wasm.js .\js\web\lib\wasm\binding\
+copy /Y .\build\Windows\Release\ort-wasm.wasm .\js\web\dist\
+)
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index bdccef504b5c8..70bf05ba2073d 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -216,9 +216,10 @@ else()
                         -s ASYNCIFY=1 \
                         -s ASYNCIFY_STACK_SIZE=8192 \
                         -s ASYNCIFY_ADVISE=1 \
-                        -s ASYNCIFY_DEBUG=2 \
+                        -s ASYNCIFY_DEBUG=0 \
                         -s ASYNCIFY_IGNORE_INDIRECT=0 \
                         -s ASYNCIFY_REMOVE=OrtInit \
+                        -s ASYNCIFY_ADD=OrtRun \
                         --no-entry")
 
   if (onnxruntime_USE_JS)
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index cee1e699ec9cc..efb73c9943518 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -68,7 +68,10 @@ export interface OrtWasmModule extends EmscriptenModule {
       (backend: JSEP.BackendType, alloc: JSEP.AllocFunction, free: JSEP.FreeFunction, upload: JSEP.UploadFunction,
        download: JSEP.DownloadFunction, createKernel: JSEP.CreateKernelFunction,
        releaseKernel: JSEP.ReleaseKernelFunction, run: JSEP.RunFunction): void;
+
   _JsepOutput(context: number, index: number, data: number): number;
+
+  jsepRunPromise?: Promise<number>;
   // #endregion
 }
 
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index e0f637ca53a2a..462660b6effc5 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -25,7 +25,8 @@ export class WebGpuBackend {
   gpuDataManager: GpuDataManager;
   programManager: ProgramManager;
 
-  kernels: Map<number, [RunFunction, unknown]>;
+  // TODO: remove value[0]. the string is only for debug
+  kernels: Map<number, [string, RunFunction, unknown]>;
 
   commandEncoder: GPUCommandEncoder|null = null;
   computePassEncoder: GPUComputePassEncoder|null = null;
@@ -164,7 +165,7 @@ export class WebGpuBackend {
     if (op.length > 1 && typeof op[1] !== 'undefined') {
       processedAttribute = op[1](attribute);
     }
-    this.kernels.set(kernelId, [op[0], processedAttribute]);
+    this.kernels.set(kernelId, [name, op[0], processedAttribute]);
   }
 
   releaseKernel(kernelId: number): void {
@@ -176,7 +177,10 @@ export class WebGpuBackend {
     if (!kernel) {
       throw new Error(`kernel not created: ${kernelId}`);
     }
-    const [kernelEntry, attributes] = kernel;
+    const [name, kernelEntry, attributes] = kernel;
+
+    // eslint-disable-next-line no-console
+    console.log(`[JS] Start to run kernel "${name}"...`);
     return kernelEntry(context, attributes);
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index bae2b46d033f3..965bd2fa570bd 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -74,7 +74,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         // jsepUpload(src, dst, size)
         (dataOffset: number, gpuDataId: number, size: number) => {
           // eslint-disable-next-line no-console
-          console.log('jsepUpload');
+          console.log(`jsepUpload: dataOffset=${dataOffset}, gpuDataId=${gpuDataId}, size=${size}`);
           const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
           backend.upload(gpuDataId, data);
         },
@@ -87,14 +87,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
               // eslint-disable-next-line no-console
               console.log(`jsepDownload: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
 
-              // eslint-disable-next-line no-console
-              console.log(`jsepDownload: before download: ${
-                  new Float32Array(data.buffer, data.byteOffset, data.byteLength).join(',')}`);
-
               await backend.download(gpuDataId, data);
-              // eslint-disable-next-line no-console
-              console.log(`jsepDownload: after download: ${
-                  new Float32Array(data.buffer, data.byteOffset, data.byteLength).join(',')}`);
             },
 
         // jsepCreateKernel
@@ -105,8 +98,6 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
 
         // jsepRun
         (kernel: number, contextDataOffset: number) => {
-          // eslint-disable-next-line no-console
-          console.log('jsepRun');
           const context = new OpKernelContext(module, backend, contextDataOffset);
           return backend.computeKernel(kernel, context);
         });
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index af67d1e4e99a3..a2e0efb65ad8c 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -332,14 +332,11 @@ export const run = async(
           outputValuesOffset, runOptionsHandle);
 
       // eslint-disable-next-line @typescript-eslint/naming-convention
-      const prom = (wasm as {OrtRunPromise?: Promise<void>}).OrtRunPromise;
-      if (prom) {
-        await prom;
+      const runPromise = wasm.jsepRunPromise;
+      if (runPromise && typeof runPromise.then !== 'undefined') {
+        errorCode = await runPromise;
       }
 
-      // eslint-disable-next-line no-console
-      console.log(`OrtRun() errorcode=${errorCode}`);
-
       const output: SerializableTensor[] = [];
 
       if (errorCode === 0) {
diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
index 313e6660fc56c..8abbe1ad04f4a 100644
--- a/onnxruntime/core/providers/js/data_transfer.cc
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -30,9 +30,7 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*un
     EM_ASM({ Module.jsepUpload($0, $1, $2); }, src_data, dst_data, bytes);
   } else if (src_device.Type() == OrtDevice::GPU) {
     // copy from GPU to CPU
-    printf("DataTransfer::CopyTensor before jsepDownload\n");
     jsepDownload(src_data, dst_data, bytes);
-    printf("DataTransfer::CopyTensor after jsepDownload\n");
   } else {
     // copy from CPU to CPU (don't think we ever get here)
     memcpy(dst_data, src_data, bytes);
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index e83f994cca4c5..70e2157f0489b 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -8,14 +8,12 @@
 const void * JsepOutput(void * context, int index, void * data) {
     uint32_t * data_offset = reinterpret_cast<uint32_t *>(data);
     uint32_t dim = *data_offset++;
-    printf("JsepOutput: dim=%u\n",dim);
     size_t dim_size = static_cast<size_t>(dim);
     std::vector<int64_t> dims;
     dims.reserve(dim_size);
     dims.resize(dim_size);
     for (size_t i = 0; i < dim_size; i++) {
         dims[i] = static_cast<int64_t>(*data_offset++);
-        printf("dim[%zu]=%lld\n",i, dims[i]);
     }
 
     auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
diff --git a/onnxruntime/core/providers/js/js_export.h b/onnxruntime/core/providers/js/js_export.h
index a178d4b37fe6e..fa6ec4f9e25f5 100644
--- a/onnxruntime/core/providers/js/js_export.h
+++ b/onnxruntime/core/providers/js/js_export.h
@@ -7,6 +7,8 @@
 
 #include <stddef.h>
 
+// TODO: Move to api.h
+
 extern "C" {
 const void * EMSCRIPTEN_KEEPALIVE JsepOutput(void * context, int index, void * data);
 };
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index e4fd968eed319..d24cbd495d1fa 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -242,7 +242,6 @@ int OrtGetTensorData(OrtValue* tensor, int* data_type, void** data, size_t** dim
     }                                                             \
   } while (false)
 
-printf("OrtGetTensorData>>0\n");
   OrtTensorTypeAndShapeInfo* info = nullptr;
   OrtAllocator* allocator = nullptr;
   size_t* p_dims = nullptr;
@@ -251,11 +250,9 @@ printf("OrtGetTensorData>>0\n");
   ONNXType tensor_type;
   RETURN_ERROR_CODE_IF_ERROR(GetValueType, tensor, &tensor_type);
   if ( tensor_type != ONNX_TYPE_TENSOR ) {
-printf("OrtGetTensorData>> tensor_type=%d\n", (int)(tensor_type));
     return ORT_FAIL;
   }
 
-printf("OrtGetTensorData>>1\n");
   RETURN_ERROR_CODE_IF_ERROR(GetTensorTypeAndShape, tensor, &info);
 
   size_t dims_len = 0;
@@ -266,7 +263,6 @@ printf("OrtGetTensorData>>1\n");
 
   RELEASE_AND_RETURN_ERROR_CODE_IF_ERROR(GetTensorMutableData, tensor, data);
 
-printf("OrtGetTensorData>>2\n");
   ONNXTensorElementDataType type;
   RELEASE_AND_RETURN_ERROR_CODE_IF_ERROR(GetTensorElementType, info, &type);
   *data_type = static_cast<int>(type);
@@ -362,11 +358,9 @@ int OrtRun(OrtSession* session,
            const char** input_names, const ort_tensor_handle_t* inputs, size_t input_count,
            const char** output_names, size_t output_count, ort_tensor_handle_t* outputs,
            OrtRunOptions* run_options) {
-  EM_ASM({ Module["OrtRunPromise"] = new Promise((r) => {Module["OrtRunPromiseResolve"] = r;}); });
-  printf("OrtRun start\n");
+  EM_ASM({ Module["jsepRunPromise"] = new Promise((r) => {Module.jsepRunPromiseResolve = r;}); });
   auto status_code = CHECK_STATUS(Run, session, run_options, input_names, inputs, input_count, output_names, output_count, outputs);
-  printf("OrtRun end\n");
-  EM_ASM({ Module["OrtRunPromiseResolve"](); });
+  EM_ASM({ Module.jsepRunPromiseResolve($0); }, status_code);
   return status_code;
 }
 

From 4697af7002561b227eaf5d09dec5423fd033280d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 31 Oct 2022 18:24:28 -0700
Subject: [PATCH 12/81] 9

---
 bb.bat                                        |   2 +-
 js/web/lib/wasm/jsep/util.ts                  | 202 ++++++++++++
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   4 +-
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  | 242 +++++++--------
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 292 +++++++++---------
 js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts |  66 ++--
 .../providers/js/js_execution_provider.cc     |   9 +
 .../core/providers/js/operators/conv.cc       |  52 ++++
 .../core/providers/js/operators/conv.h        |  57 ++++
 .../core/providers/js/operators/unary.cc      |   1 -
 10 files changed, 616 insertions(+), 311 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/operators/conv.cc
 create mode 100644 onnxruntime/core/providers/js/operators/conv.h

diff --git a/bb.bat b/bb.bat
index 97193fce3a950..80418a42c2da0 100644
--- a/bb.bat
+++ b/bb.bat
@@ -1,4 +1,4 @@
-call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" --target onnxruntime_webassembly
+call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" --target onnxruntime_webassembly --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
 
 IF %ERRORLEVEL% == 0 (
 copy /Y .\build\Windows\Debug\ort-wasm.js .\js\web\lib\wasm\binding\
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index 92f24d38f343f..f4efee4164e93 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -510,5 +510,207 @@ export class ShapeUtil {
   }
 }
 
+export class PoolConvUtil {
+  /**
+   * Adjust the kernel, strides, pads to correct rank. Set to default value if not present
+   * @param isGlobalOperator If true, perform global pooling.
+   * @param inputDims The input tensor dimension.
+   * @param kernelShape The size of the kernel along each axis.
+   * @param strides Stride along each axis.
+   * @param dilations Dilation along each axis.
+   * @param pads Padding for the beginning and ending along each axis.
+   */
+  static adjustPoolAttributes(
+      isGlobalOperator: boolean, inputDims: readonly number[], kernelShape: number[], strides: number[],
+      dilations: number[], pads: number[]): void {
+    if (!isGlobalOperator && kernelShape.length !== inputDims.length - 2) {
+      throw new Error('length of specified kernel shapes should be 2 less than length of input dimensions');
+    }
+
+    if (isGlobalOperator) {
+      // adjust kernel shape to cover the input dims
+      for (let dim = 0; dim < inputDims.length - 2; dim++) {
+        if (dim >= kernelShape.length) {
+          kernelShape.push(inputDims[dim + 2]);
+        } else {
+          kernelShape[dim] = inputDims[dim + 2];
+        }
+      }
+    }
+
+    // adjust strides length to match kernel shape length
+    for (let dim = 0; dim < kernelShape.length; dim++) {
+      if (dim < strides.length) {
+        if (strides[dim] < 0) {
+          throw new Error('strides should be greater than or equal to 1');
+        }
+      } else {
+        strides.push(1);
+      }
+    }
+
+    // adjust dilation value
+    for (let dim = 0; dim < kernelShape.length; dim++) {
+      if (dim < dilations.length) {
+        if (dilations[dim] < 0) {
+          throw new Error('dilations should be greater than or equal to 1');
+        }
+      } else {
+        dilations.push(1);
+      }
+    }
+
+    // adjust pads length to match 2 * kernel shape length
+    for (let dim = 0; dim < kernelShape.length * 2; dim++) {
+      if (dim < pads.length) {
+        if (pads[dim] < 0) {
+          throw new Error('pad should be greater than or equal to 1');
+        }
+      } else {
+        pads.push(0);
+      }
+    }
+
+    // sanity checks for values in kernel shapes and pads
+    for (let dim = 0; dim < kernelShape.length; dim++) {
+      if (kernelShape[dim] <= 0) {
+        throw new Error('kernel shapes need to be greater than 0');
+      }
+
+      if (pads[dim] >= kernelShape[dim] || pads[dim + kernelShape.length] >= kernelShape[dim]) {
+        throw new Error('pads should be smaller than kernel');
+      }
+    }
+  }
+
+  // adjust pad values based on 'autoPad' attribute
+  static adjustPadsBasedOnAutoPad(
+      inputDims: readonly number[], strides: readonly number[], dilations: readonly number[],
+      kernelShape: readonly number[], pads: number[], autoPad?: string): void {
+    if (!autoPad) {
+      return;
+    }
+
+    if (pads.length !== 2 * (inputDims.length - 2)) {
+      throw new Error('length of pads should be twice the length of data dimensions');
+    }
+
+    if (strides.length !== (inputDims.length - 2)) {
+      throw new Error('length of strides should be the length of data dimensions');
+    }
+
+    if (kernelShape.length !== (inputDims.length - 2)) {
+      throw new Error('length of kernel shapes should be the length of data dimensions');
+    }
+
+    for (let dim = 0; dim < inputDims.length - 2; dim++) {
+      PoolConvUtil.adjustPadAndReturnShape(
+          inputDims[dim + 2], strides[dim], dilations[dim], kernelShape[dim], pads, dim, dim + inputDims.length - 2,
+          autoPad);
+    }
+  }
+
+  /**
+   * Calculate the output shape for Pool ops based on input attributes. (Should be used only for Pool ops)
+   * @param isGlobalOperator If true, perform global pooling.
+   * @param inputDims The input tensor dimension. (inputs[0].dims)
+   * @param strides Stride along each axis.
+   * @param dilations Dilation along each axis.
+   * @param kernelShape The size of the kernel along each axis.
+   * @param pads Padding for the beginning and ending along each axis.
+   * @param autoPad DEPRECATED attribute supported for legacy models. Specifies how to implicitly calculate pads in each
+   *     dimension. Can take values NOTSET, SAME_UPPER, SAME_LOWER, or VALID.
+   */
+  static computePoolOutputShape(
+      isGlobalOperator: boolean, inputDims: readonly number[], strides: number[], dilations: number[],
+      kernelShape: number[], pads: number[], autoPad?: string): number[] {
+    if (inputDims.length <= 0) {
+      throw new Error('input shape must be of size greater than 0');
+    }
+
+    // Add batch size and number of channels of output
+    const outputDims = [inputDims[0], inputDims[1]];
+
+    PoolConvUtil.computeShapeHelper(
+        isGlobalOperator, inputDims, outputDims, strides, dilations, kernelShape, pads, autoPad);
+    return outputDims;
+  }
+
+  /**
+   * Calculate the output shape for Conv op based on input attributes. (Should be used only for Conv op)
+   * @param inputDims The input tensor dimension. (inputs[0].dims)
+   * @param filterDims The filter tensor dimension. (inputs[1].dims)
+   * @param strides Stride along each axis.
+   * @param kernelShape The size of the kernel along each axis.
+   * @param pads Padding for the beginning and ending along each axis.
+   * @param autoPad DEPRECATED attribute supported for legacy models. Specifies how to implicitly calculate pads in each
+   *     dimension. Can take values NOTSET, SAME_UPPER, SAME_LOWER, or VALID.
+   */
+  static computeConvOutputShape(
+      inputDims: readonly number[], filterDims: readonly number[], strides: number[], dilations: number[],
+      kernelShape: number[], pads: number[], autoPad?: string): number[] {
+    if (inputDims.length <= 0 || filterDims.length <= 0) {
+      throw new Error('invalid input tensor dims or invalid filter tensor dims');
+    }
+
+    // Add batch size and number of channels of output
+    const outputDims = [inputDims[0], filterDims[0]];
+
+    PoolConvUtil.computeShapeHelper(false, inputDims, outputDims, strides, dilations, kernelShape, pads, autoPad);
+    return outputDims;
+  }
+
+  // will compute output shapes for data dimensions ONLY (i.e.) no batch size and channels
+  // called by computePoolOutputShape() and computeConvOutputShape()
+  // adjust pads based on 'autoPad' attribute prior to shape computation
+  private static computeShapeHelper(
+      isGlobalOperator: boolean, inputDims: readonly number[], outputDims: number[], strides: readonly number[],
+      dilations: readonly number[], kernelShape: readonly number[], pads: number[], autoPad?: string) {
+    if (isGlobalOperator) {
+      for (let dim = 0; dim < inputDims.length - 2; dim++) {
+        outputDims.push(1);
+      }
+    } else {
+      for (let dim = 0; dim < inputDims.length - 2; dim++) {
+        outputDims.push(PoolConvUtil.adjustPadAndReturnShape(
+            inputDims[dim + 2], strides[dim], dilations[dim], kernelShape[dim], pads, dim, dim + inputDims.length - 2,
+            autoPad));
+      }
+    }
+  }
+
+  // helper for computeShapeHelper() and adjustPadsBasedOnAutoPad()
+  // adjusts pad value for given 'autoPad' string and computes output shape along a particular dimension
+  private static adjustPadAndReturnShape(
+      inSize: number, stride: number, dilation: number, kernel: number, pads: number[], padHeadIndex: number,
+      padTailIndex: number, autoPad?: string): number {
+    const dkernel = dilation * (kernel - 1) + 1;
+    if (autoPad && autoPad !== 'NOTSET') {
+      switch (autoPad) {
+        case 'VALID':
+          pads[padHeadIndex] = 0;
+          pads[padTailIndex] = 0;
+          return Math.floor(((inSize - dkernel) / stride) + 1);
+        case 'SAME_LOWER':
+        case 'SAME_UPPER':
+          if (dilation !== 1) {
+            throw new Error('Dilation not supported for SAME_UPPER or SAME_LOWER');
+          } else {
+            const legacyTargetSize = (inSize + stride - 1) / stride;
+            const padNeeded = (legacyTargetSize - 1) * stride + kernel - inSize;
+            pads[padHeadIndex] =
+                (autoPad === 'SAME_LOWER') ? Math.floor((padNeeded + 1) / 2) : Math.floor(padNeeded / 2);
+            pads[padTailIndex] = padNeeded - pads[padHeadIndex];
+            return Math.floor(((inSize + padNeeded - kernel) / stride) + 1);
+          }
+        default:
+          throw new Error('Unsupported AutoPad type');
+      }
+    } else {
+      return Math.floor(((inSize + pads[padHeadIndex] + pads[padTailIndex] - dkernel) / stride) + 1);
+    }
+  }
+}
+
 export const MIN_CLIP = -3.4028234663852886e+38;
 export const MAX_CLIP = 3.4028234663852886e+38;
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index a65c162727ecf..56e90f9761ce0 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -3,7 +3,7 @@
 
 // import * as binaryOps from './ops/binary-op';
 // import {concat, parseConcatAttributes} from './ops/concat';
-// import {conv, parseConvAttributes} from './ops/conv';
+import {conv, parseConvAttributes} from './ops/conv';
 // import {gather, parseGatherAttributes} from './ops/gather';
 // import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
 // import {matMul, parseMatMulAttributes} from './ops/matmul';
@@ -33,7 +33,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['Cast', '', '6+', cast, parseCastAttributes],
   //['Ceil', '', '6+', unaryOps.ceil], ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
   //['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
-  //['Conv', '', '1+', conv, parseConvAttributes], ['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
+  ['Conv', [conv, parseConvAttributes]],  //['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
   // ['Dropout', '', '7+', unaryOps.identity],
   // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
   // ['Equal', '', '7+', binaryOps.equal],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index e9b8a64d707df..d29036a7313ff 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -1,129 +1,119 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import {Logger} from '../../../instrument';
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-// import {calculateOutputShape, ConvAttributes} from './conv';
-// import {getActicationSnippet} from './fuse-utils';
-
-// const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
-//   name: 'GroupedConv',
-//   inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-//                         [GpuDataType.default, GpuDataType.default],
-//   cacheHint
-// });
-
-// const createGroupedConvProgramInfo =
-//     (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], metadata: ProgramMetadata,
-//      attributes: ConvAttributes): ProgramInfo => {
-//       const hasBias = inputs.length > 2;
-//       const processBias = hasBias ? 'value += b[output_channel];' : '';
-//       const xShape = inputs[0].dims;
-//       const wShape = inputs[1].dims;
-//       const outputChannelsPerGroup = wShape[0] / attributes.group;
-
-//       const dataType = 'f32';  // TODO: support other data type
-//       const {activationFunction, applyActivation} = getActicationSnippet(attributes);
-//       const inputStorageBuffersDeclarations = [
-//         `@group(0) @binding(0) var<storage, read> x : array<${dataType}>;`,
-//         `@group(0) @binding(1) var<storage, read> w : array<${dataType}>;`
-//       ];
-//       if (hasBias) {
-//         inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
-//       }
-
-//       Logger.verbose(
-//           'GroupedConv',
-//           `autpPad:${attributes.autoPad}, dilations:${attributes.dilations}, group:${attributes.group},
-//           kernelShape:${
-//               attributes.kernelShape}, pads:${attributes.pads}, strides:${attributes.strides}`);
-//       const outputShape =
-//           calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
-//       const outputSize = ShapeUtil.size(outputShape);
-//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
-//       const xIndicesHelper = createIndicesHelper('x', xShape);
-//       const wIndicesHelper = createIndicesHelper('w', wShape);
-
-//       const shaderSource = `
-//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-//   const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
-//   const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
-
-//   ${inputStorageBuffersDeclarations.join('\n')}
-//   @group(0) @binding(${inputStorageBuffersDeclarations.length}) var<storage, read_write> output : array<${dataType}>;
-
-//   ${activationFunction}
-//   ${outputIndicesHelper.o2iImpl}
-//   ${xIndicesHelper.i2oImpl}
-//   ${wIndicesHelper.i2oImpl}
-
-//   @compute @workgroup_size(WORKGROUP_SIZE)
-//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-//     // Guard against out-of-bounds work group sizes
-//     if (global_id.x >= ${outputSize}u) {
-//       return;
-//     }
-
-//     ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
-//     ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
-//     let batch: u32 = outputIndices[0];
-//     let output_channel: u32 = outputIndices[1];
-//     let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
-//     let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
-
-//     var value: ${dataType} = ${dataType}(0);
-//     for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
-//       let input_channel = group_id * ${wShape[1]}u + wInChannel;
-//       for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
-//         let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
-
-//         if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
-//           continue;
-//         }
-
-//         for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
-//           let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
-//           if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
-//             continue;
-//           }
-
-//           ${
-//           xIndicesHelper.indicesVariableDeclaration(
-//               'xIndices',
-//               [
-//                 'batch', 'input_channel', 'xHeight', 'xWidth'
-//               ])}
-//           let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
-//           ${
-//           wIndicesHelper.indicesVariableDeclaration('wIndices', [
-//             'output_channel', 'wInChannel', 'wHeight', 'wWidth'
-//           ])}
-//           let wVal = w[${wIndicesHelper.i2oExpression('wIndices')}];
-//           value += xVal*wVal;
-//         }
-//       }
-//     }
-//     ${processBias}
-//     ${applyActivation}
-//     output[global_id.x] = value;
-//   }`;
-//       return {
-//         ...metadata,
-//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-//         shaderSource,
-//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-//       };
-//     };
-
-// export const createGroupedConvProgramInfoLoader =
-//     (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], attributes: ConvAttributes):
-//         ProgramInfoLoader => {
-//           const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
-//           return {...metadata, get: () => createGroupedConvProgramInfo(inferenceHandler, inputs, metadata,
-//           attributes)};
-//         };
+import {TensorView} from '../../tensor';
+import {ShapeUtil} from '../../util';
+import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+import {calculateOutputShape, ConvAttributes} from './conv';
+import {getActicationSnippet} from './fuse-utils';
+
+const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
+  name: 'GroupedConv',
+  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                        [GpuDataType.default, GpuDataType.default],
+  cacheHint
+});
+
+const createGroupedConvProgramInfo =
+    (inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: ConvAttributes): ProgramInfo => {
+      const hasBias = inputs.length > 2;
+      const processBias = hasBias ? 'value += b[output_channel];' : '';
+      const xShape = inputs[0].dims;
+      const wShape = inputs[1].dims;
+      const outputChannelsPerGroup = wShape[0] / attributes.group;
+
+      const dataType = 'f32';  // TODO: support other data type
+      const {activationFunction, applyActivation} = getActicationSnippet(attributes);
+      const inputStorageBuffersDeclarations = [
+        `@group(0) @binding(0) var<storage, read> x : array<${dataType}>;`,
+        `@group(0) @binding(1) var<storage, read> w : array<${dataType}>;`
+      ];
+      if (hasBias) {
+        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
+      }
+
+      const outputShape =
+          calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
+      const outputSize = ShapeUtil.size(outputShape);
+      const outputIndicesHelper = createIndicesHelper('output', outputShape);
+      const xIndicesHelper = createIndicesHelper('x', xShape);
+      const wIndicesHelper = createIndicesHelper('w', wShape);
+
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
+  const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputStorageBuffersDeclarations.length}) var<storage, read_write> output : array<${dataType}>;
+
+  ${activationFunction}
+  ${outputIndicesHelper.o2iImpl}
+  ${xIndicesHelper.i2oImpl}
+  ${wIndicesHelper.i2oImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
+    let batch: u32 = outputIndices[0];
+    let output_channel: u32 = outputIndices[1];
+    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
+    let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
+
+    var value: ${dataType} = ${dataType}(0);
+    for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
+      let input_channel = group_id * ${wShape[1]}u + wInChannel;
+      for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
+        let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
+
+        if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
+          continue;
+        }
+
+        for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
+          let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
+          if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
+            continue;
+          }
+
+          ${
+          xIndicesHelper.indicesVariableDeclaration(
+              'xIndices',
+              [
+                'batch', 'input_channel', 'xHeight', 'xWidth'
+              ])}
+          let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
+          ${
+          wIndicesHelper.indicesVariableDeclaration('wIndices', [
+            'output_channel', 'wInChannel', 'wHeight', 'wWidth'
+          ])}
+          let wVal = w[${wIndicesHelper.i2oExpression('wIndices')}];
+          value += xVal*wVal;
+        }
+      }
+    }
+    ${processBias}
+    ${applyActivation}
+    output[global_id.x] = value;
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+export const createGroupedConvProgramInfoLoader =
+    (inputs: readonly TensorView[], attributes: ConvAttributes): ProgramInfoLoader => {
+      const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
+      return {...metadata, get: () => createGroupedConvProgramInfo(inputs, metadata, attributes)};
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index d68fae4152abb..d3a33e5b77dbe 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -1,152 +1,146 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-// import {InferenceHandler} from '../../../backend';
-// import {Graph} from '../../../graph';
-// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {PoolConvUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-
-// import {createGroupedConvProgramInfoLoader} from './conv-grouped';
-// // import {createDotProductProgramInfoLoader} from './dot-product';
-// import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
-
-// // import {createIm2ColProgramInfoLoader} from './im2col';
-// // import {createMatmulProgramInfoLoader} from './matmul';
-
-
-// export const calculateOutputShape =
-//     (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
-//      adjustPads: readonly number[], strides: readonly number[]): number[] => {
-//       const batchSize = inputShape[0];
-//       const inputSpatialShape = inputShape.slice(2);
-//       const spatialRank = inputSpatialShape.length;
-//       const outChannels = kernelShape[0];
-//       const kernelSpatialShape = kernelShape.slice(2);
-//       const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
-//       const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i +
-//       spatialRank]); const outputSpatialShape =
-//           inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
-//       const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
-//       return outputShape;
-//     };
-
-// export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
-//   readonly autoPad: string;
-//   readonly dilations: readonly number[];
-//   readonly group: number;
-//   readonly kernelShape: readonly number[];
-//   readonly pads: readonly number[];
-//   readonly strides: readonly number[];
-// }
-
-// export const conv: OperatorAsyncImplementation<ConvAttributes> =
-//     async(inferenceHandler: InferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
-//   validateInputs(inputs, attributes);  // currently will fail if not conv2D
-//   return conv2d(inferenceHandler, inputs, attributes);
-// };
-
-// const conv2d: OperatorAsyncImplementation<ConvAttributes> = async(
-//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
-//   const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
-//   //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
-//   //  if (adjustedAttributes.group > 1) {
-//   return inferenceHandler.run(createGroupedConvProgramInfoLoader(inferenceHandler, inputs, adjustedAttributes),
-//   inputs);
-//   //  } else if (isPointwise) {
-//   //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
-//   //  } else {
-//   //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
-//   //  }
-// };
-
-// const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inputs: Tensor[]): T => {
-//   const kernelShape = attributes.kernelShape.slice();
-//   // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
-//   if (attributes.kernelShape.length === 0) {
-//     for (let i = 2; i < inputs[1].dims.length; ++i) {
-//       kernelShape.push(inputs[1].dims[i]);
-//     }
-//   }
-//   const pads = attributes.pads.slice();
-//   PoolConvUtil.adjustPadsBasedOnAutoPad(
-//       inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
-
-//   // always return a new object so does not modify the original attributes
-//   const newAttributes: T = Object.assign({}, attributes);
-//   Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
-//   return newAttributes;
-// };
-
-// export const parseConvAttributes: OperatorInitialization<ConvAttributes> = (node: Graph.Node): ConvAttributes => {
-//   const attributes = node.attributes;
-//   const activationAttributes = parseInternalActivationAttributes(attributes);
-//   // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
-//   const autoPad = attributes.getString('auto_pad', 'NOTSET');
-//   const dilations = attributes.getInts('dilations', [1, 1]);
-//   const group = attributes.getInt('group', 1);
-//   const kernelShape = attributes.getInts('kernel_shape', []);
-//   const pads = attributes.getInts('pads', [0, 0, 0, 0]);
-//   const strides = attributes.getInts('strides', [1, 1]);
-
-//   return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides,
-//   ...activationAttributes});
-// };
-
-// const validateInputs = (inputs: Tensor[], attributes: ConvAttributes): void => {
-//   // Refer to the below link for all input checks
-//   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
-//   if (!inputs || (inputs.length !== 2 && inputs.length !== 3)) {
-//     throw new Error('Conv requires 2 or 3 inputs');
-//   }
-
-//   // TODO : Need to add support for multi-dimensional conv
-//   if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
-//     throw new Error('currently only support 2-dimensional conv');
-//   }
-
-//   // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
-//   const dataChannel = inputs[0].dims[1];
-//   const filterInChannel = inputs[1].dims[1] * attributes.group;
-//   if (dataChannel !== filterInChannel) {
-//     throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
-//   }
-
-//   // if bias is provided it should be 1D and the number of elements should be equal to the number of feature maps
-//   if (inputs.length === 3 && (inputs[2].dims.length !== 1 || inputs[1].dims[0] !== inputs[2].dims[0])) {
-//     throw new Error('invalid bias');
-//   }
-
-//   const spatialRank = inputs[0].dims.length - 2;
-//   // wrong dilations dimension
-//   if (attributes.dilations.length !== spatialRank) {
-//     throw new Error(`dilations should be ${spatialRank}D`);
-//   }
-
-//   // Wrong strides dimension
-//   if (attributes.strides.length !== spatialRank) {
-//     throw new Error(`strides should be ${spatialRank}D`);
-//   }
-
-//   // Wrong pads dimension
-//   if (attributes.pads.length !== spatialRank * 2) {
-//     throw new Error(`pads should be ${spatialRank * 2}D`);
-//   }
-
-//   // if kernelShape is specified, it's data length must be 2 less than dims length of the weights tensor
-//   // (the first 2 dims are batch_size and channels)
-//   if (attributes.kernelShape.length !== 0 && attributes.kernelShape.length !== inputs[1].dims.length - 2) {
-//     throw new Error('invalid kernel shape');
-//   }
-
-//   // TODO : Need to add support for float64
-//   if (inputs[0].type !== 'float32' || inputs[1].type !== 'float32') {
-//     throw new Error('Conv input(X,W) should be float tensor');
-//   }
-
-//   if (inputs.length === 3 && inputs[2].type !== 'float32') {
-//     throw new Error('Conv input(bias) should be float tensor');
-//   }
-// };
+import {DataType} from '../../../wasm-core-impl';
+import {TensorView} from '../../tensor';
+import {PoolConvUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext} from '../types';
+
+import {createGroupedConvProgramInfoLoader} from './conv-grouped';
+// import {createDotProductProgramInfoLoader} from './dot-product';
+import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+
+// import {createIm2ColProgramInfoLoader} from './im2col';
+// import {createMatmulProgramInfoLoader} from './matmul';
+
+
+export const calculateOutputShape =
+    (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
+     adjustPads: readonly number[], strides: readonly number[]): number[] => {
+      const batchSize = inputShape[0];
+      const inputSpatialShape = inputShape.slice(2);
+      const spatialRank = inputSpatialShape.length;
+      const outChannels = kernelShape[0];
+      const kernelSpatialShape = kernelShape.slice(2);
+      const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
+      const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]);
+      const outputSpatialShape =
+          inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
+      const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
+      return outputShape;
+    };
+
+export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
+  readonly autoPad: string;
+  readonly dilations: readonly number[];
+  readonly group: number;
+  readonly kernelShape: readonly number[];
+  readonly pads: readonly number[];
+  readonly strides: readonly number[];
+}
+
+const validateInputs = (inputs: readonly TensorView[], attributes: ConvAttributes): void => {
+  // Refer to the below link for all input checks
+  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
+  if (!inputs || (inputs.length !== 2 && inputs.length !== 3)) {
+    throw new Error('Conv requires 2 or 3 inputs');
+  }
+
+  // TODO : Need to add support for multi-dimensional conv
+  if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
+    throw new Error('currently only support 2-dimensional conv');
+  }
+
+  // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
+  const dataChannel = inputs[0].dims[1];
+  const filterInChannel = inputs[1].dims[1] * attributes.group;
+  if (dataChannel !== filterInChannel) {
+    throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
+  }
+
+  // if bias is provided it should be 1D and the number of elements should be equal to the number of feature maps
+  if (inputs.length === 3 && (inputs[2].dims.length !== 1 || inputs[1].dims[0] !== inputs[2].dims[0])) {
+    throw new Error('invalid bias');
+  }
+
+  const spatialRank = inputs[0].dims.length - 2;
+  // wrong dilations dimension
+  if (attributes.dilations.length !== spatialRank) {
+    throw new Error(`dilations should be ${spatialRank}D`);
+  }
+
+  // Wrong strides dimension
+  if (attributes.strides.length !== spatialRank) {
+    throw new Error(`strides should be ${spatialRank}D`);
+  }
+
+  // Wrong pads dimension
+  if (attributes.pads.length !== spatialRank * 2) {
+    throw new Error(`pads should be ${spatialRank * 2}D`);
+  }
+
+  // if kernelShape is specified, it's data length must be 2 less than dims length of the weights tensor
+  // (the first 2 dims are batch_size and channels)
+  if (attributes.kernelShape.length !== 0 && attributes.kernelShape.length !== inputs[1].dims.length - 2) {
+    throw new Error('invalid kernel shape');
+  }
+
+  // TODO : Need to add support for float64
+  if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
+    throw new Error('Conv input(X,W) should be float tensor');
+  }
+
+  if (inputs.length === 3 && inputs[2].dataType !== DataType.float) {
+    throw new Error('Conv input(bias) should be float tensor');
+  }
+};
+
+const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inputs: readonly TensorView[]): T => {
+  const kernelShape = attributes.kernelShape.slice();
+  // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
+  for (let i = 2; i < inputs[1].dims.length; ++i) {
+    if (kernelShape[i - 2] === 0) {
+      kernelShape[i - 2] = inputs[1].dims[i];
+    }
+  }
+  const pads = attributes.pads.slice();
+  PoolConvUtil.adjustPadsBasedOnAutoPad(
+      inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
+
+  // always return a new object so does not modify the original attributes
+  const newAttributes: T = Object.assign({}, attributes);
+  Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
+  return newAttributes;
+};
+
+export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAttributes => {
+  const activationAttributes = parseInternalActivationAttributes(attributes);
+  // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
+  const autoPad = ['NOTSET', 'VALID', 'SAME_UPPER', 'SAME_LOWER'][attributes.auto_pad as number];
+  const dilations = [attributes.dilation0 as number, attributes.dilation1 as number];
+  const group = attributes.group as number;
+  const kernelShape = [attributes.kernelshape0 as number, attributes.kernelshape1 as number];
+  const pads =
+      [attributes.pad0 as number, attributes.pad1 as number, attributes.pad2 as number, attributes.pad3 as number];
+  const strides = [attributes.stride0 as number, attributes.stride1 as number];
+
+  return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides, ...activationAttributes});
+};
+
+const conv2d = (context: ComputeContext, attributes: ConvAttributes): number => {
+  const adjustedAttributes = getAdjustedConvAttributes(attributes, context.inputs);
+  //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
+  //  if (adjustedAttributes.group > 1) {
+  return context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
+  //  } else if (isPointwise) {
+  //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
+  //  } else {
+  //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
+  //  }
+};
+
+export const conv = (context: ComputeContext, attributes: ConvAttributes): number => {
+  validateInputs(context.inputs, attributes);  // currently will fail if not conv2D
+  return conv2d(context, attributes);
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
index 1b403505a5962..05785ba72ac0e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
@@ -1,38 +1,40 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// export interface InternalActivationAttributes {
-//   readonly activation: string;
-//   readonly clipMin?: number;
-//   readonly clipMax?: number;
-//   readonly activationCacheKey: string;
-// }
+import {MAX_CLIP, MIN_CLIP} from '../../util';
 
-// export const getActicationSnippet =
-//     (attributes: InternalActivationAttributes): {activationFunction: string; applyActivation: string} => {
-//       switch (attributes.activation) {
-//         case 'Relu':
-//           return {activationFunction: '', applyActivation: 'value = max(value, 0.0);'};
-//         case 'Sigmoid':
-//           return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
-//         case 'Clip':
-//           return {
-//             activationFunction:
-//                `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
-//             applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
-//           };
-//           // TODO: adding other activations that can be fused.
-//         default:
-//           return {activationFunction: '', applyActivation: ''};
-//       }
-//     };
+export interface InternalActivationAttributes {
+  readonly activation: string;
+  readonly clipMin?: number;
+  readonly clipMax?: number;
+  readonly activationCacheKey: string;
+}
 
-// export const parseInternalActivationAttributes = (attributes: Attribute): InternalActivationAttributes => {
-//   const activation = attributes.getString('activation', '');
+export const getActicationSnippet =
+    (attributes: InternalActivationAttributes): {activationFunction: string; applyActivation: string} => {
+      switch (attributes.activation) {
+        case 'Relu':
+          return {activationFunction: '', applyActivation: 'value = max(value, 0.0);'};
+        case 'Sigmoid':
+          return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
+        case 'Clip':
+          return {
+            activationFunction: `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
+            applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
+          };
+          // TODO: adding other activations that can be fused.
+        default:
+          return {activationFunction: '', applyActivation: ''};
+      }
+    };
 
-//   if (activation === 'Clip') {
-//     const [clipMin, clipMax] = attributes.getFloats('activation_params', [MIN_CLIP, MAX_CLIP]);
-//     return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
-//   }
-//   return {activation, activationCacheKey: activation};
-// };
+export const parseInternalActivationAttributes =
+    (attributes: Record<string, unknown>|undefined): InternalActivationAttributes => {
+      const activation = attributes?.activation as string || '';
+
+      if (activation === 'Clip') {
+        const [clipMin, clipMax] = attributes?.activation_params as [number, number] || [MIN_CLIP, MAX_CLIP];
+        return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
+      }
+      return {activation, activationCacheKey: activation};
+    };
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index b57697de818ce..7173344653287 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -38,6 +38,11 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
       ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, Start, type, Op)>
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 14, Abs);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
+
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
 
 // class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv);
 // class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
@@ -58,6 +63,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 14, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
       // KERNEL_CREATE_INFO(11, Conv),
       // KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool),
       // KERNEL_CREATE_INFO(12, MaxPool),
diff --git a/onnxruntime/core/providers/js/operators/conv.cc b/onnxruntime/core/providers/js/operators/conv.cc
new file mode 100644
index 0000000000000..018bdcf76bf11
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/conv.cc
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/cpu/nn/conv_attributes.h"
+
+#include "conv.h"
+
+namespace onnxruntime {
+namespace js {
+
+#define REGISTER_KERNEL_TYPED(T)                                                           \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
+      Conv,                                                                                \
+      kMSInternalNHWCDomain,                                                                         \
+      1, 10,                                                                               \
+      T,                                                                                   \
+      kJsExecutionProvider,                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Conv<T>);                                                                            \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
+      Conv,                                                                                \
+      kMSInternalNHWCDomain,                                                                         \
+      11,                                                                                  \
+      T,                                                                                   \
+      kJsExecutionProvider,                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Conv<T>);
+
+ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(
+    Conv,
+    kMSInternalNHWCDomain,
+    1, 10,
+    T,
+    kOnnxDomain,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),
+    Conv<float>);
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    Conv,
+    kMSInternalNHWCDomain,
+    11,
+    T,
+    kOnnxDomain,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),
+    Conv<float>);
+
+
+
+REGISTER_KERNEL_TYPED(float)
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
new file mode 100644
index 0000000000000..90f79de92e31b
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/cpu/nn/conv_attributes.h"
+
+namespace onnxruntime {
+namespace js {
+
+template <typename T>
+class Conv : public JsKernel {
+ public:
+  Conv(const OpKernelInfo& info) : JsKernel(info), conv_attrs_(info) {
+
+    TensorShapeVector kernel_shape;
+    if (conv_attrs_.kernel_shape_specified) {
+        ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape).IsOK());
+    }
+
+    // currently only support Conv2D. TODO: support other
+    JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
+        "autopad": $0,
+        "dilation0": $1,
+        "dilation1": $2,
+        "group": $3,
+        "kernelshape0": $4,
+        "kernelshape1": $5,
+        "pad0": $6,
+        "pad1": $7,
+        "pad2": $8,
+        "pad3": $9,
+        "stride0": $10,
+        "stride1": $11,
+    }),
+    conv_attrs_.auto_pad,
+    conv_attrs_.dilations[0],
+    conv_attrs_.dilations[1],
+    conv_attrs_.group,
+    conv_attrs_.kernel_shape_specified ? kernel_shape[0] : 0,
+    conv_attrs_.kernel_shape_specified ? kernel_shape[1] : 0,
+    conv_attrs_.pads[0],
+    conv_attrs_.pads[1],
+    conv_attrs_.pads[2],
+    conv_attrs_.pads[3],
+    conv_attrs_.strides[0],
+    conv_attrs_.strides[1]
+    );
+  }
+
+ protected:
+  ConvAttributes conv_attrs_;
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index 661490fd8684c..fcb58c078eb28 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/js/js_execution_provider.h"
 #include "core/providers/js/js_kernel.h"
 
 namespace onnxruntime {

From e5dc7f7cddcb020e94fde684e32010e6d3e8fd7f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 1 Nov 2022 12:53:51 -0700
Subject: [PATCH 13/81] 10

---
 .../providers/js/js_execution_provider.cc     | 17 +++++---
 onnxruntime/core/providers/js/js_kernel.h     | 16 +++++++
 .../core/providers/js/operators/conv.cc       | 33 ++++++--------
 .../core/providers/js/operators/unary.cc      | 43 ++++++-------------
 4 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 7173344653287..8089a63b2c3b8 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -27,18 +27,20 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 
 #define KERNEL_CREATE_INFO_VERSIONED(Start, End, Op) \
   BuildKernelCreateInfo<                             \
-      ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, Start, End, Op)>
+      ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, Start, End, Op)>
 
 #define KERNEL_CREATE_INFO(Start, Op) \
   BuildKernelCreateInfo<              \
-      ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, Start, Op)>
+      ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, Start, Op)>
 
 #define KERNEL_CREATE_INFO_TYPED(Start, type, Op) \
   BuildKernelCreateInfo<                          \
-      ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, Start, type, Op)>
+      ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, Start, type, Op)>
 
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 14, Abs);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 13, Abs);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Abs);
+
+//class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
@@ -62,8 +64,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 14, Abs)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
+      KERNEL_CREATE_INFO_VERSIONED(1, 13, Abs),
+      KERNEL_CREATE_INFO(14, Abs),
+      //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 774bb437fcf3b..041512953f839 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -16,6 +16,22 @@ namespace js {
 #define JSEP_INIT_KERNEL(x) EM_ASM({ Module.jsepCreateKernel(#x, $0, undefined); }, this)
 #define JSEP_INIT_KERNEL_ATTRIBUTE(x, a, ...) EM_ASM({ Module.jsepCreateKernel(#x, $0, a); }, this, __VA_ARGS__)
 
+#define JSEP_KERNEL_IMPL(classname, x)                       \
+class classname : public JsKernel {                          \
+public:                                                      \
+    classname(const OpKernelInfo& info) : JsKernel(info) {   \
+        JSEP_INIT_KERNEL(x);                                 \
+    }                                                        \
+};
+
+#define JSEP_CLASS_IMPL_ATTRIBUTE(classname, x, a, ...)      \
+class classname : public JsKernel {                          \
+public:                                                      \
+    classname(const OpKernelInfo& info) : JsKernel(info) {   \
+        JSEP_INIT_KERNEL_ATTRIBUTE(x, a, __VA_ARGS__);       \
+    }                                                        \
+};
+
 class JsKernel : public OpKernel {
  public:
   explicit JsKernel(const OpKernelInfo& info)
diff --git a/onnxruntime/core/providers/js/operators/conv.cc b/onnxruntime/core/providers/js/operators/conv.cc
index 018bdcf76bf11..e10471acdc04f 100644
--- a/onnxruntime/core/providers/js/operators/conv.cc
+++ b/onnxruntime/core/providers/js/operators/conv.cc
@@ -10,40 +10,31 @@ namespace onnxruntime {
 namespace js {
 
 #define REGISTER_KERNEL_TYPED(T)                                                           \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
+      Conv,                                                                                \
+      kMSInternalNHWCDomain,                                                               \
+      11,                                                                                  \
+      T,                                                                                   \
+      kJsExecutionProvider,                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Conv<T>);                                                                            \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       Conv,                                                                                \
-      kMSInternalNHWCDomain,                                                                         \
+      kOnnxDomain,                                                               \
       1, 10,                                                                               \
       T,                                                                                   \
-      kJsExecutionProvider,                                                                \
+      kJsExecutionProvider,                                                                         \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Conv<T>);                                                                            \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       Conv,                                                                                \
-      kMSInternalNHWCDomain,                                                                         \
+      kOnnxDomain,                                                               \
       11,                                                                                  \
       T,                                                                                   \
-      kJsExecutionProvider,                                                                \
+      kJsExecutionProvider,                                                                         \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Conv<T>);
 
-ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(
-    Conv,
-    kMSInternalNHWCDomain,
-    1, 10,
-    T,
-    kOnnxDomain,
-    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),
-    Conv<float>);
-ONNX_OPERATOR_TYPED_KERNEL_EX(
-    Conv,
-    kMSInternalNHWCDomain,
-    11,
-    T,
-    kOnnxDomain,
-    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),
-    Conv<float>);
-
 
 
 REGISTER_KERNEL_TYPED(float)
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index fcb58c078eb28..f8fdf3ec30ba5 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -6,39 +6,22 @@
 namespace onnxruntime {
 namespace js {
 
-class AbsImpl : public JsKernel {
-public:
-    AbsImpl(const OpKernelInfo& info) : JsKernel(info) {
-        JSEP_INIT_KERNEL(Abs);
-    }
-};
+#define JSEP_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS)              \
+  ONNX_OPERATOR_KERNEL_EX(                                                         \
+      OP_TYPE, kOnnxDomain, VERSION, kJsExecutionProvider,                         \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()), \
+      KERNEL_CLASS);
 
+#define JSEP_ELEMENTWISE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, TYPE, KERNEL_CLASS) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                             \
+      OP_TYPE, kOnnxDomain, VERSION_FROM, VERSION_TO, kJsExecutionProvider,                      \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()),               \
+      KERNEL_CLASS);
 
-// class kJsExecutionProvider_Abs_kOnnxDomain_ver1_14;
-// template <> KernelCreateInfo BuildKernelCreateInfo<kJsExecutionProvider_Abs_kOnnxDomain_ver1_14>() {
-//     return KernelCreateInfo(
-//         KernelDefBuilder()
-//         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-//         .SetName("Abs")
-//         .SetDomain(kOnnxDomain)
-//         .SinceVersion(1, 14)
-//         .Provider(kJsExecutionProvider).Build(),
-//         static_cast<KernelCreatePtrFn>(
-//             [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
-//                 out = std::make_unique<AbsImpl>(info);
-//                 return Status::OK();
-//             })
-//         );
-// }
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    Abs,
-    kOnnxDomain,
-    1,
-    14,
-    kJsExecutionProvider,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    AbsImpl);
+JSEP_KERNEL_IMPL(Abs, Abs)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Abs, 1, 13, float, Abs)
+JSEP_ELEMENTWISE_KERNEL(Abs, 14, float, Abs)
 
 }  // namespace js
 }  // namespace onnxruntime

From 9e81a0438e67ac79a5119bef2af7fdce3538d24b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 7 Nov 2022 14:22:49 -0800
Subject: [PATCH 14/81] 11

---
 cmake/onnxruntime_webassembly.cmake           |  2 +-
 js/web/lib/wasm/jsep/init.ts                  |  2 +-
 js/web/lib/wasm/jsep/util.ts                  |  6 +-
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  | 10 +++
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  | 21 ++++---
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 21 ++++---
 .../core/framework/execution_provider.cc      |  1 +
 .../core/framework/graph_partitioner.cc       |  4 ++
 onnxruntime/core/framework/kernel_lookup.h    |  3 +
 onnxruntime/core/framework/kernel_registry.cc |  4 ++
 .../providers/js/js_execution_provider.cc     | 61 +++++++++++++++++--
 .../core/providers/js/operators/conv.cc       | 17 +++---
 .../core/providers/js/operators/conv.h        | 49 +++++++--------
 13 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 70bf05ba2073d..a157faf811279 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -214,7 +214,7 @@ else()
                         -s NO_FILESYSTEM=1 \
                         ${WASM_API_EXCEPTION_CATCHING} \
                         -s ASYNCIFY=1 \
-                        -s ASYNCIFY_STACK_SIZE=8192 \
+                        -s ASYNCIFY_STACK_SIZE=16384 \
                         -s ASYNCIFY_ADVISE=1 \
                         -s ASYNCIFY_DEBUG=0 \
                         -s ASYNCIFY_IGNORE_INDIRECT=0 \
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 965bd2fa570bd..91e96c950de22 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -57,7 +57,7 @@ class OpKernelContext implements ComputeContext {
 export const init = async(module: OrtWasmModule): Promise<void> => {
   // init JSEP if available
   const init = module.jsepInit;
-  if (init) {
+  if (init && navigator.gpu) {
     const backend = new WebGpuBackend();
     await backend.initialize();
 
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index f4efee4164e93..228d4d3d0a1da 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -586,7 +586,7 @@ export class PoolConvUtil {
   // adjust pad values based on 'autoPad' attribute
   static adjustPadsBasedOnAutoPad(
       inputDims: readonly number[], strides: readonly number[], dilations: readonly number[],
-      kernelShape: readonly number[], pads: number[], autoPad?: string): void {
+      kernelShape: readonly number[], pads: number[], isChannelLast: boolean, autoPad?: string): void {
     if (!autoPad) {
       return;
     }
@@ -605,8 +605,8 @@ export class PoolConvUtil {
 
     for (let dim = 0; dim < inputDims.length - 2; dim++) {
       PoolConvUtil.adjustPadAndReturnShape(
-          inputDims[dim + 2], strides[dim], dilations[dim], kernelShape[dim], pads, dim, dim + inputDims.length - 2,
-          autoPad);
+          inputDims[dim + (isChannelLast ? 1 : 2)], strides[dim], dilations[dim], kernelShape[dim], pads, dim,
+          dim + inputDims.length - 2, autoPad);
     }
   }
 
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index 2596eec46f6f9..b2ccb9b4ec0d5 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -93,6 +93,10 @@ class GpuDataManagerImpl implements GpuDataManager {
     this.backend.getCommandEncoder().copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size);
     this.backend.flush();
 
+
+    // eslint-disable-next-line no-console
+    console.log(`[js] GpuDataManager.upload(id=${id})`);
+
     gpuBufferForUploading.destroy();
   }
 
@@ -111,6 +115,9 @@ class GpuDataManagerImpl implements GpuDataManager {
 
     const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
     this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
+
+    // eslint-disable-next-line no-console
+    console.log(`[js] GpuDataManager.create(size=${size}) => id=${gpuData.id}`);
     return gpuData;
   }
 
@@ -124,6 +131,9 @@ class GpuDataManagerImpl implements GpuDataManager {
       throw new Error('releasing data does not exist');
     }
 
+    // eslint-disable-next-line no-console
+    console.log(`[js] GpuDataManager.release(id=${id}), gpuDataId=${cachedData.gpuData.id}`);
+
     this.storageCache.delete(id);
     cachedData.gpuData.buffer.destroy();
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index d29036a7313ff..fb9a57a10b755 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -34,8 +34,9 @@ const createGroupedConvProgramInfo =
         inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
       }
 
-      const outputShape =
-          calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
+      const isChannelLast = attributes.format === 'NHWC';
+      const outputShape = calculateOutputShape(
+          xShape, wShape, attributes.dilations, attributes.pads, attributes.strides, isChannelLast);
       const outputSize = ShapeUtil.size(outputShape);
       const outputIndicesHelper = createIndicesHelper('output', outputShape);
       const xIndicesHelper = createIndicesHelper('x', xShape);
@@ -64,8 +65,9 @@ const createGroupedConvProgramInfo =
     ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
     ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
     let batch: u32 = outputIndices[0];
-    let output_channel: u32 = outputIndices[1];
-    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
+    let output_channel: u32 = outputIndices[${isChannelLast ? 3 : 1}];
+    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[${isChannelLast ? 1 : 2}], outputIndices[${
+          isChannelLast ? 2 : 3}]) * strides - pads;
     let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
 
     var value: ${dataType} = ${dataType}(0);
@@ -74,22 +76,23 @@ const createGroupedConvProgramInfo =
       for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
         let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
 
-        if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
+        if (xHeight < 0u || xHeight >= ${xShape[isChannelLast ? 1 : 2]}u) {
           continue;
         }
 
         for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
           let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
-          if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
+          if (xWidth < 0u || xWidth >= ${xShape[isChannelLast ? 2 : 3]}u) {
             continue;
           }
 
           ${
           xIndicesHelper.indicesVariableDeclaration(
               'xIndices',
-              [
-                'batch', 'input_channel', 'xHeight', 'xWidth'
-              ])}
+              isChannelLast ? ['batch', 'xHeight', 'xWidth', 'input_channel'] :
+                              [
+                                'batch', 'input_channel', 'xHeight', 'xWidth'
+                              ])}
           let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
           ${
           wIndicesHelper.indicesVariableDeclaration('wIndices', [
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index d3a33e5b77dbe..47e8f68f5a0f0 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -17,23 +17,25 @@ import {InternalActivationAttributes, parseInternalActivationAttributes} from '.
 
 export const calculateOutputShape =
     (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
-     adjustPads: readonly number[], strides: readonly number[]): number[] => {
+     adjustPads: readonly number[], strides: readonly number[], isChannelLast: boolean): number[] => {
       const batchSize = inputShape[0];
-      const inputSpatialShape = inputShape.slice(2);
+      const inputSpatialShape = inputShape.slice(isChannelLast ? 1 : 2, isChannelLast ? 3 : 4);
       const spatialRank = inputSpatialShape.length;
       const outChannels = kernelShape[0];
       const kernelSpatialShape = kernelShape.slice(2);
       const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
       const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]);
-      const outputSpatialShape =
+      const outputShape =
           inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
-      const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
+      outputShape.splice(0, 0, batchSize);
+      outputShape.splice(isChannelLast ? 3 : 1, 0, outChannels);
       return outputShape;
     };
 
 export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
   readonly autoPad: string;
   readonly dilations: readonly number[];
+  readonly format: 'NHWC'|'NCHW';
   readonly group: number;
   readonly kernelShape: readonly number[];
   readonly pads: readonly number[];
@@ -53,7 +55,7 @@ const validateInputs = (inputs: readonly TensorView[], attributes: ConvAttribute
   }
 
   // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
-  const dataChannel = inputs[0].dims[1];
+  const dataChannel = inputs[0].dims[attributes.format === 'NHWC' ? 3 : 1];
   const filterInChannel = inputs[1].dims[1] * attributes.group;
   if (dataChannel !== filterInChannel) {
     throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
@@ -106,7 +108,8 @@ const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inpu
   }
   const pads = attributes.pads.slice();
   PoolConvUtil.adjustPadsBasedOnAutoPad(
-      inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
+      inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.format === 'NHWC',
+      attributes.autoPad);
 
   // always return a new object so does not modify the original attributes
   const newAttributes: T = Object.assign({}, attributes);
@@ -117,7 +120,8 @@ const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inpu
 export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAttributes => {
   const activationAttributes = parseInternalActivationAttributes(attributes);
   // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
-  const autoPad = ['NOTSET', 'VALID', 'SAME_UPPER', 'SAME_LOWER'][attributes.auto_pad as number];
+  const format = attributes.format as 'NHWC' | 'NCHW';
+  const autoPad = ['NOTSET', 'VALID', 'SAME_UPPER', 'SAME_LOWER'][attributes.autopad as number];
   const dilations = [attributes.dilation0 as number, attributes.dilation1 as number];
   const group = attributes.group as number;
   const kernelShape = [attributes.kernelshape0 as number, attributes.kernelshape1 as number];
@@ -125,7 +129,8 @@ export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAt
       [attributes.pad0 as number, attributes.pad1 as number, attributes.pad2 as number, attributes.pad3 as number];
   const strides = [attributes.stride0 as number, attributes.stride1 as number];
 
-  return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides, ...activationAttributes});
+  return createAttributeWithCacheKey(
+      {autoPad, format, dilations, group, kernelShape, pads, strides, ...activationAttributes});
 };
 
 const conv2d = (context: ComputeContext, attributes: ConvAttributes): number => {
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index 5bc5dcdbd7696..5564012221fec 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -32,6 +32,7 @@ IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                   const IKernelLookup& kernel_lookup) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (const auto& node : graph.Nodes()) {
+    printf("IExecutionProvider::GetCapability() calling on node: [%s][%s][%s]\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str());
     if (const KernelCreateInfo* kernel_create_info = kernel_lookup.LookUpKernel(node);
         kernel_create_info != nullptr) {
       std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index d34f685415f27..15bb6ea120f7f 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -106,6 +106,10 @@ static bool TryAssignSingleNode(Graph& graph,
   assert(indexed_sub_graph.GetMetaDef() == nullptr && indexed_sub_graph.nodes.size() == 1);
 
   auto* node = graph.GetNode(indexed_sub_graph.nodes[0]);
+  // if (node->Domain() == kMSInternalNHWCDomain && node->Op() == nullptr ) {
+  //   printf("TryAssignSingleNode() calling SetOpSchemaFromRegistryForNode()\n");
+  //   graph.SetOpSchemaFromRegistryForNode(*node);
+  // }
   if (nullptr != node && node->GetExecutionProviderType().empty()) {
     // The node was not fused or assigned. Assign it to <provider_type>.
     node->SetExecutionProviderType(provider_type);
diff --git a/onnxruntime/core/framework/kernel_lookup.h b/onnxruntime/core/framework/kernel_lookup.h
index 2b4d3ce81623a..933aed4542c06 100644
--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@@ -30,14 +30,17 @@ class KernelLookup final : public IExecutionProvider::IKernelLookup {
 
   const KernelCreateInfo* LookUpKernel(const Node& node) const override {
     const KernelCreateInfo* kernel_create_info{};
+    printf(" LookUpKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), provider_type_.c_str());
     for (const auto& registry : kernel_registries_) {
       const auto lookup_status = registry->TryFindKernel(node, provider_type_, kernel_type_str_resolver_,
                                                          &kernel_create_info);
       if (lookup_status.IsOK() && kernel_create_info != nullptr) {
+    printf(" - found\n");
         return kernel_create_info;
       }
     }
 
+    printf(" - not found\n");
     return nullptr;
   }
 
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index e2bc7c3e3ce6f..652e2a8860e17 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -166,6 +166,7 @@ Status KernelRegistry::TryFindKernel(const Node& node,
   const auto& node_provider = node.GetExecutionProviderType();
   const auto& expected_provider = (node_provider.empty() ? exec_provider : node_provider);
 
+    printf("  KernelRegistry::TryFindKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), expected_provider.c_str());
   auto range = kernel_creator_fn_map_.equal_range(GetMapKey(node.OpType(), node.Domain(), expected_provider));
   if (out) *out = nullptr;
 
@@ -175,6 +176,7 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     std::string error_str;
     if (VerifyKernelDef(node, *i->second.kernel_def, kernel_type_str_resolver, error_str)) {
       if (out) *out = &i->second;
+    printf("  KernelRegistry::TryFindKernel() OK\n");
       return Status::OK();
     }
     verify_kernel_def_error_strs.push_back(error_str);
@@ -191,9 +193,11 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     oss << ")";
 
     VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
+    printf("  KernelRegistry::TryFindKernel() failed: %s\n",oss.str().c_str());
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
+    printf("  KernelRegistry::TryFindKernel() failed: Kernel not found\n");
   return Status(common::ONNXRUNTIME, common::FAIL, "Kernel not found");
 }
 
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 8089a63b2c3b8..0079253e64fa2 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -10,9 +10,9 @@
 
 #include "core/graph/function_utils.h"
 #include "core/framework/compute_capability.h"
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/kernel_registry.h"
 #include "core/providers/shared/node_unit/node_unit.h"
-
 #include "allocator.h"
 #include "data_transfer.h"
 
@@ -25,6 +25,43 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
   return info;
 }
 
+class Memcpy final : public OpKernel {
+ public:
+  Memcpy(const OpKernelInfo& info) : OpKernel(info) {}
+
+  Status Compute(OpKernelContext* ctx) const override {
+    const auto* X = ctx->Input<Tensor>(0);
+    Tensor* Y = ctx->Output(0, X->Shape());
+    Status retval = Info().GetDataTransferManager().CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId());
+    return retval;
+  }
+};
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
+
+ONNX_OPERATOR_KERNEL_EX(
+    MemcpyFromHost,
+    kOnnxDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 0)
+        .ExecQueueId(0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Memcpy);
+
+ONNX_OPERATOR_KERNEL_EX(
+    MemcpyToHost,
+    kOnnxDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .OutputMemoryType(OrtMemTypeCPUOutput, 0)
+        .ExecQueueId(1)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Memcpy);
+
 #define KERNEL_CREATE_INFO_VERSIONED(Start, End, Op) \
   BuildKernelCreateInfo<                             \
       ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, Start, End, Op)>
@@ -43,7 +80,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Abs
 //class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
+// class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
 
 // class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv);
@@ -64,11 +101,13 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
       KERNEL_CREATE_INFO_VERSIONED(1, 13, Abs),
       KERNEL_CREATE_INFO(14, Abs),
       //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
+      //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
       // KERNEL_CREATE_INFO(11, Conv),
       // KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool),
@@ -126,7 +165,21 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& kernel_lookup) const {
 
-  return IExecutionProvider::GetCapability(graph, kernel_lookup);
+  auto list = IExecutionProvider::GetCapability(graph, kernel_lookup);
+  printf("JsExecutionProvider::GetCapability() results:\n");
+
+  for (size_t i=0; i < list.size(); i++) {
+    printf("  subgraph %zu: %zu node(s)\n", i, list[i]->sub_graph->nodes.size());
+    for (size_t j=0;j<list[i]->sub_graph->nodes.size();j++) {
+      auto node_index = list[i]->sub_graph->nodes[j];
+      auto *node = graph.GetNode(node_index);
+      //auto *kernel_info = kernel_lookup.LookUpKernel(&node);
+
+      printf("    node[%zu]: [%s][%s][%s]\n", node_index, node->Domain().c_str(), node->OpType().c_str(), node->Name().c_str());
+    }
+  }
+
+  return list;
 }
 
 std::shared_ptr<KernelRegistry> JsExecutionProvider::GetKernelRegistry() const {
diff --git a/onnxruntime/core/providers/js/operators/conv.cc b/onnxruntime/core/providers/js/operators/conv.cc
index e10471acdc04f..1916d84eb2720 100644
--- a/onnxruntime/core/providers/js/operators/conv.cc
+++ b/onnxruntime/core/providers/js/operators/conv.cc
@@ -18,14 +18,6 @@ namespace js {
       kJsExecutionProvider,                                                                \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Conv<T>);                                                                            \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
-      Conv,                                                                                \
-      kOnnxDomain,                                                               \
-      1, 10,                                                                               \
-      T,                                                                                   \
-      kJsExecutionProvider,                                                                         \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Conv<T>);                                                                            \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       Conv,                                                                                \
       kOnnxDomain,                                                               \
@@ -35,6 +27,15 @@ namespace js {
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Conv<T>);
 
+//   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(
+//       Conv,
+//       kOnnxDomain,
+//       1, 10,
+//       T,
+//       kJsExecutionProvider,
+//       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),
+//       Conv<T>);
+
 
 
 REGISTER_KERNEL_TYPED(float)
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 90f79de92e31b..96137b29803db 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -21,31 +21,32 @@ class Conv : public JsKernel {
 
     // currently only support Conv2D. TODO: support other
     JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
-        "autopad": $0,
-        "dilation0": $1,
-        "dilation1": $2,
-        "group": $3,
-        "kernelshape0": $4,
-        "kernelshape1": $5,
-        "pad0": $6,
-        "pad1": $7,
-        "pad2": $8,
-        "pad3": $9,
-        "stride0": $10,
-        "stride1": $11,
+        "format": "NHWC",
+        "autopad": $1,
+        "dilation0": $2,
+        "dilation1": $3,
+        "group": $4,
+        "kernelshape0": $5,
+        "kernelshape1": $6,
+        "pad0": $7,
+        "pad1": $8,
+        "pad2": $9,
+        "pad3": $10,
+        "stride0": $11,
+        "stride1": $12,
     }),
-    conv_attrs_.auto_pad,
-    conv_attrs_.dilations[0],
-    conv_attrs_.dilations[1],
-    conv_attrs_.group,
-    conv_attrs_.kernel_shape_specified ? kernel_shape[0] : 0,
-    conv_attrs_.kernel_shape_specified ? kernel_shape[1] : 0,
-    conv_attrs_.pads[0],
-    conv_attrs_.pads[1],
-    conv_attrs_.pads[2],
-    conv_attrs_.pads[3],
-    conv_attrs_.strides[0],
-    conv_attrs_.strides[1]
+    static_cast<int32_t>(conv_attrs_.auto_pad),
+    static_cast<int32_t>(conv_attrs_.dilations[0]),
+    static_cast<int32_t>(conv_attrs_.dilations[1]),
+    static_cast<int32_t>(conv_attrs_.group),
+    static_cast<int32_t>(conv_attrs_.kernel_shape_specified ? kernel_shape[0] : 0),
+    static_cast<int32_t>(conv_attrs_.kernel_shape_specified ? kernel_shape[1] : 0),
+    static_cast<int32_t>(conv_attrs_.pads[0]),
+    static_cast<int32_t>(conv_attrs_.pads[1]),
+    static_cast<int32_t>(conv_attrs_.pads[2]),
+    static_cast<int32_t>(conv_attrs_.pads[3]),
+    static_cast<int32_t>(conv_attrs_.strides[0]),
+    static_cast<int32_t>(conv_attrs_.strides[1])
     );
   }
 

From c9b36ea3b437da1b9450c0c044426854c73e7e72 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 14 Nov 2022 15:52:18 -0800
Subject: [PATCH 15/81] 12

---
 js/web/lib/build-def.d.ts                     |    4 +
 js/web/lib/index.ts                           |    9 +-
 js/web/lib/wasm/jsep/backend-webgpu.ts        |    3 +-
 js/web/lib/wasm/jsep/init.ts                  |    4 +-
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  |   42 +-
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   34 +-
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts  |    9 +-
 js/web/lib/wasm/jsep/webgpu/ops/common.ts     |   12 +-
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   |   86 +-
 js/web/lib/wasm/session-options.ts            |    2 +-
 js/web/script/test-runner-cli-args.ts         |    6 +-
 js/web/script/test-runner-cli.ts              |   12 +-
 js/web/test/suite-test-list.jsonc             | 1099 ++++++++++++++++-
 js/web/test/test-runner.ts                    |    5 +-
 js/web/webpack.config.js                      |    1 +
 .../core/framework/execution_provider.cc      |    2 +-
 onnxruntime/core/framework/kernel_lookup.h    |    6 +-
 onnxruntime/core/framework/kernel_registry.cc |    8 +-
 onnxruntime/core/providers/js/allocator.cc    |    1 -
 .../providers/js/js_execution_provider.cc     |  120 +-
 .../core/providers/js/js_execution_provider.h |    2 +-
 onnxruntime/core/providers/js/js_export.cc    |    1 +
 onnxruntime/core/providers/js/js_kernel.h     |   39 +-
 .../core/providers/js/operators/binary.cc     |   55 +
 .../core/providers/js/operators/conv.cc       |   25 +-
 .../core/providers/js/operators/conv.h        |    7 +-
 .../core/providers/js/operators/unary.cc      |   72 +-
 27 files changed, 1505 insertions(+), 161 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/operators/binary.cc

diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index 687b5aefcfccf..2049b2663ead3 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -14,6 +14,10 @@ interface BuildDefinitions {
    * defines whether to disable the whole WebGL backend in the build.
    */
   DISABLE_WEBGL: boolean;
+  /**
+   * defines whether to disable the whole WebGpu backend in the build.
+   */
+  DISABLE_WEBGPU: boolean;
   /**
    * defines whether to disable the whole WebAssembly backend in the build.
    */
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index eefdbcfb63b05..708c92a261fde 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -12,11 +12,18 @@ import {registerBackend} from 'onnxruntime-common';
 if (!BUILD_DEFS.DISABLE_WEBGL) {
   const onnxjsBackend = require('./backend-onnxjs').onnxjsBackend;
   registerBackend('webgl', onnxjsBackend, -10);
+}
+
+if (!BUILD_DEFS.DISABLE_WEBGPU) {
+  const onnxjsBackend = require('./backend-onnxjs').onnxjsBackend;
   registerBackend('webgpu', onnxjsBackend, 999);  // set to 999 as the highest priority
 }
+
 if (!BUILD_DEFS.DISABLE_WASM) {
   const wasmBackend = require('./backend-wasm').wasmBackend;
-  registerBackend('js', wasmBackend, 11);
+  if (!BUILD_DEFS.DISABLE_WEBGPU) {
+    registerBackend('jsep-webgpu', wasmBackend, 11);
+  }
   registerBackend('cpu', wasmBackend, 10);
   registerBackend('wasm', wasmBackend, 10);
   registerBackend('xnnpack', wasmBackend, 9);
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 462660b6effc5..21461ed865865 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -85,6 +85,7 @@ export class WebGpuBackend {
   flush(): void {
     this.endComputePass();
     this.device.queue.submit([this.getCommandEncoder().finish()]);
+    this.gpuDataManager.refreshPendingBuffers();
     this.commandEncoder = null;
     this.pendingDispatchNumber = 0;
   }
@@ -180,7 +181,7 @@ export class WebGpuBackend {
     const [name, kernelEntry, attributes] = kernel;
 
     // eslint-disable-next-line no-console
-    console.log(`[JS] Start to run kernel "${name}"...`);
+    console.log(`[js] Start to run kernel "${name}"...`);
     return kernelEntry(context, attributes);
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 91e96c950de22..988e04cd41e00 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -74,7 +74,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         // jsepUpload(src, dst, size)
         (dataOffset: number, gpuDataId: number, size: number) => {
           // eslint-disable-next-line no-console
-          console.log(`jsepUpload: dataOffset=${dataOffset}, gpuDataId=${gpuDataId}, size=${size}`);
+          console.log(`[js] jsepUpload: dataOffset=${dataOffset}, gpuDataId=${gpuDataId}, size=${size}`);
           const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
           backend.upload(gpuDataId, data);
         },
@@ -85,7 +85,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
               const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
 
               // eslint-disable-next-line no-console
-              console.log(`jsepDownload: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
+              console.log(`[js] jsepDownload: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
 
               await backend.download(gpuDataId, data);
             },
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index b2ccb9b4ec0d5..d4466f1a0ba66 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -31,6 +31,8 @@ export interface GpuDataManager {
    * download the data from GPU.
    */
   download(id: GpuDataId): Promise<ArrayBufferLike>;
+
+  refreshPendingBuffers(): void;
 }
 
 interface StorageCacheValue {
@@ -39,7 +41,6 @@ interface StorageCacheValue {
 }
 
 interface DownloadCacheValue {
-  gpuData: GpuData;
   data: Promise<ArrayBufferLike>;
 }
 
@@ -58,9 +59,16 @@ class GpuDataManagerImpl implements GpuDataManager {
   // GPU Data ID => GPU Data ( read buffer )
   downloadCache: Map<GpuDataId, DownloadCacheValue>;
 
+  private buffersForUploadingPending: GPUBuffer[];
+  // private buffersForDownloadingPending: GPUBuffer[];
+  private buffersPending: GPUBuffer[];
+
   constructor(private backend: WebGpuBackend /* , private reuseBuffer: boolean */) {
     this.storageCache = new Map();
     this.downloadCache = new Map();
+    this.buffersForUploadingPending = [];
+    // this.buffersForDownloadingPending = [];
+    this.buffersPending = [];
   }
 
   upload(id: GpuDataId, data: Uint8Array): void {
@@ -91,13 +99,11 @@ class GpuDataManagerImpl implements GpuDataManager {
 
     // GPU copy
     this.backend.getCommandEncoder().copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size);
-    this.backend.flush();
-
 
     // eslint-disable-next-line no-console
     console.log(`[js] GpuDataManager.upload(id=${id})`);
 
-    gpuBufferForUploading.destroy();
+    this.buffersForUploadingPending.push(gpuBufferForUploading);
   }
 
   create(size: number): GpuData {
@@ -135,13 +141,11 @@ class GpuDataManagerImpl implements GpuDataManager {
     console.log(`[js] GpuDataManager.release(id=${id}), gpuDataId=${cachedData.gpuData.id}`);
 
     this.storageCache.delete(id);
-    cachedData.gpuData.buffer.destroy();
+    this.buffersPending.push(cachedData.gpuData.buffer);
+    // cachedData.gpuData.buffer.destroy();
 
     const downloadingData = this.downloadCache.get(id);
     if (downloadingData) {
-      void downloadingData.data.then(() => {
-        downloadingData.gpuData.buffer.destroy();
-      });
       this.downloadCache.delete(id);
     }
 
@@ -170,10 +174,26 @@ class GpuDataManagerImpl implements GpuDataManager {
     );
     this.backend.flush();
 
-    await gpuReadBuffer.mapAsync(GPUMapMode.READ);
-    return gpuReadBuffer.getMappedRange();
+    const readDataPromise = new Promise<ArrayBuffer>((resolve) => {
+      gpuReadBuffer.mapAsync(GPUMapMode.READ).then(() => {
+        const data = gpuReadBuffer.getMappedRange().slice(0);
+        gpuReadBuffer.destroy();
+        resolve(data);
+      });
+    });
+
+    this.downloadCache.set(id, {data: readDataPromise});
+
+    return readDataPromise;
+  }
 
-    // TODO: release gpuReadBuffer
+  refreshPendingBuffers(): void {
+    for (const buffer of this.buffersForUploadingPending) {
+      buffer.destroy();
+    }
+    for (const buffer of this.buffersPending) {
+      buffer.destroy();
+    }
   }
 }
 
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 56e90f9761ce0..9429eb05d1c92 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import * as binaryOps from './ops/binary-op';
+import * as binaryOps from './ops/binary-op';
 // import {concat, parseConcatAttributes} from './ops/concat';
 import {conv, parseConvAttributes} from './ops/conv';
 // import {gather, parseGatherAttributes} from './ops/gather';
@@ -23,23 +23,23 @@ export type ParseAttributeFunction = (attributeRaw: unknown) => unknown;
 export type OperatorImplementation = [RunFunction]|[RunFunction, ParseAttributeFunction];
 
 export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new Map([
-  ['Abs', [unaryOps.abs]],
-  //, ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
+  ['Abs', [unaryOps.abs]], ['Acos', [unaryOps.acos]], ['Acosh', [unaryOps.acosh]], ['Add', [binaryOps.add]],
   // ['And', '', '7+', binaryOps.and],
-  //['Asin', '', '7+', unaryOps.asin], ['Atan', '', '7+', unaryOps.atan],
+  ['Asin', [unaryOps.asin]], ['Asinh', [unaryOps.asinh]], ['Atan', [unaryOps.atan]], ['Atanh', [unaryOps.atanh]],
   // TODO: support new attributes for AveragePool-10
   //['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
   // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
   // ['Cast', '', '6+', cast, parseCastAttributes],
-  //['Ceil', '', '6+', unaryOps.ceil], ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
+  ['Ceil', [unaryOps.ceil]],
+  // ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
   //['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
-  ['Conv', [conv, parseConvAttributes]],  //['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
+  ['Conv', [conv, parseConvAttributes]], ['Cos', [unaryOps.cos]], ['Cosh', [unaryOps.cosh]], ['Div', [binaryOps.div]],
   // ['Dropout', '', '7+', unaryOps.identity],
   // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
   // ['Equal', '', '7+', binaryOps.equal],
-  //['Elu', '', '6+', unaryOps.elu, unaryOps.parseEluAttributes], ['Exp', '', '6+', unaryOps.exp],
+  ['Elu', [unaryOps.elu, unaryOps.parseEluAttributes]],  //['Exp', [unaryOps.exp]],
   // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
-  //['Floor', '', '6+', unaryOps.floor],
+  ['Floor', [unaryOps.floor]],
   // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
   //['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
   //['Gemm', '', '11+', gemm, parseGemmAttributesV11],
@@ -53,14 +53,15 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['Less', '', '7+', binaryOps.less],
   //['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
-  //['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes], ['Mul', '', '7+', binaryOps.mul],
-  //['Neg', '', '6+', unaryOps.neg],
+  //['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes],
+  ['Mul', [binaryOps.mul]], ['Neg', [unaryOps.neg]],
   // ['Not', '', '1+', unaryOps.not],
   // ['Or', '', '7+', binaryOps.or],
   // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
   // ['Pad', '', '11+', padV11, parsePadAttributesV11],
-  //['Pow', '', '7+', binaryOps.pow],
+  ['Pow', [binaryOps.pow]],
   // ['PRelu', '', '7+', binaryOps.pRelu],
+  ['Reciprocal', [unaryOps.reciprocal]],
   // ['ReduceLogSum', '', '1+', reduceLogSum, parseReduceAttributes],
   // ['ReduceMax', '', '1+', reduceMax, parseReduceAttributes],
   // ['ReduceMean', '', '1+', reduceMean, parseReduceAttributes],
@@ -71,7 +72,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   //['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
   // ['Resize', '', '10', resize, parseResizeAttributesV10],
   // ['Resize', '', '11+', resize, parseResizeAttributesV11],
-  //['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid], ['Sin', '', '7+', unaryOps.sin],
+  //['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid],
+  ['Sin', [unaryOps.sin]], ['Sinh', [unaryOps.sinh]],
   //['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
   //['Slice', '', '1-9', slice, parseSliceAttributes],
   // // The "semantic" meaning of axis has changed in opset-13.
@@ -82,9 +84,11 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // // When the attribute is missing, we need the count of number of outputs
   // // so that we can determine the 'split' attribute from the runtime input to the Operator
   // ['Split', '', '2-12', split, parseSplitAttributes],
-  //['Sqrt', '', '6+', unaryOps.sqrt], ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
-  //['Squeeze', '', '13+', squeezeV13], ['Sub', '', '7+', binaryOps.sub], ['Sum', '', '6+', sum],
-  //['Tan', '', '7+', unaryOps.tan], ['Tanh', '', '6+', unaryOps.tanh],
+  ['Sqrt', [unaryOps.sqrt]],
+  // ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
+  //['Squeeze', '', '13+', squeezeV13],
+  ['Sub', [binaryOps.sub]],  // ['Sum', '', '6+', sum],
+  ['Tan', [unaryOps.tan]], ['Tanh', [unaryOps.tanh]],
   // ['Tile', '', '6+', tile],
   //['Transpose', '', '1+', transpose, parseTransposeAttributes],
   // ['Upsample', '', '7-8', upsample, parseUpsampleAttributesV7],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index b723ba19558fc..512ff09c93881 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -40,7 +40,10 @@ const createBinaryOpProgramShader =
           const strides = ShapeUtil.computeStrides(dims);
           const offsets: string[] = [];
           for (let i = dims.length - 1; i >= 0; i--) {
-            offsets.push(`${strides[i]}u * ((*outputIndices)[${i + dimsOutput.length - dims.length}] % ${dims[i]}u)`);
+            const idx = dimsOutput.length === 0 ? '0u' :
+                (dimsOutput.length === 1)       ? '(*outputIndices)' :
+                                                  `(*outputIndices)[${i + dimsOutput.length - dims.length}]`;
+            offsets.push(`${strides[i]}u * (${idx} % ${dims[i]}u)`);
           }
           return offsets.length > 0 ? offsets.join('+') : '0u';
         };
@@ -48,11 +51,11 @@ const createBinaryOpProgramShader =
         broadcastImpl = `
   ${outputIndicesHelper.o2iImpl}
 
-  fn calcOffsetA(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
+  fn calcOffsetA(outputIndices: ptr<function, ${outputIndicesHelper.iType}>) -> u32 {
     return ${calcOffsetImpl(dimsA)};
   }
 
-  fn calcOffsetB(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
+  fn calcOffsetB(outputIndices: ptr<function, ${outputIndicesHelper.iType}>) -> u32 {
     return ${calcOffsetImpl(dimsB)};
   }
   `;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index f006d175da0ed..dfe4f3c8106e1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -72,13 +72,19 @@ export const createIndicesHelper = (name: string, shape: readonly number[]): Ind
       shape.length < 2 ? `${varIndices}=${varOffset};` : `ih_o2i_${name}(${varOffset}, &${varIndices});`;
 
   const offsets: string[] = [];
-  for (let i = shape.length - 1; i >= 0; i--) {
-    offsets.push(`${strides[i]}u * ((*indices)[${i}])`);
+  if (shape.length === 0) {
+    offsets.push('0u');
+  } else if (shape.length < 2) {
+    offsets.push('(*indices)');
+  } else {
+    for (let i = shape.length - 1; i >= 0; i--) {
+      offsets.push(`${strides[i]}u * ((*indices)[${i}])`);
+    }
   }
 
   const i2oImpl = shape.length < 2 ? '' : `
   fn ih_i2o_${name}(indices: ptr<function, ${iType}>) -> u32 {
-    return ${offsets.length > 0 ? offsets.join('+') : '0u'};
+    return ${offsets.join('+')};
   }`;
 
   const i2oExpression = (varIndices: string, isPtr?: boolean) =>
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index d49e1a8acfa0d..0e9c5a372660a 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -3,7 +3,7 @@
 
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey} from '../attribute-with-cache-key';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
 import {WORKGROUP_SIZE} from './common';
@@ -69,12 +69,21 @@ export const abs = (context: ComputeContext): number =>
 export const acos = (context: ComputeContext): number =>
     context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Acos', 'acos'));
 
+export const acosh = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Acosh', 'acosh'));
+
 export const asin = (context: ComputeContext): number =>
     context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Asin', 'asin'));
 
+export const asinh = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Asinh', 'asinh'));
+
 export const atan = (context: ComputeContext): number =>
     context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Atan', 'atan'));
 
+export const atanh = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Atanh', 'atanh'));
+
 export interface ClipAttributes extends AttributeWithCacheKey {
   readonly min: number;
   readonly max: number;
@@ -107,40 +116,41 @@ export const clip = (context: ComputeContext, attributes: ClipAttributes): numbe
 //   return clip(handler, [inputs[0]], attributes);
 // };
 
-// export const ceil = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Ceil', 'ceil'), inputs);
+export const ceil = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Ceil', 'ceil'));
 
-// export const cos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Cos', 'cos'), inputs);
+export const cos = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Cos', 'cos'));
 
-// export interface EluAttributes extends AttributeWithCacheKey {
-//   readonly alpha: number;
-// }
+export const cosh = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Cosh', 'cosh'));
 
-// export const elu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
-//                        Promise<Tensor[] >=>handler.run(
-//                            createElementwiseProgramInfoLoader(
-//                                inputs[0], 'Elu', a => `elu_vf32(${a})`, `
-//     let elu_alpha_: f32 = f32(${attributes.alpha});
+export interface EluAttributes extends AttributeWithCacheKey {
+  readonly alpha: number;
+}
 
-//     fn elu_f32(a: f32) -> f32 {
-//       return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
-//     }
+export const elu = (context: ComputeContext, attributes: EluAttributes): number =>
+    context.compute(createElementwiseProgramInfoLoader(
+        context.inputs[0], 'Elu', a => `elu_vf32(${a})`, `
+  const elu_alpha_: f32 = f32(${attributes.alpha});
 
-//     fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
-//       return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
-//     }`,
-//                                attributes.cacheKey),
-//                            inputs);
+  fn elu_f32(a: f32) -> f32 {
+  return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
+  }
+
+  fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
+  return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
+  }`,
+        attributes.cacheKey));
 
-// export const parseEluAttributes = (node: Graph.Node): EluAttributes =>
-//     createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 1.0)});
+export const parseEluAttributes = (attributes: Record<string, unknown>): EluAttributes =>
+    createAttributeWithCacheKey(attributes as {alpha: number});
 
 // export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
 //     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
 
-// export const floor = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Floor', 'floor'), inputs);
+export const floor = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Floor', 'floor'));
 
 // export interface LeakyReluAttributes extends AttributeWithCacheKey {
 //   readonly alpha: number;
@@ -168,26 +178,32 @@ export const clip = (context: ComputeContext, attributes: ClipAttributes): numbe
 // export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
 //     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
 
-// export const neg = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Neg', a => `-${a}`), inputs);
+export const neg = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Neg', a => `-${a}`));
 
 // // export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
 // //     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
 
+export const reciprocal = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Reciprocal', a => `1.0/${a}`));
+
 // export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
 //     createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
 
 // export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
 //     createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
 
-// export const sin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sin', 'sin'), inputs);
+export const sin = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sin', 'sin'));
+
+export const sinh = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sinh', 'sinh'));
 
-// export const sqrt = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sqrt', 'sqrt'), inputs);
+export const sqrt = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sqrt', 'sqrt'));
 
-// export const tan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tan', 'tan'), inputs);
+export const tan = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Tan', 'tan'));
 
-// export const tanh = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tanh', 'tanh'), inputs);
+export const tanh = (context: ComputeContext): number =>
+    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Tanh', 'tanh'));
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index f9612f26c9fb8..82a06f0f3cf5a 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -58,7 +58,7 @@ const setExecutionProviders =
           case 'xnnpack':
             epName = 'XNNPACK';
             break;
-          case 'js':
+          case 'jsep-webgpu':
             epName = 'JS';
             break;
           case 'wasm':
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index c0921ff65470e..33494c714b98c 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -37,7 +37,7 @@ Options:
                                    webgpu
                                    wasm
                                    xnnpack
-                                   js
+                                   jsep-webgpu
  -e=<...>, --env=<...>         Specify the environment to run the test. Should be one of the following:
                                  chrome     (default)
                                  edge       (Windows only)
@@ -100,7 +100,7 @@ Examples:
 
 export declare namespace TestRunnerCliArgs {
   type Mode = 'suite0'|'suite1'|'model'|'unittest'|'op';
-  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack'|'js';
+  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack'|'jsep-webgpu';
   type Environment = 'chrome'|'edge'|'firefox'|'electron'|'safari'|'node'|'bs';
   type BundleMode = 'prod'|'dev'|'perf';
 }
@@ -336,7 +336,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   }
 
   // Option: -b=<...>, --backend=<...>
-  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack', 'js'];
+  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack', 'jsep-webgpu'];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
   const backend =
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 5975ee57d8312..ee32a58ea9c57 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -50,8 +50,10 @@ if (shouldLoadSuiteTestData) {
 
 // The default backends and opset version lists. Those will be used in suite tests.
 const DEFAULT_BACKENDS: readonly TestRunnerCliArgs.Backend[] =
-    args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu'];
-const DEFAULT_OPSET_VERSIONS: readonly number[] = [13, 12, 11, 10, 9, 8, 7];
+    args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu', 'jsep-webgpu'];
+const DEFAULT_OPSET_VERSIONS = fs.readdirSync(TEST_DATA_MODEL_NODE_ROOT, {withFileTypes: true})
+                                   .filter(dir => dir.isDirectory() && dir.name.startsWith('opset'))
+                                   .map(dir => dir.name.slice(5));
 
 const FILE_CACHE_ENABLED = args.fileCache;         // whether to enable file cache
 const FILE_CACHE_MAX_FILE_SIZE = 1 * 1024 * 1024;  // The max size of the file that will be put into file cache
@@ -198,7 +200,7 @@ function validateTestList() {
   }
 }
 
-function loadNodeTests(backend: string, version: number): Test.ModelTestGroup {
+function loadNodeTests(backend: string, version: string): Test.ModelTestGroup {
   return suiteFromFolder(
       `node-opset_v${version}-${backend}`, path.join(TEST_DATA_MODEL_NODE_ROOT, `opset${version}`), backend,
       testlist[backend].node);
@@ -323,7 +325,7 @@ function tryLocateModelTestFolder(searchPattern: string): string {
   const globbyPattern = [searchPattern, path.join(TEST_DATA_MODEL_NODE_ROOT, '**', searchPattern).replace(/\\/g, '/')];
   // 4 - check the globby result of NODE root combined with opset versions and searchPattern
   globbyPattern.push(...DEFAULT_OPSET_VERSIONS.map(
-      v => path.join(TEST_DATA_MODEL_NODE_ROOT, `v${v}`, '**', searchPattern).replace(/\\/g, '/')));
+      v => path.join(TEST_DATA_MODEL_NODE_ROOT, `opset${v}`, '**', searchPattern).replace(/\\/g, '/')));
 
   folderCandidates.push(...globby.sync(globbyPattern, {onlyDirectories: true, absolute: true}));
 
@@ -454,7 +456,7 @@ function run(config: Test.Config) {
     // STEP 5. use Karma to run test
     npmlog.info('TestRunnerCli.Run', '(5/5) Running karma to start test runner...');
     const karmaCommand = path.join(npmBin, 'karma');
-    const webgpu = args.backends.indexOf('webgpu') > -1 || args.backends.indexOf('js') > -1;
+    const webgpu = args.backends.indexOf('webgpu') > -1 || args.backends.indexOf('jsep-webgpu') > -1;
     const browser = getBrowserNameFromEnv(
         args.env,
         args.bundleMode === 'perf' ? 'perf' :
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index eb84e2babc754..9f87c905ac86f 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -318,14 +318,14 @@
       // "test_batchnorm_example",
       // "test_cast_DOUBLE_to_FLOAT",
       // "test_cast_FLOAT_to_DOUBLE",
-      "v{7,8,9,10}/test_clip_splitbounds",
-      "v{7,8,9,10}/test_clip_outbounds",
-      "v{7,8,9,10}/test_clip_inbounds",
-      "v{7,8,9,10}/test_clip_example",
-      "v{7,8,9,10}/test_clip_default_min",
-      "v{7,8,9,10}/test_clip_default_max",
-      "v{7,8,9,10}/test_clip_default_inbounds",
-      "v{7,8,9,10}/test_clip",
+      "opset{7,8,9,10}/test_clip_splitbounds",
+      "opset{7,8,9,10}/test_clip_outbounds",
+      "opset{7,8,9,10}/test_clip_inbounds",
+      "opset{7,8,9,10}/test_clip_example",
+      "opset{7,8,9,10}/test_clip_default_min",
+      "opset{7,8,9,10}/test_clip_default_max",
+      "opset{7,8,9,10}/test_clip_default_inbounds",
+      "opset{7,8,9,10}/test_clip",
       "test_concat_1d_axis_0",
       "test_concat_2d_axis_0",
       "test_concat_2d_axis_1",
@@ -460,28 +460,28 @@
       // "test_reduce_prod_do_not_keepdims_random",
       // "test_reduce_prod_keepdims_example",
       // "test_reduce_prod_keepdims_random",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_example",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_random",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_example",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_random",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_keepdims_example",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_keepdims_random",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_example",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_random",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_example",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_random",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_example",
-      // "v{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_random",
-      // "v{7,8,9,10,11,12}/test_split_variable_parts_default_axis",
-      // "v{7,8,9,10,11,12}/test_split_variable_parts_1d",
-      // "v{7,8,9,10,11,12}/test_split_variable_parts_2d",
-      // "v{7,8,9,10,11,12}/test_split_equal_parts_default_axis",
-      // "v{7,8,9,10,11,12}/test_split_equal_parts_1d",
-      // "v{7,8,9,10,11,12}/test_split_equal_parts_2d",
-      "v{7,8,9}/test_slice",
-      "v{7,8,9}/test_slice_default_axes",
-      "v{7,8,9}/test_slice_end_out_of_bounds",
-      "v{7,8,9}/test_slice_neg",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_example",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_random",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_example",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_random",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_keepdims_example",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_keepdims_random",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_example",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_random",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_example",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_random",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_example",
+      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_random",
+      // "opset{7,8,9,10,11,12}/test_split_variable_parts_default_axis",
+      // "opset{7,8,9,10,11,12}/test_split_variable_parts_1d",
+      // "opset{7,8,9,10,11,12}/test_split_variable_parts_2d",
+      // "opset{7,8,9,10,11,12}/test_split_equal_parts_default_axis",
+      // "opset{7,8,9,10,11,12}/test_split_equal_parts_1d",
+      // "opset{7,8,9,10,11,12}/test_split_equal_parts_2d",
+      "opset{7,8,9}/test_slice",
+      "opset{7,8,9}/test_slice_default_axes",
+      "opset{7,8,9}/test_slice_end_out_of_bounds",
+      "opset{7,8,9}/test_slice_neg",
       // "test_slice_start_out_of_bounds", // tensor shape of 0
       // "test_squeeze",
       "test_tan_example",
@@ -650,5 +650,1044 @@
       "test_instancenorm_example"
     ],
     "ops": []
+  },
+  "jsep-webgpu": {
+    "onnx": [],
+    "node": [
+      "test_abs",
+      "test_acos_example",
+      "test_acos",
+      "test_acosh_example",
+      "test_acosh",
+      // // "test_adagrad_multiple",
+      // // "test_adagrad",
+      // // "test_adam_multiple",
+      // // "test_adam",
+      "test_add_bcast",
+      // "test_add_uint8",
+      "test_add",
+      // "test_and_bcast3v1d",
+      // "test_and_bcast3v2d",
+      // "test_and_bcast4v2d",
+      // "test_and_bcast4v3d",
+      // "test_and_bcast4v4d",
+      // "test_and2d",
+      // "test_and3d",
+      // "test_and4d",
+      // // "test_argmax_default_axis_example_select_last_index",
+      // // "test_argmax_default_axis_example",
+      // // "test_argmax_default_axis_random_select_last_index",
+      // // "test_argmax_default_axis_random",
+      // // "test_argmax_keepdims_example_select_last_index",
+      // // "test_argmax_keepdims_example",
+      // // "test_argmax_keepdims_random_select_last_index",
+      // // "test_argmax_keepdims_random",
+      // // "test_argmax_negative_axis_keepdims_example_select_last_index",
+      // // "test_argmax_negative_axis_keepdims_example",
+      // // "test_argmax_negative_axis_keepdims_random_select_last_index",
+      // // "test_argmax_negative_axis_keepdims_random",
+      // // "test_argmax_no_keepdims_example_select_last_index",
+      // // "test_argmax_no_keepdims_example",
+      // // "test_argmax_no_keepdims_random_select_last_index",
+      // // "test_argmax_no_keepdims_random",
+      // // "test_argmin_default_axis_example_select_last_index",
+      // // "test_argmin_default_axis_example",
+      // // "test_argmin_default_axis_random_select_last_index",
+      // // "test_argmin_default_axis_random",
+      // // "test_argmin_keepdims_example_select_last_index",
+      // // "test_argmin_keepdims_example",
+      // // "test_argmin_keepdims_random_select_last_index",
+      // // "test_argmin_keepdims_random",
+      // // "test_argmin_negative_axis_keepdims_example_select_last_index",
+      // // "test_argmin_negative_axis_keepdims_example",
+      // // "test_argmin_negative_axis_keepdims_random_select_last_index",
+      // // "test_argmin_negative_axis_keepdims_random",
+      // // "test_argmin_no_keepdims_example_select_last_index",
+      // // "test_argmin_no_keepdims_example",
+      // // "test_argmin_no_keepdims_random_select_last_index",
+      // // "test_argmin_no_keepdims_random",
+      "test_asin_example",
+      "test_asin",
+      "test_asinh_example",
+      "test_asinh",
+      "test_atan_example",
+      "test_atan",
+      "test_atanh_example",
+      "test_atanh",
+      // "test_averagepool_1d_default",
+      // "test_averagepool_2d_ceil",
+      // "test_averagepool_2d_default",
+      // "test_averagepool_2d_pads_count_include_pad",
+      // "test_averagepool_2d_pads",
+      // "test_averagepool_2d_precomputed_pads_count_include_pad",
+      // "test_averagepool_2d_precomputed_pads",
+      // "test_averagepool_2d_precomputed_same_upper",
+      // "test_averagepool_2d_precomputed_strides",
+      // "test_averagepool_2d_same_lower",
+      // "test_averagepool_2d_same_upper",
+      // "test_averagepool_2d_strides",
+      // "test_averagepool_3d_default",
+      "test_basic_conv_with_padding",
+      "test_basic_conv_without_padding",
+      // "test_basic_convinteger",
+      "test_batchnorm_epsilon_training_mode",
+      "test_batchnorm_epsilon",
+      "test_batchnorm_example_training_mode",
+      "test_batchnorm_example",
+      // // "test_bernoulli_double_expanded",
+      // // "test_bernoulli_double",
+      // // "test_bernoulli_expanded",
+      // // "test_bernoulli_seed_expanded",
+      // // "test_bernoulli_seed",
+      // // "test_bernoulli",
+      // // "test_bitshift_left_uint16",
+      // // "test_bitshift_left_uint32",
+      // // "test_bitshift_left_uint64",
+      // // "test_bitshift_left_uint8",
+      // // "test_bitshift_right_uint16",
+      // // "test_bitshift_right_uint32",
+      // // "test_bitshift_right_uint64",
+      // // "test_bitshift_right_uint8",
+      // // "test_blackmanwindow_expanded",
+      // // "test_blackmanwindow_symmetric_expanded",
+      // // "test_blackmanwindow_symmetric",
+      // // "test_blackmanwindow",
+      // // "test_cast_BFLOAT16_to_FLOAT",
+      // // "test_cast_DOUBLE_to_FLOAT",
+      // // "test_cast_DOUBLE_to_FLOAT16",
+      // // "test_cast_FLOAT_to_BFLOAT16",
+      // // "test_cast_FLOAT_to_DOUBLE",
+      // // "test_cast_FLOAT_to_FLOAT16",
+      // // "test_cast_FLOAT_to_STRING",
+      // // "test_cast_FLOAT16_to_DOUBLE",
+      // // "test_cast_FLOAT16_to_FLOAT",
+      // // "test_cast_STRING_to_FLOAT",
+      // // "test_castlike_BFLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_BFLOAT16_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT16_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_BFLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_BFLOAT16",
+      // // "test_castlike_FLOAT_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT_to_DOUBLE",
+      // // "test_castlike_FLOAT_to_FLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_STRING_expanded",
+      // // "test_castlike_FLOAT_to_STRING",
+      // // "test_castlike_FLOAT16_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT16_to_DOUBLE",
+      // // "test_castlike_FLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_FLOAT16_to_FLOAT",
+      // // "test_castlike_STRING_to_FLOAT_expanded",
+      // // "test_castlike_STRING_to_FLOAT",
+      "test_ceil_example",
+      "test_ceil",
+      // "test_celu_expanded",
+      // "test_celu",
+      // "test_clip_default_inbounds",
+      // "test_clip_default_int8_inbounds",
+      // "test_clip_default_int8_max",
+      // "test_clip_default_int8_min",
+      // "test_clip_default_max",
+      // "test_clip_default_min",
+      // "test_clip_example",
+      // "test_clip_inbounds",
+      // "test_clip_outbounds",
+      // "test_clip_splitbounds",
+      // "test_clip",
+      // // "test_compress_0",
+      // // "test_compress_1",
+      // // "test_compress_default_axis",
+      // // "test_compress_negative_axis",
+      // "test_concat_1d_axis_0",
+      // "test_concat_1d_axis_negative_1",
+      // "test_concat_2d_axis_0",
+      // "test_concat_2d_axis_1",
+      // "test_concat_2d_axis_negative_1",
+      // "test_concat_2d_axis_negative_2",
+      // "test_concat_3d_axis_0",
+      // "test_concat_3d_axis_1",
+      // "test_concat_3d_axis_2",
+      // "test_concat_3d_axis_negative_1",
+      // "test_concat_3d_axis_negative_2",
+      // "test_concat_3d_axis_negative_3",
+      "test_conv_with_autopad_same",
+      "test_conv_with_strides_and_asymmetric_padding",
+      "test_conv_with_strides_no_padding",
+      "test_conv_with_strides_padding",
+      // // "test_convinteger_with_padding",
+      // // "test_convinteger_without_padding",
+      // // "test_convtranspose_1d",
+      // // "test_convtranspose_3d",
+      // // "test_convtranspose_autopad_same",
+      // // "test_convtranspose_dilations",
+      // // "test_convtranspose_kernel_shape",
+      // // "test_convtranspose_output_shape",
+      // // "test_convtranspose_pad",
+      // // "test_convtranspose_pads",
+      // // "test_convtranspose_with_kernel",
+      // // "test_convtranspose",
+      "test_cos_example",
+      "test_cos",
+      "test_cosh_example",
+      "test_cosh",
+      // "test_cumsum_1d_exclusive",
+      // "test_cumsum_1d_reverse_exclusive",
+      // "test_cumsum_1d_reverse",
+      // "test_cumsum_1d",
+      // "test_cumsum_2d_axis_0",
+      // "test_cumsum_2d_axis_1",
+      // "test_cumsum_2d_negative_axis",
+      // "test_depthtospace_crd_mode_example",
+      // "test_depthtospace_crd_mode",
+      // "test_depthtospace_dcr_mode",
+      // "test_depthtospace_example",
+      // "test_depthtospace",
+      // // "test_dequantizelinear_axis",
+      // // "test_dequantizelinear",
+      // // "test_det_2d",
+      // // "test_det_nd",
+      // // "test_dft_axis",
+      // // "test_dft_inverse",
+      // // "test_dft",
+      "test_div_bcast",
+      "test_div_example",
+      // "test_div_uint8",
+      "test_div",
+      // // "test_dropout_default_mask_ratio",
+      // // "test_dropout_default_mask",
+      // // "test_dropout_default_old",
+      // // "test_dropout_default_ratio",
+      // // "test_dropout_default",
+      // // "test_dropout_random_old",
+      // // "test_dropout_random",
+      // // "test_dynamic_slice_default_axes",
+      // // "test_dynamic_slice_end_out_of_bounds",
+      // // "test_dynamic_slice_neg",
+      // // "test_dynamic_slice_start_out_of_bounds",
+      // // "test_dynamic_slice",
+      // // "test_dynamicquantizelinear_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted",
+      // // "test_dynamicquantizelinear_min_adjusted_expanded",
+      // // "test_dynamicquantizelinear_min_adjusted",
+      // // "test_dynamicquantizelinear",
+      // // "test_edge_pad",
+      // "test_einsum_batch_diagonal",
+      // "test_einsum_batch_matmul",
+      // "test_einsum_inner_prod",
+      // "test_einsum_sum",
+      // "test_einsum_transpose",
+      "test_elu_default",
+      "test_elu_example",
+      "test_elu",
+      // "test_equal_bcast",
+      // "test_equal",
+      // "test_erf",
+      // "test_exp_example",
+      // "test_exp",
+      // "test_expand_dim_changed",
+      // "test_expand_dim_unchanged",
+      // "test_eyelike_populate_off_main_diagonal",
+      // "test_eyelike_with_dtype",
+      // "test_eyelike_without_dtype",
+      // "test_flatten_axis0",
+      // "test_flatten_axis1",
+      // "test_flatten_axis2",
+      // "test_flatten_axis3",
+      // "test_flatten_default_axis",
+      // "test_flatten_negative_axis1",
+      // "test_flatten_negative_axis2",
+      // "test_flatten_negative_axis3",
+      // "test_flatten_negative_axis4",
+      "test_floor_example",
+      "test_floor",
+      // "test_gather_0",
+      // "test_gather_1",
+      // "test_gather_2d_indices",
+      // "test_gather_elements_0",
+      // "test_gather_elements_1",
+      // "test_gather_elements_negative_indices",
+      // "test_gather_negative_indices",
+      // // "test_gathernd_example_float32",
+      // // "test_gathernd_example_int32_batch_dim1",
+      // // "test_gathernd_example_int32",
+      "test_gemm_all_attributes",
+      "test_gemm_alpha",
+      "test_gemm_beta",
+      "test_gemm_broadcast",
+      "test_gemm_default_matrix_bias",
+      "test_gemm_default_no_bias",
+      "test_gemm_default_scalar_bias",
+      "test_gemm_default_single_elem_vector_bias",
+      "test_gemm_default_vector_bias",
+      "test_gemm_default_zero_bias",
+      "test_gemm_nobroadcast",
+      "test_gemm_transposeA",
+      "test_gemm_transposeB",
+      // "test_globalaveragepool_precomputed",
+      // "test_globalaveragepool",
+      // "test_globalmaxpool_precomputed",
+      // "test_globalmaxpool",
+      // "test_greater_bcast",
+      // "test_greater_equal_bcast_expanded",
+      // "test_greater_equal_bcast",
+      // "test_greater_equal_expanded",
+      // "test_greater_equal",
+      // "test_greater",
+      // // "test_gridsample_aligncorners_true",
+      // // "test_gridsample_bicubic",
+      // // "test_gridsample_bilinear",
+      // // "test_gridsample_border_padding",
+      // // "test_gridsample_nearest",
+      // // "test_gridsample_reflection_padding",
+      // // "test_gridsample_zeros_padding",
+      // // "test_gridsample",
+      // // "test_gru_batchwise",
+      // // "test_gru_defaults",
+      // // "test_gru_seq_length",
+      // // "test_gru_with_initial_bias",
+      // // "test_hammingwindow_expanded",
+      // // "test_hammingwindow_symmetric_expanded",
+      // // "test_hammingwindow_symmetric",
+      // // "test_hammingwindow",
+      // // "test_hannwindow_expanded",
+      // // "test_hannwindow_symmetric_expanded",
+      // // "test_hannwindow_symmetric",
+      // // "test_hannwindow",
+      // // "test_hardmax_axis_0",
+      // // "test_hardmax_axis_1",
+      // // "test_hardmax_axis_2",
+      // // "test_hardmax_default_axis",
+      // // "test_hardmax_example",
+      // // "test_hardmax_negative_axis",
+      // // "test_hardmax_one_hot",
+      // // "test_hardsigmoid_default",
+      // // "test_hardsigmoid_example",
+      // // "test_hardsigmoid",
+      // // "test_hardswish_expanded",
+      // // "test_hardswish",
+      // // "test_instancenorm_epsilon",
+      // // "test_instancenorm_example",
+      // "test_isinf_negative",
+      // "test_isinf_positive",
+      // "test_isinf",
+      // "test_isnan",
+      // // "test_layer_normalization_2d_axis_negative_1_expanded",
+      // // "test_layer_normalization_2d_axis_negative_1",
+      // // "test_layer_normalization_2d_axis_negative_2_expanded",
+      // // "test_layer_normalization_2d_axis_negative_2",
+      // // "test_layer_normalization_2d_axis0_expanded",
+      // // "test_layer_normalization_2d_axis0",
+      // // "test_layer_normalization_2d_axis1_expanded",
+      // // "test_layer_normalization_2d_axis1",
+      // // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded",
+      // // "test_layer_normalization_3d_axis_negative_1_epsilon",
+      // // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded",
+      // // "test_layer_normalization_3d_axis_negative_2_epsilon",
+      // // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded",
+      // // "test_layer_normalization_3d_axis_negative_3_epsilon",
+      // // "test_layer_normalization_3d_axis0_epsilon_expanded",
+      // // "test_layer_normalization_3d_axis0_epsilon",
+      // // "test_layer_normalization_3d_axis1_epsilon_expanded",
+      // // "test_layer_normalization_3d_axis1_epsilon",
+      // // "test_layer_normalization_3d_axis2_epsilon_expanded",
+      // // "test_layer_normalization_3d_axis2_epsilon",
+      // // "test_layer_normalization_4d_axis_negative_1_expanded",
+      // // "test_layer_normalization_4d_axis_negative_1",
+      // // "test_layer_normalization_4d_axis_negative_2_expanded",
+      // // "test_layer_normalization_4d_axis_negative_2",
+      // // "test_layer_normalization_4d_axis_negative_3_expanded",
+      // // "test_layer_normalization_4d_axis_negative_3",
+      // // "test_layer_normalization_4d_axis_negative_4_expanded",
+      // // "test_layer_normalization_4d_axis_negative_4",
+      // // "test_layer_normalization_4d_axis0_expanded",
+      // // "test_layer_normalization_4d_axis0",
+      // // "test_layer_normalization_4d_axis1_expanded",
+      // // "test_layer_normalization_4d_axis1",
+      // // "test_layer_normalization_4d_axis2_expanded",
+      // // "test_layer_normalization_4d_axis2",
+      // // "test_layer_normalization_4d_axis3_expanded",
+      // // "test_layer_normalization_4d_axis3",
+      // // "test_layer_normalization_default_axis_expanded",
+      // // "test_layer_normalization_default_axis",
+      // "test_leakyrelu_default",
+      // "test_leakyrelu_example",
+      // "test_leakyrelu",
+      // "test_less_bcast",
+      // "test_less_equal_bcast_expanded",
+      // "test_less_equal_bcast",
+      // "test_less_equal_expanded",
+      // "test_less_equal",
+      // "test_less",
+      "test_log_example",
+      "test_log",
+      // // "test_logsoftmax_axis_0_expanded",
+      // // "test_logsoftmax_axis_0",
+      // // "test_logsoftmax_axis_1_expanded",
+      // // "test_logsoftmax_axis_1",
+      // // "test_logsoftmax_axis_2_expanded",
+      // // "test_logsoftmax_axis_2",
+      // // "test_logsoftmax_default_axis_expanded",
+      // // "test_logsoftmax_default_axis",
+      // // "test_logsoftmax_example_1_expanded",
+      // // "test_logsoftmax_example_1",
+      // // "test_logsoftmax_large_number_expanded",
+      // // "test_logsoftmax_large_number",
+      // // "test_logsoftmax_negative_axis_expanded",
+      // // "test_logsoftmax_negative_axis",
+      // "test_lrn_default",
+      // "test_lrn",
+      // // "test_lstm_batchwise",
+      // // "test_lstm_defaults",
+      // // "test_lstm_with_initial_bias",
+      // // "test_lstm_with_peepholes",
+      "test_matmul_2d",
+      "test_matmul_3d",
+      "test_matmul_4d",
+      // // "test_matmulinteger",
+      // "test_max_example",
+      // "test_max_float16",
+      // "test_max_float32",
+      // "test_max_float64",
+      // "test_max_int16",
+      // "test_max_int32",
+      // "test_max_int64",
+      // "test_max_int8",
+      // "test_max_one_input",
+      // "test_max_two_inputs",
+      // "test_max_uint16",
+      // "test_max_uint32",
+      // "test_max_uint64",
+      // "test_max_uint8",
+      // "test_maxpool_1d_default",
+      // "test_maxpool_2d_ceil",
+      // "test_maxpool_2d_default",
+      // "test_maxpool_2d_dilations",
+      // "test_maxpool_2d_pads",
+      // "test_maxpool_2d_precomputed_pads",
+      // "test_maxpool_2d_precomputed_same_upper",
+      // "test_maxpool_2d_precomputed_strides",
+      // "test_maxpool_2d_same_lower",
+      // "test_maxpool_2d_same_upper",
+      // "test_maxpool_2d_strides",
+      // "test_maxpool_2d_uint8",
+      // "test_maxpool_3d_default",
+      // "test_maxpool_with_argmax_2d_precomputed_pads",
+      // "test_maxpool_with_argmax_2d_precomputed_strides",
+      // // "test_maxunpool_export_with_output_shape",
+      // // "test_maxunpool_export_without_output_shape",
+      // // "test_mean_example",
+      // // "test_mean_one_input",
+      // // "test_mean_two_inputs",
+      // // "test_melweightmatrix",
+      // "test_min_example",
+      // "test_min_float16",
+      // "test_min_float32",
+      // "test_min_float64",
+      // "test_min_int16",
+      // "test_min_int32",
+      // "test_min_int64",
+      // "test_min_int8",
+      // "test_min_one_input",
+      // "test_min_two_inputs",
+      // "test_min_uint16",
+      // "test_min_uint32",
+      // "test_min_uint64",
+      // "test_min_uint8",
+      // "test_mod_bcast",
+      // "test_mod_broadcast",
+      // "test_mod_float_mixed_sign_example",
+      // "test_mod_fmod_mixed_sign_example",
+      // "test_mod_int64_fmod",
+      // "test_mod_int64_mixed_sign_example",
+      // "test_mod_mixed_sign_float16",
+      // "test_mod_mixed_sign_float32",
+      // "test_mod_mixed_sign_float64",
+      // "test_mod_mixed_sign_int16",
+      // "test_mod_mixed_sign_int32",
+      // "test_mod_mixed_sign_int64",
+      // "test_mod_mixed_sign_int8",
+      // "test_mod_uint16",
+      // "test_mod_uint32",
+      // "test_mod_uint64",
+      // "test_mod_uint8",
+      // // "test_momentum_multiple",
+      // // "test_momentum",
+      "test_mul_bcast",
+      "test_mul_example",
+      // "test_mul_uint8",
+      "test_mul",
+      // "test_mvn_expanded",
+      // "test_mvn",
+      "test_neg_example",
+      "test_neg",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // // "test_nesterov_momentum",
+      // // "test_nllloss_NC_expanded",
+      // // "test_nllloss_NC",
+      // // "test_nllloss_NCd1_expanded",
+      // // "test_nllloss_NCd1_ii_expanded",
+      // // "test_nllloss_NCd1_ii",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii",
+      // // "test_nllloss_NCd1_weight_expanded",
+      // // "test_nllloss_NCd1_weight_ii_expanded",
+      // // "test_nllloss_NCd1_weight_ii",
+      // // "test_nllloss_NCd1_weight",
+      // // "test_nllloss_NCd1",
+      // // "test_nllloss_NCd1d2_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii",
+      // // "test_nllloss_NCd1d2_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_reduction_mean",
+      // // "test_nllloss_NCd1d2_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight",
+      // // "test_nllloss_NCd1d2",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight",
+      // "test_nonmaxsuppression_center_point_box_format",
+      // "test_nonmaxsuppression_flipped_coordinates",
+      // "test_nonmaxsuppression_identical_boxes",
+      // "test_nonmaxsuppression_limit_output_size",
+      // "test_nonmaxsuppression_single_box",
+      // "test_nonmaxsuppression_suppress_by_IOU_and_scores",
+      // "test_nonmaxsuppression_suppress_by_IOU",
+      // "test_nonmaxsuppression_two_batches",
+      // "test_nonmaxsuppression_two_classes",
+      // "test_nonzero_example",
+      // "test_not_2d",
+      // "test_not_3d",
+      // "test_not_4d",
+      // // "test_onehot_negative_indices",
+      // // "test_onehot_with_axis",
+      // // "test_onehot_with_negative_axis",
+      // // "test_onehot_without_axis",
+      // // "test_optional_get_element_sequence",
+      // // "test_optional_get_element",
+      // // "test_optional_has_element_empty",
+      // // "test_optional_has_element",
+      // "test_or_bcast3v1d",
+      // "test_or_bcast3v2d",
+      // "test_or_bcast4v2d",
+      // "test_or_bcast4v3d",
+      // "test_or_bcast4v4d",
+      // "test_or2d",
+      // "test_or3d",
+      // "test_or4d",
+      "test_pow_bcast_array",
+      "test_pow_bcast_scalar",
+      "test_pow_example",
+      // "test_pow_types_float",
+      // "test_pow_types_float32_int32",
+      // "test_pow_types_float32_int64",
+      // "test_pow_types_float32_uint32",
+      // "test_pow_types_float32_uint64",
+      // "test_pow_types_int",
+      // "test_pow_types_int32_float32",
+      // "test_pow_types_int32_int32",
+      // "test_pow_types_int64_float32",
+      // "test_pow_types_int64_int64",
+      "test_pow",
+      // "test_prelu_broadcast",
+      // "test_prelu_example",
+      // // "test_qlinearconv",
+      // // "test_qlinearmatmul_2D",
+      // // "test_qlinearmatmul_3D",
+      // // "test_quantizelinear_axis",
+      // // "test_quantizelinear",
+      // "test_range_float_type_positive_delta_expanded",
+      // "test_range_float_type_positive_delta",
+      // "test_range_int32_type_negative_delta_expanded",
+      // "test_range_int32_type_negative_delta",
+      "test_reciprocal_example",
+      "test_reciprocal",
+      // "test_reduce_l1_default_axes_keepdims_example",
+      // "test_reduce_l1_default_axes_keepdims_random",
+      // "test_reduce_l1_do_not_keepdims_example",
+      // "test_reduce_l1_do_not_keepdims_random",
+      // "test_reduce_l1_keep_dims_example",
+      // "test_reduce_l1_keep_dims_random",
+      // "test_reduce_l1_negative_axes_keep_dims_example",
+      // "test_reduce_l1_negative_axes_keep_dims_random",
+      // "test_reduce_l2_default_axes_keepdims_example",
+      // "test_reduce_l2_default_axes_keepdims_random",
+      // "test_reduce_l2_do_not_keepdims_example",
+      // "test_reduce_l2_do_not_keepdims_random",
+      // "test_reduce_l2_keep_dims_example",
+      // "test_reduce_l2_keep_dims_random",
+      // "test_reduce_l2_negative_axes_keep_dims_example",
+      // "test_reduce_l2_negative_axes_keep_dims_random",
+      // "test_reduce_log_sum_asc_axes",
+      // "test_reduce_log_sum_default",
+      // "test_reduce_log_sum_desc_axes",
+      // "test_reduce_log_sum_exp_default_axes_keepdims_example",
+      // "test_reduce_log_sum_exp_default_axes_keepdims_random",
+      // "test_reduce_log_sum_exp_do_not_keepdims_example",
+      // "test_reduce_log_sum_exp_do_not_keepdims_random",
+      // "test_reduce_log_sum_exp_keepdims_example",
+      // "test_reduce_log_sum_exp_keepdims_random",
+      // "test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      // "test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      // "test_reduce_log_sum_negative_axes",
+      // "test_reduce_log_sum",
+      // "test_reduce_max_default_axes_keepdim_example",
+      // "test_reduce_max_default_axes_keepdims_random",
+      // "test_reduce_max_do_not_keepdims_example",
+      // "test_reduce_max_do_not_keepdims_random",
+      // "test_reduce_max_keepdims_example",
+      // "test_reduce_max_keepdims_random",
+      // "test_reduce_max_negative_axes_keepdims_example",
+      // "test_reduce_max_negative_axes_keepdims_random",
+      // "test_reduce_mean_default_axes_keepdims_example",
+      // "test_reduce_mean_default_axes_keepdims_random",
+      // "test_reduce_mean_do_not_keepdims_example",
+      // "test_reduce_mean_do_not_keepdims_random",
+      // "test_reduce_mean_keepdims_example",
+      // "test_reduce_mean_keepdims_random",
+      // "test_reduce_mean_negative_axes_keepdims_example",
+      // "test_reduce_mean_negative_axes_keepdims_random",
+      // "test_reduce_min_default_axes_keepdims_example",
+      // "test_reduce_min_default_axes_keepdims_random",
+      // "test_reduce_min_do_not_keepdims_example",
+      // "test_reduce_min_do_not_keepdims_random",
+      // "test_reduce_min_keepdims_example",
+      // "test_reduce_min_keepdims_random",
+      // "test_reduce_min_negative_axes_keepdims_example",
+      // "test_reduce_min_negative_axes_keepdims_random",
+      // "test_reduce_prod_default_axes_keepdims_example",
+      // "test_reduce_prod_default_axes_keepdims_random",
+      // "test_reduce_prod_do_not_keepdims_example",
+      // "test_reduce_prod_do_not_keepdims_random",
+      // "test_reduce_prod_keepdims_example",
+      // "test_reduce_prod_keepdims_random",
+      // "test_reduce_prod_negative_axes_keepdims_example",
+      // "test_reduce_prod_negative_axes_keepdims_random",
+      // "test_reduce_sum_default_axes_keepdims_example",
+      // "test_reduce_sum_default_axes_keepdims_random",
+      // "test_reduce_sum_do_not_keepdims_example",
+      // "test_reduce_sum_do_not_keepdims_random",
+      // "test_reduce_sum_empty_axes_input_noop_example",
+      // "test_reduce_sum_empty_axes_input_noop_random",
+      // "test_reduce_sum_keepdims_example",
+      // "test_reduce_sum_keepdims_random",
+      // "test_reduce_sum_negative_axes_keepdims_example",
+      // "test_reduce_sum_negative_axes_keepdims_random",
+      // "test_reduce_sum_square_default_axes_keepdims_example",
+      // "test_reduce_sum_square_default_axes_keepdims_random",
+      // "test_reduce_sum_square_do_not_keepdims_example",
+      // "test_reduce_sum_square_do_not_keepdims_random",
+      // "test_reduce_sum_square_keepdims_example",
+      // "test_reduce_sum_square_keepdims_random",
+      // "test_reduce_sum_square_negative_axes_keepdims_example",
+      // "test_reduce_sum_square_negative_axes_keepdims_random",
+      // // "test_reflect_pad",
+      "test_relu",
+      // "test_reshape_allowzero_reordered",
+      // "test_reshape_extended_dims",
+      // "test_reshape_negative_dim",
+      // "test_reshape_negative_extended_dims",
+      // "test_reshape_one_dim",
+      // "test_reshape_reduced_dims",
+      // "test_reshape_reordered_all_dims",
+      // "test_reshape_reordered_dims",
+      // "test_reshape_reordered_last_dims",
+      // "test_reshape_zero_and_negative_dim",
+      // "test_reshape_zero_dim",
+      // "test_resize_downsample_linear",
+      // "test_resize_downsample_nearest",
+      // "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
+      // "test_resize_downsample_scales_cubic_align_corners",
+      // "test_resize_downsample_scales_cubic",
+      // "test_resize_downsample_scales_linear_align_corners",
+      // "test_resize_downsample_scales_linear",
+      // "test_resize_downsample_scales_nearest",
+      // "test_resize_downsample_sizes_cubic",
+      // "test_resize_downsample_sizes_linear_pytorch_half_pixel",
+      // "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn",
+      // "test_resize_downsample_sizes_nearest",
+      // "test_resize_nearest",
+      // "test_resize_tf_crop_and_resize",
+      // "test_resize_upsample_linear",
+      // "test_resize_upsample_nearest",
+      // "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside",
+      // "test_resize_upsample_scales_cubic_align_corners",
+      // "test_resize_upsample_scales_cubic_asymmetric",
+      // "test_resize_upsample_scales_cubic",
+      // "test_resize_upsample_scales_linear_align_corners",
+      // "test_resize_upsample_scales_linear",
+      // "test_resize_upsample_scales_nearest",
+      // "test_resize_upsample_sizes_cubic",
+      // "test_resize_upsample_sizes_nearest_ceil_half_pixel",
+      // "test_resize_upsample_sizes_nearest_floor_align_corners",
+      // "test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
+      // "test_resize_upsample_sizes_nearest",
+      // // "test_reversesequence_batch",
+      // // "test_reversesequence_time",
+      // // "test_rnn_seq_length",
+      // // "test_roialign_aligned_false",
+      // // "test_roialign_aligned_true",
+      // // "test_roialign",
+      // // "test_round",
+      // // "test_scan_sum",
+      // // "test_scan9_sum",
+      // // "test_scatter_elements_with_axis",
+      // // "test_scatter_elements_with_duplicate_indices",
+      // // "test_scatter_elements_with_negative_indices",
+      // // "test_scatter_elements_without_axis",
+      // // "test_scatter_with_axis",
+      // // "test_scatter_without_axis",
+      // // "test_scatternd_add",
+      // // "test_scatternd_multiply",
+      // // "test_scatternd",
+      // // "test_sce_mean_3d_expanded",
+      // // "test_sce_mean_3d_log_prob_expanded",
+      // // "test_sce_mean_3d_log_prob",
+      // // "test_sce_mean_3d",
+      // // "test_sce_mean_expanded",
+      // // "test_sce_mean_log_prob_expanded",
+      // // "test_sce_mean_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d",
+      // // "test_sce_mean_no_weight_ii_4d_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob",
+      // // "test_sce_mean_no_weight_ii_4d",
+      // // "test_sce_mean_no_weight_ii_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob",
+      // // "test_sce_mean_no_weight_ii",
+      // // "test_sce_mean_weight_expanded",
+      // // "test_sce_mean_weight_ii_3d_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob",
+      // // "test_sce_mean_weight_ii_3d",
+      // // "test_sce_mean_weight_ii_4d_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob",
+      // // "test_sce_mean_weight_ii_4d",
+      // // "test_sce_mean_weight_ii_expanded",
+      // // "test_sce_mean_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_log_prob",
+      // // "test_sce_mean_weight_ii",
+      // // "test_sce_mean_weight_log_prob_expanded",
+      // // "test_sce_mean_weight_log_prob",
+      // // "test_sce_mean_weight",
+      // // "test_sce_mean",
+      // // "test_sce_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1_mean_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight",
+      // // "test_sce_none_expanded",
+      // // "test_sce_none_log_prob_expanded",
+      // // "test_sce_none_log_prob",
+      // // "test_sce_none_weights_expanded",
+      // // "test_sce_none_weights_log_prob_expanded",
+      // // "test_sce_none_weights_log_prob",
+      // // "test_sce_none_weights",
+      // // "test_sce_none",
+      // // "test_sce_sum_expanded",
+      // // "test_sce_sum_log_prob_expanded",
+      // // "test_sce_sum_log_prob",
+      // // "test_sce_sum",
+      // "test_selu_default",
+      // "test_selu_example",
+      // "test_selu",
+      // // "test_sequence_insert_at_back",
+      // // "test_sequence_insert_at_front",
+      // // "test_sequence_map_add_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_add_1_sequence_1_tensor",
+      // // "test_sequence_map_add_2_sequences_expanded",
+      // // "test_sequence_map_add_2_sequences",
+      // // "test_sequence_map_extract_shapes_expanded",
+      // // "test_sequence_map_extract_shapes",
+      // // "test_sequence_map_identity_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_identity_1_sequence_1_tensor",
+      // // "test_sequence_map_identity_1_sequence_expanded",
+      // // "test_sequence_map_identity_1_sequence",
+      // // "test_sequence_map_identity_2_sequences_expanded",
+      // // "test_sequence_map_identity_2_sequences",
+      // "test_shrink_hard",
+      // "test_shrink_soft",
+      "test_sigmoid_example",
+      "test_sigmoid",
+      // "test_sign",
+      // "test_simple_rnn_batchwise",
+      // "test_simple_rnn_defaults",
+      // "test_simple_rnn_with_initial_bias",
+      "test_sin_example",
+      "test_sin",
+      "test_sinh_example",
+      "test_sinh",
+      // // "test_size_example",
+      // // "test_size",
+      // "test_slice_default_axes",
+      // "test_slice_default_steps",
+      // "test_slice_end_out_of_bounds",
+      // "test_slice_neg_steps",
+      // "test_slice_neg",
+      // "test_slice_negative_axes",
+      // "test_slice_start_out_of_bounds",
+      // "test_slice",
+      // "test_softmax_axis_0_expanded",
+      // "test_softmax_axis_0",
+      // "test_softmax_axis_1_expanded",
+      // "test_softmax_axis_1",
+      // "test_softmax_axis_2_expanded",
+      // "test_softmax_axis_2",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // "test_softmax_cross_entropy_mean_3d_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_3d",
+      // "test_softmax_cross_entropy_mean_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_mean_weight",
+      // "test_softmax_cross_entropy_mean",
+      // "test_softmax_cross_entropy_none_expanded",
+      // "test_softmax_cross_entropy_none_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_log_prob",
+      // "test_softmax_cross_entropy_none_weights_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob",
+      // "test_softmax_cross_entropy_none_weights",
+      // "test_softmax_cross_entropy_none",
+      // "test_softmax_cross_entropy_sum_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob",
+      // "test_softmax_cross_entropy_sum",
+      // "test_softmax_default_axis_expanded",
+      // "test_softmax_default_axis",
+      // "test_softmax_example_expanded",
+      // "test_softmax_example",
+      // "test_softmax_large_number_expanded",
+      // "test_softmax_large_number",
+      // "test_softmax_negative_axis_expanded",
+      // "test_softmax_negative_axis",
+      // // "test_softplus_example",
+      // // "test_softplus",
+      // // "test_softsign_example",
+      // // "test_softsign",
+      // "test_spacetodepth_example",
+      // "test_spacetodepth",
+      // // "test_split_equal_parts_1d",
+      // // "test_split_equal_parts_2d",
+      // // "test_split_equal_parts_default_axis",
+      // // "test_split_variable_parts_1d",
+      // // "test_split_variable_parts_2d",
+      // // "test_split_variable_parts_default_axis",
+      // // "test_split_zero_size_splits",
+      "test_sqrt_example",
+      "test_sqrt",
+      // "test_squeeze_negative_axes",
+      // "test_squeeze",
+      // // "test_stft_with_window",
+      // // "test_stft",
+      // // "test_strnormalizer_export_monday_casesensintive_lower",
+      // // "test_strnormalizer_export_monday_casesensintive_nochangecase",
+      // // "test_strnormalizer_export_monday_casesensintive_upper",
+      // // "test_strnormalizer_export_monday_empty_output",
+      // // "test_strnormalizer_export_monday_insensintive_upper_twodim",
+      // // "test_strnormalizer_nostopwords_nochangecase",
+      "test_sub_bcast",
+      "test_sub_example",
+      // "test_sub_uint8",
+      "test_sub",
+      // "test_sum_example",
+      // "test_sum_one_input",
+      // "test_sum_two_inputs",
+      "test_tan_example",
+      "test_tan",
+      "test_tanh_example",
+      "test_tanh",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
+      // // "test_tfidfvectorizer_tf_only_bigrams_skip0",
+      // // "test_tfidfvectorizer_tf_onlybigrams_levelempty",
+      // // "test_tfidfvectorizer_tf_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_uniandbigrams_skip5",
+      // "test_thresholdedrelu_default",
+      // "test_thresholdedrelu_example",
+      // "test_thresholdedrelu",
+      // // "test_tile_precomputed",
+      // // "test_tile",
+      // // "test_top_k_negative_axis",
+      // // "test_top_k_smallest",
+      // // "test_top_k",
+      // // "test_training_dropout_default_mask",
+      // // "test_training_dropout_default",
+      // // "test_training_dropout_mask",
+      // // "test_training_dropout_zero_ratio_mask",
+      // // "test_training_dropout_zero_ratio",
+      // // "test_training_dropout",
+      "test_transpose_all_permutations_0",
+      "test_transpose_all_permutations_1",
+      "test_transpose_all_permutations_2",
+      "test_transpose_all_permutations_3",
+      "test_transpose_all_permutations_4",
+      "test_transpose_all_permutations_5",
+      "test_transpose_default"
+      // "test_tril_neg",
+      // "test_tril_one_row_neg",
+      // "test_tril_out_neg",
+      // "test_tril_out_pos",
+      // "test_tril_pos",
+      // "test_tril_square_neg",
+      // "test_tril_square",
+      // "test_tril_zero",
+      // "test_tril",
+      // "test_triu_neg",
+      // "test_triu_one_row",
+      // "test_triu_out_neg_out",
+      // "test_triu_out_pos",
+      // "test_triu_pos",
+      // "test_triu_square_neg",
+      // "test_triu_square",
+      // "test_triu_zero",
+      // "test_triu",
+      // // "test_unique_not_sorted_without_axis",
+      // // "test_unique_sorted_with_axis_3d",
+      // // "test_unique_sorted_with_axis",
+      // // "test_unique_sorted_with_negative_axis",
+      // // "test_unique_sorted_without_axis",
+      // "test_unsqueeze_axis_0",
+      // "test_unsqueeze_axis_1",
+      // "test_unsqueeze_axis_2",
+      // "test_unsqueeze_axis_3",
+      // "test_unsqueeze_negative_axes",
+      // "test_unsqueeze_three_axes",
+      // "test_unsqueeze_two_axes",
+      // "test_unsqueeze_unsorted_axes",
+      // "test_unsqueeze",
+      // "test_upsample_nearest",
+      // "test_where_example",
+      // "test_where_long_example",
+      // "test_xor_bcast3v1d",
+      // "test_xor_bcast3v2d",
+      // "test_xor_bcast4v2d",
+      // "test_xor_bcast4v3d",
+      // "test_xor_bcast4v4d",
+      // "test_xor2d",
+      // "test_xor3d",
+      // "test_xor4d"
+    ],
+    "ops": []
   }
 }
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 814275cfc6c82..4b4ed56ff6d5a 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -261,9 +261,6 @@ export class TensorResultValidator {
     if (backend === 'cpu') {
       this.absoluteThreshold = CPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = CPU_THRESHOLD_RELATIVE_ERROR;
-    } else if (backend === 'js') {
-      this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
-      this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'webgl') {
       if (TensorResultValidator.isHalfFloat === undefined) {
         TensorResultValidator.isHalfFloat = !createWebGLContext(ort.env.webgl.contextId).isRenderFloat32Supported;
@@ -276,7 +273,7 @@ export class TensorResultValidator {
         this.absoluteThreshold = WEBGL_THRESHOLD_ABSOLUTE_ERROR;
         this.relativeThreshold = WEBGL_THRESHOLD_RELATIVE_ERROR;
       }
-    } else if (backend === 'webgpu') {
+    } else if (backend === 'webgpu' || backend === 'jsep-webgpu') {
       this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'wasm' || backend === 'xnnpack') {
diff --git a/js/web/webpack.config.js b/js/web/webpack.config.js
index d69c6e3b94060..1c842ddced25a 100644
--- a/js/web/webpack.config.js
+++ b/js/web/webpack.config.js
@@ -57,6 +57,7 @@ function defaultTerserPluginOptions(target) {
 
 const DEFAULT_BUILD_DEFS = {
   DISABLE_WEBGL: false,
+  DISABLE_WEBGPU: false,
   DISABLE_WASM: false,
   DISABLE_WASM_PROXY: false,
   DISABLE_WASM_THREAD: false,
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index 5564012221fec..23dd9176cede9 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -32,7 +32,7 @@ IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                   const IKernelLookup& kernel_lookup) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (const auto& node : graph.Nodes()) {
-    printf("IExecutionProvider::GetCapability() calling on node: [%s][%s][%s]\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str());
+    //printf("IExecutionProvider::GetCapability() calling on node: [%s][%s][%s]\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str());
     if (const KernelCreateInfo* kernel_create_info = kernel_lookup.LookUpKernel(node);
         kernel_create_info != nullptr) {
       std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
diff --git a/onnxruntime/core/framework/kernel_lookup.h b/onnxruntime/core/framework/kernel_lookup.h
index 933aed4542c06..30f89940dbd1d 100644
--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@@ -30,17 +30,17 @@ class KernelLookup final : public IExecutionProvider::IKernelLookup {
 
   const KernelCreateInfo* LookUpKernel(const Node& node) const override {
     const KernelCreateInfo* kernel_create_info{};
-    printf(" LookUpKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), provider_type_.c_str());
+    //printf(" LookUpKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), provider_type_.c_str());
     for (const auto& registry : kernel_registries_) {
       const auto lookup_status = registry->TryFindKernel(node, provider_type_, kernel_type_str_resolver_,
                                                          &kernel_create_info);
       if (lookup_status.IsOK() && kernel_create_info != nullptr) {
-    printf(" - found\n");
+    //printf(" - found\n");
         return kernel_create_info;
       }
     }
 
-    printf(" - not found\n");
+    //printf(" - not found\n");
     return nullptr;
   }
 
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index 652e2a8860e17..6b2a3d09c20b5 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -166,7 +166,7 @@ Status KernelRegistry::TryFindKernel(const Node& node,
   const auto& node_provider = node.GetExecutionProviderType();
   const auto& expected_provider = (node_provider.empty() ? exec_provider : node_provider);
 
-    printf("  KernelRegistry::TryFindKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), expected_provider.c_str());
+    //printf("  KernelRegistry::TryFindKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), expected_provider.c_str());
   auto range = kernel_creator_fn_map_.equal_range(GetMapKey(node.OpType(), node.Domain(), expected_provider));
   if (out) *out = nullptr;
 
@@ -176,7 +176,7 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     std::string error_str;
     if (VerifyKernelDef(node, *i->second.kernel_def, kernel_type_str_resolver, error_str)) {
       if (out) *out = &i->second;
-    printf("  KernelRegistry::TryFindKernel() OK\n");
+    //printf("  KernelRegistry::TryFindKernel() OK\n");
       return Status::OK();
     }
     verify_kernel_def_error_strs.push_back(error_str);
@@ -193,11 +193,11 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     oss << ")";
 
     VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
-    printf("  KernelRegistry::TryFindKernel() failed: %s\n",oss.str().c_str());
+    //printf("  KernelRegistry::TryFindKernel() failed: %s\n",oss.str().c_str());
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
-    printf("  KernelRegistry::TryFindKernel() failed: Kernel not found\n");
+    //printf("  KernelRegistry::TryFindKernel() failed: Kernel not found\n");
   return Status(common::ONNXRUNTIME, common::FAIL, "Kernel not found");
 }
 
diff --git a/onnxruntime/core/providers/js/allocator.cc b/onnxruntime/core/providers/js/allocator.cc
index aed52855c421f..6345ef5b335f1 100644
--- a/onnxruntime/core/providers/js/allocator.cc
+++ b/onnxruntime/core/providers/js/allocator.cc
@@ -10,7 +10,6 @@ namespace onnxruntime {
 namespace js {
 
 void* JsCustomAllocator::Alloc(size_t size) {
-  printf("JsCustomAllocator::Alloc(%zu)\n", size);
   void* p = EM_ASM_PTR({return Module.jsepAlloc($0);}, size);
   stats_.num_allocs++;
   stats_.bytes_in_use += size;
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 0079253e64fa2..c08dabe246585 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -74,13 +74,58 @@ ONNX_OPERATOR_KERNEL_EX(
   BuildKernelCreateInfo<                          \
       ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, Start, type, Op)>
 
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 13, Abs);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Abs);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Abs);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Abs);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Neg);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Neg);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Floor);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Floor);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Ceil);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Ceil);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Reciprocal);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Reciprocal);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Sqrt);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Sqrt);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Exp);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Exp);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 12, Erf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Erf);
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Sin);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Cos);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Tan);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Asin);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Acos);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Atan);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Sinh);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Cosh);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Asinh);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Acosh);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Atanh);
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, Elu);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 12, Add);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Add);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Add);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 12, Sub);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Sub);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Sub);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 12, Mul);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Mul);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Mul);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 12, Div);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Div);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Div);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 11, Pow);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, Pow);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 14, Pow);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 15, Pow);
 
 //class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
 
-// class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
 
 // class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv);
@@ -103,11 +148,63 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
-      KERNEL_CREATE_INFO_VERSIONED(1, 13, Abs),
-      KERNEL_CREATE_INFO(14, Abs),
+
+      // element-wise operators
+      // unary - math
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Abs),
+      KERNEL_CREATE_INFO(13, Abs),
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Neg),
+      KERNEL_CREATE_INFO(13, Neg),
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Floor),
+      KERNEL_CREATE_INFO(13, Floor),
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Ceil),
+      KERNEL_CREATE_INFO(13, Ceil),
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Reciprocal),
+      KERNEL_CREATE_INFO(13, Reciprocal),
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Sqrt),
+      KERNEL_CREATE_INFO(13, Sqrt),
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Exp),
+      KERNEL_CREATE_INFO(13, Exp),
+      KERNEL_CREATE_INFO_VERSIONED(9, 12, Erf),
+      KERNEL_CREATE_INFO(13, Erf),
+
+      KERNEL_CREATE_INFO(7, Sin),
+      KERNEL_CREATE_INFO(7, Cos),
+      KERNEL_CREATE_INFO(7, Tan),
+      KERNEL_CREATE_INFO(7, Asin),
+      KERNEL_CREATE_INFO(7, Acos),
+      KERNEL_CREATE_INFO(7, Atan),
+      KERNEL_CREATE_INFO(9, Sinh),
+      KERNEL_CREATE_INFO(9, Cosh),
+      KERNEL_CREATE_INFO(9, Asinh),
+      KERNEL_CREATE_INFO(9, Acosh),
+      KERNEL_CREATE_INFO(9, Atanh),
+
+      // activations
+      KERNEL_CREATE_INFO(6, Elu),
+
+      // binary - math
+      KERNEL_CREATE_INFO_VERSIONED(7, 12, Add),
+      KERNEL_CREATE_INFO_VERSIONED(13, 13, Add),
+      KERNEL_CREATE_INFO(14, Add),
+      KERNEL_CREATE_INFO_VERSIONED(7, 12, Sub),
+      KERNEL_CREATE_INFO_VERSIONED(13, 13, Sub),
+      KERNEL_CREATE_INFO(14, Sub),
+      KERNEL_CREATE_INFO_VERSIONED(7, 12, Mul),
+      KERNEL_CREATE_INFO_VERSIONED(13, 13, Mul),
+      KERNEL_CREATE_INFO(14, Mul),
+      KERNEL_CREATE_INFO_VERSIONED(7, 12, Div),
+      KERNEL_CREATE_INFO_VERSIONED(13, 13, Div),
+      KERNEL_CREATE_INFO(14, Div),
+      KERNEL_CREATE_INFO_VERSIONED(7, 11, Pow),
+      KERNEL_CREATE_INFO_VERSIONED(12, 12, Pow),
+      KERNEL_CREATE_INFO_VERSIONED(13, 14, Pow),
+      KERNEL_CREATE_INFO(15, Pow),
+
+
       //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
-      //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
       // KERNEL_CREATE_INFO(11, Conv),
       // KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool),
@@ -166,16 +263,19 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
     const IKernelLookup& kernel_lookup) const {
 
   auto list = IExecutionProvider::GetCapability(graph, kernel_lookup);
-  printf("JsExecutionProvider::GetCapability() results:\n");
+  //printf("JsExecutionProvider::GetCapability() results:\n");
 
   for (size_t i=0; i < list.size(); i++) {
-    printf("  subgraph %zu: %zu node(s)\n", i, list[i]->sub_graph->nodes.size());
+    //printf("  subgraph %zu: %zu node(s)\n", i, list[i]->sub_graph->nodes.size());
     for (size_t j=0;j<list[i]->sub_graph->nodes.size();j++) {
       auto node_index = list[i]->sub_graph->nodes[j];
       auto *node = graph.GetNode(node_index);
-      //auto *kernel_info = kernel_lookup.LookUpKernel(&node);
+      auto *kernel_info = kernel_lookup.LookUpKernel(*node);
 
-      printf("    node[%zu]: [%s][%s][%s]\n", node_index, node->Domain().c_str(), node->OpType().c_str(), node->Name().c_str());
+      (void)(node_index);
+      (void)(node);
+      (void)(kernel_info);
+      //printf("    node[%zu]: [%s][%s][%s]\n", node_index, node->Domain().c_str(), node->OpType().c_str(), node->Name().c_str());
     }
   }
 
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index ac5f20f185288..9c5e653006ce9 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -42,7 +42,7 @@ class JsExecutionProvider : public IExecutionProvider {
 
   void RegisterAllocator(AllocatorManager& /*allocator_manager*/) override;
 
-  DataLayout GetPreferredLayout() const override { return DataLayout::NHWC; }
+  //DataLayout GetPreferredLayout() const override { return DataLayout::NHWC; }
 
   FusionStyle GetFusionStyle() const override { return FusionStyle::FilteredGraphViewer; }
 
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index 70e2157f0489b..7e36c2e38c37d 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -16,6 +16,7 @@ const void * JsepOutput(void * context, int index, void * data) {
         dims[i] = static_cast<int64_t>(*data_offset++);
     }
 
+        printf("JsepOutput(%d, %s)\n", index, onnxruntime::TensorShape(dims).ToString().c_str());
     auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
     return output->DataRaw();
 }
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 041512953f839..57052335aa1ce 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -14,7 +14,7 @@ namespace onnxruntime {
 namespace js {
 
 #define JSEP_INIT_KERNEL(x) EM_ASM({ Module.jsepCreateKernel(#x, $0, undefined); }, this)
-#define JSEP_INIT_KERNEL_ATTRIBUTE(x, a, ...) EM_ASM({ Module.jsepCreateKernel(#x, $0, a); }, this, __VA_ARGS__)
+#define JSEP_INIT_KERNEL_ATTRIBUTE(x, attr, ...) EM_ASM({ Module.jsepCreateKernel(#x, $0, attr); }, this, __VA_ARGS__)
 
 #define JSEP_KERNEL_IMPL(classname, x)                       \
 class classname : public JsKernel {                          \
@@ -24,14 +24,33 @@ public:                                                      \
     }                                                        \
 };
 
-#define JSEP_CLASS_IMPL_ATTRIBUTE(classname, x, a, ...)      \
+#define JSEP_KERNEL_TYPED_IMPL(classname, x)                 \
+template<typename T>                                         \
 class classname : public JsKernel {                          \
 public:                                                      \
     classname(const OpKernelInfo& info) : JsKernel(info) {   \
-        JSEP_INIT_KERNEL_ATTRIBUTE(x, a, __VA_ARGS__);       \
+        JSEP_INIT_KERNEL(x);                                 \
     }                                                        \
 };
 
+#define JSEP_CLASS_IMPL_ATTRIBUTE(classname, x, attr_pre, attr, ...)       \
+class classname : public JsKernel {                                        \
+public:                                                                    \
+    classname(const OpKernelInfo& info) : JsKernel(info) {                 \
+        attr_pre                                                           \
+        JSEP_INIT_KERNEL_ATTRIBUTE(x, attr, __VA_ARGS__);                  \
+    }                                                                      \
+};
+
+#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(classname, x, attr_name, default_value, ...) \
+    JSEP_CLASS_IMPL_ATTRIBUTE(classname, x, , ({#attr_name:$1}), static_cast<double>(info.GetAttrOrDefault<float>(#attr_name, 1.0)))
+
+#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT(classname, x, attr_name, ...) \
+    JSEP_CLASS_IMPL_ATTRIBUTE(classname, x,                           \
+        float value;                                                  \
+        ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value)); ,       \
+        , ({#attr_name:$1}), static_cast<double>(value))
+
 class JsKernel : public OpKernel {
  public:
   explicit JsKernel(const OpKernelInfo& info)
@@ -46,12 +65,12 @@ class JsKernel : public OpKernel {
 
       //
       // temp_data_format (every item is (u)int32_t):
-      //    input_count | [input_data_0] ... [input_data_N-1]
+      //    context_prt | input_count | [input_data_0] ... [input_data_N-1]
       //
       // input_data_format:
       //    type | data_ptr | dim_size | dim[0] ... dim[N-1]
       //
-      size_t temp_data_size = sizeof(size_t);
+      size_t temp_data_size = sizeof(size_t) * 2;
       for (int i = 0; i < context->InputCount(); i++) {
         temp_data_size += sizeof(size_t) * (3 + context->Input<Tensor>(i)->Shape().NumDimensions());
       }
@@ -68,13 +87,15 @@ class JsKernel : public OpKernel {
         }
       }
 
-      printf("temp data size: %zu. Data: ", temp_data_size);
-      for (int i=0; i < (int)temp_data_size/4;i++) {printf("%u ", p_inputs_data[i]); }
-      printf("\n");
+      // printf("temp data size: %zu. Data: ", temp_data_size);
+      // for (int i=0; i < (int)temp_data_size/4;i++) {
+      //   printf("%u ", p_inputs_data[i]);
+      // }
+      // printf("\n");
 
       int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_inputs_data);
 
-      printf("outputs = %d. Y.data=%zu\n", context->OutputCount(), (size_t)(context->Output<Tensor>(0)->DataRaw()));
+      // printf("outputs = %d. Y.data=%zu\n", context->OutputCount(), (size_t)(context->Output<Tensor>(0)->DataRaw()));
 
       alloc->Free(p_inputs_data);
       if (status == 0) {
diff --git a/onnxruntime/core/providers/js/operators/binary.cc b/onnxruntime/core/providers/js/operators/binary.cc
new file mode 100644
index 0000000000000..030b2803e717f
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/binary.cc
@@ -0,0 +1,55 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS)               \
+  ONNX_OPERATOR_KERNEL_EX(                                                         \
+      OP_TYPE,                                                                     \
+      kOnnxDomain,                                                                 \
+      VERSION,                                                                     \
+      kJsExecutionProvider,                                                        \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()), \
+      KERNEL_CLASS);
+
+#define REG_ELEMENTWISE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, TYPE, KERNEL_CLASS) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                            \
+      OP_TYPE,                                                                                  \
+      kOnnxDomain,                                                                              \
+      VERSION_FROM, VERSION_TO,                                                                 \
+      kJsExecutionProvider,                                                                     \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()),              \
+      KERNEL_CLASS);
+
+
+JSEP_KERNEL_IMPL(Add, Add)
+REG_ELEMENTWISE_VERSIONED_KERNEL(Add, 7, 12, float, Add);
+REG_ELEMENTWISE_VERSIONED_KERNEL(Add, 13, 13, float, Add);
+REG_ELEMENTWISE_KERNEL(Add, 14, float, Add);
+
+JSEP_KERNEL_IMPL(Sub, Sub)
+REG_ELEMENTWISE_VERSIONED_KERNEL(Sub, 7, 12, float, Sub);
+REG_ELEMENTWISE_VERSIONED_KERNEL(Sub, 13, 13, float, Sub);
+REG_ELEMENTWISE_KERNEL(Sub, 14, float, Sub);
+
+JSEP_KERNEL_IMPL(Mul, Mul)
+REG_ELEMENTWISE_VERSIONED_KERNEL(Mul, 7, 12, float, Mul);
+REG_ELEMENTWISE_VERSIONED_KERNEL(Mul, 13, 13, float, Mul);
+REG_ELEMENTWISE_KERNEL(Mul, 14, float, Mul);
+
+JSEP_KERNEL_IMPL(Div, Div)
+REG_ELEMENTWISE_VERSIONED_KERNEL(Div, 7, 12, float, Div);
+REG_ELEMENTWISE_VERSIONED_KERNEL(Div, 13, 13, float, Div);
+REG_ELEMENTWISE_KERNEL(Div, 14, float, Div);
+
+JSEP_KERNEL_IMPL(Pow, Pow)
+REG_ELEMENTWISE_VERSIONED_KERNEL(Pow, 7, 11, float, Pow);
+REG_ELEMENTWISE_VERSIONED_KERNEL(Pow, 12, 12, float, Pow);
+REG_ELEMENTWISE_VERSIONED_KERNEL(Pow, 13, 14, float, Pow);
+REG_ELEMENTWISE_KERNEL(Pow, 15, float, Pow);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/conv.cc b/onnxruntime/core/providers/js/operators/conv.cc
index 1916d84eb2720..78c1385f6ff56 100644
--- a/onnxruntime/core/providers/js/operators/conv.cc
+++ b/onnxruntime/core/providers/js/operators/conv.cc
@@ -17,24 +17,23 @@ namespace js {
       T,                                                                                   \
       kJsExecutionProvider,                                                                \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Conv<T>);                                                                            \
+      Conv<T, true>);                                                                      \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       Conv,                                                                                \
-      kOnnxDomain,                                                               \
+      kOnnxDomain,                                                                         \
       11,                                                                                  \
       T,                                                                                   \
-      kJsExecutionProvider,                                                                         \
+      kJsExecutionProvider,                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Conv<T, false>);                                                                     \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
+      Conv,                                                                                \
+      kOnnxDomain,                                                                         \
+      1, 10,                                                                               \
+      T,                                                                                   \
+      kJsExecutionProvider,                                                                \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Conv<T>);
-
-//   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(
-//       Conv,
-//       kOnnxDomain,
-//       1, 10,
-//       T,
-//       kJsExecutionProvider,
-//       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),
-//       Conv<T>);
+      Conv<T, false>);
 
 
 
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 96137b29803db..aad7daa152a4c 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -9,7 +9,7 @@
 namespace onnxruntime {
 namespace js {
 
-template <typename T>
+template <typename T, bool is_channels_last>
 class Conv : public JsKernel {
  public:
   Conv(const OpKernelInfo& info) : JsKernel(info), conv_attrs_(info) {
@@ -21,7 +21,7 @@ class Conv : public JsKernel {
 
     // currently only support Conv2D. TODO: support other
     JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
-        "format": "NHWC",
+        "format": $13 ? "NHWC" : "NCHW",
         "autopad": $1,
         "dilation0": $2,
         "dilation1": $3,
@@ -46,7 +46,8 @@ class Conv : public JsKernel {
     static_cast<int32_t>(conv_attrs_.pads[2]),
     static_cast<int32_t>(conv_attrs_.pads[3]),
     static_cast<int32_t>(conv_attrs_.strides[0]),
-    static_cast<int32_t>(conv_attrs_.strides[1])
+    static_cast<int32_t>(conv_attrs_.strides[1]),
+    static_cast<int32_t>(is_channels_last)
     );
   }
 
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index f8fdf3ec30ba5..83c20e2d06fc2 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -18,10 +18,78 @@ namespace js {
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()),               \
       KERNEL_CLASS);
 
+// math
 
 JSEP_KERNEL_IMPL(Abs, Abs)
-JSEP_ELEMENTWISE_VERSIONED_KERNEL(Abs, 1, 13, float, Abs)
-JSEP_ELEMENTWISE_KERNEL(Abs, 14, float, Abs)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Abs, 6, 12, float, Abs)
+JSEP_ELEMENTWISE_KERNEL(Abs, 13, float, Abs)
+
+JSEP_KERNEL_IMPL(Neg, Neg)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Neg, 6, 12, float, Neg)
+JSEP_ELEMENTWISE_KERNEL(Neg, 13, float, Neg)
+
+JSEP_KERNEL_IMPL(Floor, Floor)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Floor, 6, 12, float, Floor)
+JSEP_ELEMENTWISE_KERNEL(Floor, 13, float, Floor)
+
+JSEP_KERNEL_IMPL(Ceil, Ceil)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Ceil, 6, 12, float, Ceil)
+JSEP_ELEMENTWISE_KERNEL(Ceil, 13, float, Ceil)
+
+JSEP_KERNEL_IMPL(Reciprocal, Reciprocal)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Reciprocal, 6, 12, float, Reciprocal)
+JSEP_ELEMENTWISE_KERNEL(Reciprocal, 13, float, Reciprocal)
+
+JSEP_KERNEL_IMPL(Sqrt, Sqrt)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Sqrt, 6, 12, float, Sqrt)
+JSEP_ELEMENTWISE_KERNEL(Sqrt, 13, float, Sqrt)
+
+JSEP_KERNEL_IMPL(Exp, Exp)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Exp, 6, 12, float, Exp)
+JSEP_ELEMENTWISE_KERNEL(Exp, 13, float, Exp)
+
+JSEP_KERNEL_IMPL(Erf, Erf)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Erf, 9, 12, float, Erf)
+JSEP_ELEMENTWISE_KERNEL(Erf, 13, float, Erf)
+
+JSEP_KERNEL_IMPL(Sin, Sin)
+JSEP_ELEMENTWISE_KERNEL(Sin, 7, float, Sin)
+
+JSEP_KERNEL_IMPL(Cos, Cos)
+JSEP_ELEMENTWISE_KERNEL(Cos, 7, float, Cos)
+
+JSEP_KERNEL_IMPL(Tan, Tan)
+JSEP_ELEMENTWISE_KERNEL(Tan, 7, float, Tan)
+
+JSEP_KERNEL_IMPL(Asin, Asin)
+JSEP_ELEMENTWISE_KERNEL(Asin, 7, float, Asin)
+
+JSEP_KERNEL_IMPL(Acos, Acos)
+JSEP_ELEMENTWISE_KERNEL(Acos, 7, float, Acos)
+
+JSEP_KERNEL_IMPL(Atan, Atan)
+JSEP_ELEMENTWISE_KERNEL(Atan, 7, float, Atan)
+
+JSEP_KERNEL_IMPL(Sinh, Sinh)
+JSEP_ELEMENTWISE_KERNEL(Sinh, 9, float, Sinh)
+
+JSEP_KERNEL_IMPL(Cosh, Cosh)
+JSEP_ELEMENTWISE_KERNEL(Cosh, 9, float, Cosh)
+
+JSEP_KERNEL_IMPL(Asinh, Asinh)
+JSEP_ELEMENTWISE_KERNEL(Asinh, 9, float, Asinh)
+
+JSEP_KERNEL_IMPL(Acosh, Acosh)
+JSEP_ELEMENTWISE_KERNEL(Acosh, 9, float, Acosh)
+
+JSEP_KERNEL_IMPL(Atanh, Atanh)
+JSEP_ELEMENTWISE_KERNEL(Atanh, 9, float, Atanh)
+
+// activation
+
+JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(Elu, Elu, alpha, 1.0)
+JSEP_ELEMENTWISE_KERNEL(Elu, 6, float, Elu)
+
 
 }  // namespace js
 }  // namespace onnxruntime

From f75ffeb01893c29f3ef10494b3ecabfc294c1c2d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 18 Nov 2022 16:48:51 -0800
Subject: [PATCH 16/81] 13

---
 js/web/lib/wasm/jsep/init.ts                  | 18 +++++--
 js/web/lib/wasm/jsep/tensor.ts                |  5 ++
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  5 +-
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   | 43 +++++++--------
 js/web/lib/wasm/jsep/webgpu/types.ts          |  2 +-
 onnxruntime/core/framework/kernel_lookup.h    |  6 +--
 onnxruntime/core/framework/kernel_registry.cc |  8 +--
 onnxruntime/core/providers/js/allocator.h     | 15 ++++--
 .../providers/js/js_execution_provider.cc     | 52 +++++++++++++++----
 onnxruntime/core/providers/js/js_kernel.h     | 45 +++++++++-------
 .../core/providers/js/js_kernel_lookup.cc     | 18 +++++++
 .../core/providers/js/js_kernel_lookup.h      | 22 ++++++++
 .../core/providers/js/operators/unary.cc      | 22 ++++++++
 13 files changed, 189 insertions(+), 72 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/js_kernel_lookup.cc
 create mode 100644 onnxruntime/core/providers/js/js_kernel_lookup.h

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 988e04cd41e00..7797a16ac087b 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -5,10 +5,21 @@ import {OrtWasmModule} from '../binding/ort-wasm';
 
 import {WebGpuBackend} from './backend-webgpu';
 import {TensorView} from './tensor';
+import {ShapeUtil} from './util';
 import {ComputeContext, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
 
+class TensorViewImpl implements TensorView {
+  constructor(
+      private module: OrtWasmModule, public readonly dataType: number, public readonly data: number,
+      public readonly dims: readonly number[]) {}
+
+  getFloat32Array(): Float32Array {
+    return new Float32Array(this.module.HEAP8.buffer, this.data, ShapeUtil.size(this.dims));
+  }
+}
+
 class OpKernelContext implements ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
@@ -29,13 +40,14 @@ class OpKernelContext implements ComputeContext {
       for (let d = 0; d < dim; d++) {
         dims.push(heapU32[dataIndex++]);
       }
-      inputs.push({dataType, data, dims});
+      inputs.push(new TensorViewImpl(module, dataType, data, dims));
     }
     this.inputs = inputs;
   }
 
-  compute(program: ProgramInfoLoader|ProgramInfo): number {
-    return this.backend.run(program, this.inputs, this.output.bind(this));
+  compute(program: ProgramInfoLoader|ProgramInfo, inputIndices?: readonly number[]): number {
+    const mappedInputs = inputIndices?.map(i => this.inputs[i]) ?? this.inputs;
+    return this.backend.run(program, mappedInputs, this.output.bind(this));
   }
 
   output(index: number, dims: readonly number[]): number {
diff --git a/js/web/lib/wasm/jsep/tensor.ts b/js/web/lib/wasm/jsep/tensor.ts
index 27575c31b7dc1..384fa509e8c86 100644
--- a/js/web/lib/wasm/jsep/tensor.ts
+++ b/js/web/lib/wasm/jsep/tensor.ts
@@ -267,4 +267,9 @@ export interface TensorView {
   readonly data: number;
   readonly dataType: number;
   readonly dims: readonly number[];
+
+  /**
+   * get a Float32Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getFloat32Array(): Float32Array;
 }
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 9429eb05d1c92..a9eaf6251e386 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -30,9 +30,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   //['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
   // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
   // ['Cast', '', '6+', cast, parseCastAttributes],
-  ['Ceil', [unaryOps.ceil]],
-  // ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
-  //['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
+  ['Ceil', [unaryOps.ceil]], ['ClipV10', [unaryOps.clip]],
+  ['Clip', [unaryOps.clipV11]],  // ['Concat', '', '4+', concat, parseConcatAttributes],
   ['Conv', [conv, parseConvAttributes]], ['Cos', [unaryOps.cos]], ['Cosh', [unaryOps.cosh]], ['Div', [binaryOps.div]],
   // ['Dropout', '', '7+', unaryOps.identity],
   // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index 0e9c5a372660a..e83e9c6a510d1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {TensorView} from '../../tensor';
-import {ShapeUtil} from '../../util';
+import {MAX_CLIP, MIN_CLIP, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
@@ -89,32 +89,25 @@ export interface ClipAttributes extends AttributeWithCacheKey {
   readonly max: number;
 }
 
-export const clip = (context: ComputeContext, attributes: ClipAttributes): number =>
-    context.compute(createElementwiseProgramInfoLoader(
+export const clip = (context: ComputeContext, attributes: ClipAttributes): number => context.compute(
+    createElementwiseProgramInfoLoader(
         context.inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
-    let clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
-    let clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
+    const clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
+    const clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
 `,
-        attributes.cacheKey));
-
-// export const parseClipAttributes = (node: Graph.Node): ClipAttributes => createAttributeWithCacheKey(
-//     {min: node.attributes.getFloat('min', MIN_CLIP), max: node.attributes.getFloat('max', MAX_CLIP)});
-
-// const generateClipAttributesFromInputs = (handler: WebGpuInferenceHandler, inputs: Tensor[]): ClipAttributes => {
-//   if (inputs.length >= 3 &&
-//       (!handler.session.isInitializer(inputs[1].dataId) || !handler.session.isInitializer(inputs[2].dataId))) {
-//     throw new Error('dynamic clip attributes are not allowed');
-//   }
-
-//   const min = (inputs.length >= 3) ? inputs[1].numberData[0] : MIN_CLIP;
-//   const max = (inputs.length >= 3) ? inputs[2].numberData[0] : MAX_CLIP;
-//   return createAttributeWithCacheKey({min, max});
-// };
-
-// export const clipV11 = (context: ComputeContext ): number=> {
-//   const attributes = generateClipAttributesFromInputs(handler, inputs);
-//   return clip(handler, [inputs[0]], attributes);
-// };
+        attributes.cacheKey),
+    [0]);
+
+const generateClipAttributesFromInputs = (inputs: readonly TensorView[]): ClipAttributes => {
+  const min = (inputs.length >= 2) ? inputs[1].getFloat32Array()[0] : MIN_CLIP;
+  const max = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : MAX_CLIP;
+  return createAttributeWithCacheKey({min, max});
+};
+
+export const clipV11 = (context: ComputeContext): number => {
+  const attributes = generateClipAttributesFromInputs(context.inputs);
+  return clip(context, attributes);
+};
 
 export const ceil = (context: ComputeContext): number =>
     context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Ceil', 'ceil'));
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index c93529c44c722..e0790030f7502 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -98,6 +98,6 @@ export interface Artifact {
 export interface ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
-  compute(program: ProgramInfoLoader|ProgramInfo): number;
+  compute(program: ProgramInfoLoader|ProgramInfo, inputIndices?: readonly number[]): number;
   output(index: number, dims: readonly number[]): number;
 }
diff --git a/onnxruntime/core/framework/kernel_lookup.h b/onnxruntime/core/framework/kernel_lookup.h
index 30f89940dbd1d..933aed4542c06 100644
--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@@ -30,17 +30,17 @@ class KernelLookup final : public IExecutionProvider::IKernelLookup {
 
   const KernelCreateInfo* LookUpKernel(const Node& node) const override {
     const KernelCreateInfo* kernel_create_info{};
-    //printf(" LookUpKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), provider_type_.c_str());
+    printf(" LookUpKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), provider_type_.c_str());
     for (const auto& registry : kernel_registries_) {
       const auto lookup_status = registry->TryFindKernel(node, provider_type_, kernel_type_str_resolver_,
                                                          &kernel_create_info);
       if (lookup_status.IsOK() && kernel_create_info != nullptr) {
-    //printf(" - found\n");
+    printf(" - found\n");
         return kernel_create_info;
       }
     }
 
-    //printf(" - not found\n");
+    printf(" - not found\n");
     return nullptr;
   }
 
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index 6b2a3d09c20b5..652e2a8860e17 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -166,7 +166,7 @@ Status KernelRegistry::TryFindKernel(const Node& node,
   const auto& node_provider = node.GetExecutionProviderType();
   const auto& expected_provider = (node_provider.empty() ? exec_provider : node_provider);
 
-    //printf("  KernelRegistry::TryFindKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), expected_provider.c_str());
+    printf("  KernelRegistry::TryFindKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), expected_provider.c_str());
   auto range = kernel_creator_fn_map_.equal_range(GetMapKey(node.OpType(), node.Domain(), expected_provider));
   if (out) *out = nullptr;
 
@@ -176,7 +176,7 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     std::string error_str;
     if (VerifyKernelDef(node, *i->second.kernel_def, kernel_type_str_resolver, error_str)) {
       if (out) *out = &i->second;
-    //printf("  KernelRegistry::TryFindKernel() OK\n");
+    printf("  KernelRegistry::TryFindKernel() OK\n");
       return Status::OK();
     }
     verify_kernel_def_error_strs.push_back(error_str);
@@ -193,11 +193,11 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     oss << ")";
 
     VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
-    //printf("  KernelRegistry::TryFindKernel() failed: %s\n",oss.str().c_str());
+    printf("  KernelRegistry::TryFindKernel() failed: %s\n",oss.str().c_str());
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
-    //printf("  KernelRegistry::TryFindKernel() failed: Kernel not found\n");
+    printf("  KernelRegistry::TryFindKernel() failed: Kernel not found\n");
   return Status(common::ONNXRUNTIME, common::FAIL, "Kernel not found");
 }
 
diff --git a/onnxruntime/core/providers/js/allocator.h b/onnxruntime/core/providers/js/allocator.h
index 6a6663c4c3e31..5f7a6aabf4984 100644
--- a/onnxruntime/core/providers/js/allocator.h
+++ b/onnxruntime/core/providers/js/allocator.h
@@ -9,11 +9,20 @@
 namespace onnxruntime {
 namespace js {
 
-class JsCPUAllocator : public CPUAllocator {
+class JsCPUInputAllocator : public CPUAllocator {
  public:
-  JsCPUAllocator()
+  JsCPUInputAllocator()
       : CPUAllocator(
-            OrtMemoryInfo("JsCPUAllocator", OrtAllocatorType::OrtDeviceAllocator,
+            OrtMemoryInfo("JsCPUInputAllocator", OrtAllocatorType::OrtDeviceAllocator,
+                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
+                          0, OrtMemTypeCPUInput)){};
+};
+
+class JsCPUOutputAllocator : public CPUAllocator {
+ public:
+  JsCPUOutputAllocator()
+      : CPUAllocator(
+            OrtMemoryInfo("JsCPUOutputAllocator", OrtAllocatorType::OrtDeviceAllocator,
                           OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
                           0, OrtMemTypeCPUOutput)){};
 };
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index c08dabe246585..a41235d310538 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -15,6 +15,7 @@
 #include "core/providers/shared/node_unit/node_unit.h"
 #include "allocator.h"
 #include "data_transfer.h"
+#include "js_kernel_lookup.h"
 
 namespace onnxruntime {
 
@@ -103,6 +104,10 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Asin
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Acosh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, Atanh);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 10, Clip);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, Clip);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, Clip);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Clip);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, Elu);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 12, Add);
@@ -181,6 +186,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(9, Atanh),
 
       // activations
+      KERNEL_CREATE_INFO_VERSIONED(6, 10, Clip),
+      KERNEL_CREATE_INFO_VERSIONED(11, 11, Clip),
+      KERNEL_CREATE_INFO_VERSIONED(12, 12, Clip),
+      KERNEL_CREATE_INFO(13, Clip),
       KERNEL_CREATE_INFO(6, Elu),
 
       // binary - math
@@ -244,10 +253,18 @@ JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info)
 
 // implement RegisterAllocator to test/validate sharing the CPU EP's allocator
 void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
-  AllocatorCreationInfo default_memory_info([&](int) { return std::make_unique<js::JsCPUAllocator>(); });
 
-  AllocatorPtr default_allocator = CreateAllocator(default_memory_info);
-  InsertAllocator(default_allocator);
+  printf("JsExecutionProvider::RegisterAllocator() \n");
+
+  AllocatorCreationInfo cpuInputAllocatorCreationInfo([&](int) {
+    return std::make_unique<js::JsCPUInputAllocator>();
+  });
+  InsertAllocator(CreateAllocator(cpuInputAllocatorCreationInfo));
+
+  AllocatorCreationInfo cpuOutputAllocatorCreationInfo([&](int) {
+    return std::make_unique<js::JsCPUOutputAllocator>();
+  });
+  InsertAllocator(CreateAllocator(cpuOutputAllocatorCreationInfo));
 
   // use_arena might have some issue, for this to work need to change
   // https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/framework/execution_frame.cc#L507
@@ -262,20 +279,33 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& kernel_lookup) const {
 
-  auto list = IExecutionProvider::GetCapability(graph, kernel_lookup);
-  //printf("JsExecutionProvider::GetCapability() results:\n");
+  auto lookup = JsKernelLookup{kernel_lookup};
+  auto list = IExecutionProvider::GetCapability(graph, lookup);
+  printf("JsExecutionProvider::GetCapability() results:\n");
 
-  for (size_t i=0; i < list.size(); i++) {
-    //printf("  subgraph %zu: %zu node(s)\n", i, list[i]->sub_graph->nodes.size());
-    for (size_t j=0;j<list[i]->sub_graph->nodes.size();j++) {
-      auto node_index = list[i]->sub_graph->nodes[j];
+  for (size_t i = 0; i < list.size(); i++) {
+    auto &nodes = list[i]->sub_graph->nodes;
+    printf("  subgraph %zu: %zu node(s)\n", i, list[i]->sub_graph->nodes.size());
+    for (size_t j = 0; j < nodes.size(); j++) {
+      auto node_index = nodes[j];
       auto *node = graph.GetNode(node_index);
-      auto *kernel_info = kernel_lookup.LookUpKernel(*node);
+      auto *kernel_info = lookup.LookUpKernel(*node);
 
       (void)(node_index);
       (void)(node);
       (void)(kernel_info);
-      //printf("    node[%zu]: [%s][%s][%s]\n", node_index, node->Domain().c_str(), node->OpType().c_str(), node->Name().c_str());
+      printf("    node[%zu]: [%s][%s][%s]\n", node_index, node->Domain().c_str(), node->OpType().c_str(), node->Name().c_str());
+
+      // if (node->OpType() == "Clip" && node->InputDefs().size() == 3) {
+      //   printf("Clip node: [%s] %s, %s\n", node->Name().c_str(), node->InputDefs()[1]->Name().c_str(), node->InputDefs()[2]->Name().c_str());
+      //   if (!graph.IsConstantInitializer(node->InputDefs()[1]->Name(), true) ||
+      //       !graph.IsConstantInitializer(node->InputDefs()[2]->Name(), true)) {
+      //     printf("--erasing\n");
+      //     nodes.erase(nodes.begin() + j);
+      //     j--;
+      //     continue;
+      //   }
+      // }
     }
   }
 
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 57052335aa1ce..7f7485ed7e719 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -13,44 +13,51 @@ struct pthreadpool;
 namespace onnxruntime {
 namespace js {
 
-#define JSEP_INIT_KERNEL(x) EM_ASM({ Module.jsepCreateKernel(#x, $0, undefined); }, this)
-#define JSEP_INIT_KERNEL_ATTRIBUTE(x, attr, ...) EM_ASM({ Module.jsepCreateKernel(#x, $0, attr); }, this, __VA_ARGS__)
+#define JSEP_INIT_KERNEL(optype) EM_ASM({ Module.jsepCreateKernel(#optype, $0, undefined); }, this)
+#define JSEP_INIT_KERNEL_ATTRIBUTE(optype, attr, ...) EM_ASM({ Module.jsepCreateKernel(#optype, $0, attr); }, this, __VA_ARGS__)
 
-#define JSEP_KERNEL_IMPL(classname, x)                       \
+#define JSEP_KERNEL_IMPL(classname, optype)                  \
 class classname : public JsKernel {                          \
 public:                                                      \
     classname(const OpKernelInfo& info) : JsKernel(info) {   \
-        JSEP_INIT_KERNEL(x);                                 \
+        JSEP_INIT_KERNEL(optype);                            \
     }                                                        \
 };
 
-#define JSEP_KERNEL_TYPED_IMPL(classname, x)                 \
+#define JSEP_KERNEL_TYPED_IMPL(classname, optype)            \
 template<typename T>                                         \
 class classname : public JsKernel {                          \
 public:                                                      \
     classname(const OpKernelInfo& info) : JsKernel(info) {   \
-        JSEP_INIT_KERNEL(x);                                 \
+        JSEP_INIT_KERNEL(optype);                            \
     }                                                        \
 };
 
-#define JSEP_CLASS_IMPL_ATTRIBUTE(classname, x, attr_pre, attr, ...)       \
+#define JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, attr_pre, attr, ...)  \
 class classname : public JsKernel {                                        \
 public:                                                                    \
     classname(const OpKernelInfo& info) : JsKernel(info) {                 \
         attr_pre                                                           \
-        JSEP_INIT_KERNEL_ATTRIBUTE(x, attr, __VA_ARGS__);                  \
+        JSEP_INIT_KERNEL_ATTRIBUTE(optype, attr, __VA_ARGS__);             \
     }                                                                      \
 };
 
-#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(classname, x, attr_name, default_value, ...) \
-    JSEP_CLASS_IMPL_ATTRIBUTE(classname, x, , ({#attr_name:$1}), static_cast<double>(info.GetAttrOrDefault<float>(#attr_name, 1.0)))
+#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(classname, optype, attr_name, default_value, ...) \
+    JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, , ({#attr_name:$1}), static_cast<double>(info.GetAttrOrDefault<float>(#attr_name, default_value)))
 
-#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT(classname, x, attr_name, ...) \
-    JSEP_CLASS_IMPL_ATTRIBUTE(classname, x,                           \
-        float value;                                                  \
-        ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value)); ,       \
+#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_2_DEFAULT(classname, optype, attr_name_1, default_value_1, attr_name_2, default_value_2, ...) \
+    JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, , ({#attr_name_1:$1, #attr_name_2:$2}),                                              \
+                              static_cast<double>(info.GetAttrOrDefault<float>(#attr_name_1, default_value_1)),                       \
+                              static_cast<double>(info.GetAttrOrDefault<float>(#attr_name_2, default_value_2)))
+
+
+#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT(classname, optype, attr_name, ...) \
+    JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype,                           \
+        float value;                                                       \
+        ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value)); ,            \
         , ({#attr_name:$1}), static_cast<double>(value))
 
+
 class JsKernel : public OpKernel {
  public:
   explicit JsKernel(const OpKernelInfo& info)
@@ -87,11 +94,11 @@ class JsKernel : public OpKernel {
         }
       }
 
-      // printf("temp data size: %zu. Data: ", temp_data_size);
-      // for (int i=0; i < (int)temp_data_size/4;i++) {
-      //   printf("%u ", p_inputs_data[i]);
-      // }
-      // printf("\n");
+      printf("temp data size: %zu. Data: ", temp_data_size);
+      for (int i=0; i < (int)temp_data_size/4;i++) {
+        printf("%u ", p_inputs_data[i]);
+      }
+      printf("\n");
 
       int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_inputs_data);
 
diff --git a/onnxruntime/core/providers/js/js_kernel_lookup.cc b/onnxruntime/core/providers/js/js_kernel_lookup.cc
new file mode 100644
index 0000000000000..18108589bfcd1
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_kernel_lookup.cc
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "js_kernel_lookup.h"
+
+namespace onnxruntime {
+namespace js {
+
+const KernelCreateInfo* JsKernelLookup::LookUpKernel(const Node& node) const {
+    // if (node.OpType() == "Clip") {
+    //     node.
+    // }
+
+    return orig_.LookUpKernel(node);
+}
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_kernel_lookup.h b/onnxruntime/core/providers/js/js_kernel_lookup.h
new file mode 100644
index 0000000000000..aad3da26778fe
--- /dev/null
+++ b/onnxruntime/core/providers/js/js_kernel_lookup.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/execution_provider.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+class JsKernelLookup : public IExecutionProvider::IKernelLookup {
+ public:
+  JsKernelLookup(const IKernelLookup& orig): orig_(orig) {
+  }
+  const KernelCreateInfo* LookUpKernel(const Node& node) const override;
+ private:
+  const IKernelLookup& orig_;
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index 83c20e2d06fc2..06f8b82755410 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -87,6 +87,28 @@ JSEP_ELEMENTWISE_KERNEL(Atanh, 9, float, Atanh)
 
 // activation
 
+JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_2_DEFAULT(ClipV10, ClipV10, min, 3.402823e+38f, max, -3.402823e+38f)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Clip, 6, 10, float, ClipV10)
+JSEP_KERNEL_IMPL(Clip, Clip)
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Clip, kOnnxDomain, 11, 11, kJsExecutionProvider,
+    KernelDefBuilder()
+      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+      .InputMemoryType(OrtMemTypeCPUInput, 1)
+      .InputMemoryType(OrtMemTypeCPUInput, 2),
+    Clip);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Clip, kOnnxDomain, 12, 12, kJsExecutionProvider,
+    KernelDefBuilder()
+      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+      .InputMemoryType(OrtMemTypeCPUInput, 1)
+      .InputMemoryType(OrtMemTypeCPUInput, 2),
+    Clip);
+ONNX_OPERATOR_KERNEL_EX(Clip, kOnnxDomain, 13, kJsExecutionProvider,
+    KernelDefBuilder()
+      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+      .InputMemoryType(OrtMemTypeCPUInput, 1)
+      .InputMemoryType(OrtMemTypeCPUInput, 2),
+    Clip);
+
 JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(Elu, Elu, alpha, 1.0)
 JSEP_ELEMENTWISE_KERNEL(Elu, 6, float, Elu)
 

From 7e3a412bfc0f03fd1e126c17f1869fac302f669d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 18 Nov 2022 17:44:58 -0800
Subject: [PATCH 17/81] w

---
 bb.bat                            | 20 +++++++++++++++++++-
 br.bat                            | 20 +++++++++++++++++++-
 js/web/lib/wasm/wasm-core-impl.ts |  1 -
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/bb.bat b/bb.bat
index 80418a42c2da0..0256f9ae2f848 100644
--- a/bb.bat
+++ b/bb.bat
@@ -1,6 +1,24 @@
-call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" --target onnxruntime_webassembly --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
+@echo off
+
+if ["%~1"]==["--clean"] (
+    if exist "%~dp0build\Windows\Debug" (
+        rd /s /q %~dp0build\Windows\Debug
+    )
+)
+
+setlocal
+
+if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
+    set protoc_path_flag=--path_to_protoc_exe %~dp0build\Windows\host_protoc\Release\protoc.exe
+) else (
+    set protoc_path_flag=
+)
+
+call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
 
 IF %ERRORLEVEL% == 0 (
 copy /Y .\build\Windows\Debug\ort-wasm.js .\js\web\lib\wasm\binding\
 copy /Y .\build\Windows\Debug\ort-wasm.wasm .\js\web\dist\
 )
+
+endlocal
diff --git a/br.bat b/br.bat
index 828acb95dd337..6dc696596ce1d 100644
--- a/br.bat
+++ b/br.bat
@@ -1,6 +1,24 @@
-call .\build.bat --config Release --skip_submodule_sync --skip_tests --disable_wasm_exception_catching --disable_rtti --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" --target onnxruntime_webassembly
+@echo off
+
+if ["%~1"]==["--clean"] (
+    if exist "%~dp0build\Windows\Release" (
+        rd /s /q %~dp0build\Windows\Release
+    )
+)
+
+setlocal
+
+if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
+    set protoc_path_flag=--path_to_protoc_exe %~dp0build\Windows\host_protoc\Release\protoc.exe
+) else (
+    set protoc_path_flag=
+)
+
+call .\build.bat --config Release --skip_submodule_sync --skip_tests --disable_wasm_exception_catching --disable_rtti --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly
 
 IF %ERRORLEVEL% == 0 (
 copy /Y .\build\Windows\Release\ort-wasm.js .\js\web\lib\wasm\binding\
 copy /Y .\build\Windows\Release\ort-wasm.wasm .\js\web\dist\
 )
+
+endlocal
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index a2e0efb65ad8c..b0778cc6060e0 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -3,7 +3,6 @@
 
 import {InferenceSession, Tensor} from 'onnxruntime-common';
 
-import {init} from './jsep';
 import {SerializableModeldata, SerializableSessionMetadata, SerializableTensor} from './proxy-messages';
 import {setRunOptions} from './run-options';
 import {setSessionOptions} from './session-options';

From 764715b1ba43548190a20ca600785b6b6ad26ccb Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 21 Nov 2022 18:38:06 -0800
Subject: [PATCH 18/81] w2

---
 bb.bat                                        |   6 +-
 br.bat                                        |   6 +-
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  12 +-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       |  11 +-
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       | 736 +++++++++---------
 js/web/test/suite-test-list.jsonc             |  44 +-
 .../providers/js/js_execution_provider.cc     |  27 +-
 .../core/providers/js/operators/conv.h        |  36 +-
 .../core/providers/js/operators/pool.cc       |  71 ++
 .../core/providers/js/operators/pool.h        |  71 ++
 10 files changed, 576 insertions(+), 444 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/operators/pool.cc
 create mode 100644 onnxruntime/core/providers/js/operators/pool.h

diff --git a/bb.bat b/bb.bat
index 0256f9ae2f848..70400d2630eaa 100644
--- a/bb.bat
+++ b/bb.bat
@@ -14,11 +14,11 @@ if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
     set protoc_path_flag=
 )
 
-call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
+call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_xnnpack --enable_wasm_simd --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
 
 IF %ERRORLEVEL% == 0 (
-copy /Y .\build\Windows\Debug\ort-wasm.js .\js\web\lib\wasm\binding\
-copy /Y .\build\Windows\Debug\ort-wasm.wasm .\js\web\dist\
+copy /Y .\build\Windows\Debug\ort-wasm-simd.js .\js\web\lib\wasm\binding\ort-wasm.js
+copy /Y .\build\Windows\Debug\ort-wasm-simd.wasm .\js\web\dist\
 )
 
 endlocal
diff --git a/br.bat b/br.bat
index 6dc696596ce1d..fef438314e676 100644
--- a/br.bat
+++ b/br.bat
@@ -14,11 +14,11 @@ if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
     set protoc_path_flag=
 )
 
-call .\build.bat --config Release --skip_submodule_sync --skip_tests --disable_wasm_exception_catching --disable_rtti --build_wasm --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly
+call .\build.bat --config Release --skip_submodule_sync --skip_tests --disable_wasm_exception_catching --disable_rtti --build_wasm --use_xnnpack --enable_wasm_simd --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly
 
 IF %ERRORLEVEL% == 0 (
-copy /Y .\build\Windows\Release\ort-wasm.js .\js\web\lib\wasm\binding\
-copy /Y .\build\Windows\Release\ort-wasm.wasm .\js\web\dist\
+copy /Y .\build\Windows\Release\ort-wasm-simd.js .\js\web\lib\wasm\binding\
+copy /Y .\build\Windows\Release\ort-wasm-simd.wasm .\js\web\dist\
 )
 
 endlocal
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index a9eaf6251e386..082fdc7a3c2ab 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -7,8 +7,8 @@ import {conv, parseConvAttributes} from './ops/conv';
 // import {gather, parseGatherAttributes} from './ops/gather';
 // import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
 // import {matMul, parseMatMulAttributes} from './ops/matmul';
-// import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes,
-// parseGlobalAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool'; import {sum} from
+import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool';
+//  import {sum} from
 // './ops/reduce-tensors'; import {reshape} from './ops/reshape'; import {shape} from './ops/shape';
 // import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
 // import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
@@ -27,7 +27,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['And', '', '7+', binaryOps.and],
   ['Asin', [unaryOps.asin]], ['Asinh', [unaryOps.asinh]], ['Atan', [unaryOps.atan]], ['Atanh', [unaryOps.atanh]],
   // TODO: support new attributes for AveragePool-10
-  //['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
+  ['AveragePool', [averagePool, parseAveragePoolAttributes]],
   // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
   // ['Cast', '', '6+', cast, parseCastAttributes],
   ['Ceil', [unaryOps.ceil]], ['ClipV10', [unaryOps.clip]],
@@ -42,8 +42,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
   //['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
   //['Gemm', '', '11+', gemm, parseGemmAttributesV11],
-  //['GlobalAveragePool', '', '1+', globalAveragePool, parseGlobalAveragePoolAttributes],
-  //['GlobalMaxPool', '', '1+', globalMaxPool],
+  ['GlobalAveragePool', [globalAveragePool]], ['GlobalMaxPool', [globalMaxPool]],
   // ['Greater', '', '7+', binaryOps.greater],
   // ['Identity', '', '1+', unaryOps.identity],
   // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
@@ -52,8 +51,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['Less', '', '7+', binaryOps.less],
   //['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
-  //['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes],
-  ['Mul', [binaryOps.mul]], ['Neg', [unaryOps.neg]],
+  ['MaxPool', [maxPool, parseMaxPoolAttributes]], ['Mul', [binaryOps.mul]], ['Neg', [unaryOps.neg]],
   // ['Not', '', '1+', unaryOps.not],
   // ['Or', '', '7+', binaryOps.or],
   // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 47e8f68f5a0f0..79d8c7ce38976 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -121,13 +121,12 @@ export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAt
   const activationAttributes = parseInternalActivationAttributes(attributes);
   // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
   const format = attributes.format as 'NHWC' | 'NCHW';
-  const autoPad = ['NOTSET', 'VALID', 'SAME_UPPER', 'SAME_LOWER'][attributes.autopad as number];
-  const dilations = [attributes.dilation0 as number, attributes.dilation1 as number];
+  const autoPad = ['NOTSET', 'VALID', 'SAME_UPPER', 'SAME_LOWER'][attributes.auto_pad as number];
+  const dilations = attributes.dilations as [number, number];
   const group = attributes.group as number;
-  const kernelShape = [attributes.kernelshape0 as number, attributes.kernelshape1 as number];
-  const pads =
-      [attributes.pad0 as number, attributes.pad1 as number, attributes.pad2 as number, attributes.pad3 as number];
-  const strides = [attributes.stride0 as number, attributes.stride1 as number];
+  const kernelShape = attributes.kernel_shape as [number, number];
+  const pads = attributes.pads as [number, number, number, number];
+  const strides = attributes.strides as [number, number];
 
   return createAttributeWithCacheKey(
       {autoPad, format, dilations, group, kernelShape, pads, strides, ...activationAttributes});
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 801064aef12d1..c620038f1397d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -1,378 +1,364 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-// import {Graph} from '../../../graph';
-// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {PoolConvUtil, ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
-
-// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-// export interface AveragePoolAttributes extends AttributeWithCacheKey {
-//   readonly autoPad: string;
-//   readonly ceilMode: number;
-//   readonly countIncludePad: boolean;
-//   readonly kernelShape: readonly number[];
-//   readonly strides: readonly number[];
-//   readonly pads: readonly number[];
-// }
-
-// export const averagePool: OperatorAsyncImplementation<AveragePoolAttributes> =
-//     async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
-//         Promise<Tensor[]> => {
-//           validateInputs(inputs);
-//           const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-//           return inferenceHandler.run(
-//               {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
-//         };
-
-// export const parseAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
-//     (node: Graph.Node): AveragePoolAttributes => {
-//       const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
-//       const ceilMode = node.attributes.getInt('ceil_mode', 0);
-//       const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
-//       const kernelShape = node.attributes.getInts('kernel_shape');
-//       const strides = node.attributes.getInts('strides', []);
-//       const pads = node.attributes.getInts('pads', []);
-
-//       // TODO: support attribute 'ceil_mode'
-//       if (ceilMode !== 0) {
-//         throw new Error('using ceil() in shape computation is not yet supported for AveragePool');
-//       }
-
-//       return createAttributeWithCacheKey({autoPad, ceilMode, countIncludePad, kernelShape, strides, pads});
-//     };
-
-// const createAveragePoolProgramInfo =
-//     (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean,
-//      attributes: AveragePoolAttributes): ProgramInfo => {
-//       const [adjustedAttributes, outputShape] =
-//           getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
-//       const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
-
-//       const dataType = 'f32';
-
-//       const op1 = 'value += x_val;';
-//       let op2 = '';
-//       if (adjustedAttributes.countIncludePad) {
-//         op2 += `value /= ${dataType}(${kernelSize});`;
-//       } else {
-//         op2 += `value /= ${dataType}(${kernelSize} - pad);`;
-//       }
-//       return {
-//         ...metadata,
-//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-//         shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType,
-//         '0.0'), dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
-//       };
-//     };
-
-// export const globalAveragePool: OperatorAsyncImplementation<AveragePoolAttributes> =
-//     async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
-//         Promise<Tensor[]> => {
-//           validateInputs(inputs);
-//           const metadata = {
-//             name: 'GlobalAveragePool',
-//             inputTypes: [GpuDataType.default],
-//             cacheHint: `${attributes.countIncludePad}`
-//           };
-//           return inferenceHandler.run(
-//               {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, true, attributes)}, inputs);
-//         };
-
-// export const parseGlobalAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
-//     (node: Graph.Node): AveragePoolAttributes => {
-//       const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
-//       return createAttributeWithCacheKey(
-//           {autoPad: '', ceilMode: 0, countIncludePad, kernelShape: [], strides: [], pads: []});
-//     };
-
-// export interface MaxPoolAttributes extends AveragePoolAttributes {
-//   readonly storageOrder: number;
-//   readonly dilations: number[];
-// }
-
-// export const maxPool: OperatorAsyncImplementation<MaxPoolAttributes> = async(
-//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: MaxPoolAttributes): Promise<Tensor[]>
-//     => {
-//   validateInputs(inputs);
-//   const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-//   return inferenceHandler.run(
-//       {...metadata, get: () => createMaxPoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
-// };
-
-// export const parseMaxPoolAttributes: OperatorInitialization<MaxPoolAttributes> =
-//     (node: Graph.Node): MaxPoolAttributes => {
-//       const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
-//       const ceilMode = node.attributes.getInt('ceil_mode', 0);
-//       const kernelShape = node.attributes.getInts('kernel_shape');
-//       const strides = node.attributes.getInts('strides', []);
-//       const pads = node.attributes.getInts('pads', []);
-//       const storageOrder = node.attributes.getInt('storage_order', 0);
-//       const dilations = node.attributes.getInts('dilations', []);
-
-//       // TODO: support attribute 'ceil_mode' and 'storage_order'
-//       if (storageOrder !== 0) {
-//         throw new Error('column major storage order is not yet supported for MaxPool');
-//       }
-//       if (ceilMode !== 0) {
-//         throw new Error('using ceil() in shape computation is not yet supported for MaxPool');
-//       }
-
-//       return createAttributeWithCacheKey(
-//           {autoPad, ceilMode, countIncludePad: false, kernelShape, strides, pads, storageOrder, dilations});
-//     };
-
-// const createMaxPoolProgramInfo =
-//     (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean, attributes: MaxPoolAttributes):
-//         ProgramInfo => {
-//           const [adjustedAttributes, outputShape] =
-//               getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
-//           const op1 = `
-//       value = max(x_val, value);
-//     `;
-//           const op2 = '';
-//           return {
-//             ...metadata,
-//             outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-//             shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32',
-//             '-1e5'), dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
-//           };
-//         };
-
-// const getAdjustedPoolAttributesAndOutputShape =
-//     (inputs: Tensor[], attributes: AveragePoolAttributes|MaxPoolAttributes, isGlobalOperator: boolean):
-//         [AveragePoolAttributes|MaxPoolAttributes, number[]] => {
-//           const inputShape = inputs[0].dims.slice();
-//           const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
-//           const kernelShape = attributes.kernelShape.slice();
-//           const strides = attributes.strides.slice();
-//           const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
-//           const pads = attributes.pads.slice();
-//           PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
-
-//           const outputShape = PoolConvUtil.computePoolOutputShape(
-//               isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
-
-//           const newAttributes = Object.assign({}, attributes);
-//           if (hasDilations) {
-//             Object.assign(newAttributes, {kernelShape, strides, pads, dilations, cacheKey: attributes.cacheKey});
-//           } else {
-//             Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
-//           }
-//           return [newAttributes, outputShape];
-//         };
-
-// const globalMaxPoolAttributes = {
-//   autoPad: '',
-//   ceilMode: 0,
-//   countIncludePad: false,
-//   kernelShape: [],
-//   strides: [],
-//   pads: [],
-//   storageOrder: 0,
-//   dilations: [],
-//   cacheKey: ''
-// };
-
-// const globalMaxPoolMetadata = {
-//   name: 'GlobalMaxPool',
-//   inputTypes: [GpuDataType.default]
-// };
-
-// export const globalMaxPool = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]>
-// => {
-//   validateInputs(inputs);
-//   return inferenceHandler.run(
-//       {
-//         ...globalMaxPoolMetadata,
-//         get: () => createMaxPoolProgramInfo(inputs, globalMaxPoolMetadata, true, globalMaxPoolAttributes)
-//       },
-//       inputs);
-// };
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 1) {
-//     throw new Error('Pool ops requires 1 input.');
-//   }
-//   if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-//     throw new Error('Invalid input type.');
-//   }
-// };
-
-// const generatePoolingCode =
-//     (inputDims: readonly number[], outputShape: readonly number[], attributes: AveragePoolAttributes, op1: string,
-//      op2: string, dataType: string, start: string): string => {
-//       const rank = inputDims.length;
-//       const outputSize = ShapeUtil.size(outputShape);
-//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
-//       const xIndicesHelper = createIndicesHelper('x', inputDims);
-
-//       if (attributes.kernelShape.length <= 2) {
-//         const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
-//         const sw = attributes.strides[attributes.strides.length - 1];
-//         const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
-//         const pwEnd = attributes.pads[attributes.pads.length - 1];
-//         const dimW = inputDims[rank - 1];
-//         let codeW = '';
-//         let codeH = '';
-//         let codeHEnd = '';
-//         if (pwStart + pwEnd !== 0) {
-//           codeW = `
-//           for (var i: u32 = 0u; i < ${kw}u; i++) {
-//             xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
-//             if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
-//               pad++;
-//               continue;
-//             }
-//             let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-//             ${op1}
-//           }`;
-//         } else {
-//           codeW = `
-//           for (var i: u32 = 0u; i < ${kw}u; i++) {
-//             xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
-//             let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-//             ${op1}
-//           }`;
-//         }
-
-//         if (attributes.kernelShape.length === 2) {
-//           const kh = attributes.kernelShape[attributes.kernelShape.length - 2];
-//           const sh = attributes.strides[attributes.strides.length - 2];
-//           const phStart = attributes.pads[attributes.pads.length / 2 - 2];
-//           const phEnd = attributes.pads[attributes.pads.length - 2];
-//           const dimH = inputDims[rank - 2];
-//           if (phStart + phEnd !== 0) {
-//             codeH = `
-//             for (var j: u32 = 0u; j < ${kh}u; j++) {
-//               xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
-//               if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
-//                 pad+= ${kw};
-//                 continue;
-//               }
-//           `;
-//           } else {
-//             codeH = `
-//             for (var j: u32 = 0u; j < ${kh}u; j++) {
-//               xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
-//             `;
-//           }
-//           codeHEnd = `
-//           }
-//         `;
-//         }
-
-//         const poolingCode = `
-//         const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-//         @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-//         @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-//         ${outputIndicesHelper.o2iImpl}
-//         ${xIndicesHelper.i2oImpl}
-
-//         @compute @workgroup_size(WORKGROUP_SIZE)
-//         fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//           // Guard against out-of-bounds work group sizes
-//           if (global_id.x >= ${outputSize}u) {
-//             return;
-//           }
-
-//           ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-//           ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-//           ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-//           ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
-
-//           var value: ${dataType} = ${dataType}(${start});
-//           var pad = 0;
-//           ${codeH}
-//           ${codeW}
-//           ${codeHEnd}
-//           ${op2}
-
-//           output[global_id.x] = value;
-//         }`;
-//         return poolingCode;
-//       } else {
-//         const kernelSize = ShapeUtil.size(attributes.kernelShape);
-//         const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
-//         const stridesRank = kernelStrides.length;
-//         const padsRank = attributes.pads.length;
-//         const hasPads = attributes.pads.reduce((sum, cur) => sum + cur);
-//         let padCode = '';
-//         if (hasPads) {
-//           padCode = `
-//             if (xIndices[j] >= inputDims[j]) {
-//               pad++;
-//               isPad = true;
-//               break;
-//             }
-//           }
-//           if (!isPad) {
-//             let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-//             ${op1}
-//           }`;
-//         } else {
-//           padCode = `
-//           }
-//           let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-//           ${op1}
-//         `;
-//         }
-//         const poolingCode = `
-//         const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-//         @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-//         @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-//         ${outputIndicesHelper.o2iImpl}
-//         ${xIndicesHelper.i2oImpl}
-
-//         const pads = array<u32, ${padsRank}>(${attributes.pads.map(i => `${i}u`).join(',')});
-//         const inputDims = array<u32, ${rank}>(${inputDims.map(i => `${i}u`).join(',')});
-//         const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
-//         const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
-
-//         @compute @workgroup_size(WORKGROUP_SIZE)
-//         fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//           // Guard against out-of-bounds work group sizes
-//           if (global_id.x >= ${outputSize}u) {
-//             return;
-//           }
-
-//           ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-//           ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-//           ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-//           ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
-
-//           var offsets: array<u32, ${stridesRank}>;
-
-//           var value = ${dataType}(${start});
-//           var pad = 0;
-//           var isPad = false;
-
-//           for (var i: u32 = 0u; i < ${kernelSize}u; i++) {
-//             var offset = i;
-//             for (var j = 0u; j < ${stridesRank - 1}u; j++) {
-//               offsets[j] = offset / kernelStrides[j];
-//               offset -= offsets[j] * kernelStrides[j];
-//             }
-//             offsets[${stridesRank - 1}] = offset;
-
-//             isPad = false;
-//             for (var j = ${rank - stridesRank}u; j < ${rank}u; j++) {
-//               xIndices[j] = indices[j] * strides[j - ${rank - stridesRank}u]
-//                 + offsets[j - ${rank - stridesRank}u] - pads[j - 2u];
-//               ${padCode}
-//           }
-//           ${op2}
-
-//           output[global_id.x] = value;
-//         }`;
-//         return poolingCode;
-//       }
-//     };
+import {DataType} from '../../../wasm-core-impl';
+import {TensorView} from '../../tensor';
+import {PoolConvUtil, ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+// TODO: support:
+// - ceil_mode                 "test_maxpool_2d_ceil"
+// - storage_order             "test_maxpool_with_argmax_2d_precomputed_strides"
+// - [MaxPool] dilations       "test_maxpool_2d_dilations"
+// - [MaxPool] output[1]       "test_maxpool_with_argmax_2d_precomputed_pads"
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Pool ops requires 1 input.');
+  }
+  if (inputs[0].dims.length !== 4) {
+    throw new Error('Pool ops supports 2-D inputs only for now.');
+  }
+  if (inputs[0].dataType !== DataType.float) {
+    throw new Error('Invalid input type.');
+  }
+};
+
+const getAdjustedPoolAttributesAndOutputShape = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
+    inputs: readonly TensorView[], attributes: AttributeType, isGlobalOperator: boolean): [AttributeType, number[]] => {
+  const inputShape = inputs[0].dims.slice();
+  const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
+  const kernelShape = attributes.kernelShape.slice();
+  const strides = attributes.strides.slice();
+  const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
+  const pads = attributes.pads.slice();
+  PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
+
+  const outputShape = PoolConvUtil.computePoolOutputShape(
+      isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
+
+  const newAttributes = Object.assign({}, attributes);
+  if (hasDilations) {
+    Object.assign(newAttributes, {kernelShape, strides, pads, dilations, cacheKey: attributes.cacheKey});
+  } else {
+    Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
+  }
+  return [newAttributes, outputShape];
+};
+
+const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
+    inputDims: readonly number[], outputShape: readonly number[], attributes: AttributeType, op1: string, op2: string,
+    dataType: string, start: string): string => {
+  const rank = inputDims.length;
+  const outputSize = ShapeUtil.size(outputShape);
+  const outputIndicesHelper = createIndicesHelper('output', outputShape);
+  const xIndicesHelper = createIndicesHelper('x', inputDims);
+
+  if (attributes.kernelShape.length <= 2) {
+    const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
+    const sw = attributes.strides[attributes.strides.length - 1];
+    const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
+    const pwEnd = attributes.pads[attributes.pads.length - 1];
+    const dimW = inputDims[rank - 1];
+    let codeW = '';
+    let codeH = '';
+    let codeHEnd = '';
+    if (pwStart + pwEnd !== 0) {
+      codeW = `
+              for (var i: u32 = 0u; i < ${kw}u; i++) {
+                xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+                if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
+                  pad++;
+                  continue;
+                }
+                let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+                ${op1}
+              }`;
+    } else {
+      codeW = `
+              for (var i: u32 = 0u; i < ${kw}u; i++) {
+                xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+                let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+                ${op1}
+              }`;
+    }
+
+    if (attributes.kernelShape.length === 2) {
+      const kh = attributes.kernelShape[attributes.kernelShape.length - 2];
+      const sh = attributes.strides[attributes.strides.length - 2];
+      const phStart = attributes.pads[attributes.pads.length / 2 - 2];
+      const phEnd = attributes.pads[attributes.pads.length - 2];
+      const dimH = inputDims[rank - 2];
+      if (phStart + phEnd !== 0) {
+        codeH = `
+                for (var j: u32 = 0u; j < ${kh}u; j++) {
+                  xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+                  if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
+                    pad+= ${kw};
+                    continue;
+                  }
+              `;
+      } else {
+        codeH = `
+                for (var j: u32 = 0u; j < ${kh}u; j++) {
+                  xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+                `;
+      }
+      codeHEnd = `
+              }
+            `;
+    }
+
+    const poolingCode = `
+            const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+            @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+            @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+            ${outputIndicesHelper.o2iImpl}
+            ${xIndicesHelper.i2oImpl}
+
+            @compute @workgroup_size(WORKGROUP_SIZE)
+            fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+              // Guard against out-of-bounds work group sizes
+              if (global_id.x >= ${outputSize}u) {
+                return;
+              }
+
+              ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+              ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+              ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+              ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+              var value: ${dataType} = ${dataType}(${start});
+              var pad = 0;
+              ${codeH}
+              ${codeW}
+              ${codeHEnd}
+              ${op2}
+
+              output[global_id.x] = value;
+            }`;
+    return poolingCode;
+  } else {
+    const kernelSize = ShapeUtil.size(attributes.kernelShape);
+    const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
+    const stridesRank = kernelStrides.length;
+    const padsRank = attributes.pads.length;
+    const hasPads = attributes.pads.reduce((sum, cur) => sum + cur);
+    let padCode = '';
+    if (hasPads) {
+      padCode = `
+                if (xIndices[j] >= inputDims[j]) {
+                  pad++;
+                  isPad = true;
+                  break;
+                }
+              }
+              if (!isPad) {
+                let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+                ${op1}
+              }`;
+    } else {
+      padCode = `
+              }
+              let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
+              ${op1}
+            `;
+    }
+    const poolingCode = `
+            const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+            @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
+            @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+            ${outputIndicesHelper.o2iImpl}
+            ${xIndicesHelper.i2oImpl}
+
+            const pads = array<u32, ${padsRank}>(${attributes.pads.map(i => `${i}u`).join(',')});
+            const inputDims = array<u32, ${rank}>(${inputDims.map(i => `${i}u`).join(',')});
+            const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
+            const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
+
+            @compute @workgroup_size(WORKGROUP_SIZE)
+            fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+              // Guard against out-of-bounds work group sizes
+              if (global_id.x >= ${outputSize}u) {
+                return;
+              }
+
+              ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+              ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+              ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
+              ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+
+              var offsets: array<u32, ${stridesRank}>;
+
+              var value = ${dataType}(${start});
+              var pad = 0;
+              var isPad = false;
+
+              for (var i: u32 = 0u; i < ${kernelSize}u; i++) {
+                var offset = i;
+                for (var j = 0u; j < ${stridesRank - 1}u; j++) {
+                  offsets[j] = offset / kernelStrides[j];
+                  offset -= offsets[j] * kernelStrides[j];
+                }
+                offsets[${stridesRank - 1}] = offset;
+
+                isPad = false;
+                for (var j = ${rank - stridesRank}u; j < ${rank}u; j++) {
+                  xIndices[j] = indices[j] * strides[j - ${rank - stridesRank}u]
+                    + offsets[j - ${rank - stridesRank}u] - pads[j - 2u];
+                  ${padCode}
+              }
+              ${op2}
+
+              output[global_id.x] = value;
+            }`;
+    return poolingCode;
+  }
+};
+
+export interface PoolCommonAttributes {
+  readonly autoPad: string;
+  readonly ceilMode: number;
+  readonly kernelShape: readonly number[];
+  readonly strides: readonly number[];
+  readonly pads: readonly number[];
+}
+
+const parsePoolCommonAttributes = (attributes: Record<string, unknown>): PoolCommonAttributes => ({
+  autoPad: ['NOTSET', 'VALID', 'SAME_UPPER', 'SAME_LOWER'][attributes.auto_pad as number],
+  ceilMode: attributes.ceil_mode as number,
+  kernelShape: attributes.kernel_shape as [number, number],
+  strides: attributes.strides as [number, number],
+  pads: attributes.pads as [number, number, number, number]
+});
+
+export interface AveragePoolAttributes extends PoolCommonAttributes, AttributeWithCacheKey {
+  readonly countIncludePad: boolean;
+}
+
+const createAveragePoolProgramInfo =
+    (inputs: readonly TensorView[], metadata: ProgramMetadata, isGlobalOperator: boolean,
+     attributes: AveragePoolAttributes): ProgramInfo => {
+      const [adjustedAttributes, outputShape] =
+          getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+      const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
+
+      const dataType = 'f32';
+
+      const op1 = 'value += x_val;';
+      let op2 = '';
+      if (adjustedAttributes.countIncludePad) {
+        op2 += `value /= ${dataType}(${kernelSize});`;
+      } else {
+        op2 += `value /= ${dataType}(${kernelSize} - pad);`;
+      }
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType, '0.0'),
+        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+      };
+    };
+
+export const parseAveragePoolAttributes = (attributes: Record<string, unknown>): AveragePoolAttributes => {
+  const countIncludePad = (attributes.count_include_pad as number) === 0 ? false : true;
+
+  const attr = parsePoolCommonAttributes(attributes);
+  // TODO: support attribute 'ceil_mode'
+  if (attr.ceilMode !== 0) {
+    throw new Error('using ceil() in shape computation is not yet supported for AveragePool');
+  }
+
+  return createAttributeWithCacheKey({countIncludePad, ...attr});
+};
+
+export const averagePool = (context: ComputeContext, attributes: AveragePoolAttributes): number => {
+  validateInputs(context.inputs);
+  const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+  return context.compute(
+      {...metadata, get: () => createAveragePoolProgramInfo(context.inputs, metadata, false, attributes)});
+};
+
+const globalPoolAttributes = {
+  autoPad: '',
+  ceilMode: 0,
+  countIncludePad: false,
+  kernelShape: [],
+  strides: [],
+  pads: [],
+  storageOrder: 0,
+  dilations: [],
+  cacheKey: ''
+};
+
+export const globalAveragePool = (context: ComputeContext): number => {
+  validateInputs(context.inputs);
+  const metadata = {name: 'GlobalAveragePool', inputTypes: [GpuDataType.default]};
+  return context.compute(
+      {...metadata, get: () => createAveragePoolProgramInfo(context.inputs, metadata, true, globalPoolAttributes)});
+};
+
+export interface MaxPoolAttributes extends PoolCommonAttributes, AttributeWithCacheKey {
+  readonly storageOrder: number;
+  readonly dilations: number[];
+}
+
+const createMaxPoolProgramInfo =
+    (inputs: readonly TensorView[], metadata: ProgramMetadata, isGlobalOperator: boolean,
+     attributes: MaxPoolAttributes): ProgramInfo => {
+      const [adjustedAttributes, outputShape] =
+          getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
+      const op1 = `
+      value = max(x_val, value);
+    `;
+      const op2 = '';
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32', '-1e5'),
+        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+      };
+    };
+
+export const maxPool = (context: ComputeContext, attributes: MaxPoolAttributes): number => {
+  validateInputs(context.inputs);
+  const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+  return context.compute(
+      {...metadata, get: () => createMaxPoolProgramInfo(context.inputs, metadata, false, attributes)});
+};
+
+export const parseMaxPoolAttributes = (attributes: Record<string, unknown>): MaxPoolAttributes => {
+  const storageOrder = attributes.storage_order as number;
+  const dilations = attributes.dilations as [number, number];
+
+  const attr = parsePoolCommonAttributes(attributes);
+  // TODO: support attribute 'ceil_mode' and 'storage_order'
+  if (storageOrder !== 0) {
+    throw new Error('column major storage order is not yet supported for MaxPool');
+  }
+  if (attr.ceilMode !== 0) {
+    throw new Error('using ceil() in shape computation is not yet supported for MaxPool');
+  }
+
+  return createAttributeWithCacheKey({storageOrder, dilations, ...attr});
+};
+
+const globalMaxPoolMetadata = {
+  name: 'GlobalMaxPool',
+  inputTypes: [GpuDataType.default]
+};
+
+export const globalMaxPool = (context: ComputeContext): number => {
+  validateInputs(context.inputs);
+  return context.compute({
+    ...globalMaxPoolMetadata,
+    get: () => createMaxPoolProgramInfo(context.inputs, globalMaxPoolMetadata, true, globalPoolAttributes)
+  });
+};
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 9f87c905ac86f..9cdc1d1c00f5c 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -716,16 +716,16 @@
       "test_atanh",
       // "test_averagepool_1d_default",
       // "test_averagepool_2d_ceil",
-      // "test_averagepool_2d_default",
-      // "test_averagepool_2d_pads_count_include_pad",
-      // "test_averagepool_2d_pads",
-      // "test_averagepool_2d_precomputed_pads_count_include_pad",
-      // "test_averagepool_2d_precomputed_pads",
-      // "test_averagepool_2d_precomputed_same_upper",
-      // "test_averagepool_2d_precomputed_strides",
-      // "test_averagepool_2d_same_lower",
-      // "test_averagepool_2d_same_upper",
-      // "test_averagepool_2d_strides",
+      "test_averagepool_2d_default",
+      "test_averagepool_2d_pads_count_include_pad",
+      "test_averagepool_2d_pads",
+      "test_averagepool_2d_precomputed_pads_count_include_pad",
+      "test_averagepool_2d_precomputed_pads",
+      "test_averagepool_2d_precomputed_same_upper",
+      "test_averagepool_2d_precomputed_strides",
+      "test_averagepool_2d_same_lower",
+      "test_averagepool_2d_same_upper",
+      "test_averagepool_2d_strides",
       // "test_averagepool_3d_default",
       "test_basic_conv_with_padding",
       "test_basic_conv_without_padding",
@@ -927,10 +927,10 @@
       "test_gemm_nobroadcast",
       "test_gemm_transposeA",
       "test_gemm_transposeB",
-      // "test_globalaveragepool_precomputed",
-      // "test_globalaveragepool",
-      // "test_globalmaxpool_precomputed",
-      // "test_globalmaxpool",
+      "test_globalaveragepool_precomputed",
+      "test_globalaveragepool",
+      "test_globalmaxpool_precomputed",
+      "test_globalmaxpool",
       // "test_greater_bcast",
       // "test_greater_equal_bcast_expanded",
       // "test_greater_equal_bcast",
@@ -1064,15 +1064,15 @@
       // "test_max_uint8",
       // "test_maxpool_1d_default",
       // "test_maxpool_2d_ceil",
-      // "test_maxpool_2d_default",
+      "test_maxpool_2d_default",
       // "test_maxpool_2d_dilations",
-      // "test_maxpool_2d_pads",
-      // "test_maxpool_2d_precomputed_pads",
-      // "test_maxpool_2d_precomputed_same_upper",
-      // "test_maxpool_2d_precomputed_strides",
-      // "test_maxpool_2d_same_lower",
-      // "test_maxpool_2d_same_upper",
-      // "test_maxpool_2d_strides",
+      "test_maxpool_2d_pads",
+      "test_maxpool_2d_precomputed_pads",
+      "test_maxpool_2d_precomputed_same_upper",
+      "test_maxpool_2d_precomputed_strides",
+      "test_maxpool_2d_same_lower",
+      "test_maxpool_2d_same_upper",
+      "test_maxpool_2d_strides",
       // "test_maxpool_2d_uint8",
       // "test_maxpool_3d_default",
       // "test_maxpool_with_argmax_2d_precomputed_pads",
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index a41235d310538..0adec04528f7a 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -133,10 +133,16 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHW
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
 
-// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv);
-// class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
-// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
-// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalAveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, float, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, float, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool);
 // class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Softmax);
 // class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax);
 
@@ -216,9 +222,16 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
       // KERNEL_CREATE_INFO(11, Conv),
-      // KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool),
-      // KERNEL_CREATE_INFO(12, MaxPool),
-      // KERNEL_CREATE_INFO(11, AveragePool),
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool)>,
       // // layout insensitive, use ONNX-domain directly
       // BuildKernelCreateInfo<
       //     ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax)>,
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index aad7daa152a4c..91ba2f085243f 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -22,31 +22,25 @@ class Conv : public JsKernel {
     // currently only support Conv2D. TODO: support other
     JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
         "format": $13 ? "NHWC" : "NCHW",
-        "autopad": $1,
-        "dilation0": $2,
-        "dilation1": $3,
+        "auto_pad": $1,
+        "dilations": [$2, $3],
         "group": $4,
-        "kernelshape0": $5,
-        "kernelshape1": $6,
-        "pad0": $7,
-        "pad1": $8,
-        "pad2": $9,
-        "pad3": $10,
-        "stride0": $11,
-        "stride1": $12,
+        "kernel_shape": [$5, $6],
+        "pads": [$7, $8, $9, $10],
+        "strides": [$11, $12]
     }),
     static_cast<int32_t>(conv_attrs_.auto_pad),
-    static_cast<int32_t>(conv_attrs_.dilations[0]),
-    static_cast<int32_t>(conv_attrs_.dilations[1]),
+    static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
+    static_cast<int32_t>(conv_attrs_.dilations.size() > 1 ? conv_attrs_.dilations[1] : 0),
     static_cast<int32_t>(conv_attrs_.group),
-    static_cast<int32_t>(conv_attrs_.kernel_shape_specified ? kernel_shape[0] : 0),
-    static_cast<int32_t>(conv_attrs_.kernel_shape_specified ? kernel_shape[1] : 0),
-    static_cast<int32_t>(conv_attrs_.pads[0]),
-    static_cast<int32_t>(conv_attrs_.pads[1]),
-    static_cast<int32_t>(conv_attrs_.pads[2]),
-    static_cast<int32_t>(conv_attrs_.pads[3]),
-    static_cast<int32_t>(conv_attrs_.strides[0]),
-    static_cast<int32_t>(conv_attrs_.strides[1]),
+    static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
+    static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0),
+    static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
+    static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
+    static_cast<int32_t>(conv_attrs_.pads.size() > 2 ? conv_attrs_.pads[2] : 0),
+    static_cast<int32_t>(conv_attrs_.pads.size() > 3 ? conv_attrs_.pads[3] : 0),
+    static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
+    static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
     static_cast<int32_t>(is_channels_last)
     );
   }
diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc
new file mode 100644
index 0000000000000..0bcc34a210009
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/pool.cc
@@ -0,0 +1,71 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+#include "pool.h"
+
+namespace onnxruntime {
+namespace js {
+
+#define POOLING_KERNEL(op_name, data_type, pool_type, since_version)                               \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                   \
+      op_name,                                                                                     \
+      kOnnxDomain,                                                                                 \
+      since_version,                                                                               \
+      data_type,                                                                                   \
+      kJsExecutionProvider,                                                                        \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
+      Pool<data_type, pool_type>);
+
+#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                  \
+      op_name,                                                                              \
+      kOnnxDomain,                                                                          \
+      since_version,                                                                        \
+      end_version,                                                                          \
+      data_type,                                                                            \
+      kJsExecutionProvider,                                                                 \
+      (*KernelDefBuilder::Create())                                                         \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),                   \
+      Pool<data_type, pool_type>);
+
+#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version) \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                  \
+      op_name,                                                                    \
+      kOnnxDomain,                                                                \
+      since_version,                                                              \
+      data_type,                                                                  \
+      kJsExecutionProvider,                                                       \
+      (*KernelDefBuilder::Create())                                               \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())          \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),           \
+      Pool<data_type, pool_type>);
+
+#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                               \
+      op_name,                                                                                           \
+      kOnnxDomain,                                                                                       \
+      since_version,                                                                                     \
+      end_version,                                                                                       \
+      data_type,                                                                                         \
+      kJsExecutionProvider,                                                                              \
+      (*KernelDefBuilder::Create())                                                                      \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                 \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                  \
+      Pool<data_type, pool_type>);
+
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10)
+POOLING_KERNEL(AveragePool, float, AveragePool, 11)
+POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1)
+
+POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11)
+POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12)
+POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1)
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/pool.h b/onnxruntime/core/providers/js/operators/pool.h
new file mode 100644
index 0000000000000..b2ec8947e25b8
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/pool.h
@@ -0,0 +1,71 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/cpu/nn/pool_base.h"
+
+namespace onnxruntime {
+namespace js {
+
+#define POOL_ATTRIBUTES_JS_OBJ_MAPPING ({ \
+    "auto_pad": $1,                       \
+    "ceil_mode": $2,                      \
+    "count_include_pad": $3,              \
+    "storage_order": $4,                  \
+    "dilations": [$5, $6],                \
+    "kernel_shape": [$7, $8],             \
+    "pads": [$9, $10, $11, $12],          \
+    "strides": [$13, $14]                 \
+  })
+
+#define POOL_ATTRIBUTES_PARAM_LIST                                                             \
+  static_cast<int32_t>(pool_attrs_.auto_pad),                                                  \
+  static_cast<int32_t>(pool_attrs_.ceil_mode),                                                 \
+  static_cast<int32_t>(pool_attrs_.count_include_pad),                                         \
+  static_cast<int32_t>(pool_attrs_.storage_order),                                             \
+  static_cast<int32_t>(pool_attrs_.dilations.size() > 0 ? pool_attrs_.dilations[0] : 0),       \
+  static_cast<int32_t>(pool_attrs_.dilations.size() > 1 ? pool_attrs_.dilations[1] : 0),       \
+  static_cast<int32_t>(pool_attrs_.kernel_shape.size() > 0 ? pool_attrs_.kernel_shape[0] : 0), \
+  static_cast<int32_t>(pool_attrs_.kernel_shape.size() > 1 ? pool_attrs_.kernel_shape[1] : 0), \
+  static_cast<int32_t>(pool_attrs_.pads.size() > 0 ? pool_attrs_.pads[0] : 0),                 \
+  static_cast<int32_t>(pool_attrs_.pads.size() > 1 ? pool_attrs_.pads[1] : 0),                 \
+  static_cast<int32_t>(pool_attrs_.pads.size() > 2 ? pool_attrs_.pads[2] : 0),                 \
+  static_cast<int32_t>(pool_attrs_.pads.size() > 3 ? pool_attrs_.pads[3] : 0),                 \
+  static_cast<int32_t>(pool_attrs_.strides.size() > 0 ? pool_attrs_.strides[0] : 0),           \
+  static_cast<int32_t>(pool_attrs_.strides.size() > 1 ? pool_attrs_.strides[1] : 0)
+
+
+template <typename T, typename PoolType>
+class Pool : public JsKernel, public PoolBase {
+ public:
+  Pool(const OpKernelInfo& info) : JsKernel(info), PoolBase(info) {
+    if (pool_attrs_.global_pooling) {
+      if constexpr (PoolType::type == onnxruntime::PoolType::kAveragePool) {
+        JSEP_INIT_KERNEL(GlobalAveragePool);
+      } else if constexpr (PoolType::type == onnxruntime::PoolType::kMaxPool) {
+        JSEP_INIT_KERNEL(GlobalMaxPool);
+      } else {
+        // TODO: GlobalLpPool
+      }
+    } else {
+      if constexpr (PoolType::type == onnxruntime::PoolType::kAveragePool) {
+        JSEP_INIT_KERNEL_ATTRIBUTE(AveragePool, POOL_ATTRIBUTES_JS_OBJ_MAPPING, POOL_ATTRIBUTES_PARAM_LIST);
+      } else if constexpr (PoolType::type == onnxruntime::PoolType::kMaxPool) {
+        JSEP_INIT_KERNEL_ATTRIBUTE(MaxPool, POOL_ATTRIBUTES_JS_OBJ_MAPPING, POOL_ATTRIBUTES_PARAM_LIST);
+      } else {
+        // TODO: LpPool
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pool<T, MaxPool<8>> final : public Pool<T, MaxPool<1>> {
+ public:
+  Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>>(info) {}
+};
+
+}  // namespace js
+}  // namespace onnxruntime

From 81c0c048544644727a4bc6955fd6235fcf6a888b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 7 Dec 2022 18:23:43 -0800
Subject: [PATCH 19/81] w3

---
 br.bat                                        |   2 +-
 .../lib/onnxjs/backends/webgpu/ops/concat.ts  |   2 +-
 .../onnxjs/backends/webgpu/ops/fuse-utils.ts  |   2 +-
 .../onnxjs/backends/webgpu/ops/unary-op.ts    |   4 +-
 js/web/lib/wasm/jsep/backend-webgpu.ts        |  16 +-
 js/web/lib/wasm/jsep/init.ts                  |  36 +-
 js/web/lib/wasm/jsep/util.ts                  |  50 +++
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  |  52 ++-
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   7 +-
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts     |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts |   3 +-
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts       | 310 +++++++++---------
 js/web/lib/wasm/session-options.ts            |   6 +
 js/web/test/suite-test-list.jsonc             |  20 +-
 js/web/test/test-runner.ts                    |  47 ++-
 onnxruntime/core/framework/execution_frame.cc |   6 +
 onnxruntime/core/framework/kernel_lookup.h    |   6 +
 onnxruntime/core/framework/kernel_registry.cc |   8 +
 .../core/providers/js/data_transfer.cc        |  17 +-
 .../providers/js/js_execution_provider.cc     |  23 +-
 onnxruntime/core/providers/js/js_export.cc    |  10 +-
 onnxruntime/core/providers/js/js_kernel.h     |  48 ++-
 .../core/providers/js/operators/gemm.cc       |  42 +++
 .../core/providers/js/operators/gemm.h        |  38 +++
 .../core/providers/js/operators/reshape.cc    |  46 +++
 .../core/providers/js/operators/reshape.h     |  48 +++
 .../core/providers/js/operators/shape_op.cc   |  47 +++
 onnxruntime/wasm/js_internal_api.js           |   6 +-
 28 files changed, 664 insertions(+), 240 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/operators/gemm.cc
 create mode 100644 onnxruntime/core/providers/js/operators/gemm.h
 create mode 100644 onnxruntime/core/providers/js/operators/reshape.cc
 create mode 100644 onnxruntime/core/providers/js/operators/reshape.h
 create mode 100644 onnxruntime/core/providers/js/operators/shape_op.cc

diff --git a/br.bat b/br.bat
index fef438314e676..bd491a9b71767 100644
--- a/br.bat
+++ b/br.bat
@@ -17,7 +17,7 @@ if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
 call .\build.bat --config Release --skip_submodule_sync --skip_tests --disable_wasm_exception_catching --disable_rtti --build_wasm --use_xnnpack --enable_wasm_simd --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly
 
 IF %ERRORLEVEL% == 0 (
-copy /Y .\build\Windows\Release\ort-wasm-simd.js .\js\web\lib\wasm\binding\
+copy /Y .\build\Windows\Release\ort-wasm-simd.js .\js\web\lib\wasm\binding\ort-wasm.js
 copy /Y .\build\Windows\Release\ort-wasm-simd.wasm .\js\web\dist\
 )
 
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts b/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
index 874aef1e44bff..9616ca7ae5196 100644
--- a/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
@@ -80,7 +80,7 @@ const createConcatProgramInfo =
   ${inputIndicesHelpers.map(i => i.i2oImpl).join('\n')}
   ${outputIndicesHelper.o2iImpl}
 
-  let sizeInConcatAxis = array<u32, ${sizeInConcatAxis.length}>(${sizeInConcatAxis.map(i => `${i}u`).join(',')});
+  const sizeInConcatAxis = array<u32, ${sizeInConcatAxis.length}>(${sizeInConcatAxis.map(i => `${i}u`).join(',')});
   ${calculateInputIndexImpl(sizeInConcatAxis.length)}
   ${readBufferDataImpl(inputIndicesHelpers, rank, dataType)}
 
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts b/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
index fae2c9fb6e9b2..355685b55ad6a 100644
--- a/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
@@ -19,7 +19,7 @@ export function getActicationSnippet(attributes: InternalActivationAttributes) {
       return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
     case 'Clip':
       return {
-        activationFunction: `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
+        activationFunction: `const clip_min_=f32(${attributes.clipMin!});const clip_max_=f32(${attributes.clipMax!});`,
         applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
       };
       // TODO: adding other activations that can be fused.
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts b/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
index 54213cfdd2313..82c58f78e232d 100644
--- a/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
+++ b/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
@@ -86,8 +86,8 @@ export const clip = async(handler: WebGpuInferenceHandler, inputs: Tensor[], att
                         Promise<Tensor[] >=>handler.run(
                             createElementwiseProgramInfoLoader(
                                 inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
-    let clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
-    let clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
+    const clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
+    const clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
 `,
                                 attributes.cacheKey),
                             inputs);
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 21461ed865865..3575d5779d195 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {env} from 'onnxruntime-common';
+
 import {TensorView} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
@@ -106,7 +108,7 @@ export class WebGpuBackend {
     for (let i = 0; i < inputs.length; ++i) {
       const gpuData = this.gpuDataManager.get(inputs[i].data);
       if (!gpuData) {
-        throw new Error(`no GPU data for ${inputs[i].data}`);
+        throw new Error(`no GPU data for input: ${inputs[i].data}`);
       }
       inputDatas[i] = gpuData;
     }
@@ -124,7 +126,7 @@ export class WebGpuBackend {
       const dataId = createOutput(i, programInfo.outputs[i].dims);
       const gpuData = this.gpuDataManager.get(dataId);
       if (!gpuData) {
-        throw new Error(`no GPU data for ${inputs[i].data}`);
+        throw new Error(`no GPU data for output: ${dataId}`);
       }
       outputDatas.push(gpuData);
     }
@@ -143,6 +145,10 @@ export class WebGpuBackend {
     this.gpuDataManager.upload(gpuDataId, data);
   }
 
+  memcpy(src: number, dst: number): void {
+    this.gpuDataManager.memcpy(src, dst);
+  }
+
   async download(gpuDataId: number, data: Uint8Array): Promise<void> {
     const arrayBuffer = await this.gpuDataManager.download(gpuDataId);
     data.set(new Uint8Array(arrayBuffer));
@@ -180,8 +186,10 @@ export class WebGpuBackend {
     }
     const [name, kernelEntry, attributes] = kernel;
 
-    // eslint-disable-next-line no-console
-    console.log(`[js] Start to run kernel "${name}"...`);
+    if (env.debug) {
+      // eslint-disable-next-line no-console
+      console.log(`[js] Start to run kernel "${name}"...`);
+    }
     return kernelEntry(context, attributes);
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 7797a16ac087b..403961245d104 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {env} from 'onnxruntime-common';
+
 import {OrtWasmModule} from '../binding/ort-wasm';
 
 import {WebGpuBackend} from './backend-webgpu';
@@ -83,21 +85,33 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         // jsepFree()
         (ptr: number) => backend.free(ptr),
 
-        // jsepUpload(src, dst, size)
-        (dataOffset: number, gpuDataId: number, size: number) => {
-          // eslint-disable-next-line no-console
-          console.log(`[js] jsepUpload: dataOffset=${dataOffset}, gpuDataId=${gpuDataId}, size=${size}`);
-          const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
-          backend.upload(gpuDataId, data);
+        // jsepCopy(src, dst, size, isSourceGpu)
+        (src: number, dst: number, size: number, isSourceGpu = false) => {
+          if (isSourceGpu) {
+            if (env.debug) {
+              // eslint-disable-next-line no-console
+              console.log(`[js] jsepCopyGpuToGpu: src=${src}, dst=${dst}, size=${size}`);
+            }
+            backend.memcpy(src, dst);
+          } else {
+            if (env.debug) {
+              // eslint-disable-next-line no-console
+              console.log(`[js] jsepCopyCpuToGpu: dataOffset=${src}, gpuDataId=${dst}, size=${size}`);
+            }
+            const data = module.HEAPU8.subarray(src, src + size);
+            backend.upload(dst, data);
+          }
         },
 
-        // jsepDownload(src, dst, size)
+        // jsepCopyAsync(src, dst, size)
         async(gpuDataId: number, dataOffset: number, size: number):
             Promise<void> => {
               const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
 
-              // eslint-disable-next-line no-console
-              console.log(`[js] jsepDownload: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
+              if (env.debug) {
+                // eslint-disable-next-line no-console
+                console.log(`[js] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
+              }
 
               await backend.download(gpuDataId, data);
             },
@@ -110,6 +124,10 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
 
         // jsepRun
         (kernel: number, contextDataOffset: number) => {
+          if (env.debug) {
+            // eslint-disable-next-line no-console
+            console.log(`[js] jsepRun on ${contextDataOffset}`);
+          }
           const context = new OpKernelContext(module, backend, contextDataOffset);
           return backend.computeKernel(kernel, context);
         });
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index 228d4d3d0a1da..b3ac9c9e8e7ed 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -712,5 +712,55 @@ export class PoolConvUtil {
   }
 }
 
+export class GemmUtil {
+  // will make sure input shapes are compatible for this op
+  // and return back the shape of the output in the form of a tuple
+  // will throw exception if the input shapes are not compatible
+  static getShapeOfGemmResult(
+      leftShape: readonly number[], transLeft: boolean, rightShape: readonly number[], transRight: boolean,
+      biasShape?: readonly number[]): readonly number[] {
+    if (leftShape.length !== 2 || rightShape.length !== 2) {
+      throw new Error('shape need to be of size 2');
+    }
+
+    let M: number;
+    let K: number;
+    let N: number;
+
+    if (transLeft) {
+      M = leftShape[1];
+      K = leftShape[0];
+    } else {
+      M = leftShape[0];
+      K = leftShape[1];
+    }
+
+    let kDim = -1;
+
+    if (transRight) {
+      N = rightShape[0];
+      kDim = 1;
+    } else {
+      N = rightShape[1];
+      kDim = 0;
+    }
+
+    if (rightShape[kDim] !== K) {
+      throw new Error('dimension mismatch');
+    }
+
+    if (M <= 0 || N <= 0 || K <= 0) {
+      throw new Error('invalid shape specified');
+    }
+
+    if (biasShape && !BroadcastUtil.isValidBroadcast(biasShape, [M, N])) {
+      throw new Error('gemm: invalid bias shape for broadcast');
+    }
+
+    return [M, N, K];
+  }
+}
+
+
 export const MIN_CLIP = -3.4028234663852886e+38;
 export const MAX_CLIP = 3.4028234663852886e+38;
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index d4466f1a0ba66..f4cac6be224ec 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {env} from 'onnxruntime-common';
+
 import {WebGpuBackend} from '../backend-webgpu';
 
 import {GpuData, GpuDataId, GpuDataType} from './types';
@@ -10,9 +12,13 @@ import {GpuData, GpuDataId, GpuDataType} from './types';
  */
 export interface GpuDataManager {
   /**
-   * upload data to GPU.
+   * copy data from CPU to GPU.
    */
   upload(id: GpuDataId, data: Uint8Array): void;
+  /**
+   * copy data from GPU to GPU.
+   */
+  memcpy(sourceId: GpuDataId, destinationId: GpuDataId): void;
   /**
    * create new data on GPU.
    */
@@ -28,7 +34,7 @@ export interface GpuDataManager {
    */
   release(id: GpuDataId): number;
   /**
-   * download the data from GPU.
+   * copy data from GPU to CPU.
    */
   download(id: GpuDataId): Promise<ArrayBufferLike>;
 
@@ -98,14 +104,38 @@ class GpuDataManagerImpl implements GpuDataManager {
 
 
     // GPU copy
-    this.backend.getCommandEncoder().copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size);
+    const commandEncoder = this.backend.getCommandEncoder();
+    this.backend.endComputePass();
+    commandEncoder.copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size);
 
-    // eslint-disable-next-line no-console
-    console.log(`[js] GpuDataManager.upload(id=${id})`);
+    if (env.debug) {
+      // eslint-disable-next-line no-console
+      console.log(`[js] GpuDataManager.upload(id=${id})`);
+    }
 
     this.buffersForUploadingPending.push(gpuBufferForUploading);
   }
 
+  memcpy(sourceId: GpuDataId, destinationId: GpuDataId): void {
+    // get source gpu buffer
+    const sourceGpuDataCache = this.storageCache.get(sourceId);
+    if (!sourceGpuDataCache) {
+      throw new Error('source gpu data for memcpy does not exist');
+    }
+    // get destination gpu buffer
+    const destinationGpuDataCache = this.storageCache.get(destinationId);
+    if (!destinationGpuDataCache) {
+      throw new Error('destination gpu data for memcpy does not exist');
+    }
+    if (sourceGpuDataCache.originalSize !== destinationGpuDataCache.originalSize) {
+      throw new Error('inconsistent source and destination gpu data size');
+    }
+    const size = calcNormalizedBufferSize(sourceGpuDataCache.originalSize);
+    // GPU copy
+    this.backend.getCommandEncoder().copyBufferToBuffer(
+        sourceGpuDataCache.gpuData.buffer, 0, destinationGpuDataCache.gpuData.buffer, 0, size);
+  }
+
   create(size: number): GpuData {
     // !!!
     // !!! IMPORTANT: TODO: whether we should keep the storage buffer every time, or always create new ones.
@@ -122,8 +152,10 @@ class GpuDataManagerImpl implements GpuDataManager {
     const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
     this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
 
-    // eslint-disable-next-line no-console
-    console.log(`[js] GpuDataManager.create(size=${size}) => id=${gpuData.id}`);
+    if (env.debug) {
+      // eslint-disable-next-line no-console
+      console.log(`[js] GpuDataManager.create(size=${size}) => id=${gpuData.id}`);
+    }
     return gpuData;
   }
 
@@ -137,8 +169,10 @@ class GpuDataManagerImpl implements GpuDataManager {
       throw new Error('releasing data does not exist');
     }
 
-    // eslint-disable-next-line no-console
-    console.log(`[js] GpuDataManager.release(id=${id}), gpuDataId=${cachedData.gpuData.id}`);
+    if (env.debug) {
+      // eslint-disable-next-line no-console
+      console.log(`[js] GpuDataManager.release(id=${id}), gpuDataId=${cachedData.gpuData.id}`);
+    }
 
     this.storageCache.delete(id);
     this.buffersPending.push(cachedData.gpuData.buffer);
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 082fdc7a3c2ab..34dc221d25396 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -5,7 +5,7 @@ import * as binaryOps from './ops/binary-op';
 // import {concat, parseConcatAttributes} from './ops/concat';
 import {conv, parseConvAttributes} from './ops/conv';
 // import {gather, parseGatherAttributes} from './ops/gather';
-// import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
+import {gemm, parseGemmAttributes} from './ops/gemm';
 // import {matMul, parseMatMulAttributes} from './ops/matmul';
 import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool';
 //  import {sum} from
@@ -40,9 +40,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
   ['Floor', [unaryOps.floor]],
   // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
-  //['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
-  //['Gemm', '', '11+', gemm, parseGemmAttributesV11],
-  ['GlobalAveragePool', [globalAveragePool]], ['GlobalMaxPool', [globalMaxPool]],
+  //['Gather', '', '1+', gather, parseGatherAttributes],
+  ['Gemm', [gemm, parseGemmAttributes]], ['GlobalAveragePool', [globalAveragePool]], ['GlobalMaxPool', [globalMaxPool]],
   // ['Greater', '', '7+', binaryOps.greater],
   // ['Identity', '', '1+', unaryOps.identity],
   // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index bc7f7107fb978..0bf656c84d98e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -127,7 +127,7 @@ const createConcatProgramInfo =
   ${inputIndicesHelpers.map(i => i.i2oImpl).join('\n')}
   ${outputIndicesHelper.o2iImpl}
 
-  let sizeInConcatAxis = array<u32, ${sizeInConcatAxis.length}>(${sizeInConcatAxis.map(i => `${i}u`).join(',')});
+  const sizeInConcatAxis = array<u32, ${sizeInConcatAxis.length}>(${sizeInConcatAxis.map(i => `${i}u`).join(',')});
   ${calculateInputIndexImpl(sizeInConcatAxis.length)}
   ${readBufferDataImpl(inputIndicesHelpers, rank, dataType)}
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
index 05785ba72ac0e..92105859a8c0e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
@@ -19,7 +19,8 @@ export const getActicationSnippet =
           return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
         case 'Clip':
           return {
-            activationFunction: `let clip_min_=f32(${attributes.clipMin!});let clip_max_=f32(${attributes.clipMax!});`,
+            activationFunction:
+                `const clip_min_=f32(${attributes.clipMin!});const clip_max_=f32(${attributes.clipMax!});`,
             applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
           };
           // TODO: adding other activations that can be fused.
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 49429a3c9f1ea..b19f173e4fbd7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -1,165 +1,151 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-// import {Graph} from '../../../graph';
-// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {GemmUtil, ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-// import {WORKGROUP_SIZE} from './common';
-
-// export interface GemmAttributes extends AttributeWithCacheKey {
-//   transA: boolean;
-//   transB: boolean;
-//   alpha: number;
-//   beta: number;
-//   isOptionalC: boolean;  // in opset 11, C becomes optional
-// }
-
-// export const gemm: OperatorAsyncImplementation<GemmAttributes> = async(
-//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GemmAttributes): Promise<Tensor[]> => {
-//   validateInputs(inputs, attributes);
-//   return inferenceHandler.run(createGemmProgramInfoLoader(inputs, attributes), inputs);
-// };
-
-// const parseGemmAttributes = (node: Graph.Node, isOptionalC: boolean): GemmAttributes => {
-//   const transA = node.attributes.getInt('transA', 0) !== 0;
-//   const transB = node.attributes.getInt('transB', 0) !== 0;
-//   const alpha = node.attributes.getFloat('alpha', 1.0);
-//   const beta = node.attributes.getFloat('beta', 1.0);
-//   return createAttributeWithCacheKey({transA, transB, alpha, beta, isOptionalC});
-// };
-
-// export const parseGemmAttributesV7: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
-//     parseGemmAttributes(node, false);
-
-// export const parseGemmAttributesV11: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
-//     parseGemmAttributes(node, true);
-
-// const createGemmProgramInfoLoader = (inputs: Tensor[], attributes: GemmAttributes): ProgramInfoLoader => {
-//   const metadata = {
-//     name: 'Gemm',
-//     inputTypes: inputs.length === 3 ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-//                                       [GpuDataType.default, GpuDataType.default],
-//     cacheHint: attributes.cacheKey
-//   };
-
-//   return {...metadata, get: () => createGemmProgramInfo(metadata, inputs, attributes)};
-// };
-
-// const offsetC = (m: number, n: number, dims: readonly number[]): string => {
-//   const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
-//   const broadcastN = dims[dims.length - 1] !== n;
-
-//   let offset = '0u';
-//   if (!broadcastM) {
-//     offset += `+ m * ${dims[dims.length - 1]}u`;
-//   }
-//   if (!broadcastN) {
-//     offset += '+n';
-//   }
-
-//   return offset;
-// };
-
-// const createGemmProgramInfo =
-//     (metadata: ProgramMetadata, inputs: Tensor[], attributes: GemmAttributes): ProgramInfo => {
-//       const aShape = inputs[0].dims.slice();
-//       const bShape = inputs[1].dims.slice();
-//       const [M, N, K] = GemmUtil.getShapeOfGemmResult(
-//           aShape, attributes.transA, bShape, attributes.transB, inputs.length === 3 ? inputs[2].dims : undefined);
-//       const outputShape = [M, N];
-//       if (!outputShape) {
-//         throw new Error('Can\'t use gemm on the given tensors');
-//       }
-//       const outputSize = ShapeUtil.size(outputShape);
-//       let line = '';
-//       if (attributes.transA && attributes.transB) {
-//         line = 'value += a[k * M + m] * b[n * K + k];';
-//       } else if (attributes.transA && !attributes.transB) {
-//         line = 'value += a[k * M + m] * b[k * N + n];';
-//       } else if (!attributes.transA && attributes.transB) {
-//         line = 'value += a[m * K + k] * b[n * K + k];';
-//       } else if (!attributes.transA && !attributes.transB) {
-//         line = 'value += a[m * K + k] * b[k * N + n];';
-//       }
-
-//       const dataType = 'f32';  // TODO: support other data type
-//       const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
-//       const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
-//       const inputStorageBuffersDeclarations = [
-//         `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
-//         `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
-//       ];
-//       if (inputs.length === 3) {
-//         inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
-//       }
-//       const shaderSource = `
-//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-//   const M: u32 = ${M}u;
-//   const N: u32 = ${N}u;
-//   const K: u32 = ${K}u;
-//   const alpha = ${dataType}(${attributes.alpha});
-//   const beta = ${dataType}(${attributes.beta});
-
-//   ${inputStorageBuffersDeclarations.join('\n')}
-//   @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
-
-//   @compute @workgroup_size(WORKGROUP_SIZE)
-//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//     // Guard against out-of-bounds work group sizes
-//     if (global_id.x >= ${outputSize}u) {
-//       return;
-//     }
-
-//     let m = global_id.x / N;
-//     let n = global_id.x % N;
-
-//     var value = ${dataType}(0);
-//     for (var k: u32 = 0u; k<${K}u; k++) {
-//       ${line}
-//     }
-
-//     ${calculateAlpha}
-//     ${calculateC}
-//     output[global_id.x] = value;
-
-//   }`;
-//       return {
-//         ...metadata,
-//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-//         shaderSource,
-//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-//       };
-//     };
-
-// const validateInputs = (inputs: Tensor[], attributes: GemmAttributes): void => {
-//   if (!inputs) {
-//     throw new Error('Input is missing');
-//   }
-//   if (attributes.isOptionalC && (inputs.length < 2 || inputs.length > 3)) {
-//     throw new Error('Invaid input shape.');
-//   }
-//   if (!attributes.isOptionalC && inputs.length !== 3) {
-//     throw new Error('Gemm requires 3 inputs');
-//   }
-
-//   // 'C' can be of dimensionality 1 or 2 only
-//   if (inputs.length === 3 && inputs[2].dims.length !== 1 && inputs[2].dims.length !== 2) {
-//     throw new Error('Invalid input shape of C');
-//   }
-
-//   if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
-//       (inputs[1].type !== 'float32' && inputs[1].type !== 'float64') ||
-//       (inputs.length === 3 && inputs[2].type !== 'float32' && inputs[2].type !== 'float64')) {
-//     throw new Error('Invalid input type.');
-//   }
-
-//   if ((inputs[0].type !== inputs[1].type) || (inputs.length === 3 && inputs[0].type !== inputs[2].type)) {
-//     throw new Error('Input types are mismatched');
-//   }
-// };
+import {DataType} from '../../../wasm-core-impl';
+import {TensorView} from '../../tensor';
+import {GemmUtil, ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {WORKGROUP_SIZE} from './common';
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs) {
+    throw new Error('Input is missing');
+  }
+  if (inputs.length < 2 || inputs.length > 3) {
+    throw new Error('Invaid input number.');
+  }
+
+  // 'C' can be of dimensionality 0, 1 or 2 only
+  if (inputs.length === 3 && inputs[2].dims.length > 2) {
+    throw new Error('Invalid input shape of C');
+  }
+
+  if ((inputs[0].dataType !== DataType.float) || (inputs[1].dataType !== DataType.float) ||
+      (inputs.length === 3 && inputs[2].dataType !== DataType.float)) {
+    throw new Error('Invalid input type.');
+  }
+
+  if ((inputs[0].dataType !== inputs[1].dataType) ||
+      (inputs.length === 3 && inputs[0].dataType !== inputs[2].dataType)) {
+    throw new Error('Input types are mismatched');
+  }
+};
+
+export interface GemmAttributes extends AttributeWithCacheKey {
+  transA: boolean;
+  transB: boolean;
+  alpha: number;
+  beta: number;
+}
+
+const offsetC = (m: number, n: number, dims: readonly number[]): string => {
+  if (dims.length === 0) {
+    return '0u';
+  }
+
+  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
+  const broadcastN = dims[dims.length - 1] !== n;
+
+  let offset = '0u';
+  if (!broadcastM) {
+    offset += `+ m * ${dims[dims.length - 1]}u`;
+  }
+  if (!broadcastN) {
+    offset += '+n';
+  }
+
+  return offset;
+};
+
+const createGemmProgramInfo =
+    (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: GemmAttributes): ProgramInfo => {
+      const aShape = inputs[0].dims.slice();
+      const bShape = inputs[1].dims.slice();
+      const [M, N, K] = GemmUtil.getShapeOfGemmResult(
+          aShape, attributes.transA, bShape, attributes.transB, inputs.length === 3 ? inputs[2].dims : undefined);
+      const outputShape = [M, N];
+      if (!outputShape) {
+        throw new Error('Can\'t use gemm on the given tensors');
+      }
+      const outputSize = ShapeUtil.size(outputShape);
+      let line = '';
+      if (attributes.transA && attributes.transB) {
+        line = 'value += a[k * M + m] * b[n * K + k];';
+      } else if (attributes.transA && !attributes.transB) {
+        line = 'value += a[k * M + m] * b[k * N + n];';
+      } else if (!attributes.transA && attributes.transB) {
+        line = 'value += a[m * K + k] * b[n * K + k];';
+      } else if (!attributes.transA && !attributes.transB) {
+        line = 'value += a[m * K + k] * b[k * N + n];';
+      }
+
+      const dataType = 'f32';  // TODO: support other data type
+      const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
+      const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
+      const inputStorageBuffersDeclarations = [
+        `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
+        `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
+      ];
+      if (inputs.length === 3) {
+        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
+      }
+      const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+  const M: u32 = ${M}u;
+  const N: u32 = ${N}u;
+  const K: u32 = ${K}u;
+  const alpha = ${dataType}(${attributes.alpha});
+  const beta = ${dataType}(${attributes.beta});
+
+  ${inputStorageBuffersDeclarations.join('\n')}
+  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    let m = global_id.x / N;
+    let n = global_id.x % N;
+
+    var value = ${dataType}(0);
+    for (var k: u32 = 0u; k<${K}u; k++) {
+      ${line}
+    }
+
+    ${calculateAlpha}
+    ${calculateC}
+    output[global_id.x] = value;
+
+  }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        shaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const createGemmProgramInfoLoader = (inputs: readonly TensorView[], attributes: GemmAttributes): ProgramInfoLoader => {
+  const metadata = {
+    name: 'Gemm',
+    inputTypes: inputs.length === 3 ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                                      [GpuDataType.default, GpuDataType.default],
+    cacheHint: attributes.cacheKey
+  };
+
+  return {...metadata, get: () => createGemmProgramInfo(metadata, inputs, attributes)};
+};
+
+export const gemm = (context: ComputeContext, attributes: GemmAttributes): number => {
+  validateInputs(context.inputs);
+  return context.compute(createGemmProgramInfoLoader(context.inputs, attributes));
+};
+
+export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes =>
+    createAttributeWithCacheKey(attributes as Omit<GemmAttributes, keyof AttributeWithCacheKey>);
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 82a06f0f3cf5a..8bf293e2a2e0a 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -45,6 +45,12 @@ const appendDefaultOptions = (options: InferenceSession.SessionOptions): void =>
     // eslint-disable-next-line camelcase
     session.use_ort_model_bytes_directly = '1';
   }
+
+  // if using JSEP with WebGPU, always disable memory pattern
+  if (options.executionProviders &&
+      options.executionProviders.some(ep => ['jsep-webgpu'].includes(typeof ep === 'string' ? ep : ep.name))) {
+    options.enableMemPattern = false;
+  }
 };
 
 const setExecutionProviders =
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 9cdc1d1c00f5c..ef14eafae0792 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1335,16 +1335,16 @@
       // // "test_reflect_pad",
       "test_relu",
       // "test_reshape_allowzero_reordered",
-      // "test_reshape_extended_dims",
-      // "test_reshape_negative_dim",
-      // "test_reshape_negative_extended_dims",
-      // "test_reshape_one_dim",
-      // "test_reshape_reduced_dims",
-      // "test_reshape_reordered_all_dims",
-      // "test_reshape_reordered_dims",
-      // "test_reshape_reordered_last_dims",
-      // "test_reshape_zero_and_negative_dim",
-      // "test_reshape_zero_dim",
+      "test_reshape_extended_dims",
+      "test_reshape_negative_dim",
+      "test_reshape_negative_extended_dims",
+      "test_reshape_one_dim",
+      "test_reshape_reduced_dims",
+      "test_reshape_reordered_all_dims",
+      "test_reshape_reordered_dims",
+      "test_reshape_reordered_last_dims",
+      "test_reshape_zero_and_negative_dim",
+      "test_reshape_zero_dim",
       // "test_resize_downsample_linear",
       // "test_resize_downsample_nearest",
       // "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 4b4ed56ff6d5a..8dcc64025a7cf 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -3,7 +3,7 @@
 
 import {expect} from 'chai';
 import {readFile} from 'fs';
-import {onnx as onnxProto} from 'onnx-proto';
+import {onnx} from 'onnx-proto';
 import * as ort from 'onnxruntime-common';
 import {extname} from 'path';
 import {inspect, promisify} from 'util';
@@ -14,6 +14,7 @@ import {createWebGLContext} from '../lib/onnxjs/backends/webgl/webgl-context-fac
 import {Logger, Profiler} from '../lib/onnxjs/instrument';
 import {Operator} from '../lib/onnxjs/operators';
 import {Tensor} from '../lib/onnxjs/tensor';
+import {ProtoUtil} from '../lib/onnxjs/util';
 
 import {base64toBuffer, createMockGraph} from './test-shared';
 import {Test} from './test-types';
@@ -56,12 +57,40 @@ async function loadFile(uri: string): Promise<Uint8Array> {
   }
 }
 
-async function loadTensorProto(uriOrData: string|Uint8Array): Promise<Test.NamedTensor> {
+async function loadTensorProto(uriOrData: string|Uint8Array, allowInt64 = false): Promise<Test.NamedTensor> {
   const buf = (typeof uriOrData === 'string') ? await loadFile(uriOrData) : uriOrData;
-  const tensorProto = onnxProto.TensorProto.decode(buf);
-  const tensor = Tensor.fromProto(tensorProto);
+  const tensorProto = onnx.TensorProto.decode(buf);
+
+  let tensor: ort.Tensor;
+
+  // by default, we don't allow (u)int64. this is for backward compatibility.
+  if (allowInt64 && tensorProto && tensorProto.dataType &&
+      ((tensorProto.dataType === onnx.TensorProto.DataType.INT64 ||
+        tensorProto.dataType === onnx.TensorProto.DataType.UINT64))) {
+    const signed = tensorProto.dataType === onnx.TensorProto.DataType.INT64;
+    const dataConstructor = signed ? BigInt64Array : BigUint64Array;
+    const length = tensorProto.rawData.byteLength / 8;
+    const data = new dataConstructor(length);
+
+    if (tensorProto.rawData && typeof tensorProto.rawData.byteLength === 'number' &&
+        tensorProto.rawData.byteLength > 0) {
+      const dataSource =
+          new DataView(tensorProto.rawData.buffer, tensorProto.rawData.byteOffset, tensorProto.rawData.byteLength);
+      for (let i = 0; i < length; i++) {
+        data[i] = signed ? dataSource.getBigInt64(i * 8, true) : dataSource.getBigUint64(i * 8, true);
+      }
+    } else {
+      for (let i = 0; i < length; i++) {
+        data[i] = BigInt((signed ? tensorProto.int64Data : tensorProto.uint64Data)![i].toString());
+      }
+    }
+    tensor = new ort.Tensor(signed ? 'int64' : 'uint64', data, ProtoUtil.tensorDimsFromProto(tensorProto.dims));
+  } else {
+    const internalTensor = Tensor.fromProto(tensorProto);
+    tensor = fromInternalTensor(internalTensor);
+  }
   // add property 'name' to the tensor object.
-  const namedTensor = fromInternalTensor(tensor) as unknown as Test.NamedTensor;
+  const namedTensor = tensor as unknown as Test.NamedTensor;
   namedTensor.name = tensorProto.name;
   return namedTensor;
 }
@@ -72,11 +101,13 @@ async function loadMlProto(_uriOrData: string|Uint8Array): Promise<Test.NamedTen
 
 async function loadTensors(
     modelMetaData: {inputNames: readonly string[]; outputNames: readonly string[]}, testCase: Test.ModelTestCase,
-    fileCache?: FileCacheBuffer) {
+    backendName: string, fileCache?: FileCacheBuffer) {
   const inputs: Test.NamedTensor[] = [];
   const outputs: Test.NamedTensor[] = [];
   let dataFileType: 'none'|'pb'|'npy' = 'none';
 
+  const allowInt64 = ['wasm', 'xnnpack', 'jsep-webgpu'].includes(backendName);
+
   for (const dataFile of testCase.dataFiles) {
     const ext = extname(dataFile);
     if (ext.toLowerCase() === '.pb' || ext.toLowerCase() === '.tpb') {
@@ -88,7 +119,7 @@ async function loadTensors(
       }
 
       const uriOrData = fileCache && fileCache[dataFile] ? fileCache[dataFile] : dataFile;
-      const t = ext.toLowerCase() === '.pb' ? await loadTensorProto(uriOrData) :  // onnx.TensorProto
+      const t = ext.toLowerCase() === '.pb' ? await loadTensorProto(uriOrData, allowInt64) :  // onnx.TensorProto
                                               await loadMlProto(uriOrData);
 
       const dataFileBasename = dataFile.split(/[/\\]/).pop()!;
@@ -212,7 +243,7 @@ export class ModelTestContext {
       const initEnd = now();
 
       for (const testCase of modelTest.cases) {
-        await loadTensors(session, testCase, this.cache);
+        await loadTensors(session, testCase, modelTest.backend!, this.cache);
       }
 
       return new ModelTestContext(
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 9c9472860a072..de9b004b6c897 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -183,6 +183,7 @@ Status IExecutionFrame::GetOrCreateNodeOutputMLValue(const int output_index, int
       if (shape != nullptr && IsOutput(ort_value_idx)) {
         VerifyOutputSizes(output_index, node, *shape);
       }
+     // printf("before CreateNodeOutputMLValueImpl()\n");
       status = CreateNodeOutputMLValueImpl(*p_ort_value, ort_value_idx, shape);
     }
   }
@@ -519,6 +520,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
   // try to allocated on pre-allocated big chunk.
   const auto& per_alloc_plan = GetAllocationPlan(ort_value_index);
 
+   // printf("{{ before check memory patterns:\n");
   if (mem_patterns_ && per_alloc_plan.alloc_kind != AllocKind::kAllocateOutput &&
       per_alloc_plan.alloc_kind != AllocKind::kAllocatedExternally) {
     auto pattern = mem_patterns_->GetPatterns(location);
@@ -526,6 +528,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
       auto block = pattern->GetBlock(ort_value_index);
       // if block not found, fall back to default behavior
       if (block) {
+   // printf("{{   memory patterns - found block.\n");
         auto it = buffers_.find(location);
         if (it != buffers_.end()) {
           // if the block is not correct, log message then fall back to default behavior
@@ -726,6 +729,9 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
 #endif
 
     AllocKind alloc_kind = per_alloc_plan.alloc_kind;
+#ifndef NDEBUG
+    printf("{{alloc_kind}}=%d\n", (int)alloc_kind);
+#endif
     switch (alloc_kind) {
       // Right now for kAllocate and kAllocateOutput we are using same approach.
       // In the future we may want to have different way to handle it.
diff --git a/onnxruntime/core/framework/kernel_lookup.h b/onnxruntime/core/framework/kernel_lookup.h
index 933aed4542c06..6c52129965bc4 100644
--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@@ -30,17 +30,23 @@ class KernelLookup final : public IExecutionProvider::IKernelLookup {
 
   const KernelCreateInfo* LookUpKernel(const Node& node) const override {
     const KernelCreateInfo* kernel_create_info{};
+#ifndef NDEBUG
     printf(" LookUpKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), provider_type_.c_str());
+#endif
     for (const auto& registry : kernel_registries_) {
       const auto lookup_status = registry->TryFindKernel(node, provider_type_, kernel_type_str_resolver_,
                                                          &kernel_create_info);
       if (lookup_status.IsOK() && kernel_create_info != nullptr) {
+#ifndef NDEBUG
     printf(" - found\n");
+#endif
         return kernel_create_info;
       }
     }
 
+#ifndef NDEBUG
     printf(" - not found\n");
+#endif
     return nullptr;
   }
 
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index 652e2a8860e17..efa81b5a9f98c 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -166,7 +166,9 @@ Status KernelRegistry::TryFindKernel(const Node& node,
   const auto& node_provider = node.GetExecutionProviderType();
   const auto& expected_provider = (node_provider.empty() ? exec_provider : node_provider);
 
+#ifndef NDEBUG
     printf("  KernelRegistry::TryFindKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), expected_provider.c_str());
+#endif
   auto range = kernel_creator_fn_map_.equal_range(GetMapKey(node.OpType(), node.Domain(), expected_provider));
   if (out) *out = nullptr;
 
@@ -176,7 +178,9 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     std::string error_str;
     if (VerifyKernelDef(node, *i->second.kernel_def, kernel_type_str_resolver, error_str)) {
       if (out) *out = &i->second;
+#ifndef NDEBUG
     printf("  KernelRegistry::TryFindKernel() OK\n");
+#endif
       return Status::OK();
     }
     verify_kernel_def_error_strs.push_back(error_str);
@@ -193,11 +197,15 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     oss << ")";
 
     VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
+#ifndef NDEBUG
     printf("  KernelRegistry::TryFindKernel() failed: %s\n",oss.str().c_str());
+#endif
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
+#ifndef NDEBUG
     printf("  KernelRegistry::TryFindKernel() failed: Kernel not found\n");
+#endif
   return Status(common::ONNXRUNTIME, common::FAIL, "Kernel not found");
 }
 
diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
index 8abbe1ad04f4a..6be747a509f87 100644
--- a/onnxruntime/core/providers/js/data_transfer.cc
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -6,7 +6,7 @@
 #include "core/providers/js/data_transfer.h"
 
 EM_ASYNC_JS(void, jsepDownload, (const void *src_data, void *dst_data, size_t bytes), {
-  await Module.jsepDownload(src_data, dst_data, bytes);
+  await Module.jsepCopyAsync(src_data, dst_data, bytes);
 });
 
 namespace onnxruntime {
@@ -14,6 +14,7 @@ namespace js {
 
 bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
   return (dst_device.Type() == OrtDevice::GPU && src_device.Type() == OrtDevice::CPU) ||
+         (dst_device.Type() == OrtDevice::GPU && src_device.Type() == OrtDevice::GPU) ||
          (dst_device.Type() == OrtDevice::CPU && src_device.Type() == OrtDevice::GPU);
 }
 
@@ -26,14 +27,16 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*un
   auto& dst_device = dst.Location().device;
 
   if (dst_device.Type() == OrtDevice::GPU) {
-    // copy from CPU to GPU
-    EM_ASM({ Module.jsepUpload($0, $1, $2); }, src_data, dst_data, bytes);
-  } else if (src_device.Type() == OrtDevice::GPU) {
+    if (src_device.Type() == OrtDevice::GPU) {
+      // copy from GPU to GPU
+      EM_ASM({ Module.jsepCopy($0, $1, $2, true); }, src_data, dst_data, bytes);
+    } else {
+      // copy from CPU to GPU
+      EM_ASM({ Module.jsepCopy($0, $1, $2); }, src_data, dst_data, bytes);
+    }
+  } else /* if (src_device.Type() == OrtDevice::GPU) */ {
     // copy from GPU to CPU
     jsepDownload(src_data, dst_data, bytes);
-  } else {
-    // copy from CPU to CPU (don't think we ever get here)
-    memcpy(dst_data, src_data, bytes);
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 0adec04528f7a..92566493fa2c8 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -127,11 +127,22 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 14, Pow);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 15, Pow);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Shape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 14, Shape);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 15, Shape);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 5, 12, Reshape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Reshape);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Reshape);
+
 //class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, float, Gemm);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, float, Gemm);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Gemm);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
@@ -216,12 +227,22 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO_VERSIONED(13, 14, Pow),
       KERNEL_CREATE_INFO(15, Pow),
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 15, Shape)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 5, 12, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Reshape)>,
 
       //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
-      // KERNEL_CREATE_INFO(11, Conv),
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Gemm)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, AveragePool)>,
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index 7e36c2e38c37d..5c578a0a432f1 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -16,7 +16,13 @@ const void * JsepOutput(void * context, int index, void * data) {
         dims[i] = static_cast<int64_t>(*data_offset++);
     }
 
-        printf("JsepOutput(%d, %s)\n", index, onnxruntime::TensorShape(dims).ToString().c_str());
+#ifndef NDEBUG
+    printf("JsepOutput(%d, %s)\n", index, onnxruntime::TensorShape(dims).ToString().c_str());
+#endif
     auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
-    return output->DataRaw();
+    auto r = output->DataRaw();
+#ifndef NDEBUG
+    printf("JsepOutput -- data=%zu\n", (size_t)(r));
+#endif
+    return r;
 }
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 7f7485ed7e719..44a9126cfe493 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -58,6 +58,9 @@ public:                                                                    \
         , ({#attr_name:$1}), static_cast<double>(value))
 
 
+// TODO:
+// class JsMultiProgramKernel : public OpKernel { /* TBD */ };
+
 class JsKernel : public OpKernel {
  public:
   explicit JsKernel(const OpKernelInfo& info)
@@ -66,10 +69,7 @@ class JsKernel : public OpKernel {
       EM_ASM({ Module.jsepReleaseKernel($0); }, this);
   }
 
-  Status Compute(OpKernelContext* context) const override {
-      AllocatorPtr alloc;
-      ORT_RETURN_IF_ERROR(context->GetTempSpaceCPUAllocator(&alloc));
-
+  void * SerializeKernelContext(OpKernelContext* context, AllocatorPtr alloc) const {
       //
       // temp_data_format (every item is (u)int32_t):
       //    context_prt | input_count | [input_data_0] ... [input_data_N-1]
@@ -81,36 +81,56 @@ class JsKernel : public OpKernel {
       for (int i = 0; i < context->InputCount(); i++) {
         temp_data_size += sizeof(size_t) * (3 + context->Input<Tensor>(i)->Shape().NumDimensions());
       }
-      uint32_t *p_inputs_data = reinterpret_cast<uint32_t*>(alloc->Alloc(temp_data_size));
-      p_inputs_data[0] = reinterpret_cast<uint32_t>(context);
-      p_inputs_data[1] = static_cast<uint32_t>(context->InputCount());
+      uint32_t *p_serialized_kernel_context = reinterpret_cast<uint32_t*>(alloc->Alloc(temp_data_size));
+      if (p_serialized_kernel_context == nullptr) {
+        return nullptr;
+      }
+
+      p_serialized_kernel_context[0] = reinterpret_cast<uint32_t>(context);
+      p_serialized_kernel_context[1] = static_cast<uint32_t>(context->InputCount());
       size_t index = 2;
       for (int i = 0; i < context->InputCount(); i++) {
-        p_inputs_data[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->GetElementType());
-        p_inputs_data[index++] = reinterpret_cast<uint32_t>(context->Input<Tensor>(i)->DataRaw());
-        p_inputs_data[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape().NumDimensions());
+        p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->GetElementType());
+        p_serialized_kernel_context[index++] = reinterpret_cast<uint32_t>(context->Input<Tensor>(i)->DataRaw());
+        p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape().NumDimensions());
         for (size_t d = 0; d < context->Input<Tensor>(i)->Shape().NumDimensions(); d++) {
-          p_inputs_data[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape()[d]);
+          p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape()[d]);
         }
       }
 
+#ifndef NDEBUG
       printf("temp data size: %zu. Data: ", temp_data_size);
       for (int i=0; i < (int)temp_data_size/4;i++) {
-        printf("%u ", p_inputs_data[i]);
+        printf("%u ", p_serialized_kernel_context[i]);
       }
       printf("\n");
+#endif
 
-      int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_inputs_data);
+      return p_serialized_kernel_context;
+  }
+
+  virtual Status ComputeInternal(OpKernelContext* context) const {
+      AllocatorPtr alloc;
+      ORT_RETURN_IF_ERROR(context->GetTempSpaceCPUAllocator(&alloc));
+
+      auto p_serialized_kernel_context = SerializeKernelContext(context, alloc);
+
+      int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_serialized_kernel_context);
 
       // printf("outputs = %d. Y.data=%zu\n", context->OutputCount(), (size_t)(context->Output<Tensor>(0)->DataRaw()));
 
-      alloc->Free(p_inputs_data);
+      alloc->Free(p_serialized_kernel_context);
+
       if (status == 0) {
         return Status::OK();
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to run JSEP kernel");
       }
   }
+
+  Status Compute(OpKernelContext* context) const override {
+      return ComputeInternal(context);
+  }
 };
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/gemm.cc b/onnxruntime/core/providers/js/operators/gemm.cc
new file mode 100644
index 0000000000000..a5cf40055031d
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/gemm.cc
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+#include "gemm.h"
+
+namespace onnxruntime {
+namespace js {
+
+#define REGISTER_KERNEL_TYPED(T)                                                           \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
+      Gemm,                                                                                \
+      kOnnxDomain,                                                                         \
+      11,                                                                                  \
+      T,                                                                                   \
+      kJsExecutionProvider,                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Gemm<T>);                                                                            \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
+      Gemm,                                                                                \
+      kOnnxDomain,                                                                         \
+      9, 10,                                                                               \
+      T,                                                                                   \
+      kJsExecutionProvider,                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Gemm<T>);                                                                            \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
+      Gemm,                                                                                \
+      kOnnxDomain,                                                                         \
+      7, 8,                                                                                \
+      T,                                                                                   \
+      kJsExecutionProvider,                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Gemm<T>);
+
+
+
+REGISTER_KERNEL_TYPED(float)
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/gemm.h b/onnxruntime/core/providers/js/operators/gemm.h
new file mode 100644
index 0000000000000..50042fb298c3e
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/gemm.h
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+template <typename T>
+class Gemm : public JsKernel {
+ public:
+  Gemm(const OpKernelInfo& info) : JsKernel(info) {
+
+    float alpha = info.GetAttrOrDefault<float>("alpha", 1.0f);
+    float beta = info.GetAttrOrDefault<float>("beta", 1.0f);
+    int64_t transA = info.GetAttrOrDefault<int64_t>("transA", 0);
+    int64_t transB = info.GetAttrOrDefault<int64_t>("transB", 0);
+
+    // currently only support Conv2D. TODO: support other
+    JSEP_INIT_KERNEL_ATTRIBUTE(Gemm, ({
+        "alpha": $1,
+        "beta": $2,
+        "transA": $3,
+        "transB": $4
+    }),
+    static_cast<double>(alpha),
+    static_cast<double>(beta),
+    static_cast<int32_t>(transA),
+    static_cast<int32_t>(transB)
+    );
+  }
+};
+
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/reshape.cc b/onnxruntime/core/providers/js/operators/reshape.cc
new file mode 100644
index 0000000000000..023c1cd022abc
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/reshape.cc
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "reshape.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_KERNEL_EX(
+    Reshape,
+    kOnnxDomain,
+    14,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
+        .Alias(0, 0)
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    Reshape);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Reshape,
+    kOnnxDomain,
+    13, 13,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
+        .Alias(0, 0)
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    Reshape);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Reshape,
+    kOnnxDomain,
+    5, 12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
+        .Alias(0, 0)
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    Reshape);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/reshape.h b/onnxruntime/core/providers/js/operators/reshape.h
new file mode 100644
index 0000000000000..db919f0021228
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/reshape.h
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+#include "core/framework/data_transfer_manager.h"
+#include "core/providers/cpu/tensor/reshape_helper.h"
+
+namespace onnxruntime {
+namespace js {
+
+class Reshape final : public JsKernel {
+ public:
+  Reshape(const OpKernelInfo& info) : JsKernel(info),
+                                      allow_zero_(info.GetAttrOrDefault("allowzero", static_cast<int64_t>(0)) == 1) {
+  }
+
+  Status Compute(OpKernelContext* context) const override {
+    // Copy the second input tensor into the shape vector
+    const Tensor* shapeTensor = context->Input<Tensor>(1);
+    if (shapeTensor == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
+    if (shapeTensor->Shape().NumDimensions() != 1) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "A shape tensor must be a vector tensor, got ", shapeTensor->Shape().NumDimensions(), " dimensions");
+    auto data_span = shapeTensor->template DataAsSpan<int64_t>();
+    TensorShapeVector shape(data_span.begin(), data_span.end());
+    const Tensor* X = context->Input<Tensor>(0);
+    if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
+    const TensorShape& X_shape = X->Shape();
+
+    ReshapeHelper helper(X_shape, shape, allow_zero_);
+
+    Tensor* Y = context->Output(0, TensorShape(shape));
+    const void* source = X->DataRaw();
+    void* target = Y->MutableDataRaw();
+    //If source and target pointers are not equal (non-inplace operation), we need to copy the data.
+    if (target != source) {
+      ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*X, *Y));
+    }
+
+    return Status::OK();
+  }
+
+  private:
+   bool allow_zero_;
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/shape_op.cc b/onnxruntime/core/providers/js/operators/shape_op.cc
new file mode 100644
index 0000000000000..42710d26bb09d
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/shape_op.cc
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/cpu/tensor/shape_op.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Shape,
+    kOnnxDomain,
+    1, 12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        // properly force CPU/GPU synch inside the kernel
+        .OutputMemoryType(OrtMemTypeCPUInput, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
+    Shape);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Shape,
+    kOnnxDomain,
+    13, 14,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        // properly force CPU/GPU synch inside the kernel
+        .OutputMemoryType(OrtMemTypeCPUInput, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
+    Shape);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Shape,
+    kOnnxDomain,
+    15,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        // properly force CPU/GPU synch inside the kernel
+        .OutputMemoryType(OrtMemTypeCPUInput, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
+    Shape);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js
index 77608029f8937..6c2c3522c7db2 100644
--- a/onnxruntime/wasm/js_internal_api.js
+++ b/onnxruntime/wasm/js_internal_api.js
@@ -4,12 +4,12 @@
 'use strict';
 
 // init JSEP
-Module["jsepInit"] = function (backend, alloc, free, upload, download, createKernel, releaseKernel, run) {
+Module["jsepInit"] = function (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, run) {
     Module.jsepBackend = backend;
     Module.jsepAlloc = alloc;
     Module.jsepFree = free;
-    Module.jsepUpload = upload;
-    Module.jsepDownload = download;
+    Module.jsepCopy = copy;
+    Module.jsepCopyAsync = copyAsync;
     Module.jsepCreateKernel = createKernel;
     Module.jsepReleaseKernel = releaseKernel;
     Module.jsepRun = run;

From 7cdbb710b1821256eda3dc5a1f7b599c0f126a5e Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 9 Dec 2022 15:53:02 -0800
Subject: [PATCH 20/81] fix build

---
 bb.bat                            | 1 +
 br.bat                            | 1 +
 cmake/CMakeLists.txt              | 1 -
 cmake/onnxruntime_providers.cmake | 4 +++-
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/bb.bat b/bb.bat
index 70400d2630eaa..ff5c14fa0abd5 100644
--- a/bb.bat
+++ b/bb.bat
@@ -7,6 +7,7 @@ if ["%~1"]==["--clean"] (
 )
 
 setlocal
+set PATH=C:\Program Files\Git\usr\bin;%PATH%
 
 if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
     set protoc_path_flag=--path_to_protoc_exe %~dp0build\Windows\host_protoc\Release\protoc.exe
diff --git a/br.bat b/br.bat
index bd491a9b71767..8642d417f0d30 100644
--- a/br.bat
+++ b/br.bat
@@ -7,6 +7,7 @@ if ["%~1"]==["--clean"] (
 )
 
 setlocal
+set PATH=C:\Program Files\Git\usr\bin;%PATH%
 
 if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
     set protoc_path_flag=--path_to_protoc_exe %~dp0build\Windows\host_protoc\Release\protoc.exe
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d8ae4996701b2..36636eb993805 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -58,7 +58,6 @@ option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
 option(onnxruntime_USE_JS "Build with JavaScript implemented kernels support" OFF)
-option(onnxruntime_DEV_MODE "Enable developer warnings and treat most of them as error." OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
 option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 2d101603ffbc5..71b216b8d7a14 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -1063,7 +1063,9 @@ if (onnxruntime_USE_JS)
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_js_cc_srcs})
   onnxruntime_add_static_library(onnxruntime_providers_js ${onnxruntime_providers_js_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_js onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers)
+  onnxruntime_add_include_to_target(onnxruntime_providers_js
+    onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers Boost::mp11
+  )
 
   add_dependencies(onnxruntime_providers_js ${onnxruntime_EXTERNAL_DEPENDENCIES})
 

From 19f3a45dff52074248979ddcec7d3027d58bfeba Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 12 Dec 2022 17:05:15 -0800
Subject: [PATCH 21/81] use larger asyncify stack

---
 cmake/onnxruntime_webassembly.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index a157faf811279..7f2588a989a8d 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -214,7 +214,7 @@ else()
                         -s NO_FILESYSTEM=1 \
                         ${WASM_API_EXCEPTION_CATCHING} \
                         -s ASYNCIFY=1 \
-                        -s ASYNCIFY_STACK_SIZE=16384 \
+                        -s ASYNCIFY_STACK_SIZE=65536 \
                         -s ASYNCIFY_ADVISE=1 \
                         -s ASYNCIFY_DEBUG=0 \
                         -s ASYNCIFY_IGNORE_INDIRECT=0 \

From 1c45f343e2a38a8a0277fa0216ea9f903e2967fc Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 16 Dec 2022 17:26:35 -0800
Subject: [PATCH 22/81] support temp buffer

---
 js/web/lib/wasm/jsep/backend-webgpu.ts        |  57 +++++---
 js/web/lib/wasm/jsep/init.ts                  |  24 +++-
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  |   7 +
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts  |  30 ++--
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts     |   3 +-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       |   3 +-
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts       |   3 +-
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       |  14 +-
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   | 133 +++++++++++-------
 js/web/lib/wasm/jsep/webgpu/types.ts          |  23 ++-
 js/web/lib/wasm/wasm-core-impl.ts             |   3 +
 11 files changed, 209 insertions(+), 91 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 3575d5779d195..822ac9f508699 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -7,18 +7,18 @@ import {TensorView} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+import {ComputeContext, GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 const getProgramInfoUniqueKey =
-    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly TensorView[],
-     inputGpuDatas: readonly GpuData[]): string => {
-      const inputGpuDataTypes = inputGpuDatas.map(data => `${data.type}`).join('_');
-      const inputTensorShapes = inputTensors.map(t => `${t.dims.join(',')}`).join('_');
+    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensorShapes: ReadonlyArray<TensorView['dims']>,
+     inputGpuDataTypes: readonly GpuDataType[]): string => {
+      const inputTensorShapesToString = inputTensorShapes.map(d => `${d.join(',')}`).join('_');
+      const inputGpuDataTypesToString = inputGpuDataTypes.join('_');
       let key = programInfo.name;
       if (programInfo.cacheHint) {
         key += '[' + programInfo.cacheHint + ']';
       }
-      key += ':' + inputTensorShapes + ';' + inputGpuDataTypes;
+      key += ':' + inputTensorShapesToString + ';' + inputGpuDataTypesToString;
       return key;
     };
 
@@ -27,6 +27,8 @@ export class WebGpuBackend {
   gpuDataManager: GpuDataManager;
   programManager: ProgramManager;
 
+  temporaryData: GpuData[];
+
   // TODO: remove value[0]. the string is only for debug
   kernels: Map<number, [string, RunFunction, unknown]>;
 
@@ -92,18 +94,14 @@ export class WebGpuBackend {
     this.pendingDispatchNumber = 0;
   }
 
-  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[],
-      createOutput: (index: number, dims: readonly number[]) => number): number {
+  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[], outputIndices: readonly number[],
+      createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
+      createTemporaryOutput: (dataType: number, dims: readonly number[]) => TensorView): TensorView[] {
     if (inputs.length !== program.inputTypes.length) {
       throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
     }
 
-    // // create info for inputs
-    // const inputDatas: GpuData[] = [];
-    // for (let i = 0; i < program.inputTypes.length; ++i) {
-    //   inputDatas[i] = this.uploadGpuData(inputs[i], program.inputTypes[i]);
-    // }
-
+    // create info for inputs
     const inputDatas: GpuData[] = [];
     for (let i = 0; i < inputs.length; ++i) {
       const gpuData = this.gpuDataManager.get(inputs[i].data);
@@ -113,21 +111,31 @@ export class WebGpuBackend {
       inputDatas[i] = gpuData;
     }
 
-    const key = getProgramInfoUniqueKey(program, inputs, inputDatas);
+    const key = getProgramInfoUniqueKey(program, inputs.map(i => i.dims), inputDatas.map(i => i.type));
     let artifact = this.programManager.getArtifact(key);
     const programInfo = artifact ?
         artifact.programInfo :
         (typeof (program as ProgramInfoLoader).get === 'function' ? (program as ProgramInfoLoader).get() :
                                                                     (program as ProgramInfo));
 
+    // check ouput indices
+    const validatedOutputIndices = outputIndices.length === 0 ? programInfo.outputs.map((_, i) => i) : outputIndices;
+    if (validatedOutputIndices.length !== programInfo.outputs.length) {
+      throw new Error(`Output size must be equal to ${programInfo.outputs.length}.`);
+    }
+
     // create info for outputs
+    const outputTensorViews: TensorView[] = [];
     const outputDatas: GpuData[] = [];
     for (let i = 0; i < programInfo.outputs.length; ++i) {
-      const dataId = createOutput(i, programInfo.outputs[i].dims);
-      const gpuData = this.gpuDataManager.get(dataId);
+      const tensorView = validatedOutputIndices[i] === -1 ?
+          createTemporaryOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
+          createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
+      const gpuData = this.gpuDataManager.get(tensorView.data);
       if (!gpuData) {
-        throw new Error(`no GPU data for output: ${dataId}`);
+        throw new Error(`no GPU data for output: ${tensorView.data}`);
       }
+      outputTensorViews.push(tensorView);
       outputDatas.push(gpuData);
     }
 
@@ -138,7 +146,7 @@ export class WebGpuBackend {
 
     this.programManager.run(artifact, inputDatas, outputDatas, artifact.programInfo.dispatchGroup(inputs));
 
-    return 0;
+    return outputTensorViews;
   }
 
   upload(gpuDataId: number, data: Uint8Array): void {
@@ -190,6 +198,15 @@ export class WebGpuBackend {
       // eslint-disable-next-line no-console
       console.log(`[js] Start to run kernel "${name}"...`);
     }
-    return kernelEntry(context, attributes);
+
+    this.temporaryData = [];
+    try {
+      return kernelEntry(context, attributes);
+    } finally {
+      for (const data of this.temporaryData) {
+        this.gpuDataManager.release(data.id);
+      }
+      this.temporaryData = [];
+    }
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 403961245d104..e14d37abdf3fe 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -4,11 +4,12 @@
 import {env} from 'onnxruntime-common';
 
 import {OrtWasmModule} from '../binding/ort-wasm';
+import {getTensorElementSize} from '../wasm-core-impl';
 
 import {WebGpuBackend} from './backend-webgpu';
 import {TensorView} from './tensor';
 import {ShapeUtil} from './util';
-import {ComputeContext, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
 
@@ -47,9 +48,24 @@ class OpKernelContext implements ComputeContext {
     this.inputs = inputs;
   }
 
-  compute(program: ProgramInfoLoader|ProgramInfo, inputIndices?: readonly number[]): number {
-    const mappedInputs = inputIndices?.map(i => this.inputs[i]) ?? this.inputs;
-    return this.backend.run(program, mappedInputs, this.output.bind(this));
+  compute(program: ProgramInfoLoader|ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping):
+      TensorView[] {
+    // prepare inputs. inputs should always be valid data.
+    const mappedInputs =
+        inputsOutputsMapping?.inputs?.map(i => typeof i === 'number' ? this.inputs[i] : i) ?? this.inputs;
+    // prepare outputs.
+    const outputIndices = inputsOutputsMapping?.outputs ?? [];
+    const createKernelOutput = (index: number, dataType: number, dims: readonly number[]): TensorView =>
+        new TensorViewImpl(this.module, dataType, this.output(index, dims), dims);
+    const createTemporaryOutput = (dataType: number, dims: readonly number[]): TensorView => {
+      const elementSize = getTensorElementSize(dataType);
+      if (!elementSize) {
+        throw new Error(`Unsupported data type: ${dataType}`);
+      }
+      const bufferSize = elementSize * ShapeUtil.size(dims);
+      return new TensorViewImpl(this.module, dataType, this.backend.gpuDataManager.create(bufferSize).id, dims);
+    };
+    return this.backend.run(program, mappedInputs, outputIndices, createKernelOutput, createTemporaryOutput);
   }
 
   output(index: number, dims: readonly number[]): number {
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index f4cac6be224ec..bcd13b8ef6697 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -38,6 +38,13 @@ export interface GpuDataManager {
    */
   download(id: GpuDataId): Promise<ArrayBufferLike>;
 
+  /**
+   * refresh the buffers that marked for release.
+   *
+   * when release() is called, the buffer is not released immediately. this is because we need to wait for the commands
+   * to be submitted to the GPU. this function is called after the commands are submitted so that the buffers can be
+   * actually released.
+   */
   refreshPendingBuffers(): void;
 }
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index 512ff09c93881..451cef484b6d8 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -182,14 +182,18 @@ const createBinaryOpProgramInfoLoader =
       };
     };
 
-export const add = (context: ComputeContext): number =>
-    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Add', (a, b) => `${a}+${b}`));
+export const add = (context: ComputeContext): number => {
+  context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Add', (a, b) => `${a}+${b}`));
+  return 0;
+};
 
 // export const and = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslAnd(), 'bool'), inputs)];
 
-export const div = (context: ComputeContext): number =>
-    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Div', (a, b) => `${a}/${b}`));
+export const div = (context: ComputeContext): number => {
+  context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Div', (a, b) => `${a}/${b}`));
+  return 0;
+};
 
 // export const equal = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslEqual(), 'bool'), inputs)];
@@ -200,20 +204,26 @@ export const div = (context: ComputeContext): number =>
 // export const less = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslLess(), 'bool'), inputs)];
 
-export const mul = (context: ComputeContext): number =>
-    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Mul', (a, b) => `${a}*${b}`));
+export const mul = (context: ComputeContext): number => {
+  context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Mul', (a, b) => `${a}*${b}`));
+  return 0;
+};
 
 // export const or = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslOr(), 'bool'), inputs)];
 
-export const pow = (context: ComputeContext): number =>
-    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Pow', 'pow'));
+export const pow = (context: ComputeContext): number => {
+  context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Pow', 'pow'));
+  return 0;
+};
 
 // export const pRelu = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslPRelu()), inputs)];
 
-export const sub = (context: ComputeContext): number =>
-    context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Sub', (a, b) => `${a}-${b}`));
+export const sub = (context: ComputeContext): number => {
+  context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Sub', (a, b) => `${a}-${b}`));
+  return 0;
+};
 
 // export const xor = (backend: WebGLInferenceHandler, inputs: Tensor[]):
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslXor(), 'bool'), inputs)];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index 0bf656c84d98e..588ffd37c723f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -165,5 +165,6 @@ const createConcatProgramInfoLoader =
 
 export const concat = (context: ComputeContext, attributes: ConcatAttributes): number => {
   validateInputs(context.inputs);
-  return context.compute(createConcatProgramInfoLoader(context.inputs, attributes));
+  context.compute(createConcatProgramInfoLoader(context.inputs, attributes));
+  return 0;
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 79d8c7ce38976..076562d9916ad 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -136,12 +136,13 @@ const conv2d = (context: ComputeContext, attributes: ConvAttributes): number =>
   const adjustedAttributes = getAdjustedConvAttributes(attributes, context.inputs);
   //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
   //  if (adjustedAttributes.group > 1) {
-  return context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
+  context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
   //  } else if (isPointwise) {
   //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
   //  } else {
   //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
   //  }
+  return 0;
 };
 
 export const conv = (context: ComputeContext, attributes: ConvAttributes): number => {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index b19f173e4fbd7..48920646a7e9e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -144,7 +144,8 @@ const createGemmProgramInfoLoader = (inputs: readonly TensorView[], attributes:
 
 export const gemm = (context: ComputeContext, attributes: GemmAttributes): number => {
   validateInputs(context.inputs);
-  return context.compute(createGemmProgramInfoLoader(context.inputs, attributes));
+  context.compute(createGemmProgramInfoLoader(context.inputs, attributes));
+  return 0;
 };
 
 export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes =>
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index c620038f1397d..e2c0d89fde1a8 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -282,8 +282,8 @@ export const parseAveragePoolAttributes = (attributes: Record<string, unknown>):
 export const averagePool = (context: ComputeContext, attributes: AveragePoolAttributes): number => {
   validateInputs(context.inputs);
   const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-  return context.compute(
-      {...metadata, get: () => createAveragePoolProgramInfo(context.inputs, metadata, false, attributes)});
+  context.compute({...metadata, get: () => createAveragePoolProgramInfo(context.inputs, metadata, false, attributes)});
+  return 0;
 };
 
 const globalPoolAttributes = {
@@ -301,8 +301,9 @@ const globalPoolAttributes = {
 export const globalAveragePool = (context: ComputeContext): number => {
   validateInputs(context.inputs);
   const metadata = {name: 'GlobalAveragePool', inputTypes: [GpuDataType.default]};
-  return context.compute(
+  context.compute(
       {...metadata, get: () => createAveragePoolProgramInfo(context.inputs, metadata, true, globalPoolAttributes)});
+  return 0;
 };
 
 export interface MaxPoolAttributes extends PoolCommonAttributes, AttributeWithCacheKey {
@@ -330,8 +331,8 @@ const createMaxPoolProgramInfo =
 export const maxPool = (context: ComputeContext, attributes: MaxPoolAttributes): number => {
   validateInputs(context.inputs);
   const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-  return context.compute(
-      {...metadata, get: () => createMaxPoolProgramInfo(context.inputs, metadata, false, attributes)});
+  context.compute({...metadata, get: () => createMaxPoolProgramInfo(context.inputs, metadata, false, attributes)});
+  return 0;
 };
 
 export const parseMaxPoolAttributes = (attributes: Record<string, unknown>): MaxPoolAttributes => {
@@ -357,8 +358,9 @@ const globalMaxPoolMetadata = {
 
 export const globalMaxPool = (context: ComputeContext): number => {
   validateInputs(context.inputs);
-  return context.compute({
+  context.compute({
     ...globalMaxPoolMetadata,
     get: () => createMaxPoolProgramInfo(context.inputs, globalMaxPoolMetadata, true, globalPoolAttributes)
   });
+  return 0;
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index e83e9c6a510d1..a6d7e0f340430 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -63,41 +63,56 @@ const createElementwiseProgramInfoLoader =
       };
     };
 
-export const abs = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Abs', 'abs'));
-
-export const acos = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Acos', 'acos'));
+export const abs = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Abs', 'abs'));
+  return 0;
+};
 
-export const acosh = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Acosh', 'acosh'));
+export const acos = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Acos', 'acos'));
+  return 0;
+};
 
-export const asin = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Asin', 'asin'));
+export const acosh = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Acosh', 'acosh'));
+  return 0;
+};
 
-export const asinh = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Asinh', 'asinh'));
+export const asin = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Asin', 'asin'));
+  return 0;
+};
 
-export const atan = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Atan', 'atan'));
+export const asinh = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Asinh', 'asinh'));
+  return 0;
+};
 
-export const atanh = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Atanh', 'atanh'));
+export const atan = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Atan', 'atan'));
+  return 0;
+};
+export const atanh = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Atanh', 'atanh'));
+  return 0;
+};
 
 export interface ClipAttributes extends AttributeWithCacheKey {
   readonly min: number;
   readonly max: number;
 }
 
-export const clip = (context: ComputeContext, attributes: ClipAttributes): number => context.compute(
-    createElementwiseProgramInfoLoader(
-        context.inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
+export const clip = (context: ComputeContext, attributes: ClipAttributes): number => {
+  context.compute(
+      createElementwiseProgramInfoLoader(
+          context.inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
     const clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
     const clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
 `,
-        attributes.cacheKey),
-    [0]);
-
+          attributes.cacheKey),
+      {inputs: [0]});
+  return 0;
+};
 const generateClipAttributesFromInputs = (inputs: readonly TensorView[]): ClipAttributes => {
   const min = (inputs.length >= 2) ? inputs[1].getFloat32Array()[0] : MIN_CLIP;
   const max = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : MAX_CLIP;
@@ -109,22 +124,28 @@ export const clipV11 = (context: ComputeContext): number => {
   return clip(context, attributes);
 };
 
-export const ceil = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Ceil', 'ceil'));
+export const ceil = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Ceil', 'ceil'));
+  return 0;
+};
 
-export const cos = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Cos', 'cos'));
+export const cos = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Cos', 'cos'));
+  return 0;
+};
 
-export const cosh = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Cosh', 'cosh'));
+export const cosh = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Cosh', 'cosh'));
+  return 0;
+};
 
 export interface EluAttributes extends AttributeWithCacheKey {
   readonly alpha: number;
 }
 
-export const elu = (context: ComputeContext, attributes: EluAttributes): number =>
-    context.compute(createElementwiseProgramInfoLoader(
-        context.inputs[0], 'Elu', a => `elu_vf32(${a})`, `
+export const elu = (context: ComputeContext, attributes: EluAttributes): number => {
+  context.compute(createElementwiseProgramInfoLoader(
+      context.inputs[0], 'Elu', a => `elu_vf32(${a})`, `
   const elu_alpha_: f32 = f32(${attributes.alpha});
 
   fn elu_f32(a: f32) -> f32 {
@@ -134,7 +155,9 @@ export const elu = (context: ComputeContext, attributes: EluAttributes): number
   fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
   return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
   }`,
-        attributes.cacheKey));
+      attributes.cacheKey));
+  return 0;
+};
 
 export const parseEluAttributes = (attributes: Record<string, unknown>): EluAttributes =>
     createAttributeWithCacheKey(attributes as {alpha: number});
@@ -142,8 +165,10 @@ export const parseEluAttributes = (attributes: Record<string, unknown>): EluAttr
 // export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
 //     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
 
-export const floor = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Floor', 'floor'));
+export const floor = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Floor', 'floor'));
+  return 0;
+};
 
 // export interface LeakyReluAttributes extends AttributeWithCacheKey {
 //   readonly alpha: number;
@@ -171,14 +196,18 @@ export const floor = (context: ComputeContext): number =>
 // export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
 //     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
 
-export const neg = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Neg', a => `-${a}`));
+export const neg = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Neg', a => `-${a}`));
+  return 0;
+};
 
 // // export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
 // //     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
 
-export const reciprocal = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Reciprocal', a => `1.0/${a}`));
+export const reciprocal = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Reciprocal', a => `1.0/${a}`));
+  return 0;
+};
 
 // export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
 //     createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
@@ -186,17 +215,27 @@ export const reciprocal = (context: ComputeContext): number =>
 // export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
 //     createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
 
-export const sin = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sin', 'sin'));
+export const sin = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sin', 'sin'));
+  return 0;
+};
 
-export const sinh = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sinh', 'sinh'));
+export const sinh = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sinh', 'sinh'));
+  return 0;
+};
 
-export const sqrt = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sqrt', 'sqrt'));
+export const sqrt = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sqrt', 'sqrt'));
+  return 0;
+};
 
-export const tan = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Tan', 'tan'));
+export const tan = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Tan', 'tan'));
+  return 0;
+};
 
-export const tanh = (context: ComputeContext): number =>
-    context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Tanh', 'tanh'));
+export const tanh = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Tanh', 'tanh'));
+  return 0;
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index e0790030f7502..e9dd0378ccb59 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -95,9 +95,30 @@ export interface Artifact {
   // attribLocations: {position: number; textureCoord: number};
 }
 
+export interface ComputeContextInputsOutputsMapping {
+  /**
+   * specify the mapping to the program's inputs. the value can be a number or a tensor view.
+   * - if it's a number, it's the index of the kernel's input
+   * - if it's a tensor view, it's an existing tensor view that will be used as the input
+   *
+   * if inputs is not specified, the mapping will be the kernel's inputs in order.
+   */
+  readonly inputs?: ReadonlyArray<TensorView|number>;
+  /**
+   * specify the mapping to the program's outputs. the value can be a number or undefined.
+   * - if it's a non-negative number, it's the index of the kernel's output
+   * - if it's -1, it's an output that will be created as a temporary value. this value will be released after
+   * the kernel is executed.
+   *
+   * if outputs is not specified, the mapping will be the kernel's outputs in order.
+   */
+  readonly outputs?: readonly number[];
+}
+
 export interface ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
-  compute(program: ProgramInfoLoader|ProgramInfo, inputIndices?: readonly number[]): number;
+  compute(program: ProgramInfoLoader|ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping):
+      TensorView[];
   output(index: number, dims: readonly number[]): number;
 }
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index b0778cc6060e0..d0b342996c3e5 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -138,6 +138,9 @@ export const enum DataType {
   bfloat16 = 16
 }
 
+export const getTensorElementSize = (dateType: number): number|
+    undefined => [undefined, 4, 1, 1, 2, 2, 4, 8, undefined, 1, 2, 8, 4, 8, undefined, undefined, undefined][dateType];
+
 
 const tensorDataTypeStringToEnum = (type: string): DataType => {
   switch (type) {

From 193bf99e6af038740ccdeef877c19de85db4551a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sat, 17 Dec 2022 10:44:39 -0800
Subject: [PATCH 23/81] fix build

---
 onnxruntime/core/providers/js/data_transfer.cc         | 2 +-
 onnxruntime/core/providers/js/data_transfer.h          | 2 +-
 onnxruntime/core/providers/js/js_execution_provider.cc | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
index 6be747a509f87..ab5fb72878a00 100644
--- a/onnxruntime/core/providers/js/data_transfer.cc
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -18,7 +18,7 @@ bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_dev
          (dst_device.Type() == OrtDevice::CPU && src_device.Type() == OrtDevice::GPU);
 }
 
-common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*unused_arg*/) const {
+common::Status DataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, Stream& /*stream*/) const {
   size_t bytes = src.SizeInBytes();
   const void* src_data = src.DataRaw();
   void* dst_data = dst.MutableDataRaw();
diff --git a/onnxruntime/core/providers/js/data_transfer.h b/onnxruntime/core/providers/js/data_transfer.h
index d1e703ec1dc0c..580b1eb9fedcf 100644
--- a/onnxruntime/core/providers/js/data_transfer.h
+++ b/onnxruntime/core/providers/js/data_transfer.h
@@ -16,7 +16,7 @@ class DataTransfer : public IDataTransfer {
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 
-  common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
+  common::Status CopyTensorAsync(const Tensor& src, Tensor& dst, Stream& stream) const override;
 };
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 92566493fa2c8..506e144deee65 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -33,8 +33,8 @@ class Memcpy final : public OpKernel {
   Status Compute(OpKernelContext* ctx) const override {
     const auto* X = ctx->Input<Tensor>(0);
     Tensor* Y = ctx->Output(0, X->Shape());
-    Status retval = Info().GetDataTransferManager().CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId());
-    return retval;
+    auto* data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
+    return data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream());
   }
 };
 

From 8ee4c5ddfe75a63bc507c1cda21ed9bf8d32d159 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sat, 17 Dec 2022 12:04:01 -0800
Subject: [PATCH 24/81] fix copy tensor

---
 onnxruntime/core/providers/js/data_transfer.cc         | 2 +-
 onnxruntime/core/providers/js/data_transfer.h          | 2 +-
 onnxruntime/core/providers/js/js_execution_provider.cc | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
index ab5fb72878a00..64d71fea8ce54 100644
--- a/onnxruntime/core/providers/js/data_transfer.cc
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -18,7 +18,7 @@ bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_dev
          (dst_device.Type() == OrtDevice::CPU && src_device.Type() == OrtDevice::GPU);
 }
 
-common::Status DataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, Stream& /*stream*/) const {
+common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
   size_t bytes = src.SizeInBytes();
   const void* src_data = src.DataRaw();
   void* dst_data = dst.MutableDataRaw();
diff --git a/onnxruntime/core/providers/js/data_transfer.h b/onnxruntime/core/providers/js/data_transfer.h
index 580b1eb9fedcf..6a0e8586776a2 100644
--- a/onnxruntime/core/providers/js/data_transfer.h
+++ b/onnxruntime/core/providers/js/data_transfer.h
@@ -16,7 +16,7 @@ class DataTransfer : public IDataTransfer {
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 
-  common::Status CopyTensorAsync(const Tensor& src, Tensor& dst, Stream& stream) const override;
+  common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
 };
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 506e144deee65..26c54598ca4a9 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -33,8 +33,7 @@ class Memcpy final : public OpKernel {
   Status Compute(OpKernelContext* ctx) const override {
     const auto* X = ctx->Input<Tensor>(0);
     Tensor* Y = ctx->Output(0, X->Shape());
-    auto* data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
-    return data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream());
+    return Info().GetDataTransferManager().CopyTensor(*X, *Y);
   }
 };
 

From 1a70d40d9883f698596435587ac106de720dc0ec Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 19 Dec 2022 16:03:38 -0800
Subject: [PATCH 25/81] fix temp allocation cleanup

---
 js/web/lib/wasm/jsep/backend-webgpu.ts | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 822ac9f508699..6d9d9e29955e0 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -128,13 +128,17 @@ export class WebGpuBackend {
     const outputTensorViews: TensorView[] = [];
     const outputDatas: GpuData[] = [];
     for (let i = 0; i < programInfo.outputs.length; ++i) {
-      const tensorView = validatedOutputIndices[i] === -1 ?
+      const isTemporary = validatedOutputIndices[i] === -1;
+      const tensorView = isTemporary ?
           createTemporaryOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
       const gpuData = this.gpuDataManager.get(tensorView.data);
       if (!gpuData) {
         throw new Error(`no GPU data for output: ${tensorView.data}`);
       }
+      if (isTemporary) {
+        this.temporaryData.push(gpuData);
+      }
       outputTensorViews.push(tensorView);
       outputDatas.push(gpuData);
     }

From cfecdce535857093b8bd1083dded08fbddf1d3b9 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 19 Dec 2022 16:04:04 -0800
Subject: [PATCH 26/81] less output to console

---
 onnxruntime/core/providers/js/js_execution_provider.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 26c54598ca4a9..47bd347e8b2c0 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -287,7 +287,9 @@ JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info)
 // implement RegisterAllocator to test/validate sharing the CPU EP's allocator
 void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
 
+#ifndef NDEBUG
   printf("JsExecutionProvider::RegisterAllocator() \n");
+#endif
 
   AllocatorCreationInfo cpuInputAllocatorCreationInfo([&](int) {
     return std::make_unique<js::JsCPUInputAllocator>();
@@ -314,6 +316,7 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
 
   auto lookup = JsKernelLookup{kernel_lookup};
   auto list = IExecutionProvider::GetCapability(graph, lookup);
+#ifndef NDEBUG
   printf("JsExecutionProvider::GetCapability() results:\n");
 
   for (size_t i = 0; i < list.size(); i++) {
@@ -341,6 +344,7 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
       // }
     }
   }
+#endif
 
   return list;
 }

From 39a37b678a2f6f5153ba071c6f87e97c113c7686 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 19 Dec 2022 16:04:23 -0800
Subject: [PATCH 27/81] remove unused internal tensor def

---
 js/web/lib/wasm/jsep/tensor.ts | 186 ++-------------------------------
 1 file changed, 8 insertions(+), 178 deletions(-)

diff --git a/js/web/lib/wasm/jsep/tensor.ts b/js/web/lib/wasm/jsep/tensor.ts
index 384fa509e8c86..720b2357df1f2 100644
--- a/js/web/lib/wasm/jsep/tensor.ts
+++ b/js/web/lib/wasm/jsep/tensor.ts
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {ShapeUtil} from './util';
-
 export declare namespace Tensor {
   export interface DataTypeMap {
     bool: Uint8Array;
@@ -32,15 +30,6 @@ export declare namespace Tensor {
   export type Id = number;
 }
 
-type TensorData = Tensor.DataTypeMap[Tensor.DataType];
-
-type DataProvider = (id: Tensor.Id) => TensorData;
-type AsyncDataProvider = (id: Tensor.Id) => Promise<TensorData>;
-
-let guid = 0;
-const createNewTensorId = () => guid++;
-
-
 export const sizeof = (type: Tensor.DataType): number => {
   switch (type) {
     case 'bool':
@@ -96,173 +85,9 @@ export const createView = (dataBuffer: ArrayBuffer, type: Tensor.DataType): Int3
     BigUint64Array|Uint8Array|Float32Array|Float64Array|Int8Array|Int16Array|Uint16Array =>
         new (dataviewConstructor(type))(dataBuffer);
 
-export class Tensor {
-  /**
-   * get the underlying tensor data
-   */
-  get data(): TensorData {
-    if (this.cache === undefined) {
-      const data = this.dataProvider!(this.dataId);
-      if (data.length !== this.size) {
-        throw new Error('Length of data provided by the Data Provider is inconsistent with the dims of this Tensor.');
-      }
-      this.cache = data;
-    }
-    return this.cache;
-  }
-
-  /**
-   * get the underlying string tensor data. Should only use when type is STRING
-   */
-  get stringData(): Tensor.StringType {
-    if (this.type !== 'string') {
-      throw new TypeError('data type is not string');
-    }
-
-    return this.data as Tensor.StringType;
-  }
-
-  /**
-   * get the underlying integer tensor data. Should only use when type is one of the following: (UINT8, INT8, UINT16,
-   * INT16, INT32, UINT32, BOOL)
-   */
-  get integerData(): Tensor.IntegerType {
-    switch (this.type) {
-      case 'uint8':
-      case 'int8':
-      case 'uint16':
-      case 'int16':
-      case 'int32':
-      case 'uint32':
-      case 'int64':
-      case 'uint64':
-      case 'bool':
-        return this.data as Tensor.IntegerType;
-
-      default:
-        throw new TypeError(
-            'data type is not integer (uint8, int8, uint16, int16, int32, uint32, int64, uint64, bool)');
-    }
-  }
-
-  /**
-   * get the underlying float tensor data. Should only use when type is one of the following: (FLOAT, DOUBLE)
-   */
-  get floatData(): Tensor.FloatType {
-    switch (this.type) {
-      case 'float32':
-      case 'float64':
-        return this.data as Tensor.FloatType;
-
-      default:
-        throw new TypeError('data type is not float (float32, float64)');
-    }
-  }
-
-  /**
-   * get the underlying number tensor data. Should only use when type is one of the following: (UINT8, INT8, UINT16,
-   * INT16, INT32, UINT32, BOOL, FLOAT, DOUBLE)
-   */
-  get numberData(): Tensor.NumberType {
-    if (this.type !== 'string') {
-      return this.data as Tensor.NumberType;
-    }
-    throw new TypeError('type cannot be non-number (string)');
-  }
-
-  /**
-   * get the underlying tensor data asynchronously
-   */
-  async getData(): Promise<TensorData> {
-    if (this.cache === undefined) {
-      if (this.asyncDataProvider) {
-        const data = await this.asyncDataProvider(this.dataId);
-        if (data.length !== this.size) {
-          throw new Error('Length of data provided by the Data Provider is inconsistent with the dims of this Tensor.');
-        }
-        this.cache = data;
-      } else {
-        return this.data;
-      }
-    }
-    return this.cache;
-  }
-
-  /**
-   * get the number of elements in the tensor
-   */
-  public readonly size: number;
-
-  private _strides: readonly number[];
-  /**
-   * get the strides for each dimension
-   */
-  get strides(): readonly number[] {
-    if (!this._strides) {
-      this._strides = ShapeUtil.computeStrides(this.dims);
-    }
-    return this._strides;
-  }
-
-  constructor(
-      /**
-       * get the dimensions of the tensor
-       */
-      public readonly dims: readonly number[],
-      /**
-       * get the type of the tensor
-       */
-      public readonly type: Tensor.DataType, private dataProvider?: DataProvider,
-      private asyncDataProvider?: AsyncDataProvider, private cache?: TensorData,
-      /**
-       * get the data ID that used to map to a tensor data
-       */
-      public readonly dataId: Tensor.Id = createNewTensorId()) {
-    this.size = ShapeUtil.validateDimsAndCalcSize(dims);
-    const size = this.size;
-    const empty = (dataProvider === undefined && asyncDataProvider === undefined && cache === undefined);
-
-    if (cache !== undefined) {
-      if (cache.length !== size) {
-        throw new RangeError('Input dims doesn\'t match data length.');
-      }
-    }
-
-    if (type === 'string') {
-      if (cache !== undefined && (!Array.isArray(cache) || !cache.every(i => typeof i === 'string'))) {
-        throw new TypeError('cache should be a string array');
-      }
-
-      if (empty) {
-        this.cache = new Array<string>(size);
-      }
-    } else {
-      if (cache !== undefined) {
-        const constructor = dataviewConstructor(type);
-        if (!(cache instanceof constructor)) {
-          throw new TypeError(`cache should be type ${constructor.name}`);
-        }
-      }
-
-      if (empty) {
-        const buf = new ArrayBuffer(size * sizeof(type));
-        this.cache = createView(buf, type);
-      }
-    }
-  }
-
-  /**
-   * Construct new Tensor from raw data
-   * @param data the raw data object. Should be a string array for 'string' tensor, and the corresponding typed array
-   * for other types of tensor.
-   * @param dims the dimensions of the tensor
-   * @param type the type of the tensor
-   */
-  static fromData(data: Tensor.DataTypeMap[Tensor.DataType], dims: readonly number[], type: Tensor.DataType): Tensor {
-    return new Tensor(dims, type, undefined, undefined, data);
-  }
-}
-
+/**
+ * a TensorView does not own the data.
+ */
 export interface TensorView {
   readonly data: number;
   readonly dataType: number;
@@ -272,4 +97,9 @@ export interface TensorView {
    * get a Float32Array data view of the tensor data. tensor data must be on CPU.
    */
   getFloat32Array(): Float32Array;
+
+  /**
+   * create a new tensor view with the same data but different dimensions.
+   */
+  reshape(newDims: readonly number[]): TensorView;
 }

From 2fd03b16a1ae29a0b4c59be2ddac45c47f4ec334 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 19 Dec 2022 16:04:43 -0800
Subject: [PATCH 28/81] add reshape impl

---
 js/web/lib/wasm/jsep/init.ts | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index e14d37abdf3fe..3af91ae9aa53f 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -21,6 +21,13 @@ class TensorViewImpl implements TensorView {
   getFloat32Array(): Float32Array {
     return new Float32Array(this.module.HEAP8.buffer, this.data, ShapeUtil.size(this.dims));
   }
+
+  reshape(newDims: readonly number[]): TensorView {
+    if (ShapeUtil.size(newDims) !== ShapeUtil.size(this.dims)) {
+      throw new Error('Invalid new shape');
+    }
+    return new TensorViewImpl(this.module, this.dataType, this.data, newDims);
+  }
 }
 
 class OpKernelContext implements ComputeContext {

From bdb55f6b00bc1c4cb2eb2f455a43f0d4c3120b91 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 19 Dec 2022 21:49:03 -0800
Subject: [PATCH 29/81] remove 'HANDLE' memory type

---
 include/onnxruntime/core/framework/ortdevice.h | 1 -
 onnxruntime/core/providers/js/allocator.h      | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h
index efc5923031262..77f7c3e1743f0 100644
--- a/include/onnxruntime/core/framework/ortdevice.h
+++ b/include/onnxruntime/core/framework/ortdevice.h
@@ -23,7 +23,6 @@ struct OrtDevice {
     static const MemoryType CUDA_PINNED = 1;
     static const MemoryType HIP_PINNED = 2;
     static const MemoryType CANN_PINNED = 3;
-    static const MemoryType HANDLE = 4;
   };
 
   constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)
diff --git a/onnxruntime/core/providers/js/allocator.h b/onnxruntime/core/providers/js/allocator.h
index 5f7a6aabf4984..1c57540c24b97 100644
--- a/onnxruntime/core/providers/js/allocator.h
+++ b/onnxruntime/core/providers/js/allocator.h
@@ -32,7 +32,7 @@ class JsCustomAllocator : public IAllocator {
   JsCustomAllocator()
       : IAllocator(
             OrtMemoryInfo("JsCustomAllocator", OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::HANDLE, 0),
+                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0),
                           0, OrtMemTypeDefault)) {
   }
 

From 1c28286c1a34a0d6b9d561702661749bb19cddab Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 6 Jan 2023 16:03:05 -0800
Subject: [PATCH 30/81] transpose

---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   4 +-
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  | 217 ++++++++----------
 .../providers/js/js_execution_provider.cc     |   6 +
 .../core/providers/js/operators/transpose.cc  |  28 +++
 .../core/providers/js/operators/transpose.h   |  34 +++
 5 files changed, 172 insertions(+), 117 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/operators/transpose.cc
 create mode 100644 onnxruntime/core/providers/js/operators/transpose.h

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 34dc221d25396..7f72d14e0a101 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -12,7 +12,7 @@ import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePool
 // './ops/reduce-tensors'; import {reshape} from './ops/reshape'; import {shape} from './ops/shape';
 // import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
 // import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
-// import {parseTransposeAttributes, transpose} from './ops/transpose';
+import {parseTransposeAttributes, transpose} from './ops/transpose';
 import * as unaryOps from './ops/unary-op';
 import {ComputeContext} from './types';
 
@@ -86,7 +86,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Sub', [binaryOps.sub]],  // ['Sum', '', '6+', sum],
   ['Tan', [unaryOps.tan]], ['Tanh', [unaryOps.tanh]],
   // ['Tile', '', '6+', tile],
-  //['Transpose', '', '1+', transpose, parseTransposeAttributes],
+  ['Transpose', [transpose, parseTransposeAttributes]],
   // ['Upsample', '', '7-8', upsample, parseUpsampleAttributesV7],
   // ['Upsample', '', '9', upsample, parseUpsampleAttributesV9],
   //['Unsqueeze', '', '1-12', unsqueeze, parseUnsqueezeAttributes], ['Unsqueeze', '', '13+', unsqueezeV13],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 8519f319df7f5..9d666d724e897 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -1,118 +1,105 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-// import {Graph} from '../../../graph';
-// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo} from '../types';
-
-// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-// export interface TransposeAttributes extends AttributeWithCacheKey {
-//   readonly perm: number[];
-// }
-
-// const transposeProgramMetadata = {
-//   name: 'Transpose',
-//   inputTypes: [GpuDataType.default]
-// };
-
-// export const transpose: OperatorAsyncImplementation<TransposeAttributes> = async(
-//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: TransposeAttributes): Promise<Tensor[]>
-//     => {
-//   validateInputs(inputs);
-//   return inferenceHandler.run(
-//       {
-//         ...transposeProgramMetadata,
-//         cacheHint: attributes.cacheKey,
-//         get: () => createTransposeProgramInfo(inferenceHandler, inputs[0], attributes.perm)
-//       },
-//       inputs);
-// };
-
-// export const parseTransposeAttributes: OperatorInitialization<TransposeAttributes> =
-//     (node: Graph.Node): TransposeAttributes => createAttributeWithCacheKey({perm: node.attributes.getInts('perm',
-//     [])});
-
-// const createTransposeProgramInfo =
-//     (_inferenceHandler: WebGpuInferenceHandler, input: Tensor, perm: number[]): ProgramInfo => {
-//       const dataType = 'f32';  // TODO: support other data type
-//       const inputShape = input.dims;
-//       perm = getAdjustedPerm(inputShape, perm);
-//       const outputShape = getOutputShape(inputShape, perm);
-//       const rank = inputShape.length;
-//       const outputSize = ShapeUtil.size(outputShape);
-//       // A dims=[${inputs[0].dims.toString()}]
-//       // out Dims=[${unpackedOutputShape.toString()}]
-//       // based on perm=[${perm.toString()}]
-
-//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
-//       const inputIndicesHelper = createIndicesHelper('a', inputShape);
-
-//       const shaderSource = `
-//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-//   @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
-//   @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-//   ${permFunctionBody(perm, rank)}
-//   ${outputIndicesHelper.o2iImpl}
-//   ${inputIndicesHelper.i2oImpl}
-
-//   @compute @workgroup_size(WORKGROUP_SIZE)
-//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//     // Guard against out-of-bounds work group sizes
-//     if (global_id.x >= ${outputSize}u) {
-//       return;
-//     }
-
-//     ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-//     ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-//     ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
-//     perm(&aIndices, &indices);
-
-//     output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
-//   }`;
-//       return {
-//         ...transposeProgramMetadata,
-//         outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
-//         shaderSource,
-//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-//       };
-//     };
-
-// const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] => {
-//   if (perm && perm.length !== inputShape.length) {
-//     perm = [...(inputShape.keys())].reverse();
-//   }
-//   return perm;
-// };
-
-// const getOutputShape = (inputShape: readonly number[], perm: number[]): readonly number[] => {
-//   perm = getAdjustedPerm(inputShape, perm);
-//   return ShapeUtil.sortBasedOnPerm(inputShape, perm);
-// };
-
-// const permFunctionBody = (perm: number[], rank: number): string => {
-//   const reverseFunc = [];
-//   reverseFunc.push(`fn perm(a: ptr<function, array<u32, ${rank}>>, i: ptr<function, array<u32, ${rank}>>) {`);
-//   for (let i = 0; i < rank; ++i) {
-//     reverseFunc.push(`\t(*a)[${perm[i]}]=(*i)[${i}];`);
-//   }
-//   reverseFunc.push('\t}');
-//   return reverseFunc.join('\n');
-// };
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 1) {
-//     throw new Error('Transpose requires 1 input.');
-//   }
-
-//   if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-//     throw new Error('input should be float tensor');
-//   }
-// };
+import {DataType} from '../../../wasm-core-impl';
+import {TensorView} from '../../tensor';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo} from '../types';
+
+import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+
+export interface TransposeAttributes extends AttributeWithCacheKey {
+  readonly perm: number[];
+}
+
+const transposeProgramMetadata = {
+  name: 'Transpose',
+  inputTypes: [GpuDataType.default]
+};
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('Transpose requires 1 input.');
+  }
+
+  if (inputs[0].dataType !== DataType.float) {
+    throw new Error('input should be float tensor');
+  }
+};
+
+const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] =>
+    (perm && perm.length !== inputShape.length) ? [...(inputShape.keys())].reverse() : perm;
+
+const getOutputShape = (inputShape: readonly number[], perm: number[]): readonly number[] =>
+    ShapeUtil.sortBasedOnPerm(inputShape, getAdjustedPerm(inputShape, perm));
+
+const permFunctionBody = (perm: number[], rank: number): string => {
+  const reverseFunc = [];
+  reverseFunc.push(`fn perm(a: ptr<function, array<u32, ${rank}>>, i: ptr<function, array<u32, ${rank}>>) {`);
+  for (let i = 0; i < rank; ++i) {
+    reverseFunc.push(`\t(*a)[${perm[i]}]=(*i)[${i}];`);
+  }
+  reverseFunc.push('\t}');
+  return reverseFunc.join('\n');
+};
+
+const createTransposeProgramInfo = (input: TensorView, permAttr: number[]): ProgramInfo => {
+  const dataType = 'f32';  // TODO: support other data type
+  const inputShape = input.dims;
+  const perm = getAdjustedPerm(inputShape, permAttr);
+  const outputShape = getOutputShape(inputShape, perm);
+  const rank = inputShape.length;
+  const outputSize = ShapeUtil.size(outputShape);
+  // A dims=[${inputs[0].dims.toString()}]
+  // out Dims=[${unpackedOutputShape.toString()}]
+  // based on perm=[${perm.toString()}]
+
+  const outputIndicesHelper = createIndicesHelper('output', outputShape);
+  const inputIndicesHelper = createIndicesHelper('a', inputShape);
+
+  const shaderSource = `
+  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+
+  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
+
+  ${permFunctionBody(perm, rank)}
+  ${outputIndicesHelper.o2iImpl}
+  ${inputIndicesHelper.i2oImpl}
+
+  @compute @workgroup_size(WORKGROUP_SIZE)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+
+    // Guard against out-of-bounds work group sizes
+    if (global_id.x >= ${outputSize}u) {
+      return;
+    }
+
+    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
+    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+    ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
+    perm(&aIndices, &indices);
+
+    output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
+  }`;
+  return {
+    ...transposeProgramMetadata,
+    outputs: [{dims: outputShape, dataType: input.dataType, gpuDataType: GpuDataType.default}],
+    shaderSource,
+    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+  };
+};
+
+export const transpose = (context: ComputeContext, attributes: TransposeAttributes): number => {
+  validateInputs(context.inputs);
+  context.compute({
+    ...transposeProgramMetadata,
+    cacheHint: attributes.cacheKey,
+    get: () => createTransposeProgramInfo(context.inputs[0], attributes.perm)
+  });
+  return 0;
+};
+
+export const parseTransposeAttributes = (attributes: Record<string, unknown>): TransposeAttributes =>
+    createAttributeWithCacheKey({perm: Array.from(attributes.perm as Iterable<number>)});
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 47bd347e8b2c0..5527cfd7d4bba 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -134,6 +134,9 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Reshape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Reshape);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose);
+
 //class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
 
@@ -234,6 +237,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 13, Reshape)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Reshape)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose)>,
+
       //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
diff --git a/onnxruntime/core/providers/js/operators/transpose.cc b/onnxruntime/core/providers/js/operators/transpose.cc
new file mode 100644
index 0000000000000..6803e6e7a2a76
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/transpose.cc
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "transpose.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Transpose,
+    kOnnxDomain,
+    1, 12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Transpose);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Transpose,
+    kOnnxDomain,
+    13,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Transpose);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
new file mode 100644
index 0000000000000..97bc6d6a87b39
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+#include "core/common/gsl.h"
+#include "core/providers/cpu/tensor/transpose.h"
+
+namespace onnxruntime {
+namespace js {
+
+class Transpose final : public JsKernel, public TransposeBase {
+ public:
+  Transpose(const OpKernelInfo& info) : JsKernel(info), TransposeBase(info) {
+    std::vector<int32_t> perm;
+    if (perm_specified_) {
+        perm.resize(perm_.size());
+        perm[0] = gsl::narrow_cast<int32_t>(perm_.size());
+        for (size_t i = 0; i < perm_.size(); ++i) {
+            perm[i] = gsl::narrow_cast<int32_t>(perm_[i]);
+        }
+    }
+    JSEP_INIT_KERNEL_ATTRIBUTE(Transpose, ({
+        "perm": $1 ? Module.HEAP32.subarray($2, $2 + $1) : []
+    }),
+    gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
+    reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2
+    );
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime

From f7df9bab6164e75a9802c2a65ac073367688426b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 12 Jan 2023 16:59:56 -0800
Subject: [PATCH 31/81] [js] support 2 flags from session options

---
 js/common/lib/inference-session.ts    |  8 ++++++
 js/web/lib/wasm/binding/ort-wasm.d.ts |  2 +-
 js/web/lib/wasm/session-options.ts    |  7 +++++-
 js/web/script/test-runner-cli-args.ts | 29 +++++++++++++++++++++
 js/web/script/test-runner-cli.ts      |  2 ++
 js/web/test/test-main.ts              |  3 ++-
 js/web/test/test-runner.ts            | 12 ++++++---
 js/web/test/test-types.ts             |  1 +
 onnxruntime/core/graph/model.cc       | 36 +++++++++++++++++++++++++++
 onnxruntime/wasm/api.cc               |  7 +++++-
 onnxruntime/wasm/api.h                |  4 ++-
 11 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 1f2f855a3e487..638cb90f36716 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -94,6 +94,14 @@ export declare namespace InferenceSession {
      */
     executionMode?: 'sequential'|'parallel';
 
+    /**
+     * Optimized model file path.
+     *
+     * If this setting is specified, the optimized model will be dumped. In browser, a blob will be created
+     * with a pop-up window.
+     */
+    optimizedModelFilePath?: string;
+
     /**
      * Wether enable profiling.
      *
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index efb73c9943518..2e51d3257ec9c 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -47,7 +47,7 @@ export interface OrtWasmModule extends EmscriptenModule {
   _OrtCreateSessionOptions(
       graphOptimizationLevel: number, enableCpuMemArena: boolean, enableMemPattern: boolean, executionMode: number,
       enableProfiling: boolean, profileFilePrefix: number, logId: number, logSeverityLevel: number,
-      logVerbosityLevel: number): number;
+      logVerbosityLevel: number, optimizedModelFilePath: number): number;
   _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number;
   _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number;
   _OrtReleaseSessionOptions(sessionOptionsHandle: number): void;
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 8bf293e2a2e0a..400757f4a7721 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -131,10 +131,15 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
       sessionOptions.enableProfiling = false;
     }
 
+    let optimizedModelFilePathOffset = 0;
+    if (typeof options?.optimizedModelFilePath === 'string') {
+      optimizedModelFilePathOffset = allocWasmString(options.optimizedModelFilePath, allocs);
+    }
+
     sessionOptionsHandle = wasm._OrtCreateSessionOptions(
         graphOptimizationLevel, !!sessionOptions.enableCpuMemArena!, !!sessionOptions.enableMemPattern!, executionMode,
         !!sessionOptions.enableProfiling!, 0, logIdDataOffset, sessionOptions.logSeverityLevel!,
-        sessionOptions.logVerbosityLevel!);
+        sessionOptions.logVerbosityLevel!, optimizedModelFilePathOffset);
     if (sessionOptionsHandle === 0) {
       throw new Error('Can\'t create session options');
     }
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index 33494c714b98c..d860e84dfd99f 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -52,6 +52,10 @@ Options:
                                  This flag can be used with a number as value, specifying the total count of test cases to run. The test cases may be used multiple times. Default value is 10.
  -c, --file-cache              Enable file cache.
 
+*** Session Options ***
+ -u=<...>, --optimized-model-file-path=<...>        Specify whether to dump the optimized model.
+ -o=<...>, --graph-optimization-level=<...>    Specify graph optimization level.
+                                                 Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'all'.
 *** Logging Options ***
 
  --log-verbose=<...>           Set log level to verbose
@@ -151,6 +155,16 @@ export interface TestRunnerCliArgs {
    */
   times?: number;
 
+  /**
+   * whether to dump the optimized model
+   */
+  optimizedModelFilePath?: string;
+
+  /**
+   * Specify graph optimization level
+   */
+  graphOptimizationLevel: 'disabled'|'basic'|'extended'|'all';
+
   cpuOptions?: InferenceSession.CpuExecutionProviderOption;
   cudaOptions?: InferenceSession.CudaExecutionProviderOption;
   cudaFlags?: Record<string, unknown>;
@@ -380,6 +394,19 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     logConfig.push({category: 'TestRunner.Perf', config: {minimalSeverity: 'verbose'}});
   }
 
+  // Option: -u, --optimized-model-file-path
+  const optimizedModelFilePath = args['optimized-model-file-path'] || args.u || undefined;
+  if (typeof optimizedModelFilePath !== 'undefined' && typeof optimizedModelFilePath !== 'string') {
+    throw new Error('Flag "optimized-model-file-path" need to be either empty or a valid file path.');
+  }
+
+  // Option: -o, --graph-optimization-level
+  const graphOptimizationLevel = args['graph-optimization-level'] || args.o || 'all';
+  if (typeof graphOptimizationLevel !== 'string' ||
+      ['disabled', 'basic', 'extended', 'all'].indexOf(graphOptimizationLevel) === -1) {
+    throw new Error(`graph optimization level is invalid: ${graphOptimizationLevel}`);
+  }
+
   // Option: -c, --file-cache
   const fileCache = parseBooleanArg(args['file-cache'] || args.c, false);
 
@@ -407,6 +434,8 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     logConfig,
     profile,
     times: perf ? times : undefined,
+    optimizedModelFilePath,
+    graphOptimizationLevel: graphOptimizationLevel as TestRunnerCliArgs['graphOptimizationLevel'],
     fileCache,
     cpuOptions,
     webglOptions,
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index ee32a58ea9c57..a04b7f78746d0 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -148,6 +148,8 @@ run({
   log: args.logConfig,
   profile: args.profile,
   options: {
+    sessionOptions:
+        {graphOptimizationLevel: args.graphOptimizationLevel, optimizedModelFilePath: args.optimizedModelFilePath},
     debug: args.debug,
     cpuOptions: args.cpuOptions,
     webglOptions: args.webglOptions,
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 64cef55bd1c3b..2610cbe1d82e6 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -103,7 +103,8 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
         let context: ModelTestContext;
 
         before('prepare session', async () => {
-          context = await ModelTestContext.create(test, ORT_WEB_TEST_CONFIG.profile);
+          context = await ModelTestContext.create(
+              test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
         });
 
         after('release session', () => {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 8dcc64025a7cf..4dedd678fbdce 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -147,7 +147,7 @@ async function loadTensors(
 }
 
 async function initializeSession(
-    modelFilePath: string, backendHint: string, profile: boolean,
+    modelFilePath: string, backendHint: string, profile: boolean, sessionOptions: ort.InferenceSession.SessionOptions,
     fileCache?: FileCacheBuffer): Promise<ort.InferenceSession> {
   const preloadModelData: Uint8Array|undefined =
       fileCache && fileCache[modelFilePath] ? fileCache[modelFilePath] : undefined;
@@ -157,7 +157,8 @@ async function initializeSession(
           preloadModelData ? ` [preloaded(${preloadModelData.byteLength})]` : ''}`);
 
   const profilerConfig = profile ? {maxNumberEvents: 65536} : undefined;
-  const sessionConfig = {executionProviders: [backendHint], profiler: profilerConfig, enableProfiling: profile};
+  const sessionConfig =
+      {...sessionOptions, executionProviders: [backendHint], profiler: profilerConfig, enableProfiling: profile};
   let session: ort.InferenceSession;
 
   try {
@@ -230,7 +231,9 @@ export class ModelTestContext {
   /**
    * create a ModelTestContext object that used in every test cases in the given ModelTest.
    */
-  static async create(modelTest: Test.ModelTest, profile: boolean): Promise<ModelTestContext> {
+  static async create(
+      modelTest: Test.ModelTest, profile: boolean,
+      sessionOptions?: ort.InferenceSession.SessionOptions): Promise<ModelTestContext> {
     if (this.initializing) {
       throw new Error('cannot create a ModelTestContext object when the previous creation is not done');
     }
@@ -239,7 +242,8 @@ export class ModelTestContext {
       this.initializing = true;
 
       const initStart = now();
-      const session = await initializeSession(modelTest.modelUrl, modelTest.backend!, profile, this.cache);
+      const session =
+          await initializeSession(modelTest.modelUrl, modelTest.backend!, profile, sessionOptions || {}, this.cache);
       const initEnd = now();
 
       for (const testCase of modelTest.cases) {
diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts
index a7ab9d7025706..966b1e704a5b7 100644
--- a/js/web/test/test-types.ts
+++ b/js/web/test/test-types.ts
@@ -104,6 +104,7 @@ export declare namespace Test {
    */
   export interface Options {
     debug?: boolean;
+    sessionOptions?: InferenceSession.SessionOptions;
     cpuOptions?: InferenceSession.CpuExecutionProviderOption;
     cpuFlags?: Record<string, unknown>;
     cudaOptions?: InferenceSession.CudaExecutionProviderOption;
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 8af9f99ed1d44..34ab1ccfc53c0 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -29,6 +29,10 @@
 #include "core/graph/function_utils.h"
 #endif
 
+#if defined(__wasm__)
+#include <emscripten.h>
+#endif
+
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime;
 using namespace onnxruntime::common;
@@ -500,6 +504,37 @@ static Status LoadModel(const T& file_path, std::shared_ptr<Model>& p_model,
 
 template <typename T>
 static Status SaveModel(Model& model, const T& file_path) {
+#if defined(__wasm__)
+  ORT_RETURN_IF_ERROR(model.MainGraph().Resolve());
+  auto model_proto = model.ToProto();
+  auto buffer_size = model_proto.ByteSizeLong();
+  void* buffer = malloc(buffer_size);
+  model_proto.SerializeToArray(buffer, buffer_size);
+
+  EM_ASM(({
+           const buffer = $0;
+           const buffer_size = $1;
+           const file_path = UTF8ToString($2);
+           const bytes = new Uint8Array(buffer_size);
+           bytes.set(HEAPU8.subarray(buffer, buffer + buffer_size));
+           if (typeof process == 'object' && typeof process.versions == 'object' && typeof process.versions.node == 'string') {
+             // Node.js
+             require('fs').writeFileSync(file_path, bytes);
+           } else {
+             // Browser
+             const file = new File([bytes], file_path, {type: "application/octet-stream" });
+             const url = URL.createObjectURL(file);
+             window.open(url, '_blank');
+           }
+         }),
+         reinterpret_cast<int32_t>(buffer),
+         static_cast<int32_t>(buffer_size),
+         reinterpret_cast<int32_t>(file_path.c_str()));
+
+  free(buffer);
+  return Status::OK();
+
+#else
   int fd;
   Status status = Env::Default().FileOpenWr(file_path, fd);
   ORT_RETURN_IF_ERROR(status);
@@ -518,6 +553,7 @@ static Status SaveModel(Model& model, const T& file_path) {
     return status;
   }
   return Env::Default().FileClose(fd);
+#endif
 }
 
 #ifdef _WIN32
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index d24cbd495d1fa..69b179ddd6969 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -68,10 +68,15 @@ OrtSessionOptions* OrtCreateSessionOptions(size_t graph_optimization_level,
                                            const char* /*profile_file_prefix*/,
                                            const char* log_id,
                                            size_t log_severity_level,
-                                           size_t log_verbosity_level) {
+                                           size_t log_verbosity_level,
+                                           const char* optimized_model_filepath) {
   OrtSessionOptions* session_options = nullptr;
   RETURN_NULLPTR_IF_ERROR(CreateSessionOptions, &session_options);
 
+  if (optimized_model_filepath) {
+    RETURN_NULLPTR_IF_ERROR(SetOptimizedModelFilePath, session_options, optimized_model_filepath);
+  }
+
   // assume that a graph optimization level is checked and properly set at JavaScript
   RETURN_NULLPTR_IF_ERROR(SetSessionGraphOptimizationLevel,
                           session_options,
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index d3435f2958a17..80466ecd871c4 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -46,6 +46,7 @@ int EMSCRIPTEN_KEEPALIVE OrtInit(int num_threads, int logging_level);
  * @param log_id logger id for session output
  * @param log_severity_level verbose, info, warning, error or fatal
  * @param log_verbosity_level vlog level
+ * @param optimized_model_filepath filepath of the optimized model to dump.
  * @returns a pointer to a session option handle and must be freed by calling OrtReleaseSessionOptions().
  */
 ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t graph_optimization_level,
@@ -56,7 +57,8 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t
                                                                           const char* profile_file_prefix,
                                                                           const char* log_id,
                                                                           size_t log_severity_level,
-                                                                          size_t log_verbosity_level);
+                                                                          size_t log_verbosity_level,
+                                                                          const char* optimized_model_filepath);
 
 /**
  * append an execution provider for a session.

From a032718baf5375be315def1c3b6a3266823b7135 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 23 Jan 2023 14:28:58 -0800
Subject: [PATCH 32/81] conv_by_matmul

---
 js/.eslintrc.js                               |   6 +
 js/web/lib/wasm/jsep/backend-webgpu.ts        |  49 ++-
 js/web/lib/wasm/jsep/init.ts                  |   1 +
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  10 +-
 .../webgpu/ops/3rd-party/activation_util.ts   |  85 +++++
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  | 254 ++++++++++++++
 .../jsep/webgpu/ops/3rd-party/conv_util.ts    | 157 +++++++++
 .../ops/3rd-party/matmul_packed_webgpu.ts     | 327 ++++++++++++++++++
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 109 +++++-
 js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts  |  28 ++
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       |  75 ++--
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  |   6 +-
 js/web/lib/wasm/jsep/webgpu/types.ts          |   5 +-
 onnxruntime/core/framework/session_state.cc   |   6 +
 .../contrib_ops/internal_nhwc_onnx_opset.cc   |   2 +
 .../providers/js/js_execution_provider.cc     |  10 +-
 .../core/providers/js/js_execution_provider.h |   2 +-
 .../core/providers/js/operators/conv.h        |  29 +-
 .../core/providers/js/operators/pool.cc       | 107 +++---
 .../core/providers/js/operators/pool.h        |  19 +-
 .../core/providers/js/operators/transpose.h   |   4 +-
 21 files changed, 1172 insertions(+), 119 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts

diff --git a/js/.eslintrc.js b/js/.eslintrc.js
index 24620a2791871..519284617f428 100644
--- a/js/.eslintrc.js
+++ b/js/.eslintrc.js
@@ -182,6 +182,12 @@ module.exports = {
       'import/no-extraneous-dependencies': 'off',
       'no-console': 'off'
     }
+  }, {
+    files: ['web/lib/**/3rd-party/**/*.ts'], rules: {
+      'header/header': 'off',
+      'unicorn/filename-case': 'off',
+      '@typescript-eslint/explicit-module-boundary-types': 'off',
+    }
   }],
   extends: [
     'eslint:recommended',
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 6d9d9e29955e0..a7458742cbb95 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -28,9 +28,10 @@ export class WebGpuBackend {
   programManager: ProgramManager;
 
   temporaryData: GpuData[];
+  currentKernelId: number|null = null;
+  kernelPersistentData: Map<number, GpuData[]>;
 
-  // TODO: remove value[0]. the string is only for debug
-  kernels: Map<number, [string, RunFunction, unknown]>;
+  kernels: Map<number, [string, RunFunction, [((attribute: unknown) => unknown) | undefined, unknown]]>;
 
   commandEncoder: GPUCommandEncoder|null = null;
   computePassEncoder: GPUComputePassEncoder|null = null;
@@ -50,6 +51,7 @@ export class WebGpuBackend {
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
     this.kernels = new Map();
+    this.kernelPersistentData = new Map();
     // TODO: set up flags
 
     this.device.onuncapturederror = ev => {
@@ -128,8 +130,13 @@ export class WebGpuBackend {
     const outputTensorViews: TensorView[] = [];
     const outputDatas: GpuData[] = [];
     for (let i = 0; i < programInfo.outputs.length; ++i) {
-      const isTemporary = validatedOutputIndices[i] === -1;
-      const tensorView = isTemporary ?
+      if (!Number.isInteger(validatedOutputIndices[i]) || validatedOutputIndices[i] < -2 ||
+          validatedOutputIndices[i] >= programInfo.outputs.length) {
+        throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
+      }
+      const isTemporary = validatedOutputIndices[i] === -2;
+      const isPersistent = validatedOutputIndices[i] === -1;
+      const tensorView = (isTemporary || isPersistent) ?
           createTemporaryOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
       const gpuData = this.gpuDataManager.get(tensorView.data);
@@ -139,6 +146,14 @@ export class WebGpuBackend {
       if (isTemporary) {
         this.temporaryData.push(gpuData);
       }
+      if (isPersistent) {
+        let persistentData = this.kernelPersistentData.get(this.currentKernelId!);
+        if (!persistentData) {
+          persistentData = [];
+          this.kernelPersistentData.set(this.currentKernelId!, persistentData);
+        }
+        persistentData.push(gpuData);
+      }
       outputTensorViews.push(tensorView);
       outputDatas.push(gpuData);
     }
@@ -180,14 +195,17 @@ export class WebGpuBackend {
       throw new Error(`kernel not implemented: ${name}`);
     }
 
-    let processedAttribute = attribute;
-    if (op.length > 1 && typeof op[1] !== 'undefined') {
-      processedAttribute = op[1](attribute);
-    }
-    this.kernels.set(kernelId, [name, op[0], processedAttribute]);
+    this.kernels.set(kernelId, [name, op[0], [op[1], attribute]]);
   }
 
   releaseKernel(kernelId: number): void {
+    const persistentData = this.kernelPersistentData.get(kernelId);
+    if (persistentData) {
+      for (const data of persistentData) {
+        this.gpuDataManager.release(data.id);
+      }
+      this.kernelPersistentData.delete(kernelId);
+    }
     this.kernels.delete(kernelId);
   }
 
@@ -197,6 +215,16 @@ export class WebGpuBackend {
       throw new Error(`kernel not created: ${kernelId}`);
     }
     const [name, kernelEntry, attributes] = kernel;
+    if (this.currentKernelId !== null) {
+      throw new Error(`kernel "${name}" is not allowed to be called recursively`);
+    }
+    this.currentKernelId = kernelId;
+
+    // parse attributes if necessary
+    if (attributes[0]) {
+      attributes[1] = attributes[0](attributes[1]);
+      attributes[0] = undefined;
+    }
 
     if (env.debug) {
       // eslint-disable-next-line no-console
@@ -205,12 +233,13 @@ export class WebGpuBackend {
 
     this.temporaryData = [];
     try {
-      return kernelEntry(context, attributes);
+      return kernelEntry(context, attributes[1]);
     } finally {
       for (const data of this.temporaryData) {
         this.gpuDataManager.release(data.id);
       }
       this.temporaryData = [];
+      this.currentKernelId = null;
     }
   }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 3af91ae9aa53f..2082694b84f36 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -33,6 +33,7 @@ class TensorViewImpl implements TensorView {
 class OpKernelContext implements ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
+  readonly customData: {[key: string]: unknown} = {};
   constructor(private module: OrtWasmModule, private backend: WebGpuBackend, contextDataOffset: number) {
     const heapU32 = module.HEAPU32;
 
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 7f72d14e0a101..77fb88e274dd0 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -7,7 +7,7 @@ import {conv, parseConvAttributes} from './ops/conv';
 // import {gather, parseGatherAttributes} from './ops/gather';
 import {gemm, parseGemmAttributes} from './ops/gemm';
 // import {matMul, parseMatMulAttributes} from './ops/matmul';
-import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool';
+import * as pool from './ops/pool';
 //  import {sum} from
 // './ops/reduce-tensors'; import {reshape} from './ops/reshape'; import {shape} from './ops/shape';
 // import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
@@ -27,7 +27,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['And', '', '7+', binaryOps.and],
   ['Asin', [unaryOps.asin]], ['Asinh', [unaryOps.asinh]], ['Atan', [unaryOps.atan]], ['Atanh', [unaryOps.atanh]],
   // TODO: support new attributes for AveragePool-10
-  ['AveragePool', [averagePool, parseAveragePoolAttributes]],
+  ['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
   // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
   // ['Cast', '', '6+', cast, parseCastAttributes],
   ['Ceil', [unaryOps.ceil]], ['ClipV10', [unaryOps.clip]],
@@ -41,7 +41,9 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Floor', [unaryOps.floor]],
   // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
   //['Gather', '', '1+', gather, parseGatherAttributes],
-  ['Gemm', [gemm, parseGemmAttributes]], ['GlobalAveragePool', [globalAveragePool]], ['GlobalMaxPool', [globalMaxPool]],
+  ['Gemm', [gemm, parseGemmAttributes]],
+  ['GlobalAveragePool', [pool.globalAveragePool, pool.parseGlobalAveragePoolAttributes]],
+  ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
   // ['Greater', '', '7+', binaryOps.greater],
   // ['Identity', '', '1+', unaryOps.identity],
   // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
@@ -50,7 +52,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['Less', '', '7+', binaryOps.less],
   //['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
-  ['MaxPool', [maxPool, parseMaxPoolAttributes]], ['Mul', [binaryOps.mul]], ['Neg', [unaryOps.neg]],
+  ['MaxPool', [pool.maxPool, pool.parseMaxPoolAttributes]], ['Mul', [binaryOps.mul]], ['Neg', [unaryOps.neg]],
   // ['Not', '', '1+', unaryOps.not],
   // ['Or', '', '7+', binaryOps.or],
   // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
new file mode 100644
index 0000000000000..3ac290103842e
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
@@ -0,0 +1,85 @@
+/**
+ * @license
+ * Copyright 2021 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+// sampled from [@tensorflow/tfjs] tfjs-backend-webgpu/src/activation_util.ts
+//
+// modified to fit the needs of the project
+
+export declare type Activation = 'linear' | 'relu' | 'prelu' | 'elu' | 'relu6' | 'leakyrelu' | 'sigmoid';
+
+export const typeSnippet = (component: number) => {
+  switch (component) {
+    case 1:
+      return 'f32';
+    case 2:
+      return 'vec2<f32>';
+    case 3:
+      return 'vec3<f32>';
+    case 4:
+      return 'vec4<f32>';
+    default:
+      throw new Error(`${component}-component is not supported.`);
+  }
+};
+
+export const activationFnSnippet =
+    (activation?: Activation, _hasPreluActivationWeights = false, _packed = false, _coordsLength = 3): string => {
+      if (!activation) {
+        return '';
+      }
+
+      return '';
+      // let activationOpSnippet = '';
+      // if (activation === 'linear') {
+      //   activationOpSnippet = getUnaryOpString(UnaryOpType.LINEAR);
+      // } else if (activation === 'relu') {
+      //   activationOpSnippet = getUnaryOpString(UnaryOpType.RELU, packed);
+      // } else if (activation === 'elu') {
+      //   activationOpSnippet = getUnaryOpString(UnaryOpType.ELU, packed);
+      // } else if (activation === 'relu6') {
+      //   activationOpSnippet = getUnaryOpString(UnaryOpType.RELU6, packed);
+      // } else if (activation === 'prelu') {
+      //   activationOpSnippet = getBinaryOpString(BinaryOpType.PRELU, packed);
+      // } else if (activation === 'sigmoid') {
+      //   activationOpSnippet = getUnaryOpString(UnaryOpType.SIGMOID, packed);
+      // } else if (activation === 'leakyrelu') {
+      //   activationOpSnippet = getUnaryOpString(UnaryOpType.LEAKYRELU, packed);
+      // } else {
+      //   throw new Error(`Activation ${activation} has not been implemented for the WebGPU backend.`);
+      // }
+      // const elementSize = packed ? 4 : 1;
+      // const dataType = typeSnippet(elementSize);
+      // let activationFnSnippet = '';
+      // if (hasPreluActivationWeights) {
+      //   activationFnSnippet = `
+      // fn activation(a : ${dataType}, coords : vec${coordsLength}<i32>) -> ${dataType} {
+      //   let b = getPreluActivationWeightsByOutputCoords(coords);
+      //   ${activationOpSnippet}
+      // }`;
+      // } else {
+      //   activationFnSnippet = `
+      // fn activation(a : ${dataType}, coords : vec${coordsLength}<i32>) -> ${dataType} {
+      //   ${activationOpSnippet}
+      // }`;
+      // }
+      // return activationFnSnippet;
+    };
+
+export const biasActivationSnippet = (hasBias: boolean, activation?: Activation): string => `
+      ${hasBias ? 'value = value + getBiasByOutputCoords(coords);' : ''}
+      ${activation ? 'value = activation(value, coords);' : ''}
+      `;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
new file mode 100644
index 0000000000000..4737f57980f57
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -0,0 +1,254 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+// sampled from [@tensorflow/tfjs] tfjs-backend-webgpu/src/conv2d_mm_webgpu.ts
+//
+// modified to fit the needs of the project
+
+import {env} from 'onnxruntime-common';
+
+import {TensorView} from '../../../tensor';
+import {ShapeUtil} from '../../../util';
+import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
+import {ConvAttributes} from '../conv';
+
+import {Activation, activationFnSnippet, biasActivationSnippet, typeSnippet} from './activation_util';
+import {utilFunctions} from './conv_util';
+import {makeMatMulPackedSource, makeMatMulPackedVec4Source} from './matmul_packed_webgpu';
+
+const conv2dCommonSnippet =
+    (isChannelsLast: boolean, fitAOuter: boolean, fitBOuter: boolean, fitInner: boolean, addBias = false,
+     activation?: Activation, hasPreluActivationWeights = false, innerElementSizeX = 4, innerElementSizeW = 4,
+     innerElementSize = 4): string => {
+      const getXSnippet = (innerElementSize: number) => {
+        switch (innerElementSize) {
+          case 1:
+            return 'resData = x[xIndex];';
+          case 3:
+            return 'resData = vec3<f32>(x[xIndex], x[xIndex + 1], x[xIndex + 2]);';
+          case 4:
+            return 'resData = x[xIndex / 4];';
+          default:
+            throw new Error(`innerElementSize ${innerElementSize} is not supported.`);
+        }
+      };
+      const getWSnippet = (innerElementSize: number) => {
+        switch (innerElementSize) {
+          case 1:
+            return 'return w[row * wShape[3] + colIn];';
+          case 4:
+            return 'return w[row * wShape[3] / 4 + colIn];';
+          default:
+            throw new Error(`innerElementSize ${innerElementSize} is not supported.`);
+        }
+      };
+      const coordASnippet = isChannelsLast ? `
+    let coord = vec4<i32>(batch, xRow, xCol, xCh);
+    ` :
+                                             `
+    let coord = vec4<i32>(batch, xCh, xRow, xCol);
+    `;
+
+      const coordResSnippet = isChannelsLast ? `
+    let coords = vec4<i32>(
+      batch,
+      row / outWidth,
+      row % outWidth,
+      col);
+    ` :
+                                               `
+    let coords = vec4<i32>(
+      batch,
+      row,
+      col / outWidth,
+      col % outWidth);
+    `;
+
+      const xHight = isChannelsLast ? 'xShape[1]' : 'xShape[2]';
+      const xWidth = isChannelsLast ? 'xShape[2]' : 'xShape[3]';
+      const row = isChannelsLast ? 'row' : 'col';
+      const col = isChannelsLast ? 'col' : 'row';
+      const readXSnippet = `
+    let inChannels = wShape[2];
+    let outWidth = ${isChannelsLast ? 'outShape[2]' : 'outShape[3]'};
+    let outRow = ${row} / outWidth;
+    let outCol = ${row} % outWidth;
+
+    let WRow = ${col} / (filterDims[1] * inChannels);
+    let WCol = ${col} / inChannels % filterDims[1];
+    let xRow = outRow * stride[0] + dilation[0] * WRow - pad[0];
+    let xCol = outCol * stride[1] + dilation[1] * WCol - pad[1];
+    let xCh = ${col} % inChannels;
+    var resData = ${typeSnippet(innerElementSizeX)}(0.0);
+    // The bounds checking is always needed since we use it to pad zero for
+    // the 'same' padding type.
+    if (xRow >= 0 && xRow < ${xHight} && xCol >= 0 && xCol < ${xWidth}) {
+      ${coordASnippet}
+      let xIndex = getIndexFromCoords4D(coord, xShape);
+      ${getXSnippet(innerElementSizeX)}
+    }
+    return resData;`;
+
+      const sampleX = isChannelsLast ? (fitAOuter && fitInner ? `
+    let col = colIn * ${innerElementSizeX};
+    ${readXSnippet}` :
+                                                                `
+    let col = colIn * ${innerElementSizeX};
+    if (row < dimAOuter && col < dimInner) {
+      ${readXSnippet}
+    }
+    return ${typeSnippet(innerElementSizeX)}(0.0);`) :
+                                       (fitInner && fitBOuter ? `
+    let col = colIn * ${innerElementSizeX};
+    ${readXSnippet}` :
+                                                                `
+    let col = colIn * ${innerElementSizeX};
+    if (row < dimInner && col < dimBOuter) {
+      ${readXSnippet}
+    }
+    return ${typeSnippet(innerElementSizeX)}(0.0);`);
+
+      const sampleW = `${getWSnippet(innerElementSizeW)}`;
+
+      const resType = typeSnippet(innerElementSize);
+      const aType = isChannelsLast ? typeSnippet(innerElementSizeX) : typeSnippet(innerElementSizeW);
+      const bType = isChannelsLast ? typeSnippet(innerElementSizeW) : typeSnippet(innerElementSizeX);
+      const userCode = `
+    ${activationFnSnippet(activation, hasPreluActivationWeights, innerElementSize === 4, 4)}
+    fn mm_readA(batch: i32, row : i32, colIn : i32) -> ${aType} {
+      ${isChannelsLast ? sampleX : sampleW}
+    }
+
+    fn mm_readB(batch: i32, row : i32, colIn : i32) -> ${bType} {
+      ${isChannelsLast ? sampleW : sampleX}
+    }
+
+    fn mm_write(batch: i32, row : i32, colIn : i32, valueIn : ${resType}) {
+      let col = colIn * ${innerElementSize};
+      if (row < dimAOuter && col < dimBOuter)
+      {
+      var value = valueIn;
+      let outWidth = ${isChannelsLast ? 'outShape[2]' : 'outShape[3]'};
+      ${coordResSnippet}
+      ${biasActivationSnippet(addBias, activation)}
+      setOutputAtCoords(coords[0], coords[1], coords[2], coords[3], value);
+      }
+    }`;
+      return userCode;
+    };
+
+export const createConv2DMatMulProgramInfo =
+    (inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: ConvAttributes,
+     outputShape: readonly number[], dimAOuter: number, dimBOuter: number, dimInner: number, hasBias: boolean,
+     sequentialAccessByThreads: boolean): ProgramInfo => {
+      const isChannelsLast = attributes.format === 'NHWC';
+      const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
+      const batchSize = outputShape[0];
+      const outWidth = isChannelsLast ? outputShape[2] : outputShape[3];
+      const outHeight = isChannelsLast ? outputShape[1] : outputShape[2];
+      const outChannels = isChannelsLast ? outputShape[3] : outputShape[1];
+      const isVec4 = (((inChannels % 4 === 0 || inChannels % 3 === 0) && isChannelsLast) ||
+                      (outWidth % 4 === 0 && !isChannelsLast)) &&
+          outChannels % 4 === 0;
+
+      // TODO: fine tune size
+      const dispatchX = isChannelsLast ? outChannels : outWidth * outHeight;
+      const dispatchY = isChannelsLast ? outWidth * outHeight : outChannels;
+      const workGroupSize: [number, number, number] =
+          isVec4 ? [8, 8, 1] : [dispatchX <= 4 ? 4 : 16, dispatchX > 4 && dispatchY <= 4 ? 4 : 16, 1];
+      const elementsPerThread =
+          isVec4 ? [4, 4, 1] : [dispatchX <= 4 ? 1 : 2, dispatchX > 4 && dispatchY <= 4 ? 1 : 2, 1];
+      const dispatch = [
+        Math.ceil(dispatchX / workGroupSize[0] / elementsPerThread[0]),
+        Math.ceil(dispatchY / workGroupSize[1] / elementsPerThread[1]),
+        Math.ceil(batchSize / workGroupSize[2] / elementsPerThread[1])
+      ];
+
+      if (env.debug) {
+        // eslint-disable-next-line no-console
+        console.log(`dispatch = ${dispatch}`);
+      }
+
+      const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : elementsPerThread[0];
+
+      const tileAOuter = workGroupSize[1] * elementsPerThread[1];
+      const tileBOuter = workGroupSize[0] * elementsPerThread[0];
+      const tileInner = Math.max(workGroupSize[0] * innerElementSize, workGroupSize[1]);
+
+      const fitAOuter = dimAOuter % tileAOuter === 0;
+      const fitBOuter = dimBOuter % tileBOuter === 0;
+      const fitInner = dimInner % tileInner === 0;
+
+      const elementsSize = isVec4 ? [innerElementSize, 4, 4] : [1, 1, 1];
+
+      const declareInputs = [
+        `@group(0) @binding(0) var<storage, read> x: array<${isVec4 && innerElementSize === 4 ? 'vec4<f32>' : 'f32'}>;`,
+        `@group(0) @binding(1) var<storage, read> w: array<${isVec4 ? 'vec4<f32>' : 'f32'}>;`
+      ];
+      let declareFunctions = `
+      fn setOutputAtIndex(flatIndex : i32, value : ${isVec4 ? 'vec4<f32>' : 'f32'}) {
+        result[flatIndex] = ${isVec4 ? 'vec4<f32>' : 'f32'}(value);
+      }
+      fn setOutputAtCoords(d0 : i32, d1 : i32, d2 : i32, d3 : i32, value : ${isVec4 ? 'vec4<f32>' : 'f32'}) {
+        let flatIndex = getOutputIndexFromCoords(vec4<i32>(d0, d1, d2, d3));
+        setOutputAtIndex(flatIndex ${isVec4 ? '/ 4' : ''}, value);
+      }`;
+      if (hasBias) {
+        declareInputs.push(`@group(0) @binding(2) var<storage, read> bias: array<${isVec4 ? 'vec4<f32>' : 'f32'}>;`);
+        declareFunctions += `
+        fn getBiasByOutputCoords(coords : vec4<i32>) -> ${isVec4 ? 'vec4<f32>' : 'f32'} {
+          return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
+        }`;
+      }
+
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        dispatchGroup: () => ({x: dispatch[0], y: dispatch[1], z: dispatch[2]}),
+        shaderSource: `
+        ${utilFunctions}
+        //struct Uniforms { xShape : vec4<i32>, wShape : vec4<i32>, outShape : vec4<i32>,
+        //  outShapeStrides: vec3<i32>, filterDims : vec2<i32>, pad : vec2<i32>, stride : vec2<i32>,
+        //  dilation : vec2<i32>, dimAOuter : i32, dimBOuter : i32, dimInner : i32 };
+        ${declareInputs.join('')}
+        @group(0) @binding(${declareInputs.length}) var<storage, read_write> result: array<${
+            isVec4 ? 'vec4<f32>' : 'f32'}>;
+        //@group(0) @binding(${declareInputs.length + 1}) var<uniform> uniforms: Uniforms;
+
+        const xShape : vec4<i32> = vec4<i32>(${inputs[0].dims.join(',')});
+        const wShape : vec4<i32> = vec4<i32>(${inputs[1].dims.join(',')});
+        const outShape : vec4<i32> = vec4<i32>(${outputShape.join(',')});
+        const outShapeStrides : vec3<i32> = vec3<i32>(${ShapeUtil.computeStrides(outputShape).slice(0, 3).join(',')});
+        const filterDims : vec2<i32> = vec2<i32>(${attributes.kernelShape[0]}, ${attributes.kernelShape[1]});
+        const pad : vec2<i32> = vec2<i32>(${attributes.pads[0]}, ${attributes.pads[1]});
+        const stride : vec2<i32> = vec2<i32>(${attributes.strides[0]}, ${attributes.strides[1]});
+        const dilation : vec2<i32> = vec2<i32>(${attributes.dilations[0]}, ${attributes.dilations[1]});
+        const dimAOuter : i32 = ${dimAOuter};
+        const dimBOuter : i32 = ${dimBOuter};
+        const dimInner : i32 = ${dimInner};
+        ${declareFunctions}
+        ${
+            conv2dCommonSnippet(
+                isChannelsLast, fitAOuter, fitBOuter, fitInner, hasBias, undefined, false, elementsSize[0],
+                elementsSize[1], elementsSize[2])}
+            ${
+            isVec4 ? makeMatMulPackedVec4Source(elementsPerThread, workGroupSize, !isChannelsLast, tileInner) :
+                     makeMatMulPackedSource(
+                         elementsPerThread, workGroupSize, !isChannelsLast, tileInner, false, undefined,
+                         sequentialAccessByThreads)}`
+      };
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts
new file mode 100644
index 0000000000000..dd79f88ee6880
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts
@@ -0,0 +1,157 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+// sampled from [@tensorflow/tfjs] tfjs-core/src/ops/conv_util.ts
+//
+// modified to fit the needs of the project
+
+export const utilFunctions = `
+fn getIndexFromCoords4D(coords : vec4<i32>, shape : vec4<i32>) -> i32 {
+  return dot(coords, vec4<i32>(
+      shape.y * shape.z * shape.w, shape.z * shape.w, shape.w, 1));
+}
+fn getOutputIndexFromCoords(coords : vec4<i32>) -> i32 {
+  return dot(coords, vec4<i32>(
+    outShapeStrides.x, outShapeStrides.y, outShapeStrides.z, 1));
+}
+`;
+// type PadType = 'SAME'|'VALID'|'NUMBER'|'EXPLICIT';
+
+// export interface PadInfo {
+//   top: number;
+//   left: number;
+//   right: number;
+//   bottom: number;
+//   type: PadType;
+// }
+
+// /**
+//  * Information about the forward pass of a convolution/pooling operation.
+//  * It includes input and output shape, strides, filter size and padding
+//  * information.
+//  */
+// export interface Conv2DInfo {
+//   batchSize: number;
+//   inHeight: number;
+//   inWidth: number;
+//   inChannels: number;
+//   outHeight: number;
+//   outWidth: number;
+//   outChannels: number;
+//   isChannelsFirst: boolean;
+//   strideHeight: number;
+//   strideWidth: number;
+//   dilationHeight: number;
+//   dilationWidth: number;
+//   filterHeight: number;
+//   filterWidth: number;
+//   effectiveFilterHeight: number;
+//   effectiveFilterWidth: number;
+//   padInfo: PadInfo;
+//   inShape: [number, number, number, number];
+//   outShape: [number, number, number, number];
+//   filterShape: [number, number, number, number];
+// }
+
+// const parseTupleParam = (param: number|number[]): [number, number, number] => {
+//   if (typeof param === 'number') {
+//     return [param, param, param];
+//   }
+//   if (param.length === 2) {
+//     return [param[0], param[1], 1];
+//   }
+//   return param as [number, number, number];
+// };
+
+// /* See https://www.tensorflow.org/api_docs/python/tf/nn/atrous_conv2d
+//  * Atrous convolution is equivalent to standard convolution with upsampled
+//  * filters with effective_filter_height =
+//  * filter_height + (filter_height - 1) * (dilation - 1)
+//  * and effective_filter_width =
+//  * filter_width + (filter_width - 1) * (dilation - 1),
+//  * produced by inserting dilation - 1 zeros along consecutive elements across
+//  * the filters' spatial dimensions.
+//  * When there is a dilation, this converts a filter dimension to the
+//  * effective filter dimension, so it can be used in a standard convolution.
+//  */
+// const getEffectiveFilterSize = (filterSize: number, dilation: number): number => {
+//   if (dilation <= 1) {
+//     return filterSize;
+//   }
+
+//   return filterSize + (filterSize - 1) * (dilation - 1);
+// };
+
+
+// /**
+//  * Computes the information for a forward pass of a convolution/pooling
+//  * operation.
+//  */
+// export const computeConv2DInfo =
+//     (inShape: [number, number, number, number], filterShape: [number, number, number, number],
+//      strides: number|[number, number], dilations: number|[number, number],
+//      pad: 'SAME_UPPER'|'SAME_LOWER'|'VALID'|number|[number, number, number, number],
+//      roundingMode: 'floor'|'round'|'ceil', depthwise: boolean, isChannelsFirst: boolean): Conv2DInfo => {
+//       let [batchSize, inHeight, inWidth, inChannels] = [-1, -1, -1, -1];
+//       if (isChannelsFirst) {
+//         [batchSize, inChannels, inHeight, inWidth] = inShape;
+//       } else {
+//         [batchSize, inHeight, inWidth, inChannels] = inShape;
+//       }
+
+//       const [filterHeight, filterWidth, , filterChannels] = filterShape;
+//       const [strideHeight, strideWidth] = parseTupleParam(strides);
+//       const [dilationHeight, dilationWidth] = parseTupleParam(dilations);
+
+//       const effectiveFilterHeight = getEffectiveFilterSize(filterHeight, dilationHeight);
+//       const effectiveFilterWidth = getEffectiveFilterSize(filterWidth, dilationWidth);
+//       const {padInfo, outHeight, outWidth} = getPadAndOutInfo(
+//           pad, inHeight, inWidth, strideHeight, strideWidth, effectiveFilterHeight, effectiveFilterWidth,
+//           roundingMode, dataFormat);
+
+//       const outChannels = depthwise ? filterChannels * inChannels : filterChannels;
+
+//       let outShape: [number, number, number, number];
+//       if (dataFormat === 'channelsFirst') {
+//         outShape = [batchSize, outChannels, outHeight, outWidth];
+//       } else if (dataFormat === 'channelsLast') {
+//         outShape = [batchSize, outHeight, outWidth, outChannels];
+//       }
+
+//       return {
+//         batchSize,
+//         dataFormat,
+//         inHeight,
+//         inWidth,
+//         inChannels,
+//         outHeight,
+//         outWidth,
+//         outChannels,
+//         padInfo,
+//         strideHeight,
+//         strideWidth,
+//         filterHeight,
+//         filterWidth,
+//         effectiveFilterHeight,
+//         effectiveFilterWidth,
+//         dilationHeight,
+//         dilationWidth,
+//         inShape,
+//         outShape,
+//         filterShape
+//       };
+//     }
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
new file mode 100644
index 0000000000000..d30821e508083
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -0,0 +1,327 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+// sampled from [@tensorflow/tfjs] tfjs-backend-webgpu/src/matmul_packed_webgpu.ts
+//
+// modified to fit the needs of the project
+
+const writeDataToSubAVec4Snippet = (transpose: boolean) => {
+  if (transpose) {
+    return `
+        mm_Asub[inputRow][inputCol] = mm_readA(batch,
+          kStart + inputRow,
+          globalRowStart / innerElementSize + inputCol);
+        `;
+
+  } else {
+    return `
+        mm_Asub[inputRow][inputCol] = mm_readA(batch,
+          globalRow + innerRow,
+          kStart / innerElementSize + inputCol);
+        `;
+  }
+};
+
+const calculateResultSnippet = (transposeA: boolean, innerElementSize: number) => {
+  if (transposeA) {
+    return `
+        let ACached0 = mm_Asub[k * innerElementSize][localRow];
+        let ACached1 = mm_Asub[k * innerElementSize + 1][localRow];
+        let ACached2 = mm_Asub[k * innerElementSize + 2][localRow];
+        ${innerElementSize === 3 ? '' : 'let ACached3 = mm_Asub[k * innerElementSize + 3][localRow];'}
+        for (var i = 0; i < rowPerThread; i = i + 1) {
+          acc[i] = BCached0 * ACached0[i] + acc[i];
+          acc[i] = BCached1 * ACached1[i] + acc[i];
+          acc[i] = BCached2 * ACached2[i] + acc[i];
+          ${innerElementSize === 3 ? '' : 'acc[i] = BCached3 * ACached3[i] + acc[i];'}
+        }`;
+  } else {
+    return `
+        for (var i = 0; i < rowPerThread; i = i + 1) {
+          let ACached = mm_Asub[tileRow + i][k];
+          acc[i] = BCached0 * ACached.x + acc[i];
+          acc[i] = BCached1 * ACached.y + acc[i];
+          acc[i] = BCached2 * ACached.z + acc[i];
+          ${innerElementSize === 3 ? '' : 'acc[i] = BCached3 * ACached.w + acc[i];'}
+        }`;
+  }
+};
+
+export const makeMatMulPackedVec4Source =
+    (workPerThread: number[], workgroupSize: [number, number, number], transposeA = false, tileInner = 32,
+     splitK = false, splitedDimInner = 32, isVectorA = false): string => {
+      const tileAOuter = workgroupSize[1] * workPerThread[1];
+      const tileBOuter = workgroupSize[0] * workPerThread[0];
+      const tileAWidth = transposeA ? tileAOuter : tileInner;
+      const tileAHight = transposeA ? tileInner : tileAOuter;
+      const innerElementSize = tileAWidth / workgroupSize[0];
+      const rowPerThreadB = tileInner / workgroupSize[1];
+
+      if (!(((transposeA && innerElementSize === 4 && workPerThread[1] === 4) ||
+             (!transposeA && (innerElementSize === 3 || innerElementSize === 4))) &&
+            tileAWidth % workgroupSize[0] === 0 && tileInner % workgroupSize[1] === 0 && workPerThread[0] === 4)) {
+        throw new Error(`If transposeA ${transposeA} is true, innerElementSize ${
+            innerElementSize} and workPerThread[1] ${workPerThread[1]} must be 4.
+      Otherwise, innerElementSize ${innerElementSize} must be 3 or 4.
+  tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}. tileInner ${
+            tileInner} must be divisible by workgroupSize[1] ${workgroupSize[1]}. colPerThread ${
+            workPerThread[0]} must be 4.`);
+      }
+      return `
+var<workgroup> mm_Asub : array<array<vec${innerElementSize}<f32>, ${tileAWidth / innerElementSize}>, ${tileAHight}>;
+var<workgroup> mm_Bsub : array<array<vec4<f32>, ${tileBOuter / workPerThread[0]}>, ${tileInner}>;
+
+const rowPerThread = ${workPerThread[1]};
+const colPerThread = ${workPerThread[0]};
+const innerElementSize = ${innerElementSize};
+const tileInner = ${tileInner};
+
+@compute @workgroup_size(${workgroupSize[0]}, ${workgroupSize[1]}, ${workgroupSize[2]})
+fn main(@builtin(local_invocation_id) localId : vec3<u32>,
+        @builtin(global_invocation_id) globalId : vec3<u32>,
+        @builtin(workgroup_id) workgroupId : vec3<u32>) {
+  let localRow = i32(localId.y);
+  let tileRow = ${isVectorA ? '0' : 'localRow * rowPerThread'};
+  let tileCol = i32(localId.x);
+
+  let globalRow = ${isVectorA ? '0' : 'i32(globalId.y) * rowPerThread'};
+  let globalCol = i32(globalId.x);
+  let batch = ${splitK ? '0' : 'i32(globalId.z)'};
+  let globalRowStart = i32(workgroupId.y) * ${tileAOuter};
+
+  let numTiles = ${splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : '(dimInner - 1) / tileInner + 1'};
+  var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'};
+
+  var acc: array<vec4<f32>, rowPerThread>;
+
+  // Loop over shared dimension.
+  let tileRowB = localRow * ${rowPerThreadB};
+  for (var t = 0; t < numTiles; t = t + 1) {
+      // Load one tile of A into local memory.
+      for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+          let inputRow = tileRow + innerRow;
+          let inputCol = tileCol;
+          ${writeDataToSubAVec4Snippet(transposeA)}
+      }
+
+      // Load one tile of B into local memory.
+      for (var innerRow = 0; innerRow < ${rowPerThreadB}; innerRow = innerRow + 1) {
+          let inputRow = tileRowB + innerRow;
+          let inputCol = tileCol;
+          mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol);
+      }
+      kStart = kStart + tileInner;
+      workgroupBarrier();
+
+      // Compute acc values for a single thread.
+      for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {
+          let BCached0 = mm_Bsub[k * innerElementSize][tileCol];
+          let BCached1 = mm_Bsub[k * innerElementSize + 1][tileCol];
+          let BCached2 = mm_Bsub[k * innerElementSize + 2][tileCol];
+          ${innerElementSize === 3 ? '' : 'let BCached3 = mm_Bsub[k * innerElementSize + 3][tileCol];'}
+
+          ${calculateResultSnippet(transposeA, innerElementSize)}
+      }
+
+      workgroupBarrier();
+  }
+
+  for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      mm_write(batch, globalRow + innerRow, globalCol, acc[innerRow]);
+  }
+}`;
+    };
+
+const writeDataToSubASnippet = (transpose: boolean) => {
+  if (transpose) {
+    return `
+            mm_Asub[inputRow][inputCol] = mm_readA(batch,
+              kStart + inputRow,
+              globalRowStart + inputCol);
+            `;
+
+  } else {
+    return `
+            mm_Asub[inputRow][inputCol] = mm_readA(batch,
+              globalRowStart + inputRow,
+              kStart + inputCol);
+            `;
+  }
+};
+
+const readDataFromSubASnippet = (transposeA: boolean) =>
+    transposeA ? 'let ACached = mm_Asub[k][tileRow + innerRow];' : 'let ACached = mm_Asub[tileRow + innerRow][k];';
+
+// sequentialAccessByThreads means sequential data in memory is accessed by
+// threads, instead of a single thread (default behavior).
+export const makeMatMulPackedSource =
+    (workPerThread: number[], workgroupSize: [number, number, number], transposeA = false, tileInner = 32,
+     splitK = false, splitedDimInner = 32, sequentialAccessByThreads = false): string => {
+      const tileAOuter = workPerThread[1] * workgroupSize[1];
+      const tileBOuter = workPerThread[0] * workgroupSize[0];
+      const tileAWidth = transposeA ? tileAOuter : tileInner;
+      const tileAHight = transposeA ? tileInner : tileAOuter;
+
+      if (!(tileAHight % workgroupSize[1] === 0 && tileAWidth % workgroupSize[0] === 0 &&
+            tileInner % workgroupSize[1] === 0)) {
+        throw new Error(`tileAHight ${tileAHight} must be divisible by workgroupSize[1]${
+            workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${
+            workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`);
+      }
+      const rowPerThreadA = tileAHight / workgroupSize[1];
+      const colPerThreadA = tileAWidth / workgroupSize[0];
+      const rowPerThreadB = tileInner / workgroupSize[1];
+      const matmulSnippet = sequentialAccessByThreads ?
+          `
+    let localRow = i32(localId.y);
+    let localCol = i32(localId.x);
+    let globalRowStart = i32(workgroupId.y) * ${tileAOuter};
+    let globalColStart = i32(workgroupId.x) * ${tileBOuter};
+
+    // Loop over shared dimension.
+    for (var t = 0; t < numTiles; t = t + 1) {
+      // Load one tile of A into local memory.
+      for (var inputRow = localRow; inputRow < ${tileAHight}; inputRow = inputRow + ${workgroupSize[1]}) {
+        for (var inputCol = localCol; inputCol < ${tileAWidth}; inputCol = inputCol + ${workgroupSize[0]}) {
+          ${writeDataToSubASnippet(transposeA)}
+        }
+      }
+      // Load one tile of B into local memory.
+      for (var inputRow = localRow; inputRow < ${tileInner}; inputRow = inputRow + ${workgroupSize[1]}) {
+            for (var inputCol = localCol; inputCol < ${tileBOuter}; inputCol = inputCol + ${workgroupSize[0]}) {
+          mm_Bsub[inputRow][inputCol] = mm_readB(batch,
+            kStart + inputRow,
+            globalColStart + inputCol);
+        }
+      }
+      kStart = kStart + tileInner;
+      workgroupBarrier();
+
+      // Compute acc values for a single thread.
+      var BCached : array<f32, colPerThread>;
+      for (var k = 0; k < tileInner; k = k + 1) {
+        for (var inner = 0; inner < colPerThread; inner = inner + 1) {
+          BCached[inner] = mm_Bsub[k][localCol + inner * ${workgroupSize[0]}];
+        }
+        for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+          let ACached = ${
+              transposeA ? `mm_Asub[k][localRow + innerRow * ${workgroupSize[1]}];` :
+                           `mm_Asub[localRow + innerRow * ${workgroupSize[1]}][k];`}
+          for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+            acc[innerRow][innerCol] = acc[innerRow][innerCol] +
+                ACached * BCached[innerCol];
+          }
+        }
+      }
+      workgroupBarrier();
+    }
+    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      let gRow = globalRowStart + localRow + innerRow * ${workgroupSize[1]};
+      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+        let gCol = globalColStart + localCol + innerCol * ${workgroupSize[0]};
+        mm_write(batch, gRow, gCol, acc[innerRow][innerCol]);
+      }
+    }
+    ` :
+          `
+let tileRow = i32(localId.y) * rowPerThread;
+let tileCol = i32(localId.x) * colPerThread;
+
+let globalRow = i32(globalId.y) * rowPerThread;
+let globalCol = i32(globalId.x) * colPerThread;
+let globalRowStart = i32(workgroupId.y) * ${tileAOuter};
+
+let tileRowA = i32(localId.y) * ${rowPerThreadA};
+let tileColA = i32(localId.x) * ${colPerThreadA};
+let tileRowB = i32(localId.y) * ${rowPerThreadB};
+// Loop over shared dimension.
+for (var t = 0; t < numTiles; t = t + 1) {
+  // Load one tile of A into local memory.
+  for (var innerRow = 0; innerRow < ${rowPerThreadA}; innerRow = innerRow + 1) {
+    for (var innerCol = 0; innerCol < ${colPerThreadA}; innerCol = innerCol + 1) {
+      let inputRow = tileRowA + innerRow;
+      let inputCol = tileColA + innerCol;
+      ${writeDataToSubASnippet(transposeA)}
+    }
+  }
+
+  // Load one tile of B into local memory.
+  for (var innerRow = 0; innerRow < ${rowPerThreadB}; innerRow = innerRow + 1) {
+    for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+      let inputRow = tileRowB + innerRow;
+      let inputCol = tileCol + innerCol;
+      mm_Bsub[inputRow][inputCol] = mm_readB(batch,
+        kStart + inputRow,
+        globalCol + innerCol);
+    }
+  }
+  kStart = kStart + tileInner;
+  workgroupBarrier();
+
+  // Compute acc values for a single thread.
+  var BCached : array<f32, colPerThread>;
+  for (var k = 0; k < tileInner; k = k + 1) {
+    for (var inner = 0; inner < colPerThread; inner = inner + 1) {
+      BCached[inner] = mm_Bsub[k][tileCol + inner];
+    }
+
+    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      ${readDataFromSubASnippet(transposeA)}
+      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+        acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol];
+      }
+    }
+  }
+
+  workgroupBarrier();
+}
+
+for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+  for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+    mm_write(batch, globalRow + innerRow, globalCol + innerCol,
+        acc[innerRow][innerCol]);
+  }
+}
+`;
+
+      return `
+  var<workgroup> mm_Asub : array<array<f32, ${tileAWidth}>, ${tileAHight}>;
+  var<workgroup> mm_Bsub : array<array<f32, ${tileBOuter}>, ${tileInner}>;
+  const rowPerThread = ${workPerThread[1]};
+  const colPerThread = ${workPerThread[0]};
+  const tileInner = ${tileInner};
+
+@compute @workgroup_size(${workgroupSize[0]}, ${workgroupSize[1]}, ${workgroupSize[2]})
+fn main(@builtin(local_invocation_id) localId : vec3<u32>,
+        @builtin(global_invocation_id) globalId : vec3<u32>,
+        @builtin(workgroup_id) workgroupId : vec3<u32>) {
+    let batch = ${splitK ? '0' : 'i32(globalId.z)'};
+    let numTiles = ${splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : '(dimInner - 1) / tileInner + 1'};
+    var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'};
+
+    var acc : array<array<f32, colPerThread>, rowPerThread>;
+
+    // Without this initialization strange values show up in acc.
+    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+        acc[innerRow][innerCol] = 0.0;
+      }
+    }
+    ${matmulSnippet}
+  }
+`;
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 076562d9916ad..3aaa3db7e49af 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -8,8 +8,10 @@ import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-w
 import {ComputeContext} from '../types';
 
 import {createGroupedConvProgramInfoLoader} from './conv-grouped';
+import {createConv2DMatMulProgramInfoLoader} from './conv2d-mm';
 // import {createDotProductProgramInfoLoader} from './dot-product';
 import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
+import {createTransposeProgramInfo, TransposeAttributes, transposeProgramMetadata} from './transpose';
 
 // import {createIm2ColProgramInfoLoader} from './im2col';
 // import {createMatmulProgramInfoLoader} from './matmul';
@@ -40,8 +42,12 @@ export interface ConvAttributes extends InternalActivationAttributes, AttributeW
   readonly kernelShape: readonly number[];
   readonly pads: readonly number[];
   readonly strides: readonly number[];
+  readonly wIsConst: boolean;
 }
 
+// for transposing weight tensor from [M, C/group, KH, KW] to [KH, KW, C/group, M]
+const weightTransposeAttribute: TransposeAttributes = createAttributeWithCacheKey({perm: [2, 3, 1, 0]});
+
 const validateInputs = (inputs: readonly TensorView[], attributes: ConvAttributes): void => {
   // Refer to the below link for all input checks
   // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
@@ -127,21 +133,106 @@ export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAt
   const kernelShape = attributes.kernel_shape as [number, number];
   const pads = attributes.pads as [number, number, number, number];
   const strides = attributes.strides as [number, number];
+  const wIsConst = (attributes.w_is_const as () => boolean)();
 
   return createAttributeWithCacheKey(
-      {autoPad, format, dilations, group, kernelShape, pads, strides, ...activationAttributes});
+      {autoPad, format, dilations, group, kernelShape, pads, strides, wIsConst, ...activationAttributes});
 };
 
 const conv2d = (context: ComputeContext, attributes: ConvAttributes): number => {
   const adjustedAttributes = getAdjustedConvAttributes(attributes, context.inputs);
-  //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
-  //  if (adjustedAttributes.group > 1) {
-  context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
-  //  } else if (isPointwise) {
-  //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
-  //  } else {
-  //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
-  //  }
+
+  // check attributes
+
+  const hasBias = context.inputs.length === 3;
+  // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
+  const isChannelsLast = attributes.format === 'NHWC';
+
+  // const batchSize = context.inputs[0].dims[0];
+  const inputHeight = context.inputs[0].dims[isChannelsLast ? 1 : 2];
+  const inputWidth = context.inputs[0].dims[isChannelsLast ? 2 : 3];
+  const inputChannels = context.inputs[0].dims[isChannelsLast ? 3 : 1];
+  const weightHeight = context.inputs[1].dims[2];
+  const weightWidth = context.inputs[1].dims[3];
+
+  const outputShape = calculateOutputShape(
+      context.inputs[0].dims, context.inputs[1].dims, attributes.dilations, attributes.pads, attributes.strides,
+      isChannelsLast);
+  const outHeight = outputShape[isChannelsLast ? 1 : 2];
+  const outWidth = outputShape[isChannelsLast ? 2 : 3];
+  const outChannels = outputShape[isChannelsLast ? 3 : 1];
+
+  const sameSize =
+      isChannelsLast && weightHeight === inputHeight && weightWidth === inputWidth && attributes.autoPad === 'VALID';
+  if (sameSize ||
+      (weightHeight === 1 && weightWidth === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1 &&
+       attributes.strides[0] === 1 && attributes.strides[1] === 1 &&
+       (attributes.autoPad === 'SAME_UPPER' || attributes.autoPad === 'SAME_LOWER' ||
+        attributes.autoPad === 'VALID'))) {
+    // return conv2dByMatMul({x, filter, convInfo, backend, bias, activation, preluActivationWeights, leakyreluAlpha});
+    // eslint-disable-next-line no-console
+    console.log('[_CONV_]conv2dByMatMul');
+    context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
+    return 0;
+  }
+
+  if (!isChannelsLast || attributes.group !== 1) {
+    context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
+    return 0;
+  }
+
+  // const thresholdToIncreaseWorkgroups = 8;
+  // const workgroupsBy32x32 = batchSize * Math.ceil((outHeight * outWidth) / 32) * Math.ceil(outChannels / 32);
+  // if (workgroupsBy32x32 <= thresholdToIncreaseWorkgroups) {
+  //   // return conv2dWithIm2Col({x, filter, convInfo, backend, bias, preluActivationWeights, leakyreluAlpha,
+  //   // activation});
+  //   //  eslint-disable-next-line no-console
+  //   console.log('[_CONV_]conv2dWithIm2Col');
+  //   context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
+  //   return 0;
+  // }
+
+  const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels;
+  const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth;
+  const dimInner = weightHeight * weightWidth * inputChannels;
+
+  const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true;
+  // const inputs = [context.inputs[0], context.inputs[1]];
+  // if (hasBias) {
+  //   if (!isChannelsLast && context.inputs[2].dims.length === 1) {
+  //     inputs.push(context.inputs[2].reshape([context.inputs[2].dims[0], 1, 1]));
+  //   } else {
+  //     inputs.push(context.inputs[2]);
+  //   }
+  // }
+  // eslint-disable-next-line no-console
+  // console.log('[_CONV_]Conv2DMMProgram');
+
+  // STEP.1: transpose weight
+  const transposedWeight = (context.customData.wT as TensorView | undefined) ??
+      context.compute(
+          {
+            ...transposeProgramMetadata,
+            cacheHint: weightTransposeAttribute.cacheKey,
+            get: () => createTransposeProgramInfo(context.inputs[1], weightTransposeAttribute.perm)
+          },
+          {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+  if (attributes.wIsConst && !context.customData.wT) {
+    context.customData.wT = transposedWeight;
+  }
+
+  const inputs = [context.inputs[0], transposedWeight];
+  if (hasBias) {
+    if (!isChannelsLast && context.inputs[2].dims.length === 1) {
+      inputs.push(context.inputs[2].reshape([context.inputs[2].dims[0], 1, 1]));
+    } else {
+      inputs.push(context.inputs[2]);
+    }
+  }
+  context.compute(
+      createConv2DMatMulProgramInfoLoader(
+          inputs, adjustedAttributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias, sequentialAccessByThreads),
+      {inputs});
   return 0;
 };
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
new file mode 100644
index 0000000000000..0abece9559630
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {TensorView} from '../../tensor';
+import {GpuDataType, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
+import {ConvAttributes} from './conv';
+
+
+const createConv2DMatMulProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
+  name: 'Conv2DMatMul',
+  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                        [GpuDataType.default, GpuDataType.default],
+  cacheHint
+});
+
+export const createConv2DMatMulProgramInfoLoader =
+    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], dimAOuter: number,
+     dimBOuter: number, dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean): ProgramInfoLoader => {
+      const metadata = createConv2DMatMulProgramMetadata(hasBias, attributes.cacheKey);
+      return {
+        ...metadata,
+        get: () => createConv2DMatMulProgramInfo(
+            inputs, metadata, attributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias,
+            sequentialAccessByThreads)
+      };
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index e2c0d89fde1a8..d4ab30ab00ccb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -29,16 +29,19 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
 
 const getAdjustedPoolAttributesAndOutputShape = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
     inputs: readonly TensorView[], attributes: AttributeType, isGlobalOperator: boolean): [AttributeType, number[]] => {
-  const inputShape = inputs[0].dims.slice();
+  const isChannelsLast = attributes.format === 'NHWC';
+  const inputShapeAsChannelFirst = isChannelsLast ?
+      [inputs[0].dims[0], inputs[0].dims[3], inputs[0].dims[1], inputs[0].dims[2]] :
+      inputs[0].dims.slice();
   const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
   const kernelShape = attributes.kernelShape.slice();
   const strides = attributes.strides.slice();
   const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
   const pads = attributes.pads.slice();
-  PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
+  PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShapeAsChannelFirst, kernelShape, strides, dilations, pads);
 
-  const outputShape = PoolConvUtil.computePoolOutputShape(
-      isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
+  const outputShapeAsChannelFirst = PoolConvUtil.computePoolOutputShape(
+      isGlobalOperator, inputShapeAsChannelFirst, strides, dilations, kernelShape, pads, attributes.autoPad);
 
   const newAttributes = Object.assign({}, attributes);
   if (hasDilations) {
@@ -46,12 +49,21 @@ const getAdjustedPoolAttributesAndOutputShape = <AttributeType extends AveragePo
   } else {
     Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
   }
-  return [newAttributes, outputShape];
+  return [
+    newAttributes,
+    isChannelsLast ?
+        [
+          outputShapeAsChannelFirst[0], outputShapeAsChannelFirst[2], outputShapeAsChannelFirst[3],
+          outputShapeAsChannelFirst[1]
+        ] :
+        outputShapeAsChannelFirst
+  ];
 };
 
 const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
     inputDims: readonly number[], outputShape: readonly number[], attributes: AttributeType, op1: string, op2: string,
     dataType: string, start: string): string => {
+  const isChannelsLast = attributes.format === 'NHWC';
   const rank = inputDims.length;
   const outputSize = ShapeUtil.size(outputShape);
   const outputIndicesHelper = createIndicesHelper('output', outputShape);
@@ -62,15 +74,15 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
     const sw = attributes.strides[attributes.strides.length - 1];
     const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
     const pwEnd = attributes.pads[attributes.pads.length - 1];
-    const dimW = inputDims[rank - 1];
+    const dimIdxW = rank - (isChannelsLast ? 2 : 1);
     let codeW = '';
     let codeH = '';
     let codeHEnd = '';
     if (pwStart + pwEnd !== 0) {
       codeW = `
               for (var i: u32 = 0u; i < ${kw}u; i++) {
-                xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
-                if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
+                xIndices[${dimIdxW}] = indices[${dimIdxW}] * ${sw} - ${pwStart} + i;
+                if (xIndices[${dimIdxW}] < 0 || xIndices[${dimIdxW}] >= ${inputDims[dimIdxW]}) {
                   pad++;
                   continue;
                 }
@@ -80,7 +92,7 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
     } else {
       codeW = `
               for (var i: u32 = 0u; i < ${kw}u; i++) {
-                xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
+                xIndices[${dimIdxW}] = indices[${dimIdxW}] * ${sw} - ${pwStart} + i;
                 let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
                 ${op1}
               }`;
@@ -91,12 +103,13 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
       const sh = attributes.strides[attributes.strides.length - 2];
       const phStart = attributes.pads[attributes.pads.length / 2 - 2];
       const phEnd = attributes.pads[attributes.pads.length - 2];
-      const dimH = inputDims[rank - 2];
+      const dimIdxH = rank - (isChannelsLast ? 3 : 2);
+      const dimH = inputDims[dimIdxH];
       if (phStart + phEnd !== 0) {
         codeH = `
                 for (var j: u32 = 0u; j < ${kh}u; j++) {
-                  xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
-                  if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
+                  xIndices[${dimIdxH}] = indices[${dimIdxH}] * ${sh} - ${phStart} + j;
+                  if (xIndices[${dimIdxH}] < 0 || xIndices[${dimIdxH}] >= ${dimH}) {
                     pad+= ${kw};
                     continue;
                   }
@@ -104,7 +117,7 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
       } else {
         codeH = `
                 for (var j: u32 = 0u; j < ${kh}u; j++) {
-                  xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
+                  xIndices[${dimIdxH}] = indices[${dimIdxH}] * ${sh} - ${phStart} + j;
                 `;
       }
       codeHEnd = `
@@ -144,6 +157,9 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
             }`;
     return poolingCode;
   } else {
+    if (isChannelsLast) {
+      throw new Error('Pooling with kernelShape.length > 2 is not supported for NHWC format.');
+    }
     const kernelSize = ShapeUtil.size(attributes.kernelShape);
     const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
     const stridesRank = kernelStrides.length;
@@ -223,7 +239,11 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
   }
 };
 
-export interface PoolCommonAttributes {
+export interface FormatAttributes {
+  readonly format: 'NHWC'|'NCHW';
+}
+
+export interface PoolCommonAttributes extends FormatAttributes {
   readonly autoPad: string;
   readonly ceilMode: number;
   readonly kernelShape: readonly number[];
@@ -232,6 +252,7 @@ export interface PoolCommonAttributes {
 }
 
 const parsePoolCommonAttributes = (attributes: Record<string, unknown>): PoolCommonAttributes => ({
+  format: attributes.format as FormatAttributes['format'],
   autoPad: ['NOTSET', 'VALID', 'SAME_UPPER', 'SAME_LOWER'][attributes.auto_pad as number],
   ceilMode: attributes.ceil_mode as number,
   kernelShape: attributes.kernel_shape as [number, number],
@@ -298,11 +319,15 @@ const globalPoolAttributes = {
   cacheKey: ''
 };
 
-export const globalAveragePool = (context: ComputeContext): number => {
+export const parseGlobalAveragePoolAttributes = (attributes: Record<string, unknown>): AveragePoolAttributes => {
+  const format = attributes.format as FormatAttributes['format'];
+  return {format, ...globalPoolAttributes, cacheKey: format};
+};
+
+export const globalAveragePool = (context: ComputeContext, attributes: AveragePoolAttributes): number => {
   validateInputs(context.inputs);
-  const metadata = {name: 'GlobalAveragePool', inputTypes: [GpuDataType.default]};
-  context.compute(
-      {...metadata, get: () => createAveragePoolProgramInfo(context.inputs, metadata, true, globalPoolAttributes)});
+  const metadata = {name: 'GlobalAveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+  context.compute({...metadata, get: () => createAveragePoolProgramInfo(context.inputs, metadata, true, attributes)});
   return 0;
 };
 
@@ -351,16 +376,14 @@ export const parseMaxPoolAttributes = (attributes: Record<string, unknown>): Max
   return createAttributeWithCacheKey({storageOrder, dilations, ...attr});
 };
 
-const globalMaxPoolMetadata = {
-  name: 'GlobalMaxPool',
-  inputTypes: [GpuDataType.default]
+export const parseGlobalMaxPoolAttributes = (attributes: Record<string, unknown>): MaxPoolAttributes => {
+  const format = attributes.format as FormatAttributes['format'];
+  return {format, ...globalPoolAttributes, cacheKey: format};
 };
 
-export const globalMaxPool = (context: ComputeContext): number => {
+export const globalMaxPool = (context: ComputeContext, attributes: MaxPoolAttributes): number => {
   validateInputs(context.inputs);
-  context.compute({
-    ...globalMaxPoolMetadata,
-    get: () => createMaxPoolProgramInfo(context.inputs, globalMaxPoolMetadata, true, globalPoolAttributes)
-  });
+  const metadata = {name: 'GlobalMaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
+  context.compute({...metadata, get: () => createMaxPoolProgramInfo(context.inputs, metadata, true, attributes)});
   return 0;
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 9d666d724e897..06c3729510b9c 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -13,7 +13,7 @@ export interface TransposeAttributes extends AttributeWithCacheKey {
   readonly perm: number[];
 }
 
-const transposeProgramMetadata = {
+export const transposeProgramMetadata = {
   name: 'Transpose',
   inputTypes: [GpuDataType.default]
 };
@@ -44,7 +44,7 @@ const permFunctionBody = (perm: number[], rank: number): string => {
   return reverseFunc.join('\n');
 };
 
-const createTransposeProgramInfo = (input: TensorView, permAttr: number[]): ProgramInfo => {
+export const createTransposeProgramInfo = (input: TensorView, permAttr: number[]): ProgramInfo => {
   const dataType = 'f32';  // TODO: support other data type
   const inputShape = input.dims;
   const perm = getAdjustedPerm(inputShape, permAttr);
@@ -102,4 +102,4 @@ export const transpose = (context: ComputeContext, attributes: TransposeAttribut
 };
 
 export const parseTransposeAttributes = (attributes: Record<string, unknown>): TransposeAttributes =>
-    createAttributeWithCacheKey({perm: Array.from(attributes.perm as Iterable<number>)});
+    createAttributeWithCacheKey({perm: attributes.perm as number[]});
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index e9dd0378ccb59..34ab337105ff4 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -105,10 +105,12 @@ export interface ComputeContextInputsOutputsMapping {
    */
   readonly inputs?: ReadonlyArray<TensorView|number>;
   /**
-   * specify the mapping to the program's outputs. the value can be a number or undefined.
+   * specify the mapping to the program's outputs. the value must be a number.
    * - if it's a non-negative number, it's the index of the kernel's output
    * - if it's -1, it's an output that will be created as a temporary value. this value will be released after
    * the kernel is executed.
+   * - if it's -2, it's an output that will be created as a persistent value. this value will be released when the
+   * kernel is released.
    *
    * if outputs is not specified, the mapping will be the kernel's outputs in order.
    */
@@ -118,6 +120,7 @@ export interface ComputeContextInputsOutputsMapping {
 export interface ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
+  readonly customData: {[key: string]: unknown};
   compute(program: ProgramInfoLoader|ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping):
       TensorView[];
   output(index: number, dims: readonly number[]): number;
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index facce93cde798..9ea751b455046 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -1444,6 +1444,9 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   }
 #endif
 
+#ifndef NDEBUG
+  printf("before SaveInitializedTensors()\n");
+#endif
   ORT_RETURN_IF_ERROR(
       session_state_utils::SaveInitializedTensors(
           Env::Default(), graph_location, *graph_viewer_,
@@ -1463,6 +1466,9 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   // Record Weight allocation info on device
   GetMemoryProfiler()->GetMemoryInfo().RecordInitializerAllocInfo(GetInitializedTensors());
 #endif
+#ifndef NDEBUG
+  printf("after SaveInitializedTensors()\n");
+#endif
 
   // remove weights from the graph now to save memory but in many cases it won't save memory, if the tensor was
   // preallocated with the some other tensors in a single 'allocate' call, which is very common.
diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc
index 16eb812e8b223..ad45139070e2d 100644
--- a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc
+++ b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc
@@ -85,6 +85,8 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
   REGISTER_NHWC_SCHEMA_FROM_MSDOMAIN(fn, QLinearAveragePool, 1);
   REGISTER_NHWC_SCHEMA_FROM_MSDOMAIN(fn, QLinearConvTranspose, 1);
 
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, GlobalAveragePool, 1);
+
   // TODO: Add other layout sensitive ops when needed. Those are:
   //   BatchNormalization,
   //   AveragePool, GlobalAveragePool, GlobalMaxPool,
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 5527cfd7d4bba..30d8c00d45d8e 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -139,6 +139,10 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Tra
 
 //class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
+//class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool);
+//class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
+//class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
@@ -240,8 +244,12 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose)>,
 
-      //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
+      //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool)>,
+      //BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool)>,
+      //BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index 9c5e653006ce9..ac5f20f185288 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -42,7 +42,7 @@ class JsExecutionProvider : public IExecutionProvider {
 
   void RegisterAllocator(AllocatorManager& /*allocator_manager*/) override;
 
-  //DataLayout GetPreferredLayout() const override { return DataLayout::NHWC; }
+  DataLayout GetPreferredLayout() const override { return DataLayout::NHWC; }
 
   FusionStyle GetFusionStyle() const override { return FusionStyle::FilteredGraphViewer; }
 
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 91ba2f085243f..ca303f855a8bc 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -12,13 +12,15 @@ namespace js {
 template <typename T, bool is_channels_last>
 class Conv : public JsKernel {
  public:
-  Conv(const OpKernelInfo& info) : JsKernel(info), conv_attrs_(info) {
+  Conv(const OpKernelInfo& info) : JsKernel(info), conv_attrs_(info), w_is_const_(false) {
 
     TensorShapeVector kernel_shape;
     if (conv_attrs_.kernel_shape_specified) {
         ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape).IsOK());
     }
 
+    int64_t channels_last = is_channels_last ? 1 : info.GetAttrOrDefault<int64_t>("channels_last", 0);
+
     // currently only support Conv2D. TODO: support other
     JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
         "format": $13 ? "NHWC" : "NCHW",
@@ -27,7 +29,8 @@ class Conv : public JsKernel {
         "group": $4,
         "kernel_shape": [$5, $6],
         "pads": [$7, $8, $9, $10],
-        "strides": [$11, $12]
+        "strides": [$11, $12],
+        "w_is_const": () => (!!HEAP8[$14])
     }),
     static_cast<int32_t>(conv_attrs_.auto_pad),
     static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
@@ -41,12 +44,32 @@ class Conv : public JsKernel {
     static_cast<int32_t>(conv_attrs_.pads.size() > 3 ? conv_attrs_.pads[3] : 0),
     static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
     static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
-    static_cast<int32_t>(is_channels_last)
+    static_cast<int32_t>(channels_last),
+    reinterpret_cast<int32_t>(&w_is_const_)
     );
   }
 
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 /*out*/ bool& is_packed,
+                 /*out*/ PrePackedWeights* /* prepacked_weights */) override {
+    is_packed = false;
+
+    if (input_idx == 1) {
+      // Only handle the common case of conv2D
+      if (tensor.Shape().NumDimensions() != 4 || tensor.SizeInBytes() == 0) {
+        return Status::OK();
+      }
+
+      w_is_const_ = true;
+    }
+
+    return Status::OK();
+  }
+
  protected:
   ConvAttributes conv_attrs_;
+  bool w_is_const_;
+  //Tensor w_transposed_;
 };
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc
index 0bcc34a210009..9ccc654e08eb8 100644
--- a/onnxruntime/core/providers/js/operators/pool.cc
+++ b/onnxruntime/core/providers/js/operators/pool.cc
@@ -8,64 +8,65 @@
 namespace onnxruntime {
 namespace js {
 
-#define POOLING_KERNEL(op_name, data_type, pool_type, since_version)                               \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                   \
-      op_name,                                                                                     \
-      kOnnxDomain,                                                                                 \
-      since_version,                                                                               \
-      data_type,                                                                                   \
-      kJsExecutionProvider,                                                                        \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
-      Pool<data_type, pool_type>);
+#define POOLING_KERNEL(op_name, domain, is_channels_last, data_type, pool_type, since_version)             \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                           \
+      op_name,                                                                                             \
+      domain,                                                                                              \
+      since_version,                                                                                       \
+      data_type,                                                                                           \
+      kJsExecutionProvider,                                                                                \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),         \
+      Pool<data_type, pool_type, is_channels_last>);
 
-#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                  \
-      op_name,                                                                              \
-      kOnnxDomain,                                                                          \
-      since_version,                                                                        \
-      end_version,                                                                          \
-      data_type,                                                                            \
-      kJsExecutionProvider,                                                                 \
-      (*KernelDefBuilder::Create())                                                         \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),                   \
-      Pool<data_type, pool_type>);
+#define POOLING_KERNEL_VERSIONED(op_name, domain, is_channels_last, data_type, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                                            \
+      op_name,                                                                                                        \
+      domain,                                                                                                         \
+      since_version,                                                                                                  \
+      end_version,                                                                                                    \
+      data_type,                                                                                                      \
+      kJsExecutionProvider,                                                                                           \
+      (*KernelDefBuilder::Create())                                                                                   \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),                                             \
+      Pool<data_type, pool_type, is_channels_last>);
 
-#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version) \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                  \
-      op_name,                                                                    \
-      kOnnxDomain,                                                                \
-      since_version,                                                              \
-      data_type,                                                                  \
-      kJsExecutionProvider,                                                       \
-      (*KernelDefBuilder::Create())                                               \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())          \
-          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),           \
-      Pool<data_type, pool_type>);
+#define POOLING_KERNEL_WITH_INDICES(op_name, domain, is_channels_last, data_type, pool_type, since_version) \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                            \
+      op_name,                                                                                              \
+      domain,                                                                                               \
+      since_version,                                                                                        \
+      data_type,                                                                                            \
+      kJsExecutionProvider,                                                                                 \
+      (*KernelDefBuilder::Create())                                                                         \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                    \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                     \
+      Pool<data_type, pool_type, is_channels_last>);
 
-#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                               \
-      op_name,                                                                                           \
-      kOnnxDomain,                                                                                       \
-      since_version,                                                                                     \
-      end_version,                                                                                       \
-      data_type,                                                                                         \
-      kJsExecutionProvider,                                                                              \
-      (*KernelDefBuilder::Create())                                                                      \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                 \
-          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                  \
-      Pool<data_type, pool_type>);
+#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, domain, is_channels_last, data_type, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                                                         \
+      op_name,                                                                                                                     \
+      domain,                                                                                                                      \
+      since_version,                                                                                                               \
+      end_version,                                                                                                                 \
+      data_type,                                                                                                                   \
+      kJsExecutionProvider,                                                                                                        \
+      (*KernelDefBuilder::Create())                                                                                                \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                                           \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                                            \
+      Pool<data_type, pool_type, is_channels_last>);
 
-POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9)
-POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10)
-POOLING_KERNEL(AveragePool, float, AveragePool, 11)
-POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1)
+POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 10, 10)
+POOLING_KERNEL(AveragePool, kOnnxDomain, false, float, AveragePool, 11)
+POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, float, AveragePool, 1)
+POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, float, AveragePool, 1)
 
-POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11)
-POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12)
-POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1)
+POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, float, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 11, 11)
+POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 12)
+POOLING_KERNEL(GlobalMaxPool, kOnnxDomain, false, float, MaxPool<1>, 1)
 
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/pool.h b/onnxruntime/core/providers/js/operators/pool.h
index b2ec8947e25b8..294cbfac1e03f 100644
--- a/onnxruntime/core/providers/js/operators/pool.h
+++ b/onnxruntime/core/providers/js/operators/pool.h
@@ -10,6 +10,7 @@ namespace onnxruntime {
 namespace js {
 
 #define POOL_ATTRIBUTES_JS_OBJ_MAPPING ({ \
+    "format": $15 ? "NHWC" : "NCHW",      \
     "auto_pad": $1,                       \
     "ceil_mode": $2,                      \
     "count_include_pad": $3,              \
@@ -34,18 +35,22 @@ namespace js {
   static_cast<int32_t>(pool_attrs_.pads.size() > 2 ? pool_attrs_.pads[2] : 0),                 \
   static_cast<int32_t>(pool_attrs_.pads.size() > 3 ? pool_attrs_.pads[3] : 0),                 \
   static_cast<int32_t>(pool_attrs_.strides.size() > 0 ? pool_attrs_.strides[0] : 0),           \
-  static_cast<int32_t>(pool_attrs_.strides.size() > 1 ? pool_attrs_.strides[1] : 0)
+  static_cast<int32_t>(pool_attrs_.strides.size() > 1 ? pool_attrs_.strides[1] : 0),           \
+  static_cast<int32_t>(is_channels_last)
 
+#define GLOBAL_POOL_ATTRIBUTES_JS_OBJ_MAPPING ({ "format": $1 ? "NHWC" : "NCHW" })
+#define GLOBAL_POOL_ATTRIBUTES_PARAM_LIST static_cast<int32_t>(is_channels_last)
 
-template <typename T, typename PoolType>
+
+template <typename T, typename PoolType, bool is_channels_last>
 class Pool : public JsKernel, public PoolBase {
  public:
   Pool(const OpKernelInfo& info) : JsKernel(info), PoolBase(info) {
     if (pool_attrs_.global_pooling) {
       if constexpr (PoolType::type == onnxruntime::PoolType::kAveragePool) {
-        JSEP_INIT_KERNEL(GlobalAveragePool);
+        JSEP_INIT_KERNEL_ATTRIBUTE(GlobalAveragePool, GLOBAL_POOL_ATTRIBUTES_JS_OBJ_MAPPING, GLOBAL_POOL_ATTRIBUTES_PARAM_LIST);
       } else if constexpr (PoolType::type == onnxruntime::PoolType::kMaxPool) {
-        JSEP_INIT_KERNEL(GlobalMaxPool);
+        JSEP_INIT_KERNEL_ATTRIBUTE(GlobalMaxPool, GLOBAL_POOL_ATTRIBUTES_JS_OBJ_MAPPING, GLOBAL_POOL_ATTRIBUTES_PARAM_LIST);
       } else {
         // TODO: GlobalLpPool
       }
@@ -61,10 +66,10 @@ class Pool : public JsKernel, public PoolBase {
   }
 };
 
-template <typename T>
-class Pool<T, MaxPool<8>> final : public Pool<T, MaxPool<1>> {
+template <typename T, bool is_channels_last>
+class Pool<T, MaxPool<8>, is_channels_last> final : public Pool<T, MaxPool<1>, is_channels_last> {
  public:
-  Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>>(info) {}
+  Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, is_channels_last>(info) {}
 };
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
index 97bc6d6a87b39..c8ace13bddfd5 100644
--- a/onnxruntime/core/providers/js/operators/transpose.h
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -21,8 +21,10 @@ class Transpose final : public JsKernel, public TransposeBase {
             perm[i] = gsl::narrow_cast<int32_t>(perm_[i]);
         }
     }
+    // printf("Transpose: perm_specified_ = %d, perm.size() = %d, perm[0] = %d, perm[1] = %d, perm[2] = %d, perm[3] = %d\n",
+    //   perm_specified_, static_cast<int32_t>(perm.size()), perm[0], perm[1], perm[2], perm[3]);
     JSEP_INIT_KERNEL_ATTRIBUTE(Transpose, ({
-        "perm": $1 ? Module.HEAP32.subarray($2, $2 + $1) : []
+        "perm": $1 ? Array.from(HEAP32.subarray($2, $2 + $1)) : []
     }),
     gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
     reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2

From ce206b663e06aea691a0dd6bcc589087ac2718ca Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 23 Jan 2023 17:39:34 -0800
Subject: [PATCH 33/81] fix buildbreak (api upgrade)

---
 .../lib/onnxjs/backends/webgpu/program-manager.ts  |  5 +++--
 js/web/lib/wasm/jsep/webgpu/program-manager.ts     |  5 +++--
 js/web/package-lock.json                           | 14 +++++++-------
 js/web/package.json                                |  2 +-
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/js/web/lib/onnxjs/backends/webgpu/program-manager.ts b/js/web/lib/onnxjs/backends/webgpu/program-manager.ts
index dac32ccbe4f72..3a6ae37e3ab54 100644
--- a/js/web/lib/onnxjs/backends/webgpu/program-manager.ts
+++ b/js/web/lib/onnxjs/backends/webgpu/program-manager.ts
@@ -49,7 +49,7 @@ export class ProgramManager {
     computePassEncoder.setBindGroup(0, bindGroup);
 
     const {x, y, z} = dispatchGroup;
-    computePassEncoder.dispatch(x, y, z);
+    computePassEncoder.dispatchWorkgroups(x, y, z);
 
     this.backend.pendingDispatchNumber++;
 
@@ -68,7 +68,8 @@ export class ProgramManager {
       Logger.verbose('WebGpuProgram', programInfo.shaderSource);
     }
 
-    const computePipeline = device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}});
+    const computePipeline =
+        device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}, layout: 'auto'});
 
     return {programInfo, computePipeline};
   }
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index fdb917dc2e4d5..e1992ac3b58f4 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -48,7 +48,7 @@ export class ProgramManager {
     computePassEncoder.setBindGroup(0, bindGroup);
 
     const {x, y, z} = dispatchGroup;
-    computePassEncoder.dispatch(x, y, z);
+    computePassEncoder.dispatchWorkgroups(x, y, z);
 
     this.backend.pendingDispatchNumber++;
 
@@ -68,7 +68,8 @@ export class ProgramManager {
       console.log('WebGpuProgram: ' + programInfo.shaderSource);
     }
 
-    const computePipeline = device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}});
+    const computePipeline =
+        device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}, layout: 'auto'});
 
     return {programInfo, computePipeline};
   }
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 4e68256a80774..fbaa91c877487 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -29,7 +29,7 @@
         "@types/mocha": "^8.2.2",
         "@types/npmlog": "^4.1.2",
         "@types/platform": "^1.3.3",
-        "@webgpu/types": "^0.1.13",
+        "@webgpu/types": "^0.1.24",
         "base64-js": "^1.5.1",
         "chai": "^4.3.4",
         "dir-compare": "^3.3.0",
@@ -593,9 +593,9 @@
       }
     },
     "node_modules/@webgpu/types": {
-      "version": "0.1.13",
-      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.13.tgz",
-      "integrity": "sha512-SAq8FRONvMANQi/eXw5ArKfSvih6am/EC+5y7+du2xf1VyprtKn4ylUPKGW4T6ZkDogtH3xZgGE+J/cx601L5w==",
+      "version": "0.1.24",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.24.tgz",
+      "integrity": "sha512-Mkz+SVJwHApTg6nCzqIuHDt3HsGRcCvHJNkWT2PgZTTC2Gy+LXvN4+7x6YvduAcx3F/pEDWW5OfAHs6VSo6J4Q==",
       "dev": true
     },
     "node_modules/@webpack-cli/configtest": {
@@ -7560,9 +7560,9 @@
       }
     },
     "@webgpu/types": {
-      "version": "0.1.13",
-      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.13.tgz",
-      "integrity": "sha512-SAq8FRONvMANQi/eXw5ArKfSvih6am/EC+5y7+du2xf1VyprtKn4ylUPKGW4T6ZkDogtH3xZgGE+J/cx601L5w==",
+      "version": "0.1.24",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.24.tgz",
+      "integrity": "sha512-Mkz+SVJwHApTg6nCzqIuHDt3HsGRcCvHJNkWT2PgZTTC2Gy+LXvN4+7x6YvduAcx3F/pEDWW5OfAHs6VSo6J4Q==",
       "dev": true
     },
     "@webpack-cli/configtest": {
diff --git a/js/web/package.json b/js/web/package.json
index 1aa8e3e673de7..89d1db7c52dfb 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -45,7 +45,7 @@
     "@types/mocha": "^8.2.2",
     "@types/npmlog": "^4.1.2",
     "@types/platform": "^1.3.3",
-    "@webgpu/types": "^0.1.13",
+    "@webgpu/types": "^0.1.24",
     "base64-js": "^1.5.1",
     "chai": "^4.3.4",
     "dir-compare": "^3.3.0",

From 429f426364262540feca54c0f2a8b62d6abac62a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 24 Jan 2023 17:24:58 -0800
Subject: [PATCH 34/81] fix conv padding

---
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 3aaa3db7e49af..431e3fdbf1969 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -156,7 +156,7 @@ const conv2d = (context: ComputeContext, attributes: ConvAttributes): number =>
   const weightWidth = context.inputs[1].dims[3];
 
   const outputShape = calculateOutputShape(
-      context.inputs[0].dims, context.inputs[1].dims, attributes.dilations, attributes.pads, attributes.strides,
+      context.inputs[0].dims, context.inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
       isChannelsLast);
   const outHeight = outputShape[isChannelsLast ? 1 : 2];
   const outWidth = outputShape[isChannelsLast ? 2 : 3];

From 5791040bb74b6b31e642c5c58a930e2ac1a30e50 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 24 Jan 2023 17:25:16 -0800
Subject: [PATCH 35/81] support pooling channel last

---
 .../graph/contrib_ops/internal_nhwc_onnx_opset.cc |  1 +
 .../core/providers/js/js_execution_provider.cc    | 15 ++++++++-------
 onnxruntime/core/providers/js/operators/pool.cc   |  4 ++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc
index ad45139070e2d..58baed9c4236c 100644
--- a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc
+++ b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_opset.cc
@@ -86,6 +86,7 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
   REGISTER_NHWC_SCHEMA_FROM_MSDOMAIN(fn, QLinearConvTranspose, 1);
 
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, GlobalAveragePool, 1);
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, GlobalMaxPool, 1);
 
   // TODO: Add other layout sensitive ops when needed. Those are:
   //   BatchNormalization,
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 30d8c00d45d8e..6a8279ef8678f 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -137,12 +137,12 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Res
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose);
 
-//class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
-//class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool);
-//class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
-//class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
@@ -245,10 +245,11 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
-      //BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool)>,
-      //BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool)>,
-      //BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc
index 9ccc654e08eb8..0a3491e1a308c 100644
--- a/onnxruntime/core/providers/js/operators/pool.cc
+++ b/onnxruntime/core/providers/js/operators/pool.cc
@@ -58,6 +58,7 @@ namespace js {
 POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 7, 9)
 POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 10, 10)
 POOLING_KERNEL(AveragePool, kOnnxDomain, false, float, AveragePool, 11)
+POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, float, AveragePool, 11)
 POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, float, AveragePool, 1)
 POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, float, AveragePool, 1)
 
@@ -65,8 +66,11 @@ POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, float, MaxPool<1>, 1, 7)
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 8, 9)
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 10, 10)
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 11, 11)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, float, MaxPool<8>, 11, 11)
 POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 12)
+POOLING_KERNEL_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, float, MaxPool<8>, 12)
 POOLING_KERNEL(GlobalMaxPool, kOnnxDomain, false, float, MaxPool<1>, 1)
+POOLING_KERNEL(GlobalMaxPool, kMSInternalNHWCDomain, true, float, MaxPool<1>, 1)
 
 }  // namespace js
 }  // namespace onnxruntime

From 30c69afddb9786027833776651b74c80de90efbc Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 25 Jan 2023 17:18:05 -0800
Subject: [PATCH 36/81] time measuring

---
 js/web/lib/wasm/jsep/init.ts | 10 ++++++----
 js/web/test/test-main.ts     |  2 ++
 js/web/test/test-runner.ts   |  2 ++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 2082694b84f36..f75e6fb730302 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -114,13 +114,14 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
           if (isSourceGpu) {
             if (env.debug) {
               // eslint-disable-next-line no-console
-              console.log(`[js] jsepCopyGpuToGpu: src=${src}, dst=${dst}, size=${size}`);
+              console.log(`[js][${performance.now()}] jsepCopyGpuToGpu: src=${src}, dst=${dst}, size=${size}`);
             }
             backend.memcpy(src, dst);
           } else {
             if (env.debug) {
               // eslint-disable-next-line no-console
-              console.log(`[js] jsepCopyCpuToGpu: dataOffset=${src}, gpuDataId=${dst}, size=${size}`);
+              console.log(
+                  `[js][${performance.now()}] jsepCopyCpuToGpu: dataOffset=${src}, gpuDataId=${dst}, size=${size}`);
             }
             const data = module.HEAPU8.subarray(src, src + size);
             backend.upload(dst, data);
@@ -134,7 +135,8 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
 
               if (env.debug) {
                 // eslint-disable-next-line no-console
-                console.log(`[js] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
+                console.log(`[js][${performance.now()}] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${
+                    dataOffset}, size=${size}`);
               }
 
               await backend.download(gpuDataId, data);
@@ -150,7 +152,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         (kernel: number, contextDataOffset: number) => {
           if (env.debug) {
             // eslint-disable-next-line no-console
-            console.log(`[js] jsepRun on ${contextDataOffset}`);
+            console.log(`[js][${performance.now()}] jsepRun on ${contextDataOffset}`);
           }
           const context = new OpKernelContext(module, backend, contextDataOffset);
           return backend.computeKernel(kernel, context);
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 2610cbe1d82e6..4cd419589b350 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -103,8 +103,10 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
         let context: ModelTestContext;
 
         before('prepare session', async () => {
+          console.log(`[_BEFORE_PREPARE_SESSION_] ${performance.now()}`);
           context = await ModelTestContext.create(
               test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
+          console.log(`[_AFTER_PREPARE_SESSION_] ${performance.now()}`);
         });
 
         after('release session', () => {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 4dedd678fbdce..452ef828e768a 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -502,8 +502,10 @@ export async function runModelTestSet(
     const feeds: Record<string, ort.Tensor> = {};
     testCase.inputs!.forEach((tensor, i) => feeds[context.session.inputNames[i]] = tensor);
     const start = now();
+    console.log(`[_BEFORE_SESSION_RUN_] ${start}`);
     const outputs = await context.session.run(feeds);
     const end = now();
+    console.log(`[_AFTER_SESSION_RUN_] ${end}`);
     if (context.perfData.count === 0) {
       context.perfData.firstRun = end - start;
     } else {

From c30dd1bda5e97cebc0011159ec28ac4f017b0844 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 25 Jan 2023 18:39:02 -0800
Subject: [PATCH 37/81] operator: Sigmoid

---
 js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts        | 4 ++--
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts            | 6 ++++--
 onnxruntime/core/providers/js/js_execution_provider.cc | 4 ++++
 onnxruntime/core/providers/js/operators/unary.cc       | 4 ++++
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 77fb88e274dd0..b6d331ecea60b 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -70,8 +70,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   //['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
   // ['Resize', '', '10', resize, parseResizeAttributesV10],
   // ['Resize', '', '11+', resize, parseResizeAttributesV11],
-  //['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid],
-  ['Sin', [unaryOps.sin]], ['Sinh', [unaryOps.sinh]],
+  //['Shape', '', '1+', shape],
+  ['Sigmoid', [unaryOps.sigmoid]], ['Sin', [unaryOps.sin]], ['Sinh', [unaryOps.sinh]],
   //['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
   //['Slice', '', '1-9', slice, parseSliceAttributes],
   // // The "semantic" meaning of axis has changed in opset-13.
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index a6d7e0f340430..fe9753f75b3b8 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -212,8 +212,10 @@ export const reciprocal = (context: ComputeContext): number => {
 // export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
 //     createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
 
-// export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
-//     createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
+export const sigmoid = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sigmoid', a => `(1.0 / (1.0 + exp(-${a})))`));
+  return 0;
+};
 
 export const sin = (context: ComputeContext): number => {
   context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sin', 'sin'));
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 6a8279ef8678f..edf2eef5cbb66 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -90,6 +90,8 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Exp);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 12, Erf);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Erf);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Sigmoid);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Sigmoid);
 
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Sin);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, Cos);
@@ -195,6 +197,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(13, Exp),
       KERNEL_CREATE_INFO_VERSIONED(9, 12, Erf),
       KERNEL_CREATE_INFO(13, Erf),
+      KERNEL_CREATE_INFO_VERSIONED(6, 12, Sigmoid),
+      KERNEL_CREATE_INFO(13, Sigmoid),
 
       KERNEL_CREATE_INFO(7, Sin),
       KERNEL_CREATE_INFO(7, Cos),
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index 06f8b82755410..5ed76972dc363 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -52,6 +52,10 @@ JSEP_KERNEL_IMPL(Erf, Erf)
 JSEP_ELEMENTWISE_VERSIONED_KERNEL(Erf, 9, 12, float, Erf)
 JSEP_ELEMENTWISE_KERNEL(Erf, 13, float, Erf)
 
+JSEP_KERNEL_IMPL(Sigmoid, Sigmoid)
+JSEP_ELEMENTWISE_VERSIONED_KERNEL(Sigmoid, 6, 12, float, Sigmoid)
+JSEP_ELEMENTWISE_KERNEL(Sigmoid, 13, float, Sigmoid)
+
 JSEP_KERNEL_IMPL(Sin, Sin)
 JSEP_ELEMENTWISE_KERNEL(Sin, 7, float, Sin)
 

From 5ed5ab7b021493e25b8619f11e7b13deb5274a1b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 2 Feb 2023 14:31:13 -0800
Subject: [PATCH 38/81] profiling

---
 js/common/lib/env-impl.ts                     |  3 +-
 js/common/lib/env.ts                          |  9 ++++
 js/web/karma.conf.js                          |  3 +-
 js/web/lib/wasm/jsep/backend-webgpu.ts        | 27 +++++++++-
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  |  9 ++--
 .../lib/wasm/jsep/webgpu/program-manager.ts   | 52 ++++++++++++++++++-
 js/web/script/test-runner-cli-args.ts         | 16 ++++--
 js/web/script/test-runner-cli.ts              | 13 ++---
 js/web/test/test-main.ts                      |  3 ++
 9 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/js/common/lib/env-impl.ts b/js/common/lib/env-impl.ts
index 9b9ea78e83364..f4f3f447b4c1a 100644
--- a/js/common/lib/env-impl.ts
+++ b/js/common/lib/env-impl.ts
@@ -8,6 +8,7 @@ export class EnvImpl implements Env {
   constructor() {
     this.wasm = {};
     this.webgl = {};
+    this.webgpu = {};
     this.logLevelInternal = 'warning';
   }
 
@@ -28,8 +29,8 @@ export class EnvImpl implements Env {
   debug?: boolean;
 
   wasm: Env.WebAssemblyFlags;
-
   webgl: Env.WebGLFlags;
+  webgpu: Env.WebGpuFlags;
 
   [name: string]: unknown;
 
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index 2369c63612b45..259df88462697 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -83,6 +83,10 @@ export declare namespace Env {
      */
     async?: boolean;
   }
+
+  export interface WebGpuFlags {
+    profilingMode?: 'off'|'default';
+  }
 }
 
 export interface Env {
@@ -109,6 +113,11 @@ export interface Env {
    */
   webgl: Env.WebGLFlags;
 
+  /**
+   * Represent a set of flags for WebGPU
+   */
+  webgpu: Env.WebGpuFlags;
+
   [name: string]: unknown;
 }
 
diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index bb85fa90bafe1..b51a3b16ddc04 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -84,8 +84,9 @@ module.exports = function (config) {
       ChromePerf: { base: 'Chrome', flags: ['--window-size=1,1', '--enable-features=SharedArrayBuffer'] },
       ChromeDebug: { debug: true, base: 'Chrome', flags: ['--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer'] },
       ChromeCanaryTest: { base: 'ChromeCanary', flags: ['--window-size=1,1', '--enable-features=SharedArrayBuffer', '--enable-unsafe-webgpu'] },
+      ChromeCanaryProfileTest: { base: 'ChromeCanary', flags: ['--window-size=1,1', '--enable-features=SharedArrayBuffer', '--enable-unsafe-webgpu', '--disable-dawn-features=disallow_unsafe_apis'] },
       ChromeCanaryDebug: { debug: true, base: 'ChromeCanary', flags: ['--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer', '--enable-unsafe-webgpu'] },
-
+      ChromeCanaryProfileDebug: { debug: true, base: 'ChromeCanary', flags: ['--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer', '--enable-unsafe-webgpu', '--disable-dawn-features=disallow_unsafe_apis'] },
       //
       // ==== BrowserStack browsers ====
       //
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index a7458742cbb95..6ce156142bfa4 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -37,6 +37,10 @@ export class WebGpuBackend {
   computePassEncoder: GPUComputePassEncoder|null = null;
   pendingDispatchNumber = 0;
 
+  profilingEnabled = false;
+  profilingQuerySet: GPUQuerySet;
+  profilingTimeBase?: bigint;
+
   async initialize(): Promise<void> {
     if (!navigator.gpu) {
       // WebGPU is not available.
@@ -47,7 +51,21 @@ export class WebGpuBackend {
     if (!adapter) {
       throw new Error('WebGpuBackend: Failed to get GPU adapter.');
     }
-    this.device = await adapter.requestDevice();
+
+    const deviceDescriptor: GPUDeviceDescriptor = {
+      requiredLimits: {
+        maxComputeWorkgroupStorageSize: adapter.limits.maxComputeWorkgroupStorageSize,
+        maxComputeWorkgroupsPerDimension: adapter.limits.maxComputeWorkgroupsPerDimension,
+        maxStorageBufferBindingSize: adapter.limits.maxStorageBufferBindingSize,
+      }
+    };
+    if (adapter.features.has('timestamp-query-inside-passes') && env.webgpu.profilingMode === 'default') {
+      this.profilingEnabled = true;
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      deviceDescriptor.requiredFeatures = ['timestamp-query-inside-passes' as any];
+    }
+
+    this.device = await adapter.requestDevice(deviceDescriptor);
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
     this.kernels = new Map();
@@ -60,6 +78,13 @@ export class WebGpuBackend {
         console.error(`An uncaught WebGPU validation error was raised: ${ev.error.message}`);
       }
     };
+
+    if (this.profilingEnabled) {
+      this.profilingQuerySet = this.device.createQuerySet({
+        type: 'timestamp',
+        count: 2,
+      });
+    }
   }
 
   dispose(): void {
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index bcd13b8ef6697..c8c456d248e7e 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -22,7 +22,7 @@ export interface GpuDataManager {
   /**
    * create new data on GPU.
    */
-  create(size: number): GpuData;
+  create(size: number, usage?: number): GpuData;
   /**
    * get GPU data by ID.
    */
@@ -143,7 +143,8 @@ class GpuDataManagerImpl implements GpuDataManager {
         sourceGpuDataCache.gpuData.buffer, 0, destinationGpuDataCache.gpuData.buffer, 0, size);
   }
 
-  create(size: number): GpuData {
+  // eslint-disable-next-line no-bitwise
+  create(size: number, usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST): GpuData {
     // !!!
     // !!! IMPORTANT: TODO: whether we should keep the storage buffer every time, or always create new ones.
     // !!!                  This need to be figured out by performance test results.
@@ -152,9 +153,7 @@ class GpuDataManagerImpl implements GpuDataManager {
     const bufferSize = calcNormalizedBufferSize(size);
 
     // create gpu buffer
-    const gpuBuffer = this.backend.device.createBuffer(
-        // eslint-disable-next-line no-bitwise
-        {size: bufferSize, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST});
+    const gpuBuffer = this.backend.device.createBuffer({size: bufferSize, usage});
 
     const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
     this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index e1992ac3b58f4..692b9ba5ec0fb 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -33,9 +33,15 @@ export class ProgramManager {
   run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[],
       dispatchGroup: {x: number; y?: number; z?: number}): void {
     const device = this.backend.device;
-
     const computePassEncoder = this.backend.getComputePassEncoder();
 
+    if (this.backend.profilingEnabled) {
+      // profiling write start timestamp
+
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      (computePassEncoder as any).writeTimestamp(this.backend.profilingQuerySet, 0);
+    }
+
     computePassEncoder.setPipeline(buildArtifact.computePipeline);
     const entries = [];
     for (const input of inputs) {
@@ -52,6 +58,50 @@ export class ProgramManager {
 
     this.backend.pendingDispatchNumber++;
 
+    if (this.backend.profilingEnabled) {
+      // profiling write end timestamp
+
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      (computePassEncoder as any).writeTimestamp(this.backend.profilingQuerySet, 1);
+      // eslint-disable-next-line no-bitwise
+      const queryData = this.backend.gpuDataManager.create(16, GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE);
+      // eslint-disable-next-line no-bitwise
+      const syncData = this.backend.gpuDataManager.create(16, GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST);
+
+      this.backend.endComputePass();
+      this.backend.getCommandEncoder().resolveQuerySet(this.backend.profilingQuerySet, 0, 2, queryData.buffer, 0);
+      this.backend.getCommandEncoder().copyBufferToBuffer(queryData.buffer, 0, syncData.buffer, 0, 16);
+      this.backend.flush();
+
+      const kernelId = this.backend.currentKernelId!;
+      const kernelName = this.backend.kernels.get(kernelId)![0];
+
+      syncData.buffer.mapAsync(GPUMapMode.READ).then(() => {
+        const mappedData = new BigUint64Array(syncData.buffer.getMappedRange());
+        const startTimeU64 = mappedData[0];
+        const endTimeU64 = mappedData[1];
+
+        syncData.buffer.unmap();
+
+        if (typeof this.backend.profilingTimeBase === 'undefined') {
+          this.backend.profilingTimeBase = startTimeU64;
+        }
+
+        const startTime = Number(startTimeU64 - this.backend.profilingTimeBase);
+        const endTime = Number(endTimeU64 - this.backend.profilingTimeBase);
+
+        if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
+          throw new RangeError('incorrect timestamp range');
+        }
+
+        this.backend.gpuDataManager.release(queryData.id);
+        this.backend.gpuDataManager.release(syncData.id);
+
+        // eslint-disable-next-line no-console
+        console.log(`[profiling] kernel "${kernelId}|${kernelName}" execution time: ${endTime - startTime} ns`);
+      });
+    }
+
     if (this.backend.pendingDispatchNumber >= 16) {
       this.backend.flush();
     }
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index d860e84dfd99f..249f89d42a490 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -74,6 +74,7 @@ Options:
  --webgl-matmul-max-batch-size Set the WebGL matmulMaxBatchSize
  --webgl-texture-cache-mode    Set the WebGL texture cache mode (initializerOnly/full)
  --webgl-texture-pack-mode     Set the WebGL texture pack mode (true/false)
+ --webgpu-profiling-mode       Set the WebGPU profiling mode (off/default)
 
 *** Browser Options ***
 
@@ -318,11 +319,20 @@ function parseWebglFlags(args: minimist.ParsedArgs): Env.WebGLFlags {
   return {contextId, matmulMaxBatchSize, textureCacheMode, pack};
 }
 
+function parseWebgpuFlags(args: minimist.ParsedArgs): Env.WebGpuFlags {
+  const profilingMode = args['webgpu-profiling-mode'];
+  if (profilingMode !== undefined && profilingMode !== 'off' && profilingMode !== 'default') {
+    throw new Error('Flag "webgpu-profiling-mode" is invalid');
+  }
+  return {profilingMode};
+}
+
 function parseGlobalEnvFlags(args: minimist.ParsedArgs): Env {
-  const wasmFlags = parseWasmFlags(args);
-  const webglFlags = parseWebglFlags(args);
+  const wasm = parseWasmFlags(args);
+  const webgl = parseWebglFlags(args);
+  const webgpu = parseWebgpuFlags(args);
   const cpuFlags = parseCpuFlags(args);
-  return {webgl: webglFlags, wasm: wasmFlags, cpuFlags};
+  return {webgl, wasm, webgpu, ...cpuFlags};
 }
 
 export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs {
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index a04b7f78746d0..dd9a832595b61 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -464,7 +464,7 @@ function run(config: Test.Config) {
         args.bundleMode === 'perf' ? 'perf' :
             args.debug             ? 'debug' :
                                      'test',
-        webgpu);
+        webgpu, config.options.globalEnvFlags?.webgpu?.profilingMode === 'default');
     const karmaArgs = ['start', `--browsers ${browser}`];
     if (args.debug) {
       karmaArgs.push('--log-level info --timeout-mocha 9999999');
@@ -569,10 +569,11 @@ function saveConfig(config: Test.Config) {
 }
 
 
-function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean) {
+function getBrowserNameFromEnv(
+    env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, profile: boolean) {
   switch (env) {
     case 'chrome':
-      return selectChromeBrowser(mode, webgpu);
+      return selectChromeBrowser(mode, webgpu, profile);
     case 'edge':
       return 'Edge';
     case 'firefox':
@@ -588,13 +589,13 @@ function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'per
   }
 }
 
-function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean) {
+function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, profile: boolean) {
   if (webgpu) {
     switch (mode) {
       case 'debug':
-        return 'ChromeCanaryDebug';
+        return profile ? 'ChromeCanaryProfileDebug' : 'ChromeCanaryDebug';
       default:
-        return 'ChromeCanaryTest';
+        return profile ? 'ChromeCanaryProfileTest' : 'ChromeCanaryDebug';
     }
   } else {
     switch (mode) {
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 4cd419589b350..345faf509504d 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -54,6 +54,9 @@ if (options.globalEnvFlags) {
   if (flags.wasm?.initTimeout !== undefined) {
     ort.env.wasm.initTimeout = flags.wasm.initTimeout;
   }
+  if (flags.webgpu?.profilingMode !== undefined) {
+    ort.env.webgpu.profilingMode = flags.webgpu.profilingMode;
+  }
 }
 
 // Set logging configuration

From ba14c5db72af4c4c7c8f3041721612127566dfc7 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 3 Feb 2023 17:37:15 -0800
Subject: [PATCH 39/81] clean code

---
 cmake/onnxruntime_webassembly.cmake           |  17 +-
 js/web/lib/backend-onnxjs.ts                  |  11 +-
 js/web/lib/index.ts                           |   7 +-
 js/web/lib/onnxjs/backend.ts                  |   4 +-
 js/web/lib/onnxjs/backends/backend-webgpu.ts  |  95 ----
 .../backends/webgpu/gpu-data-manager.ts       | 167 -------
 .../backends/webgpu/inference-handler.ts      |  89 ----
 .../backends/webgpu/op-resolve-rules.ts       |  90 ----
 .../onnxjs/backends/webgpu/ops/binary-op.ts   | 217 --------
 .../lib/onnxjs/backends/webgpu/ops/common.ts  |  91 ----
 .../lib/onnxjs/backends/webgpu/ops/concat.ts  | 176 -------
 .../backends/webgpu/ops/conv-grouped.ts       | 127 -----
 js/web/lib/onnxjs/backends/webgpu/ops/conv.ts | 150 ------
 .../onnxjs/backends/webgpu/ops/fuse-utils.ts  |  39 --
 .../lib/onnxjs/backends/webgpu/ops/gather.ts  | 131 -----
 js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts | 165 ------
 .../lib/onnxjs/backends/webgpu/ops/matmul.ts  | 115 -----
 js/web/lib/onnxjs/backends/webgpu/ops/pool.ts | 376 --------------
 .../backends/webgpu/ops/reduce-tensors.ts     |  85 ----
 .../lib/onnxjs/backends/webgpu/ops/reshape.ts |  22 -
 .../lib/onnxjs/backends/webgpu/ops/shape.ts   |  16 -
 .../lib/onnxjs/backends/webgpu/ops/slice.ts   | 180 -------
 .../lib/onnxjs/backends/webgpu/ops/squeeze.ts |  44 --
 .../onnxjs/backends/webgpu/ops/transpose.ts   | 116 -----
 .../onnxjs/backends/webgpu/ops/unary-op.ts    | 197 --------
 .../onnxjs/backends/webgpu/ops/unsqueeze.ts   |  43 --
 .../onnxjs/backends/webgpu/program-manager.ts |  76 ---
 .../onnxjs/backends/webgpu/session-handler.ts |  47 --
 .../backends/webgpu/tensor-data-manager.ts    | 140 ------
 js/web/lib/onnxjs/backends/webgpu/types.ts    |  96 ----
 js/web/lib/onnxjs/execution-plan.ts           |  22 +-
 js/web/lib/onnxjs/operators.ts                |   6 +-
 js/web/lib/onnxjs/opset.ts                    |  12 +-
 js/web/lib/onnxjs/tensor.ts                   |  14 +-
 js/web/lib/wasm/session-options.ts            |   4 +-
 js/web/script/test-runner-cli-args.ts         |   5 +-
 js/web/script/test-runner-cli.ts              |   4 +-
 js/web/test/suite-test-list.jsonc             | 468 ++++--------------
 js/web/test/test-runner.ts                    |   4 +-
 .../unittests/backends/webgl/test-conv-new.ts |   2 +-
 onnxruntime/core/framework/execution_frame.cc |   6 -
 .../core/framework/execution_provider.cc      |   1 -
 .../core/framework/graph_partitioner.cc       |   4 -
 onnxruntime/core/framework/kernel_lookup.h    |   9 -
 onnxruntime/core/framework/kernel_registry.cc |  12 -
 onnxruntime/core/framework/session_state.cc   |   6 -
 46 files changed, 129 insertions(+), 3579 deletions(-)
 delete mode 100644 js/web/lib/onnxjs/backends/backend-webgpu.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/inference-handler.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/common.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/conv.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/gather.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/pool.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/shape.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/slice.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/program-manager.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/session-handler.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts
 delete mode 100644 js/web/lib/onnxjs/backends/webgpu/types.ts

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 7f2588a989a8d..922470b82939a 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -199,10 +199,15 @@ else()
   endif()
 
   set(EXPORTED_RUNTIME_METHODS "['stackAlloc','stackRestore','stackSave','UTF8ToString','stringToUTF8','lengthBytesUTF8']")
+  if (onnxruntime_USE_JS)
+    set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput")
+  else()
+    set(EXPORTED_FUNCTIONS "_malloc,_free")
+  endif()
 
   set_target_properties(onnxruntime_webassembly PROPERTIES LINK_FLAGS " \
                         -s \"EXPORTED_RUNTIME_METHODS=${EXPORTED_RUNTIME_METHODS}\" \
-                        -s \"EXPORTED_FUNCTIONS=_malloc,_free,_JsepOutput\" \
+                        -s \"EXPORTED_FUNCTIONS=${EXPORTED_FUNCTIONS}\" \
                         -s MAXIMUM_MEMORY=4294967296 \
                         -s WASM=1 \
                         -s NO_EXIT_RUNTIME=0 \
@@ -213,18 +218,12 @@ else()
                         -s VERBOSE=0 \
                         -s NO_FILESYSTEM=1 \
                         ${WASM_API_EXCEPTION_CATCHING} \
-                        -s ASYNCIFY=1 \
-                        -s ASYNCIFY_STACK_SIZE=65536 \
-                        -s ASYNCIFY_ADVISE=1 \
-                        -s ASYNCIFY_DEBUG=0 \
-                        -s ASYNCIFY_IGNORE_INDIRECT=0 \
-                        -s ASYNCIFY_REMOVE=OrtInit \
-                        -s ASYNCIFY_ADD=OrtRun \
                         --no-entry")
 
   if (onnxruntime_USE_JS)
     target_compile_definitions(onnxruntime_webassembly PRIVATE -DUSE_JS=1)
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js\"")
+    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS
+      " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js\" -s ASYNCIFY=1 -s ASYNCIFY_STACK_SIZE=65536")
   endif()
 
   if (onnxruntime_EMSCRIPTEN_SETTINGS)
diff --git a/js/web/lib/backend-onnxjs.ts b/js/web/lib/backend-onnxjs.ts
index c4fe1f1db38af..18a068e0ced8b 100644
--- a/js/web/lib/backend-onnxjs.ts
+++ b/js/web/lib/backend-onnxjs.ts
@@ -17,16 +17,7 @@ class OnnxjsBackend implements Backend {
     // onnxruntime-common).
     //       In future we should remove Session.Config and use InferenceSession.SessionOptions.
     //       Currently we allow this to happen to make test runner work.
-    const onnxjsOptions = {...options as unknown as Session.Config};
-    if (!onnxjsOptions.backendHint && options?.executionProviders && options?.executionProviders[0]) {
-      const ep = options?.executionProviders[0];
-      if (typeof ep === 'string') {
-        onnxjsOptions.backendHint = ep;
-      } else {
-        onnxjsOptions.backendHint = ep.name;
-      }
-    }
-    const session = new Session(onnxjsOptions);
+    const session = new Session(options as unknown as Session.Config);
 
     // typescript cannot merge method override correctly (so far in 4.2.3). need if-else to call the method.
     if (typeof pathOrBuffer === 'string') {
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 708c92a261fde..749331058cc4a 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -14,15 +14,10 @@ if (!BUILD_DEFS.DISABLE_WEBGL) {
   registerBackend('webgl', onnxjsBackend, -10);
 }
 
-if (!BUILD_DEFS.DISABLE_WEBGPU) {
-  const onnxjsBackend = require('./backend-onnxjs').onnxjsBackend;
-  registerBackend('webgpu', onnxjsBackend, 999);  // set to 999 as the highest priority
-}
-
 if (!BUILD_DEFS.DISABLE_WASM) {
   const wasmBackend = require('./backend-wasm').wasmBackend;
   if (!BUILD_DEFS.DISABLE_WEBGPU) {
-    registerBackend('jsep-webgpu', wasmBackend, 11);
+    registerBackend('webgpu', wasmBackend, 5);
   }
   registerBackend('cpu', wasmBackend, 10);
   registerBackend('wasm', wasmBackend, 10);
diff --git a/js/web/lib/onnxjs/backend.ts b/js/web/lib/onnxjs/backend.ts
index 5ac77ae2f5fcb..f402b820e76e1 100644
--- a/js/web/lib/onnxjs/backend.ts
+++ b/js/web/lib/onnxjs/backend.ts
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 import {WebGLBackend} from './backends/backend-webgl';
-import {WebGpuBackend} from './backends/backend-webgpu';
 import {Graph} from './graph';
 import {Operator} from './operators';
 import {OpSet} from './opset';
@@ -79,8 +78,7 @@ export interface Backend {
 const backendsCache: Map<string, Backend> = new Map();
 
 export const backend: {[name: string]: Backend} = {
-  webgl: new WebGLBackend(),
-  webgpu: new WebGpuBackend()
+  webgl: new WebGLBackend()
 };
 
 /**
diff --git a/js/web/lib/onnxjs/backends/backend-webgpu.ts b/js/web/lib/onnxjs/backends/backend-webgpu.ts
deleted file mode 100644
index e0f247eb135cb..0000000000000
--- a/js/web/lib/onnxjs/backends/backend-webgpu.ts
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {env} from 'onnxruntime-common';
-
-import {Backend, SessionHandler} from '../backend';
-import {Logger} from '../instrument';
-import {Session} from '../session';
-
-import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
-import {WebGpuSessionHandler} from './webgpu/session-handler';
-
-export class WebGpuBackend implements Backend {
-  device: GPUDevice;
-  gpuDataManager: GpuDataManager;
-
-  commandEncoder: GPUCommandEncoder|null = null;
-  computePassEncoder: GPUComputePassEncoder|null = null;
-  pendingDispatchNumber = 0;
-
-  // #region interface Backend
-
-  async initialize(): Promise<boolean> {
-    try {
-      if (!navigator.gpu) {
-        // WebGPU is not available.
-        Logger.warning('WebGpuBackend', 'WebGPU is not available.');
-        return false;
-      }
-
-      const adapter = await navigator.gpu.requestAdapter();
-      if (!adapter) {
-        Logger.warning('WebGpuBackend', 'Failed to get GPU adapter.');
-        return false;
-      }
-      this.device = await adapter.requestDevice();
-      this.gpuDataManager = createGpuDataManager(this);
-
-      // TODO: set up flags
-
-      Logger.setWithEnv(env);
-
-      Logger.verbose('WebGpuBackend', 'Initialized successfully.');
-
-      this.device.onuncapturederror = ev => {
-        if (ev.error instanceof GPUValidationError) {
-          // eslint-disable-next-line no-console
-          console.error(`An uncaught WebGPU validation error was raised: ${ev.error.message}`);
-        }
-      };
-
-      return true;
-    } catch (e) {
-      Logger.warning('WebGpuBackend', `Unable to initialize WebGpuBackend. ${e}`);
-      return false;
-    }
-  }
-  createSessionHandler(context: Session.Context): SessionHandler {
-    return new WebGpuSessionHandler(this, context);
-  }
-  dispose(): void {
-    // TODO: uninitialization
-    // this.glContext.dispose();
-  }
-
-  // #endregion interface Backend
-
-  getCommandEncoder(): GPUCommandEncoder {
-    if (!this.commandEncoder) {
-      this.commandEncoder = this.device.createCommandEncoder();
-    }
-    return this.commandEncoder;
-  }
-
-  getComputePassEncoder(): GPUComputePassEncoder {
-    if (!this.computePassEncoder) {
-      this.computePassEncoder = this.getCommandEncoder().beginComputePass();
-    }
-    return this.computePassEncoder;
-  }
-
-  endComputePass(): void {
-    if (this.computePassEncoder) {
-      this.computePassEncoder.end();
-      this.computePassEncoder = null;
-    }
-  }
-
-  flush(): void {
-    this.endComputePass();
-    this.device.queue.submit([this.commandEncoder!.finish()]);
-    this.commandEncoder = null;
-    this.pendingDispatchNumber = 0;
-  }
-}
diff --git a/js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts b/js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts
deleted file mode 100644
index 297d4bae64aed..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/gpu-data-manager.ts
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Guid} from 'guid-typescript';
-
-import {Logger} from '../../instrument';
-
-import {sizeof, Tensor} from '../../tensor';
-import {ShapeUtil} from '../../util';
-import {WebGpuBackend} from '../backend-webgpu';
-import {GpuData, GpuDataId, GpuDataType} from './types';
-
-/**
- * manages GpuDataId -> GpuBuffer
- */
-export interface GpuDataManager {
-  /**
-   * upload data to GPU. if the ID already exists in cache, returns the cached value without uploading anything.
-   */
-  upload(data: Tensor.NumberType, gpuDataType: GpuDataType): Promise<GpuData>;
-  /**
-   * create new data on GPU.
-   */
-  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData;
-  /**
-   * get GPU data by ID.
-   */
-  get(id: GpuDataId): GpuData|undefined;
-  /**
-   * release the data on GPU by ID.
-   */
-  release(id: GpuDataId): void;
-  /**
-   * download the data from GPU.
-   */
-  download(id: GpuDataId): Promise<ArrayBufferLike>;
-}
-
-interface StorageCacheValue {
-  gpuData: GpuData;
-  size: number;
-}
-
-interface DownloadCacheValue {
-  gpuData: GpuData;
-  data: Promise<ArrayBufferLike>;
-}
-
-/**
- * normalize the buffer size so that it fits the 128-bits (16 bytes) alignment.
- */
-const calcNormalizedBufferSize = (size: number) => Math.ceil(size / 16) * 16;
-
-class GpuDataManagerImpl implements GpuDataManager {
-  // GPU Data ID => GPU Data ( storage buffer )
-  storageCache: Map<GpuDataId, StorageCacheValue>;
-
-  // GPU Data ID => GPU Data ( read buffer )
-  downloadCache: Map<GpuDataId, DownloadCacheValue>;
-
-  constructor(private backend: WebGpuBackend /* , private reuseBuffer: boolean */) {
-    this.storageCache = new Map();
-    this.downloadCache = new Map();
-  }
-
-  async upload(data: Tensor.NumberType, gpuDataType: GpuDataType): Promise<GpuData> {
-    if (gpuDataType !== GpuDataType.default) {
-      throw new Error('we only support default GPU data type now');
-    }
-
-    Logger.verbose('GpuData', `Uploading data to GPU: {${data.length}}`);
-
-    const srcArrayBuffer = data.buffer;
-    const srcOffset = data.byteOffset;
-    const srcLength = data.byteLength;
-    const size = calcNormalizedBufferSize(srcLength);
-
-    // create gpu buffer
-    const gpuBuffer = this.backend.device.createBuffer({mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE});
-
-    // copy (upload) data
-    const arrayBuffer = gpuBuffer.getMappedRange();
-    new Uint8Array(arrayBuffer).set(new Uint8Array(srcArrayBuffer, srcOffset, srcLength));
-    gpuBuffer.unmap();
-
-    const gpuData = {id: Guid.create(), type: GpuDataType.default, buffer: gpuBuffer};
-    this.storageCache.set(gpuData.id, {gpuData, size: srcLength});
-    return gpuData;
-  }
-
-  create(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): GpuData {
-    if (gpuDataType !== GpuDataType.default) {
-      throw new Error('we only support default GPU data type now');
-    }
-
-    // !!!
-    // !!! IMPORTANT: TODO: whether we should keep the storage buffer every time, or always create new ones.
-    // !!!                  This need to be figured out by performance test results.
-    // !!!
-
-    const elemCount = ShapeUtil.size(dims);
-    const bufferLength = sizeof(type) * elemCount;
-    const size = calcNormalizedBufferSize(bufferLength);
-
-    // create gpu buffer
-    const gpuBuffer =
-        // eslint-disable-next-line no-bitwise
-        this.backend.device.createBuffer({size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC});
-
-    const gpuData = {id: Guid.create(), type: GpuDataType.default, buffer: gpuBuffer};
-    this.storageCache.set(gpuData.id, {gpuData, size: bufferLength});
-    return gpuData;
-  }
-
-  get(id: GpuDataId): GpuData|undefined {
-    return this.storageCache.get(id)?.gpuData;
-  }
-
-  release(id: GpuDataId): void {
-    const cachedData = this.storageCache.get(id);
-    if (!cachedData) {
-      throw new Error('releasing data does not exist');
-    }
-
-    this.storageCache.delete(id);
-    cachedData.gpuData.buffer.destroy();
-
-    const downloadingData = this.downloadCache.get(id);
-    if (downloadingData) {
-      void downloadingData.data.then(() => {
-        downloadingData.gpuData.buffer.destroy();
-      });
-      this.downloadCache.delete(id);
-    }
-  }
-
-  async download(id: GpuDataId): Promise<ArrayBufferLike> {
-    const downloadData = this.downloadCache.get(id);
-    if (downloadData) {
-      return downloadData.data;
-    }
-
-    const cachedData = this.storageCache.get(id);
-    if (!cachedData) {
-      throw new Error('data does not exist');
-    }
-
-    Logger.verbose('GpuData', `Downloading data from GPU: {${id}}`);
-
-    const commandEncoder = this.backend.getCommandEncoder();
-    this.backend.endComputePass();
-    const gpuReadBuffer = this.backend.device.createBuffer(
-        // eslint-disable-next-line no-bitwise
-        {size: cachedData.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ});
-    commandEncoder.copyBufferToBuffer(
-        cachedData.gpuData.buffer /* source buffer */, 0 /* source offset */, gpuReadBuffer /* destination buffer */,
-        0 /* destination offset */, cachedData.size /* size */
-    );
-    this.backend.flush();
-
-    await gpuReadBuffer.mapAsync(GPUMapMode.READ);
-    return gpuReadBuffer.getMappedRange();
-  }
-}
-
-export const createGpuDataManager = (...args: ConstructorParameters<typeof GpuDataManagerImpl>): GpuDataManager =>
-    new GpuDataManagerImpl(...args);
diff --git a/js/web/lib/onnxjs/backends/webgpu/inference-handler.ts b/js/web/lib/onnxjs/backends/webgpu/inference-handler.ts
deleted file mode 100644
index 2509814c353f1..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/inference-handler.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {InferenceHandler} from '../../backend';
-import {Tensor} from '../../tensor';
-
-import {WebGpuSessionHandler} from './session-handler';
-import {createTensorDataManager, TensorDataManager} from './tensor-data-manager';
-import {GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './types';
-
-const getProgramInfoUniqueKey =
-    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly Tensor[], inputGpuDatas: readonly GpuData[]):
-        string => {
-          const inputGpuDataTypes = inputGpuDatas.map(data => `${data.type}`).join('_');
-          const inputTensorShapes = inputTensors.map(t => `${t.dims.join(',')}`).join('_');
-          let key = programInfo.name;
-          if (programInfo.cacheHint) {
-            key += '[' + programInfo.cacheHint + ']';
-          }
-          key += ':' + inputTensorShapes + ';' + inputGpuDataTypes;
-          return key;
-        };
-
-export class WebGpuInferenceHandler implements InferenceHandler {
-  // per inference context
-  dataManager: TensorDataManager;
-
-  constructor(public session: WebGpuSessionHandler) {
-    this.dataManager = createTensorDataManager(session.backend.gpuDataManager);
-  }
-
-  private async uploadGpuData(tensor: Tensor, textureType: GpuDataType): Promise<GpuData> {
-    if (this.session.isInitializer(tensor.dataId)) {
-      return this.session.dataManager.uploadTensorToGpu(tensor, textureType);
-    }
-
-    return this.dataManager.uploadTensorToGpu(tensor, textureType);
-  }
-
-  private createGpuData(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
-    return this.dataManager.createGpuTensor(type, dims, gpuDataType);
-  }
-
-  async run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly Tensor[]): Promise<Tensor[]> {
-    if (inputs.length !== program.inputTypes.length) {
-      throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
-    }
-
-    // create info for inputs
-    const inputDatas: GpuData[] = [];
-    for (let i = 0; i < program.inputTypes.length; ++i) {
-      inputDatas[i] = await this.uploadGpuData(inputs[i], program.inputTypes[i]);
-    }
-
-    const key = getProgramInfoUniqueKey(program, inputs, inputDatas);
-    let artifact = this.session.programManager.getArtifact(key);
-    const programInfo = artifact ?
-        artifact.programInfo :
-        (typeof (program as ProgramInfoLoader).get === 'function' ? (program as ProgramInfoLoader).get() :
-                                                                    (program as ProgramInfo));
-
-    // create info for outputs
-    const outputDatas: GpuData[] = [];
-    const outputTensors: Tensor[] = [];
-    for (let i = 0; i < programInfo.outputs.length; ++i) {
-      const [tensor, gpuData] = this.createGpuData(
-          programInfo.outputs[i].type, programInfo.outputs[i].dims, programInfo.outputs[i].gpuDataType);
-      outputTensors.push(tensor);
-      outputDatas.push(gpuData);
-    }
-
-    if (!artifact) {
-      artifact = this.session.programManager.build(programInfo);
-      this.session.programManager.setArtifact(key, artifact);
-    }
-
-    this.session.programManager.run(artifact, inputDatas, outputDatas, artifact.programInfo.dispatchGroup(inputs));
-
-    return outputTensors;
-  }
-
-  reshape(input: Tensor, reshapedDims: readonly number[]): Tensor {
-    return this.dataManager.hasGpuData(input.dataId) ?
-        this.dataManager.createGpuRef(input.dataId, input.type, reshapedDims)[0] :
-        new Tensor(reshapedDims, input.type, undefined, undefined, input.data);
-  }
-
-  dispose(): void {}
-}
diff --git a/js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts b/js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts
deleted file mode 100644
index 4adfb180893a6..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/op-resolve-rules.ts
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {OpSet} from '../../opset';
-
-import * as binaryOps from './ops/binary-op';
-import {concat, parseConcatAttributes} from './ops/concat';
-import {conv, parseConvAttributes} from './ops/conv';
-import {gather, parseGatherAttributes} from './ops/gather';
-import {gemm, parseGemmAttributesV11, parseGemmAttributesV7} from './ops/gemm';
-import {matMul, parseMatMulAttributes} from './ops/matmul';
-import {averagePool, globalAveragePool, globalMaxPool, maxPool, parseAveragePoolAttributes, parseGlobalAveragePoolAttributes, parseMaxPoolAttributes} from './ops/pool';
-import {sum} from './ops/reduce-tensors';
-import {reshape} from './ops/reshape';
-import {shape} from './ops/shape';
-import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
-import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
-import {parseTransposeAttributes, transpose} from './ops/transpose';
-import * as unaryOps from './ops/unary-op';
-import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
-
-export const WEBGPU_OP_RESOLVE_RULES: readonly OpSet.ResolveRule[] = [
-  ['Abs', '', '6+', unaryOps.abs], ['Acos', '', '7+', unaryOps.acos], ['Add', '', '7+', binaryOps.add],
-  // ['And', '', '7+', binaryOps.and],
-  ['Asin', '', '7+', unaryOps.asin], ['Atan', '', '7+', unaryOps.atan],
-  // TODO: support new attributes for AveragePool-10
-  ['AveragePool', '', '7+', averagePool, parseAveragePoolAttributes],
-  // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
-  // ['Cast', '', '6+', cast, parseCastAttributes],
-  ['Ceil', '', '6+', unaryOps.ceil], ['Clip', '', '6-10', unaryOps.clip, unaryOps.parseClipAttributes],
-  ['Clip', '', '11+', unaryOps.clipV11], ['Concat', '', '4+', concat, parseConcatAttributes],
-  ['Conv', '', '1+', conv, parseConvAttributes], ['Cos', '', '7+', unaryOps.cos], ['Div', '', '7+', binaryOps.div],
-  // ['Dropout', '', '7+', unaryOps.identity],
-  // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
-  // ['Equal', '', '7+', binaryOps.equal],
-  ['Elu', '', '6+', unaryOps.elu, unaryOps.parseEluAttributes], ['Exp', '', '6+', unaryOps.exp],
-  // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
-  ['Floor', '', '6+', unaryOps.floor],
-  // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
-  ['Gather', '', '1+', gather, parseGatherAttributes], ['Gemm', '', '7-10', gemm, parseGemmAttributesV7],
-  ['Gemm', '', '11+', gemm, parseGemmAttributesV11],
-  ['GlobalAveragePool', '', '1+', globalAveragePool, parseGlobalAveragePoolAttributes],
-  ['GlobalMaxPool', '', '1+', globalMaxPool],
-  // ['Greater', '', '7+', binaryOps.greater],
-  // ['Identity', '', '1+', unaryOps.identity],
-  // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
-  // ['InstanceNormalization', '', '6+', instanceNormalization, parseInstanceNormalizationAttributes],
-  ['LeakyRelu', '', '6+', unaryOps.leakyRelu, unaryOps.parseLeakyReluAttributes],
-  // ['Less', '', '7+', binaryOps.less],
-  ['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
-  // TODO: support new attributes for MaxPool-8 and MaxPool-10
-  ['MaxPool', '', '1+', maxPool, parseMaxPoolAttributes], ['Mul', '', '7+', binaryOps.mul],
-  ['Neg', '', '6+', unaryOps.neg],
-  // ['Not', '', '1+', unaryOps.not],
-  // ['Or', '', '7+', binaryOps.or],
-  // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
-  // ['Pad', '', '11+', padV11, parsePadAttributesV11],
-  ['Pow', '', '7+', binaryOps.pow],
-  // ['PRelu', '', '7+', binaryOps.pRelu],
-  // ['ReduceLogSum', '', '1+', reduceLogSum, parseReduceAttributes],
-  // ['ReduceMax', '', '1+', reduceMax, parseReduceAttributes],
-  // ['ReduceMean', '', '1+', reduceMean, parseReduceAttributes],
-  // ['ReduceMin', '', '1+', reduceMin, parseReduceAttributes],
-  // ['ReduceProd', '', '1+', reduceProd, parseReduceAttributes],
-  // ['ReduceSum', '', '1-12', reduceSum, parseReduceAttributes],
-  // ['ReduceSumSquare', '', '1+', reduceLogSumSquare, parseReduceAttributes],
-  ['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
-  // ['Resize', '', '10', resize, parseResizeAttributesV10],
-  // ['Resize', '', '11+', resize, parseResizeAttributesV11],
-  ['Shape', '', '1+', shape], ['Sigmoid', '', '6+', unaryOps.sigmoid], ['Sin', '', '7+', unaryOps.sin],
-  ['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
-  ['Slice', '', '1-9', slice, parseSliceAttributes],
-  // // The "semantic" meaning of axis has changed in opset-13.
-  // ['Softmax', '', '1-12', softmax, parseSoftmaxAttributes],
-  // ['Softmax', '', '13+', softmaxV13, parseSoftmaxAttributesV13],
-  // // 'Split' operator has an optional attribute 'split'
-  // // this attribute determines how the specified axis of input data is split.
-  // // When the attribute is missing, we need the count of number of outputs
-  // // so that we can determine the 'split' attribute from the runtime input to the Operator
-  // ['Split', '', '2-12', split, parseSplitAttributes],
-  ['Sqrt', '', '6+', unaryOps.sqrt], ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
-  ['Squeeze', '', '13+', squeezeV13], ['Sub', '', '7+', binaryOps.sub], ['Sum', '', '6+', sum],
-  ['Tan', '', '7+', unaryOps.tan], ['Tanh', '', '6+', unaryOps.tanh],
-  // ['Tile', '', '6+', tile],
-  ['Transpose', '', '1+', transpose, parseTransposeAttributes],
-  // ['Upsample', '', '7-8', upsample, parseUpsampleAttributesV7],
-  // ['Upsample', '', '9', upsample, parseUpsampleAttributesV9],
-  ['Unsqueeze', '', '1-12', unsqueeze, parseUnsqueezeAttributes], ['Unsqueeze', '', '13+', unsqueezeV13],
-  // ['Xor', '', '7+', binaryOps.xor],
-];
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts b/js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts
deleted file mode 100644
index 8997932602a2f..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/binary-op.ts
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Tensor} from '../../../tensor';
-import {BroadcastUtil, ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-type BuiltinFunctionName = string;
-type BinaryCustomExpression = (expressionA: string, expressionB: string) => string;
-type BinaryFunctionCall = BuiltinFunctionName|BinaryCustomExpression|{
-  scalar: BinaryCustomExpression;
-  vector: BinaryCustomExpression;
-};
-
-const createBinaryOpProgramShader =
-    (dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[], vectorize: boolean,
-     doBroadcast: boolean, funcCall: BinaryFunctionCall, additionalImplementation?: string, typeA = 'f32',
-     typeB = 'f32', typeOutput = 'f32') => {
-      const outputSize = ShapeUtil.size(dimsOutput);
-      const vecSize = Math.ceil(outputSize / 4);
-
-      let expressionScalar: BinaryCustomExpression;
-      let expressionVector: BinaryCustomExpression;
-      if (typeof funcCall === 'string') {
-        expressionScalar = expressionVector = (a, b) => `${funcCall}((${a}),(${b}))`;
-      } else if (typeof funcCall === 'function') {
-        expressionScalar = expressionVector = funcCall;
-      } else {
-        expressionScalar = funcCall.scalar;
-        expressionVector = funcCall.vector;
-      }
-
-      let broadcastImpl = '';
-      const outputIndicesHelper = createIndicesHelper('output', dimsOutput);
-      if (doBroadcast) {
-        const calcOffsetImpl = (dims: readonly number[]) => {
-          const strides = ShapeUtil.computeStrides(dims);
-          const offsets: string[] = [];
-          for (let i = dims.length - 1; i >= 0; i--) {
-            offsets.push(`${strides[i]}u * ((*outputIndices)[${i + dimsOutput.length - dims.length}] % ${dims[i]}u)`);
-          }
-          return offsets.length > 0 ? offsets.join('+') : '0u';
-        };
-
-        broadcastImpl = `
-  ${outputIndicesHelper.o2iImpl}
-
-  fn calcOffsetA(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
-    return ${calcOffsetImpl(dimsA)};
-  }
-
-  fn calcOffsetB(outputIndices: ptr<function, array<u32, ${dimsOutput.length}>>) -> u32 {
-    return ${calcOffsetImpl(dimsB)};
-  }
-  `;
-      }
-
-      let assignment: string;
-      if (vectorize) {
-        if (doBroadcast) {
-          assignment = `
-      ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
-      ${outputIndicesHelper.o2iCall('global_id.x * 4u', 'outputIndices')}
-      let offsetA = calcOffsetA(&outputIndices);
-      let offsetB = calcOffsetB(&outputIndices);
-      outputData[global_id.x] = ${expressionVector('aData[offsetA / 4u]', 'bData[offsetB / 4u]')};`;
-        } else {
-          assignment = `outputData[global_id.x] = ${expressionVector('aData[global_id.x]', 'bData[global_id.x]')};`;
-        }
-      } else {
-        if (!doBroadcast) {
-          throw new Error('no necessary to use scalar implementation for element-wise binary op implementation.');
-        }
-        const singleAssignment = (x: number) => {
-          const expressionA = `aData[indexA${x}][componentA${x}]`;
-          const expressionB = `bData[indexB${x}][componentB${x}]`;
-          return `
-      ${outputIndicesHelper.o2iCall(`global_id.x * 4u + ${x}u`, 'outputIndices')}
-      let offsetA${x} = calcOffsetA(&outputIndices);
-      let offsetB${x} = calcOffsetB(&outputIndices);
-      let indexA${x} = offsetA${x} / 4u;
-      let indexB${x} = offsetB${x} / 4u;
-      let componentA${x} = offsetA${x} % 4u;
-      let componentB${x} = offsetB${x} % 4u;
-      outputData[global_id.x][${x}] = ${expressionScalar(expressionA, expressionB)};`;
-        };
-
-        assignment = `
-      ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
-      ${singleAssignment(0)}
-      ${singleAssignment(1)}
-      ${singleAssignment(2)}
-      ${singleAssignment(3)}`;
-      }
-
-      return `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-  @group(0) @binding(0) var<storage, read> aData : array<vec4<${typeA}>>;
-  @group(0) @binding(1) var<storage, read> bData : array<vec4<${typeB}>>;
-  @group(0) @binding(2) var<storage, read_write> outputData : array<vec4<${typeOutput}>>;
-
-  ${additionalImplementation ?? ''}
-  ${broadcastImpl}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${vecSize}u) {
-      return;
-    }
-
-    ${assignment}
-  }`;
-    };
-
-const createBinaryOpProgramInfo =
-    (metadata: ProgramMetadata, a: Tensor, b: Tensor, funcCall: BinaryFunctionCall, additionalImplementation?: string,
-     outputTensorType: Tensor.DataType = a.type): ProgramInfo => {
-      const isBroadcast = !ShapeUtil.areEqual(a.dims, b.dims);
-      let outputShape = a.dims;
-      let outputSize = a.size;
-
-      let vectorize = false;
-
-      // TODO: deal with zero-sized tensors (eg. dims=[1,0])
-
-      if (isBroadcast) {
-        const calculatedShape = BroadcastUtil.calcShape(a.dims, b.dims, false);
-        if (!calculatedShape) {
-          throw new Error('Can\'t perform binary op on the given tensors');
-        }
-        outputShape = calculatedShape;
-        outputSize = ShapeUtil.size(outputShape);
-
-        // check whether vectorize can be enabled
-        let sharedDimension = 1;
-        for (let i = 0; i < outputShape.length; i++) {
-          const dimA = a.dims[a.dims.length - i] ?? 1;
-          const dimB = b.dims[b.dims.length - i] ?? 1;
-          if (dimA === dimB) {
-            sharedDimension *= dimA;
-          } else {
-            break;
-          }
-        }
-        if (sharedDimension % 4 === 0) {
-          vectorize = true;
-        }
-
-
-      } else {
-        // element-wise
-        vectorize = true;
-      }
-
-      return {
-        ...metadata,
-        shaderSource: createBinaryOpProgramShader(
-            a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, additionalImplementation),
-        outputs: [{dims: outputShape, type: outputTensorType, gpuDataType: GpuDataType.default}],
-        dispatchGroup: () =>
-            ({x: Math.ceil(outputSize / 64 /* workgroup size */ / (vectorize ? 4 : 1) /* vec size */)})
-      };
-    };
-
-const createBinaryOpProgramInfoLoader =
-    (inputs: Tensor[], name: string, funcCall: BinaryFunctionCall, additionalImplementation?: string,
-     cacheKey?: string): ProgramInfoLoader => {
-      const metadata:
-          ProgramMetadata = {name, inputTypes: [GpuDataType.default, GpuDataType.default], cacheHint: cacheKey};
-      return {
-        ...metadata,
-        get: () => createBinaryOpProgramInfo(metadata, inputs[0], inputs[1], funcCall, additionalImplementation)
-      };
-    };
-
-export const add = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Add', (a, b) => `${a}+${b}`), inputs);
-
-// export const and = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslAnd(), 'bool'), inputs)];
-
-export const div = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Div', (a, b) => `${a}/${b}`), inputs);
-
-// export const equal = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslEqual(), 'bool'), inputs)];
-
-// export const greater = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslGreater(), 'bool'), inputs)];
-
-// export const less = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslLess(), 'bool'), inputs)];
-
-export const mul = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Mul', (a, b) => `${a}*${b}`), inputs);
-
-// export const or = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslOr(), 'bool'), inputs)];
-
-export const pow = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Pow', 'pow'), inputs);
-
-// export const pRelu = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslPRelu()), inputs)];
-
-export const sub = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createBinaryOpProgramInfoLoader(inputs, 'Sub', (a, b) => `${a}-${b}`), inputs);
-
-// export const xor = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createBinaryProgramInfoLoader(handler, inputs, glslXor(), 'bool'), inputs)];
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/common.ts b/js/web/lib/onnxjs/backends/webgpu/ops/common.ts
deleted file mode 100644
index ec7ec3107e084..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/common.ts
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {ShapeUtil} from '../../../util';
-
-/**
- * constant value for a workgroup size.
- *
- * We definitely can do further optimization in future, but for now we use 64.
- *
- * rule of thumb: Use [a workgroup size of] 64 unless you know what GPU you are targeting or that your workload
- *                needs something different.
- *
- * from: https://surma.dev/things/webgpu/
- **/
-export const WORKGROUP_SIZE = 64;
-
-export interface IndicesHelper {
-  /**
-   * WGSL code of function implementation for offset-to-indices
-   */
-  o2iImpl: string;
-  /**
-   * WGSL code of function call for offset-to-indices
-   */
-  o2iCall: (varOffset: string, varIndices: string) => string;
-  /**
-   * WGSL code of function implementation for indices-to-offset
-   */
-  i2oImpl: string;
-  /**
-   * WGSL code of function implementation for indices-to-offset
-   *
-   * @param isPtr - whether the variable is a pointer. default is false.
-   */
-  i2oExpression: (varIndices: string, isPtr?: boolean) => string;
-  /**
-   * WGSL code of indices variable declaration
-   *
-   * @param v - variable name.
-   * @param init - initial value.
-   */
-  indicesVariableDeclaration: (v: string, init?: string[]) => string;
-  /**
-   * data type of indices
-   */
-  iType: string;
-}
-
-export const createIndicesHelper = (name: string, shape: readonly number[]) => {
-  const iType = shape.length < 2 ? 'u32' : `array<u32, ${shape.length}>`;
-
-  const strides = ShapeUtil.computeStrides(shape);
-  let o2iSnippet = '';
-  for (let i = 0; i < shape.length - 1; i++) {
-    o2iSnippet += `
-    let dim${i} = current / ${strides[i]}u;
-    let rest${i} = current % ${strides[i]}u;
-    (*indices)[${i}] = dim${i};
-    current = rest${i};
-    `;
-  }
-  o2iSnippet += `(*indices)[${shape.length - 1}] = current;`;
-
-  const o2iImpl = shape.length < 2 ? '' : `
-  fn ih_o2i_${name}(offset: u32, indices: ptr<function, ${iType}>) {
-    var current = offset;
-    ${o2iSnippet}
-  }`;
-
-  const o2iCall = (varOffset: string, varIndices: string) =>
-      shape.length < 2 ? `${varIndices}=${varOffset};` : `ih_o2i_${name}(${varOffset}, &${varIndices});`;
-
-  const offsets: string[] = [];
-  for (let i = shape.length - 1; i >= 0; i--) {
-    offsets.push(`${strides[i]}u * ((*indices)[${i}])`);
-  }
-
-  const i2oImpl = shape.length < 2 ? '' : `
-  fn ih_i2o_${name}(indices: ptr<function, ${iType}>) -> u32 {
-    return ${offsets.length > 0 ? offsets.join('+') : '0u'};
-  }`;
-
-  const i2oExpression = (varIndices: string, isPtr?: boolean) =>
-      shape.length < 2 ? `(${isPtr ? '*' : ''}${varIndices})` : `ih_i2o_${name}(${isPtr ? '' : '&'}${varIndices})`;
-
-  const indicesVariableDeclaration = (v: string, init?: string[]) =>
-      `var ${v}:${iType}${init ? `=${iType}(${init.join(',')})` : ''};`;
-
-  return {o2iImpl, o2iCall, i2oImpl, i2oExpression, indicesVariableDeclaration, iType};
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts b/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
deleted file mode 100644
index 9616ca7ae5196..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/concat.ts
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, IndicesHelper, WORKGROUP_SIZE} from './common';
-
-export interface ConcatAttributes extends AttributeWithCacheKey {
-  readonly axis: number;
-}
-
-export const concat = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConcatAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(createConcatProgramInfoLoader(inputs, attributes), inputs);
-};
-
-const createConcatProgramMetadata = (inputCount: number, cacheHint: string) =>
-    ({name: 'Concat', inputTypes: Array(inputCount).fill(GpuDataType.default), cacheHint});
-
-const createConcatProgramInfo =
-    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
-      const inputShape = inputs[0].dims.slice();
-      if (axis >= inputShape.length || axis < (-1 * inputShape.length)) {
-        throw new Error('axis specified for concat doesn\'t match input dimensionality');
-      }
-      if (axis < 0) {
-        axis = inputShape.length + axis;
-      }
-      // ensure all of the non-concatenated axes match each other
-      // calculate the shape of the output tensor while we do that
-      const outputShape = inputShape.slice(0);
-      for (let i = 1; i < inputs.length; i++) {
-        const dataNShape = inputs[i].dims.slice();
-        for (let axisIndex = 0; axisIndex < inputShape.length; axisIndex++) {
-          // add to the placeholder for computing output shape
-          if (axisIndex === axis) {
-            outputShape[axis] += dataNShape[axisIndex];
-          }
-          // ensure all non-cancatenated axes match each other
-          else if (inputShape[axisIndex] !== dataNShape[axisIndex]) {
-            throw new Error('non concat dimensions must match');
-          }
-        }
-      }
-
-      const outputSize = ShapeUtil.size(outputShape);
-      const rank = outputShape.length;
-
-      const sizeInConcatAxis = new Array<number>(inputs.length);
-      const inputStorageBuffersDeclarations = new Array<string>(inputs.length);
-      const inputIndicesHelpers = new Array<IndicesHelper>(inputs.length);
-
-      let previousSum = 0;
-      for (let i = 0; i < inputs.length; ++i) {
-        previousSum += inputs[i].dims[axis];
-        sizeInConcatAxis[i] = previousSum;
-
-        inputStorageBuffersDeclarations[i] =
-            `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`;
-
-        inputIndicesHelpers[i] = createIndicesHelper(`input${i}`, inputs[i].dims);
-      }
-
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-
-      const indicesAxis = rank < 2 ? 'indices' : `indices[${axis}]`;
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
-
-  ${inputIndicesHelpers.map(i => i.i2oImpl).join('\n')}
-  ${outputIndicesHelper.o2iImpl}
-
-  const sizeInConcatAxis = array<u32, ${sizeInConcatAxis.length}>(${sizeInConcatAxis.map(i => `${i}u`).join(',')});
-  ${calculateInputIndexImpl(sizeInConcatAxis.length)}
-  ${readBufferDataImpl(inputIndicesHelpers, rank, dataType)}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-
-    let textureIndex = calculateInputIndex(${indicesAxis});
-    if (textureIndex != 0u) {
-      ${indicesAxis} -= sizeInConcatAxis[textureIndex - 1u];
-    }
-
-    output[global_id.x] = readBufferData(textureIndex, &indices);
-  }`;
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const createConcatProgramInfoLoader = (inputs: Tensor[], attributes: ConcatAttributes): ProgramInfoLoader => {
-  const metadata = createConcatProgramMetadata(inputs.length, attributes.cacheKey);
-  return {...metadata, get: () => createConcatProgramInfo(metadata, inputs, attributes.axis)};
-};
-
-const calculateInputIndexImpl = (numberOfTensors: number): string => `
-  fn calculateInputIndex(index: u32) -> u32 {
-    for (var i: u32 = 0u; i < ${numberOfTensors}u; i += 1u ) {
-      if (index < sizeInConcatAxis[i]) {
-        return i;
-      }
-    }
-    return ${numberOfTensors}u;
-  }`;
-
-const readBufferDataImpl = (indicesHelper: readonly IndicesHelper[], tensorRank: number, dataType: string) => {
-  const numberOfTensors = indicesHelper.length;
-  const codeLines: string[] = [];
-  for (let i = 0; i < numberOfTensors; ++i) {
-    const returnSnippet = `return input${i}[${indicesHelper[i].i2oExpression('indices', true)}];`;
-    if (numberOfTensors === 1) {
-      codeLines.push(returnSnippet);
-    } else if (i === 0) {
-      codeLines.push(`if (textureIndex == ${i}u) { ${returnSnippet} }`);
-    } else if (i === numberOfTensors - 1) {
-      codeLines.push(`else { ${returnSnippet} }`);
-    } else {
-      codeLines.push(`else if (textureIndex == ${i}) { ${returnSnippet} }`);
-    }
-  }
-  return `
-  fn readBufferData(textureIndex: u32, indices: ptr<function, ${indicesHelper[0].iType}>) -> ${dataType} {
-    ${codeLines.join('\n')}
-  }`;
-};
-
-export const parseConcatAttributes: OperatorInitialization<ConcatAttributes> = (node: Graph.Node): ConcatAttributes =>
-    createAttributeWithCacheKey({axis: node.attributes.getInt('axis')});
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length < 1) {
-    throw new Error('too few inputs');
-  }
-
-  const inputType = inputs[0].type;
-  const inputDimensionality = inputs[0].dims.length;
-
-  // TODO: Support string concat
-  if (inputType === 'string') {
-    throw new Error('string tensor is not supported yet');
-  }
-
-  for (const input of inputs) {
-    // make sure types of all inputs match
-    if (input.type !== inputType) {
-      throw new Error('input tensors should be one type');
-    }
-
-    // make sure the dimensionality of all inputs are the same
-    if (input.dims.length !== inputDimensionality) {
-      throw new Error('input tensors should have the same shape');
-    }
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts b/js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts
deleted file mode 100644
index 570ec041a34fc..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/conv-grouped.ts
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Logger} from '../../../instrument';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-import {calculateOutputShape, ConvAttributes} from './conv';
-import {getActicationSnippet} from './fuse-utils';
-
-const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string): ProgramMetadata => ({
-  name: 'GroupedConv',
-  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-                        [GpuDataType.default, GpuDataType.default],
-  cacheHint
-});
-
-const createGroupedConvProgramInfo =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], metadata: ProgramMetadata,
-     attributes: ConvAttributes): ProgramInfo => {
-      const hasBias = inputs.length > 2;
-      const processBias = hasBias ? 'value += b[output_channel];' : '';
-      const xShape = inputs[0].dims;
-      const wShape = inputs[1].dims;
-      const outputChannelsPerGroup = wShape[0] / attributes.group;
-
-      const dataType = 'f32';  // TODO: support other data type
-      const {activationFunction, applyActivation} = getActicationSnippet(attributes);
-      const inputStorageBuffersDeclarations = [
-        `@group(0) @binding(0) var<storage, read> x : array<${dataType}>;`,
-        `@group(0) @binding(1) var<storage, read> w : array<${dataType}>;`
-      ];
-      if (hasBias) {
-        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> b : array<${dataType}>;`);
-      }
-
-      Logger.verbose(
-          'GroupedConv',
-          `autpPad:${attributes.autoPad}, dilations:${attributes.dilations}, group:${attributes.group}, kernelShape:${
-              attributes.kernelShape}, pads:${attributes.pads}, strides:${attributes.strides}`);
-      const outputShape =
-          calculateOutputShape(xShape, wShape, attributes.dilations, attributes.pads, attributes.strides);
-      const outputSize = ShapeUtil.size(outputShape);
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const xIndicesHelper = createIndicesHelper('x', xShape);
-      const wIndicesHelper = createIndicesHelper('w', wShape);
-
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
-  const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
-
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputStorageBuffersDeclarations.length}) var<storage, read_write> output : array<${dataType}>;
-
-  ${activationFunction}
-  ${outputIndicesHelper.o2iImpl}
-  ${xIndicesHelper.i2oImpl}
-  ${wIndicesHelper.i2oImpl}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
-    let batch: u32 = outputIndices[0];
-    let output_channel: u32 = outputIndices[1];
-    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[2], outputIndices[3]) * strides - pads;
-    let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
-
-    var value: ${dataType} = ${dataType}(0);
-    for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
-      let input_channel = group_id * ${wShape[1]}u + wInChannel;
-      for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
-        let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
-
-        if (xHeight < 0u || xHeight >= ${xShape[2]}u) {
-          continue;
-        }
-
-        for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
-          let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
-          if (xWidth < 0u || xWidth >= ${xShape[3]}u) {
-            continue;
-          }
-
-          ${
-          xIndicesHelper.indicesVariableDeclaration(
-              'xIndices',
-              [
-                'batch', 'input_channel', 'xHeight', 'xWidth'
-              ])}
-          let xVal = x[${xIndicesHelper.i2oExpression('xIndices')}];
-          ${
-          wIndicesHelper.indicesVariableDeclaration('wIndices', [
-            'output_channel', 'wInChannel', 'wHeight', 'wWidth'
-          ])}
-          let wVal = w[${wIndicesHelper.i2oExpression('wIndices')}];
-          value += xVal*wVal;
-        }
-      }
-    }
-    ${processBias}
-    ${applyActivation}
-    output[global_id.x] = value;
-  }`;
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-export const createGroupedConvProgramInfoLoader =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: readonly Tensor[], attributes: ConvAttributes):
-        ProgramInfoLoader => {
-          const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
-          return {...metadata, get: () => createGroupedConvProgramInfo(inferenceHandler, inputs, metadata, attributes)};
-        };
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/conv.ts b/js/web/lib/onnxjs/backends/webgpu/ops/conv.ts
deleted file mode 100644
index 644e9b08c7030..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/conv.ts
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {InferenceHandler} from '../../../backend';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {PoolConvUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-import {createGroupedConvProgramInfoLoader} from './conv-grouped';
-// import {createDotProductProgramInfoLoader} from './dot-product';
-import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
-
-// import {createIm2ColProgramInfoLoader} from './im2col';
-// import {createMatmulProgramInfoLoader} from './matmul';
-
-
-export const calculateOutputShape =
-    (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
-     adjustPads: readonly number[], strides: readonly number[]): number[] => {
-      const batchSize = inputShape[0];
-      const inputSpatialShape = inputShape.slice(2);
-      const spatialRank = inputSpatialShape.length;
-      const outChannels = kernelShape[0];
-      const kernelSpatialShape = kernelShape.slice(2);
-      const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
-      const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]);
-      const outputSpatialShape =
-          inputSpatialShapeWithPad.map((v, i) => Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]));
-      const outputShape = [batchSize, outChannels].concat(...outputSpatialShape);
-      return outputShape;
-    };
-
-export interface ConvAttributes extends InternalActivationAttributes, AttributeWithCacheKey {
-  readonly autoPad: string;
-  readonly dilations: readonly number[];
-  readonly group: number;
-  readonly kernelShape: readonly number[];
-  readonly pads: readonly number[];
-  readonly strides: readonly number[];
-}
-
-export const conv: OperatorAsyncImplementation<ConvAttributes> =
-    async(inferenceHandler: InferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs, attributes);  // currently will fail if not conv2D
-  return conv2d(inferenceHandler, inputs, attributes);
-};
-
-const conv2d: OperatorAsyncImplementation<ConvAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ConvAttributes): Promise<Tensor[]> => {
-  const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
-  //  const isPointwise = adjustedAttributes.kernelShape[0] === 1 && adjustedAttributes.kernelShape[1] === 1;
-  //  if (adjustedAttributes.group > 1) {
-  return inferenceHandler.run(createGroupedConvProgramInfoLoader(inferenceHandler, inputs, adjustedAttributes), inputs);
-  //  } else if (isPointwise) {
-  //    return conv2DPointwise(inferenceHandler, inputs, adjustedAttributes);
-  //  } else {
-  //    return conv2D(inferenceHandler, inputs, adjustedAttributes);
-  //  }
-};
-
-const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inputs: Tensor[]): T => {
-  const kernelShape = attributes.kernelShape.slice();
-  // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
-  if (attributes.kernelShape.length === 0) {
-    for (let i = 2; i < inputs[1].dims.length; ++i) {
-      kernelShape.push(inputs[1].dims[i]);
-    }
-  }
-  const pads = attributes.pads.slice();
-  PoolConvUtil.adjustPadsBasedOnAutoPad(
-      inputs[0].dims, attributes.strides, attributes.dilations, kernelShape, pads, attributes.autoPad);
-
-  // always return a new object so does not modify the original attributes
-  const newAttributes: T = Object.assign({}, attributes);
-  Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
-  return newAttributes;
-};
-
-export const parseConvAttributes: OperatorInitialization<ConvAttributes> = (node: Graph.Node): ConvAttributes => {
-  const attributes = node.attributes;
-  const activationAttributes = parseInternalActivationAttributes(attributes);
-  // TODO : Make this generic enough to compute default attributes for multi-dimensional conv
-  const autoPad = attributes.getString('auto_pad', 'NOTSET');
-  const dilations = attributes.getInts('dilations', [1, 1]);
-  const group = attributes.getInt('group', 1);
-  const kernelShape = attributes.getInts('kernel_shape', []);
-  const pads = attributes.getInts('pads', [0, 0, 0, 0]);
-  const strides = attributes.getInts('strides', [1, 1]);
-
-  return createAttributeWithCacheKey({autoPad, dilations, group, kernelShape, pads, strides, ...activationAttributes});
-};
-
-const validateInputs = (inputs: Tensor[], attributes: ConvAttributes): void => {
-  // Refer to the below link for all input checks
-  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
-  if (!inputs || (inputs.length !== 2 && inputs.length !== 3)) {
-    throw new Error('Conv requires 2 or 3 inputs');
-  }
-
-  // TODO : Need to add support for multi-dimensional conv
-  if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
-    throw new Error('currently only support 2-dimensional conv');
-  }
-
-  // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
-  const dataChannel = inputs[0].dims[1];
-  const filterInChannel = inputs[1].dims[1] * attributes.group;
-  if (dataChannel !== filterInChannel) {
-    throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
-  }
-
-  // if bias is provided it should be 1D and the number of elements should be equal to the number of feature maps
-  if (inputs.length === 3 && (inputs[2].dims.length !== 1 || inputs[1].dims[0] !== inputs[2].dims[0])) {
-    throw new Error('invalid bias');
-  }
-
-  const spatialRank = inputs[0].dims.length - 2;
-  // wrong dilations dimension
-  if (attributes.dilations.length !== spatialRank) {
-    throw new Error(`dilations should be ${spatialRank}D`);
-  }
-
-  // Wrong strides dimension
-  if (attributes.strides.length !== spatialRank) {
-    throw new Error(`strides should be ${spatialRank}D`);
-  }
-
-  // Wrong pads dimension
-  if (attributes.pads.length !== spatialRank * 2) {
-    throw new Error(`pads should be ${spatialRank * 2}D`);
-  }
-
-  // if kernelShape is specified, it's data length must be 2 less than dims length of the weights tensor
-  // (the first 2 dims are batch_size and channels)
-  if (attributes.kernelShape.length !== 0 && attributes.kernelShape.length !== inputs[1].dims.length - 2) {
-    throw new Error('invalid kernel shape');
-  }
-
-  // TODO : Need to add support for float64
-  if (inputs[0].type !== 'float32' || inputs[1].type !== 'float32') {
-    throw new Error('Conv input(X,W) should be float tensor');
-  }
-
-  if (inputs.length === 3 && inputs[2].type !== 'float32') {
-    throw new Error('Conv input(bias) should be float tensor');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts b/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
deleted file mode 100644
index 355685b55ad6a..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/fuse-utils.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Attribute} from '../../../attribute';
-import {MAX_CLIP, MIN_CLIP} from '../../../util';
-
-export interface InternalActivationAttributes {
-  readonly activation: string;
-  readonly clipMin?: number;
-  readonly clipMax?: number;
-  readonly activationCacheKey: string;
-}
-
-export function getActicationSnippet(attributes: InternalActivationAttributes) {
-  switch (attributes.activation) {
-    case 'Relu':
-      return {activationFunction: '', applyActivation: 'value = max(value, 0.0);'};
-    case 'Sigmoid':
-      return {activationFunction: '', applyActivation: 'value = (1.0 / (1.0 + exp(-value)));'};
-    case 'Clip':
-      return {
-        activationFunction: `const clip_min_=f32(${attributes.clipMin!});const clip_max_=f32(${attributes.clipMax!});`,
-        applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
-      };
-      // TODO: adding other activations that can be fused.
-    default:
-      return {activationFunction: '', applyActivation: ''};
-  }
-}
-
-export const parseInternalActivationAttributes = (attributes: Attribute): InternalActivationAttributes => {
-  const activation = attributes.getString('activation', '');
-
-  if (activation === 'Clip') {
-    const [clipMin, clipMax] = attributes.getFloats('activation_params', [MIN_CLIP, MAX_CLIP]);
-    return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
-  }
-  return {activation, activationCacheKey: activation};
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/gather.ts b/js/web/lib/onnxjs/backends/webgpu/ops/gather.ts
deleted file mode 100644
index 65f679a2cea83..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/gather.ts
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {NUMBER_TYPES, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-interface GatherAttributes extends AttributeWithCacheKey {
-  readonly axis: number;
-}
-
-export const gather = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GatherAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs, attributes.axis);
-  return inferenceHandler.run(createGatherProgramInfoLoader(inputs, attributes), inputs);
-};
-
-export const parseGatherAttributes: OperatorInitialization<GatherAttributes> = (node: Graph.Node): GatherAttributes =>
-    createAttributeWithCacheKey({axis: node.attributes.getInt('axis', 0)});
-
-const gatherProgramMetadata = {
-  name: 'Gather',
-  inputTypes: [GpuDataType.default, GpuDataType.default]
-};
-
-const createGatherProgramInfo =
-    (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
-      const dataShape = inputs[0].dims.slice();
-      const indicesShape = inputs[1].dims.slice();
-      const outputShape = new Array(dataShape.length + indicesShape.length - 1);
-
-      axis = ShapeUtil.normalizeAxis(axis, dataShape.length);
-      const indexCopyOps: string[] = [];
-      if (indicesShape.length > 1) {
-        indexCopyOps.push('indicesIdx[0] = 0u;');
-      } else {
-        indexCopyOps.push('indicesIdx = 0u;');
-      }
-      for (let i = 0; i < outputShape.length; i++) {
-        // outputShape is divided into three parts: A, B, C
-        // |0        axis|  axis + indicesShape.length |          end|
-        // |     A       |             B               |      C      |
-        //
-        // dataIdx: [A, inputs[1][B], C]
-        const outputIdxLValue = outputShape.length > 1 ? `outputIdx[${i}]` : 'outputIdx';
-        if (i < axis) {  // A
-          const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i}]` : 'dataIdx';
-          outputShape[i] = dataShape[i];
-          indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
-        } else {
-          if (i < axis + indicesShape.length) {  // B
-            const indicesIdxLValue = indicesShape.length > 1 ? `indicesIdx[${i - axis}]` : 'indicesIdx';
-            outputShape[i] = indicesShape[i - axis];
-            indexCopyOps.push(`${indicesIdxLValue} = ${outputIdxLValue};`);
-          } else {  // C
-            const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i - indicesShape.length + 1}]` : 'dataIdx';
-            outputShape[i] = dataShape[i - indicesShape.length + 1];  // skip 1 for axis
-            indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
-          }
-        }
-      }
-      const outputSize = ShapeUtil.size(outputShape);
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const dataIndicesHelper = createIndicesHelper('data', dataShape);
-      const indicesIndicesHelper = createIndicesHelper('indices', indicesShape);
-
-      const shaderSource = `
-    const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-    @group(0) @binding(0) var<storage, read> data : array<${dataType}>;
-    @group(0) @binding(1) var<storage, read> indices : array<i32>;
-    @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
-
-    ${outputIndicesHelper.o2iImpl}
-    ${indicesIndicesHelper.i2oImpl}
-    ${dataIndicesHelper.i2oImpl}
-
-    @compute @workgroup_size(WORKGROUP_SIZE)
-    fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-      // Guard against out-of-bounds work group sizes
-      if (global_id.x >= ${outputSize}u) {
-        return;
-      }
-
-      ${outputIndicesHelper.indicesVariableDeclaration('outputIdx')}
-      ${outputIndicesHelper.o2iCall('global_id.x', 'outputIdx')}
-      ${dataIndicesHelper.indicesVariableDeclaration('dataIdx')}
-      ${indicesIndicesHelper.indicesVariableDeclaration('indicesIdx')}
-      ${indexCopyOps.join('\n        ')}
-      let idx = indices[${indicesIndicesHelper.i2oExpression('indicesIdx')}];
-      dataIdx${dataShape.length > 1 ? `[${axis}]` : ''} = u32(select(idx, idx + ${dataShape[axis]}, idx < 0));
-      output[global_id.x] = data[${dataIndicesHelper.i2oExpression('dataIdx')}];
-    }`;
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const createGatherProgramInfoLoader = (inputs: Tensor[], attributes: GatherAttributes): ProgramInfoLoader => {
-  const metadata = {...gatherProgramMetadata, cacheHint: attributes.cacheKey};
-  return {...metadata, get: () => createGatherProgramInfo(metadata, inputs, attributes.axis)};
-};
-
-const validateInputs = (inputs: Tensor[], axis: number): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Gather requires 2 inputs.');
-  }
-  const tensorRank = inputs[0].dims.length;
-  if (tensorRank < 1) {
-    throw new Error('Invalid input shape.');
-  }
-  if (axis < -tensorRank || axis > tensorRank - 1) {
-    throw new Error('Invalid axis.');
-  }
-  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
-    throw new Error('Invaid input type.');
-  }
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invaid input type.');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts b/js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts
deleted file mode 100644
index 3eeb49c91033a..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/gemm.ts
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {GemmUtil, ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-
-export interface GemmAttributes extends AttributeWithCacheKey {
-  transA: boolean;
-  transB: boolean;
-  alpha: number;
-  beta: number;
-  isOptionalC: boolean;  // in opset 11, C becomes optional
-}
-
-export const gemm: OperatorAsyncImplementation<GemmAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GemmAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs, attributes);
-  return inferenceHandler.run(createGemmProgramInfoLoader(inputs, attributes), inputs);
-};
-
-const parseGemmAttributes = (node: Graph.Node, isOptionalC: boolean): GemmAttributes => {
-  const transA = node.attributes.getInt('transA', 0) !== 0;
-  const transB = node.attributes.getInt('transB', 0) !== 0;
-  const alpha = node.attributes.getFloat('alpha', 1.0);
-  const beta = node.attributes.getFloat('beta', 1.0);
-  return createAttributeWithCacheKey({transA, transB, alpha, beta, isOptionalC});
-};
-
-export const parseGemmAttributesV7: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
-    parseGemmAttributes(node, false);
-
-export const parseGemmAttributesV11: OperatorInitialization<GemmAttributes> = (node: Graph.Node): GemmAttributes =>
-    parseGemmAttributes(node, true);
-
-const createGemmProgramInfoLoader = (inputs: Tensor[], attributes: GemmAttributes): ProgramInfoLoader => {
-  const metadata = {
-    name: 'Gemm',
-    inputTypes: inputs.length === 3 ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-                                      [GpuDataType.default, GpuDataType.default],
-    cacheHint: attributes.cacheKey
-  };
-
-  return {...metadata, get: () => createGemmProgramInfo(metadata, inputs, attributes)};
-};
-
-const offsetC = (m: number, n: number, dims: readonly number[]): string => {
-  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
-  const broadcastN = dims[dims.length - 1] !== n;
-
-  let offset = '0u';
-  if (!broadcastM) {
-    offset += `+ m * ${dims[dims.length - 1]}u`;
-  }
-  if (!broadcastN) {
-    offset += '+n';
-  }
-
-  return offset;
-};
-
-const createGemmProgramInfo =
-    (metadata: ProgramMetadata, inputs: Tensor[], attributes: GemmAttributes): ProgramInfo => {
-      const aShape = inputs[0].dims.slice();
-      const bShape = inputs[1].dims.slice();
-      const [M, N, K] = GemmUtil.getShapeOfGemmResult(
-          aShape, attributes.transA, bShape, attributes.transB, inputs.length === 3 ? inputs[2].dims : undefined);
-      const outputShape = [M, N];
-      if (!outputShape) {
-        throw new Error('Can\'t use gemm on the given tensors');
-      }
-      const outputSize = ShapeUtil.size(outputShape);
-      let line = '';
-      if (attributes.transA && attributes.transB) {
-        line = 'value += a[k * M + m] * b[n * K + k];';
-      } else if (attributes.transA && !attributes.transB) {
-        line = 'value += a[k * M + m] * b[k * N + n];';
-      } else if (!attributes.transA && attributes.transB) {
-        line = 'value += a[m * K + k] * b[n * K + k];';
-      } else if (!attributes.transA && !attributes.transB) {
-        line = 'value += a[m * K + k] * b[k * N + n];';
-      }
-
-      const dataType = 'f32';  // TODO: support other data type
-      const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
-      const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
-      const inputStorageBuffersDeclarations = [
-        `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
-        `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
-      ];
-      if (inputs.length === 3) {
-        inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
-      }
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha = ${dataType}(${attributes.alpha});
-  const beta = ${dataType}(${attributes.beta});
-
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    let m = global_id.x / N;
-    let n = global_id.x % N;
-
-    var value = ${dataType}(0);
-    for (var k: u32 = 0u; k<${K}u; k++) {
-      ${line}
-    }
-
-    ${calculateAlpha}
-    ${calculateC}
-    output[global_id.x] = value;
-
-  }`;
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const validateInputs = (inputs: Tensor[], attributes: GemmAttributes): void => {
-  if (!inputs) {
-    throw new Error('Input is missing');
-  }
-  if (attributes.isOptionalC && (inputs.length < 2 || inputs.length > 3)) {
-    throw new Error('Invaid input shape.');
-  }
-  if (!attributes.isOptionalC && inputs.length !== 3) {
-    throw new Error('Gemm requires 3 inputs');
-  }
-
-  // 'C' can be of dimensionality 1 or 2 only
-  if (inputs.length === 3 && inputs[2].dims.length !== 1 && inputs[2].dims.length !== 2) {
-    throw new Error('Invalid input shape of C');
-  }
-
-  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
-      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64') ||
-      (inputs.length === 3 && inputs[2].type !== 'float32' && inputs[2].type !== 'float64')) {
-    throw new Error('Invalid input type.');
-  }
-
-  if ((inputs[0].type !== inputs[1].type) || (inputs.length === 3 && inputs[0].type !== inputs[2].type)) {
-    throw new Error('Input types are mismatched');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts b/js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts
deleted file mode 100644
index 5b8f0bf94733e..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/matmul.ts
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {BroadcastUtil, ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-import {getActicationSnippet, InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
-
-export const matMul: OperatorAsyncImplementation<InternalActivationAttributes> =
-    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: InternalActivationAttributes):
-        Promise<Tensor[]> => {
-          validateInputs(inputs);
-
-          return inferenceHandler.run(createMatmulProgramInfoLoader(inputs, attributes), inputs);
-        };
-
-export const parseMatMulAttributes: OperatorInitialization<InternalActivationAttributes> =
-    (node: Graph.Node): InternalActivationAttributes => parseInternalActivationAttributes(node.attributes);
-
-const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
-  name: 'MatMul',
-  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-                        [GpuDataType.default, GpuDataType.default],
-  cacheHint
-});
-
-function createMatmulProgramInfo(
-    metadata: ProgramMetadata, inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfo {
-  const aShape = inputs[0].dims;
-  const bShape = inputs[1].dims;
-  const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
-  if (!outputShape) {
-    throw new Error('Can\'t use matmul on the given tensors');
-  }
-  const outputSize = ShapeUtil.size(outputShape);
-  // TODO: support broadcasting
-
-  const dataType = 'f32';  // TODO: support other data type
-  const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
-
-  const M = outputShape[outputShape.length - 2];
-  const K = aShape[aShape.length - 1];
-  const N = outputShape[outputShape.length - 1];
-  const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-
-  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> b : array<${dataType}>;
-  @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
-
-  ${activationFunction}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    let stack = global_id.x / (M * N);
-    let mn = global_id.x % (M * N);
-    let n = global_id.x % N;
-    let m = mn / N;
-
-    let offsetA = stack * (M * K);
-    let offsetB = stack * (K * N);
-
-    var value = ${dataType}(0);
-    for (var k: u32 = 0u; k<${K}u; k++) {
-      value += a[offsetA + m * K + k] * b[offsetB + k * N + n];
-    }
-    ${applyActivation}
-    output[global_id.x] = value;
-  }`;
-  return {
-    ...metadata,
-    outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-    shaderSource,
-    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-  };
-}
-
-export function createMatmulProgramInfoLoader(
-    inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfoLoader {
-  const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
-  return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes)};
-}
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('MatMul requires 2 inputs.');
-  }
-
-  if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
-    throw new Error('shared dimension does not match.');
-  }
-
-  if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
-      (inputs[1].type !== 'float32' && inputs[1].type !== 'float64')) {
-    throw new Error('inputs should be float type');
-  }
-
-  if (inputs[0].type !== inputs[1].type) {
-    throw new Error('inputs types should match');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/pool.ts b/js/web/lib/onnxjs/backends/webgpu/ops/pool.ts
deleted file mode 100644
index 0e92ff8cb906a..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/pool.ts
+++ /dev/null
@@ -1,376 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {PoolConvUtil, ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-export interface AveragePoolAttributes extends AttributeWithCacheKey {
-  readonly autoPad: string;
-  readonly ceilMode: number;
-  readonly countIncludePad: boolean;
-  readonly kernelShape: readonly number[];
-  readonly strides: readonly number[];
-  readonly pads: readonly number[];
-}
-
-export const averagePool: OperatorAsyncImplementation<AveragePoolAttributes> =
-    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
-        Promise<Tensor[]> => {
-          validateInputs(inputs);
-          const metadata = {name: 'AveragePool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-          return inferenceHandler.run(
-              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
-        };
-
-export const parseAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
-    (node: Graph.Node): AveragePoolAttributes => {
-      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
-      const ceilMode = node.attributes.getInt('ceil_mode', 0);
-      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
-      const kernelShape = node.attributes.getInts('kernel_shape');
-      const strides = node.attributes.getInts('strides', []);
-      const pads = node.attributes.getInts('pads', []);
-
-      // TODO: support attribute 'ceil_mode'
-      if (ceilMode !== 0) {
-        throw new Error('using ceil() in shape computation is not yet supported for AveragePool');
-      }
-
-      return createAttributeWithCacheKey({autoPad, ceilMode, countIncludePad, kernelShape, strides, pads});
-    };
-
-const createAveragePoolProgramInfo =
-    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean,
-     attributes: AveragePoolAttributes): ProgramInfo => {
-      const [adjustedAttributes, outputShape] =
-          getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
-      const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
-
-      const dataType = 'f32';
-
-      const op1 = 'value += x_val;';
-      let op2 = '';
-      if (adjustedAttributes.countIncludePad) {
-        op2 += `value /= ${dataType}(${kernelSize});`;
-      } else {
-        op2 += `value /= ${dataType}(${kernelSize} - pad);`;
-      }
-      return {
-        ...metadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType, '0.0'),
-        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
-      };
-    };
-
-export const globalAveragePool: OperatorAsyncImplementation<AveragePoolAttributes> =
-    async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: AveragePoolAttributes):
-        Promise<Tensor[]> => {
-          validateInputs(inputs);
-          const metadata = {
-            name: 'GlobalAveragePool',
-            inputTypes: [GpuDataType.default],
-            cacheHint: `${attributes.countIncludePad}`
-          };
-          return inferenceHandler.run(
-              {...metadata, get: () => createAveragePoolProgramInfo(inputs, metadata, true, attributes)}, inputs);
-        };
-
-export const parseGlobalAveragePoolAttributes: OperatorInitialization<AveragePoolAttributes> =
-    (node: Graph.Node): AveragePoolAttributes => {
-      const countIncludePad = (node.attributes.getInt('count_include_pad', 0) === 0 ? false : true);
-      return createAttributeWithCacheKey(
-          {autoPad: '', ceilMode: 0, countIncludePad, kernelShape: [], strides: [], pads: []});
-    };
-
-export interface MaxPoolAttributes extends AveragePoolAttributes {
-  readonly storageOrder: number;
-  readonly dilations: number[];
-}
-
-export const maxPool: OperatorAsyncImplementation<MaxPoolAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: MaxPoolAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  const metadata = {name: 'MaxPool', inputTypes: [GpuDataType.default], cacheHint: attributes.cacheKey};
-  return inferenceHandler.run(
-      {...metadata, get: () => createMaxPoolProgramInfo(inputs, metadata, false, attributes)}, inputs);
-};
-
-export const parseMaxPoolAttributes: OperatorInitialization<MaxPoolAttributes> =
-    (node: Graph.Node): MaxPoolAttributes => {
-      const autoPad = node.attributes.getString('auto_pad', 'NOTSET');
-      const ceilMode = node.attributes.getInt('ceil_mode', 0);
-      const kernelShape = node.attributes.getInts('kernel_shape');
-      const strides = node.attributes.getInts('strides', []);
-      const pads = node.attributes.getInts('pads', []);
-      const storageOrder = node.attributes.getInt('storage_order', 0);
-      const dilations = node.attributes.getInts('dilations', []);
-
-      // TODO: support attribute 'ceil_mode' and 'storage_order'
-      if (storageOrder !== 0) {
-        throw new Error('column major storage order is not yet supported for MaxPool');
-      }
-      if (ceilMode !== 0) {
-        throw new Error('using ceil() in shape computation is not yet supported for MaxPool');
-      }
-
-      return createAttributeWithCacheKey(
-          {autoPad, ceilMode, countIncludePad: false, kernelShape, strides, pads, storageOrder, dilations});
-    };
-
-const createMaxPoolProgramInfo =
-    (inputs: Tensor[], metadata: ProgramMetadata, isGlobalOperator: boolean, attributes: MaxPoolAttributes):
-        ProgramInfo => {
-          const [adjustedAttributes, outputShape] =
-              getAdjustedPoolAttributesAndOutputShape(inputs, attributes, isGlobalOperator);
-          const op1 = `
-      value = max(x_val, value);
-    `;
-          const op2 = '';
-          return {
-            ...metadata,
-            outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-            shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32', '-1e5'),
-            dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
-          };
-        };
-
-const getAdjustedPoolAttributesAndOutputShape =
-    (inputs: Tensor[], attributes: AveragePoolAttributes|MaxPoolAttributes, isGlobalOperator: boolean):
-        [AveragePoolAttributes|MaxPoolAttributes, number[]] => {
-          const inputShape = inputs[0].dims.slice();
-          const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
-          const kernelShape = attributes.kernelShape.slice();
-          const strides = attributes.strides.slice();
-          const dilations: number[] = hasDilations ? (attributes as MaxPoolAttributes).dilations.slice() : [];
-          const pads = attributes.pads.slice();
-          PoolConvUtil.adjustPoolAttributes(isGlobalOperator, inputShape, kernelShape, strides, dilations, pads);
-
-          const outputShape = PoolConvUtil.computePoolOutputShape(
-              isGlobalOperator, inputShape, strides, dilations, kernelShape, pads, attributes.autoPad);
-
-          const newAttributes = Object.assign({}, attributes);
-          if (hasDilations) {
-            Object.assign(newAttributes, {kernelShape, strides, pads, dilations, cacheKey: attributes.cacheKey});
-          } else {
-            Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
-          }
-          return [newAttributes, outputShape];
-        };
-
-const globalMaxPoolAttributes = {
-  autoPad: '',
-  ceilMode: 0,
-  countIncludePad: false,
-  kernelShape: [],
-  strides: [],
-  pads: [],
-  storageOrder: 0,
-  dilations: [],
-  cacheKey: ''
-};
-
-const globalMaxPoolMetadata = {
-  name: 'GlobalMaxPool',
-  inputTypes: [GpuDataType.default]
-};
-
-export const globalMaxPool = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(
-      {
-        ...globalMaxPoolMetadata,
-        get: () => createMaxPoolProgramInfo(inputs, globalMaxPoolMetadata, true, globalMaxPoolAttributes)
-      },
-      inputs);
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Pool ops requires 1 input.');
-  }
-  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-    throw new Error('Invalid input type.');
-  }
-};
-
-const generatePoolingCode =
-    (inputDims: readonly number[], outputShape: readonly number[], attributes: AveragePoolAttributes, op1: string,
-     op2: string, dataType: string, start: string): string => {
-      const rank = inputDims.length;
-      const outputSize = ShapeUtil.size(outputShape);
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const xIndicesHelper = createIndicesHelper('x', inputDims);
-
-      if (attributes.kernelShape.length <= 2) {
-        const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
-        const sw = attributes.strides[attributes.strides.length - 1];
-        const pwStart = attributes.pads[attributes.pads.length / 2 - 1];
-        const pwEnd = attributes.pads[attributes.pads.length - 1];
-        const dimW = inputDims[rank - 1];
-        let codeW = '';
-        let codeH = '';
-        let codeHEnd = '';
-        if (pwStart + pwEnd !== 0) {
-          codeW = `
-          for (var i: u32 = 0u; i < ${kw}u; i++) {
-            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
-            if (xIndices[${rank - 1}] < 0 || xIndices[${rank - 1}] >= ${dimW}) {
-              pad++;
-              continue;
-            }
-            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-            ${op1}
-          }`;
-        } else {
-          codeW = `
-          for (var i: u32 = 0u; i < ${kw}u; i++) {
-            xIndices[${rank - 1}] = indices[${rank - 1}] * ${sw} - ${pwStart} + i;
-            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-            ${op1}
-          }`;
-        }
-
-        if (attributes.kernelShape.length === 2) {
-          const kh = attributes.kernelShape[attributes.kernelShape.length - 2];
-          const sh = attributes.strides[attributes.strides.length - 2];
-          const phStart = attributes.pads[attributes.pads.length / 2 - 2];
-          const phEnd = attributes.pads[attributes.pads.length - 2];
-          const dimH = inputDims[rank - 2];
-          if (phStart + phEnd !== 0) {
-            codeH = `
-            for (var j: u32 = 0u; j < ${kh}u; j++) {
-              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
-              if (xIndices[${rank - 2}] < 0 || xIndices[${rank - 2}] >= ${dimH}) {
-                pad+= ${kw};
-                continue;
-              }
-          `;
-          } else {
-            codeH = `
-            for (var j: u32 = 0u; j < ${kh}u; j++) {
-              xIndices[${rank - 2}] = indices[${rank - 2}] * ${sh} - ${phStart} + j;
-            `;
-          }
-          codeHEnd = `
-          }
-        `;
-        }
-
-        const poolingCode = `
-        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-        ${outputIndicesHelper.o2iImpl}
-        ${xIndicesHelper.i2oImpl}
-
-        @compute @workgroup_size(WORKGROUP_SIZE)
-        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-          // Guard against out-of-bounds work group sizes
-          if (global_id.x >= ${outputSize}u) {
-            return;
-          }
-
-          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
-
-          var value: ${dataType} = ${dataType}(${start});
-          var pad = 0;
-          ${codeH}
-          ${codeW}
-          ${codeHEnd}
-          ${op2}
-
-          output[global_id.x] = value;
-        }`;
-        return poolingCode;
-      } else {
-        const kernelSize = ShapeUtil.size(attributes.kernelShape);
-        const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
-        const stridesRank = kernelStrides.length;
-        const padsRank = attributes.pads.length;
-        const hasPads = attributes.pads.reduce((sum, cur) => sum + cur);
-        let padCode = '';
-        if (hasPads) {
-          padCode = `
-            if (xIndices[j] >= inputDims[j]) {
-              pad++;
-              isPad = true;
-              break;
-            }
-          }
-          if (!isPad) {
-            let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-            ${op1}
-          }`;
-        } else {
-          padCode = `
-          }
-          let x_val = x[${xIndicesHelper.i2oExpression('xIndices')}];
-          ${op1}
-        `;
-        }
-        const poolingCode = `
-        const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-        @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-        @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-        ${outputIndicesHelper.o2iImpl}
-        ${xIndicesHelper.i2oImpl}
-
-        const pads = array<u32, ${padsRank}>(${attributes.pads.map(i => `${i}u`).join(',')});
-        const inputDims = array<u32, ${rank}>(${inputDims.map(i => `${i}u`).join(',')});
-        const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
-        const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
-
-        @compute @workgroup_size(WORKGROUP_SIZE)
-        fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-          // Guard against out-of-bounds work group sizes
-          if (global_id.x >= ${outputSize}u) {
-            return;
-          }
-
-          ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-          ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-          ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
-
-          var offsets: array<u32, ${stridesRank}>;
-
-          var value = ${dataType}(${start});
-          var pad = 0;
-          var isPad = false;
-
-          for (var i: u32 = 0u; i < ${kernelSize}u; i++) {
-            var offset = i;
-            for (var j = 0u; j < ${stridesRank - 1}u; j++) {
-              offsets[j] = offset / kernelStrides[j];
-              offset -= offsets[j] * kernelStrides[j];
-            }
-            offsets[${stridesRank - 1}] = offset;
-
-            isPad = false;
-            for (var j = ${rank - stridesRank}u; j < ${rank}u; j++) {
-              xIndices[j] = indices[j] * strides[j - ${rank - stridesRank}u]
-                + offsets[j - ${rank - stridesRank}u] - pads[j - 2u];
-              ${padCode}
-          }
-          ${op2}
-
-          output[global_id.x] = value;
-        }`;
-        return poolingCode;
-      }
-    };
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts b/js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts
deleted file mode 100644
index 763a656d92abb..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/reduce-tensors.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-
-export const sum = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-
-  const sumProgramMetadata = {name: 'Sum', inputTypes: new Array(inputs.length).fill(GpuDataType.default)};
-
-  return inferenceHandler.run(
-      {...sumProgramMetadata, get: () => createSumProgramInfo(inferenceHandler, inputs, sumProgramMetadata)}, inputs);
-};
-
-const createSumProgramInfo =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], sumProgramMetadata: ProgramMetadata): ProgramInfo => {
-      const dataType = 'f32';
-      const outputShape = inputs[0].dims;
-      const outputSize = ShapeUtil.size(outputShape);
-
-
-      const inputsDeclaration =
-          inputs.map((_, i) => `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`);
-      const sumLine = inputs.map((_, i) => `input${i}[offset]`).join('+');
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-  ${inputsDeclaration.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    let offset = global_id.x;
-
-    var value = ${dataType}(0);
-    value = ${sumLine};
-
-    output[offset] = value;
-  }`;
-      return {
-        ...sumProgramMetadata,
-        outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length === 0) {
-    throw new Error('Sum requires inputs.');
-  }
-
-  const length = inputs[0].dims.length;
-  for (let i = 1; i < inputs.length; i++) {
-    if (length !== inputs[i].dims.length) {
-      throw new Error('Input shapes are mismatched. broadcasting not supported yet');
-    }
-
-    for (let j = 0; j < length; j++) {
-      if (inputs[0].dims[j] !== inputs[i].dims[j]) {
-        throw new Error('Input shapes are not matched. broadcasting not supported yet');
-      }
-    }
-  }
-
-  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-    throw new Error('Invalid input type.');
-  }
-  for (let i = 1; i < inputs.length; i++) {
-    if (inputs[0].type !== inputs[i].type) {
-      throw new Error('Input types are not matched.');
-    }
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts b/js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts
deleted file mode 100644
index 323e80bdb596a..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/reshape.ts
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-export const reshape = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  const shape = await inputs[1].getData();
-  const reshapedDims = ShapeUtil.calculateReshapedDims(inputs[0].dims, shape as Int32Array);
-  return [handler.reshape(inputs[0], reshapedDims)];
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Reshape requires 2 inputs.');
-  }
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invalid input type.');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/shape.ts b/js/web/lib/onnxjs/backends/webgpu/ops/shape.ts
deleted file mode 100644
index 94ba9293c457a..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/shape.ts
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Tensor} from '../../../tensor';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-export const shape = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return [new Tensor([inputs[0].dims.length], 'int32', undefined, undefined, new Int32Array(inputs[0].dims))];
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Shape requires 1 input.');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/slice.ts b/js/web/lib/onnxjs/backends/webgpu/ops/slice.ts
deleted file mode 100644
index fd5d6e2d2299e..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/slice.ts
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {NUMBER_TYPES, OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-
-export interface SliceAttributes extends AttributeWithCacheKey {
-  readonly axes: number[];
-  readonly ends: number[];
-  readonly starts: number[];
-}
-
-const sliceProgramMetadata = {
-  name: 'Slice',
-  inputTypes: [GpuDataType.default]
-};
-
-export const slice: OperatorAsyncImplementation<SliceAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: SliceAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(
-      {
-        ...sliceProgramMetadata,
-        cacheHint: attributes.cacheKey,
-        get: () => createSliceProgramInfo(inputs[0], attributes)
-      },
-      inputs);
-};
-
-export const parseSliceAttributes: OperatorInitialization<SliceAttributes> = (node: Graph.Node): SliceAttributes => {
-  const starts = node.attributes.getInts('starts');
-  const ends = node.attributes.getInts('ends');
-  const axes = node.attributes.getInts('axes', []);
-  return createAttributeWithCacheKey({starts, ends, axes});
-};
-
-const offsetToIndices = (offset: string, strides: readonly number[], indicesPrefix: string): string => {
-  const outputLines: string[] = [];
-
-  for (let i = 0; i < strides.length - 1; i++) {
-    outputLines.push(`var ${indicesPrefix}${i}=${offset}/${strides[i]}u;`);
-    outputLines.push(`${offset}%=${strides[i]}u;`);
-  }
-  outputLines.push(`var ${indicesPrefix}${strides.length - 1}=${offset};`);
-
-  return outputLines.join('\n');
-};
-
-const indicesToOffset = (indicesPrefix: string, strides: readonly number[], offset: string): string => {
-  const outputLines: string[] = [];
-
-  for (let i = 0; i < strides.length - 1; i++) {
-    outputLines.push(`${offset}+=${indicesPrefix}${i} * ${strides[i]}u;`);
-  }
-  outputLines.push(`${offset}+=${indicesPrefix}${strides.length - 1};`);
-
-  return outputLines.join('\n');
-};
-
-const createSliceProgramInfo = (input: Tensor, attributes: SliceAttributes, dataType = 'f32'): ProgramInfo => {
-  const axes = (attributes.axes.length === 0) ? input.dims.slice(0).map((val, i) => i) : attributes.axes;
-  const normalizedAxes = ShapeUtil.normalizeAxes(axes, input.dims.length);
-  const starts = attributes.starts.map((start, i) => {
-    if (start > input.dims[normalizedAxes[i]] - 1) {
-      return input.dims[normalizedAxes[i]];
-    }
-    return ShapeUtil.normalizeAxis(start, input.dims[normalizedAxes[i]]);
-  });
-  const ends = attributes.ends.map((end, i) => {
-    if (end > input.dims[normalizedAxes[i]] - 1) {
-      return input.dims[normalizedAxes[i]];
-    }
-    return ShapeUtil.normalizeAxis(end, input.dims[normalizedAxes[i]]);
-  });
-
-  const outputShape = input.dims.slice();
-
-  const sliceOps: string[] = [];
-  for (let i = 0; i < normalizedAxes.length; i++) {
-    outputShape[normalizedAxes[i]] = ends[i] - starts[i];
-    if (starts[i] > 0) {
-      sliceOps.push(`idx_${normalizedAxes[i]} += ${starts[i]}u;`);
-    }  // else { sliceOps.push(`outputIdx[${normalizedAxes[i]}] += 0;`); }
-  }
-
-  const outputSize = ShapeUtil.size(outputShape);
-  const outputStrides = ShapeUtil.computeStrides(outputShape);
-  const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-  @group(0) @binding(0) var<storage, read> input : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    var offset = global_id.x;
-    ${offsetToIndices('offset', outputStrides, 'idx_')}
-    ${sliceOps.join('')}
-    var offsetInput = 0u;
-    ${indicesToOffset('idx_', ShapeUtil.computeStrides(input.dims), 'offsetInput')}
-    output[global_id.x] = input[offsetInput];
-  }`;
-  return {
-    ...sliceProgramMetadata,
-    outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
-    shaderSource,
-    dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-  };
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Slice requires 1 input.');
-  }
-  if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
-    throw new Error('Invalid input type.');
-  }
-};
-
-export const sliceV10 = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  validateInputsV10(inputs);
-  const attributes = generateSliceAttributesFromInputs(inferenceHandler, inputs);
-  return inferenceHandler.run(
-      {
-        ...sliceProgramMetadata,
-        cacheHint: attributes.cacheKey,
-        get: () => createSliceProgramInfo(inputs[0], attributes)
-      },
-      [inputs[0]]);
-};
-
-const generateSliceAttributesFromInputs =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): SliceAttributes => {
-      if (!inferenceHandler.session.isInitializer(inputs[1].dataId) ||
-          !inferenceHandler.session.isInitializer(inputs[2].dataId) ||
-          (inputs.length >= 4 && !inferenceHandler.session.isInitializer(inputs[3].dataId)) ||
-          (inputs.length >= 5 && !inferenceHandler.session.isInitializer(inputs[4].dataId))) {
-        throw new Error('dynamic slice attributes are not allowed');
-      }
-
-      if (inputs.length >= 5 && inputs[4].integerData.some((i: number) => i !== 1)) {
-        throw new Error('currently non-1 steps is not supported for Slice');
-      }
-
-      const starts = Array.from(inputs[1].integerData);
-      const ends = Array.from(inputs[2].integerData);
-      const axes = inputs.length >= 4 ? Array.from(inputs[3].integerData) : [];
-      const cacheKey = `${axes};${starts};${ends}`;
-      return {starts, ends, axes, cacheKey};
-    };
-
-const validateInputsV10 = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length < 3 || inputs.length > 5) {
-    throw new Error('Invalid input number.');
-  }
-  if (inputs[1].type !== 'int32' || inputs[1].dims.length !== 1) {
-    throw new Error('Invalid input type.');
-  }
-  if (inputs[2].type !== 'int32' || inputs[2].dims.length !== 1) {
-    throw new Error('Invalid input type.');
-  }
-  if (inputs.length >= 4 && (inputs[3].type !== 'int32' || inputs[3].dims.length !== 1)) {
-    throw new Error('Invalid input type.');
-  }
-  if (inputs.length >= 5 && (inputs[4].type !== 'int32' || inputs[4].dims.length !== 1)) {
-    throw new Error('Invalid input type.');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts b/js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts
deleted file mode 100644
index 7cd85e6877b03..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/squeeze.ts
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Graph} from '../../../graph';
-import {OperatorImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-export const squeeze: OperatorImplementation<number[]> =
-    (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
-      validateInputs(inputs);
-      const outputShape = ShapeUtil.squeezeShape(inputs[0].dims, axes);
-      const output = inferenceHandler.reshape(inputs[0], outputShape);
-      return [output];
-    };
-
-export const squeezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
-  validateInputsV13(inputs);
-  return squeeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
-};
-
-export const parseSqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
-    node.attributes.getInts('axes');
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Squeeze requires 1 input.');
-  }
-
-  if (inputs[0].type === 'string') {
-    throw new Error('invalid input tensor types.');
-  }
-};
-
-const validateInputsV13 = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Squeeze requires 2 inputs.');
-  }
-
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invalid input type.');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts b/js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts
deleted file mode 100644
index e83dd7fcbb0b9..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/transpose.ts
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo} from '../types';
-
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-export interface TransposeAttributes extends AttributeWithCacheKey {
-  readonly perm: number[];
-}
-
-const transposeProgramMetadata = {
-  name: 'Transpose',
-  inputTypes: [GpuDataType.default]
-};
-
-export const transpose: OperatorAsyncImplementation<TransposeAttributes> = async(
-    inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: TransposeAttributes): Promise<Tensor[]> => {
-  validateInputs(inputs);
-  return inferenceHandler.run(
-      {
-        ...transposeProgramMetadata,
-        cacheHint: attributes.cacheKey,
-        get: () => createTransposeProgramInfo(inferenceHandler, inputs[0], attributes.perm)
-      },
-      inputs);
-};
-
-export const parseTransposeAttributes: OperatorInitialization<TransposeAttributes> =
-    (node: Graph.Node): TransposeAttributes => createAttributeWithCacheKey({perm: node.attributes.getInts('perm', [])});
-
-const createTransposeProgramInfo =
-    (_inferenceHandler: WebGpuInferenceHandler, input: Tensor, perm: number[]): ProgramInfo => {
-      const dataType = 'f32';  // TODO: support other data type
-      const inputShape = input.dims;
-      perm = getAdjustedPerm(inputShape, perm);
-      const outputShape = getOutputShape(inputShape, perm);
-      const rank = inputShape.length;
-      const outputSize = ShapeUtil.size(outputShape);
-      // A dims=[${inputs[0].dims.toString()}]
-      // out Dims=[${unpackedOutputShape.toString()}]
-      // based on perm=[${perm.toString()}]
-
-      const outputIndicesHelper = createIndicesHelper('output', outputShape);
-      const inputIndicesHelper = createIndicesHelper('a', inputShape);
-
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-  ${permFunctionBody(perm, rank)}
-  ${outputIndicesHelper.o2iImpl}
-  ${inputIndicesHelper.i2oImpl}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
-
-    ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
-    ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
-    perm(&aIndices, &indices);
-
-    output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
-  }`;
-      return {
-        ...transposeProgramMetadata,
-        outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
-        shaderSource,
-        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-      };
-    };
-
-const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] => {
-  if (perm && perm.length !== inputShape.length) {
-    perm = [...(inputShape.keys())].reverse();
-  }
-  return perm;
-};
-
-const getOutputShape = (inputShape: readonly number[], perm: number[]): readonly number[] => {
-  perm = getAdjustedPerm(inputShape, perm);
-  return ShapeUtil.sortBasedOnPerm(inputShape, perm);
-};
-
-const permFunctionBody = (perm: number[], rank: number): string => {
-  const reverseFunc = [];
-  reverseFunc.push(`fn perm(a: ptr<function, array<u32, ${rank}>>, i: ptr<function, array<u32, ${rank}>>) {`);
-  for (let i = 0; i < rank; ++i) {
-    reverseFunc.push(`\t(*a)[${perm[i]}]=(*i)[${i}];`);
-  }
-  reverseFunc.push('\t}');
-  return reverseFunc.join('\n');
-};
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Transpose requires 1 input.');
-  }
-
-  if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-    throw new Error('input should be float tensor');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts b/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
deleted file mode 100644
index 82c58f78e232d..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/unary-op.ts
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-import {Graph} from '../../../graph';
-import {Tensor} from '../../../tensor';
-import {MAX_CLIP, MIN_CLIP} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-import {WORKGROUP_SIZE} from './common';
-
-type BuiltinFunctionName = string;
-type ElementwiseCustomExpression = (expression: string) => string;
-type ElementwiseFunctionCall = BuiltinFunctionName|ElementwiseCustomExpression;
-
-const createElementwiseProgramShader =
-    (datasize: number, funcCall: ElementwiseFunctionCall, additionalImplementation?: string): string => {
-      const vecSize = Math.ceil(datasize / 4);
-
-      let expression = '';
-      if (typeof funcCall === 'string') {
-        expression = `${funcCall}(a)`;
-      } else {
-        expression = funcCall('a');
-      }
-      return `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-  @group(0) @binding(0) var<storage, read> inputData : array<vec4<f32>>;
-  @group(0) @binding(1) var<storage, read_write> outputData : array<vec4<f32>>;
-
-  ${additionalImplementation ?? ''}
-
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${vecSize}u) {
-      return;
-    }
-
-    let a = inputData[global_id.x];
-    outputData[global_id.x] = ${expression};
-  }`;
-    };
-
-const createElementwiseProgramInfo =
-    (metadata: ProgramMetadata, input: Tensor, funcCall: ElementwiseFunctionCall, additionalImplementation?: string):
-        ProgramInfo => ({
-          ...metadata,
-          shaderSource: createElementwiseProgramShader(input.size, funcCall, additionalImplementation),
-          outputs: [{dims: input.dims, type: input.type, gpuDataType: GpuDataType.default}],
-          dispatchGroup: (inputTensors) =>
-              ({x: Math.ceil(inputTensors[0].size / 64 /* workgroup size */ / 4 /* vec size */)})
-        });
-
-const createElementwiseProgramInfoLoader =
-    (input: Tensor, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string,
-     cacheKey?: string): ProgramInfoLoader => {
-      const metadata: ProgramMetadata = {name, inputTypes: [GpuDataType.default], cacheHint: cacheKey};
-      return {
-        ...metadata,
-        get: () => createElementwiseProgramInfo(metadata, input, funcCall, additionalImplementation)
-      };
-    };
-
-export const abs = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Abs', 'abs'), inputs);
-
-export const acos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Acos', 'acos'), inputs);
-
-export const asin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Asin', 'asin'), inputs);
-
-export const atan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Atan', 'atan'), inputs);
-
-export interface ClipAttributes extends AttributeWithCacheKey {
-  readonly min: number;
-  readonly max: number;
-}
-
-export const clip = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: ClipAttributes):
-                        Promise<Tensor[] >=>handler.run(
-                            createElementwiseProgramInfoLoader(
-                                inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
-    const clip_min_: vec4<f32> = vec4(f32(${attributes.min}));
-    const clip_max_: vec4<f32> = vec4(f32(${attributes.max}));
-`,
-                                attributes.cacheKey),
-                            inputs);
-
-export const parseClipAttributes = (node: Graph.Node): ClipAttributes => createAttributeWithCacheKey(
-    {min: node.attributes.getFloat('min', MIN_CLIP), max: node.attributes.getFloat('max', MAX_CLIP)});
-
-const generateClipAttributesFromInputs = (handler: WebGpuInferenceHandler, inputs: Tensor[]): ClipAttributes => {
-  if (inputs.length >= 3 &&
-      (!handler.session.isInitializer(inputs[1].dataId) || !handler.session.isInitializer(inputs[2].dataId))) {
-    throw new Error('dynamic clip attributes are not allowed');
-  }
-
-  const min = (inputs.length >= 3) ? inputs[1].numberData[0] : MIN_CLIP;
-  const max = (inputs.length >= 3) ? inputs[2].numberData[0] : MAX_CLIP;
-  return createAttributeWithCacheKey({min, max});
-};
-
-export const clipV11 = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-  const attributes = generateClipAttributesFromInputs(handler, inputs);
-  return clip(handler, [inputs[0]], attributes);
-};
-
-export const ceil = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Ceil', 'ceil'), inputs);
-
-export const cos = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Cos', 'cos'), inputs);
-
-export interface EluAttributes extends AttributeWithCacheKey {
-  readonly alpha: number;
-}
-
-export const elu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
-                       Promise<Tensor[] >=>handler.run(
-                           createElementwiseProgramInfoLoader(
-                               inputs[0], 'Elu', a => `elu_vf32(${a})`, `
-    let elu_alpha_: f32 = f32(${attributes.alpha});
-
-    fn elu_f32(a: f32) -> f32 {
-      return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
-    }
-
-    fn elu_vf32(v: vec4<f32>) -> vec4<f32> {
-      return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
-    }`,
-                               attributes.cacheKey),
-                           inputs);
-
-export const parseEluAttributes = (node: Graph.Node): EluAttributes =>
-    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 1.0)});
-
-export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
-
-export const floor = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Floor', 'floor'), inputs);
-
-export interface LeakyReluAttributes extends AttributeWithCacheKey {
-  readonly alpha: number;
-}
-
-export const leakyRelu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
-                             Promise<Tensor[] >=>handler.run(
-                                 createElementwiseProgramInfoLoader(
-                                     inputs[0], 'LeakyRelu', a => `leaky_relu_vf32(${a})`, `
-    let leaky_relu_alpha_: f32 = f32(${attributes.alpha});
-
-    fn leaky_relu_f32(a: f32) -> f32 {
-      return select(a, a * leaky_relu_alpha_, a < 0.0);
-    }
-
-    fn leaky_relu_vf32(v: vec4<f32>) -> vec4<f32> {
-      return vec4(leaky_relu_f32(v.x), leaky_relu_f32(v.y), leaky_relu_f32(v.z), leaky_relu_f32(v.w));
-    }`,
-                                     attributes.cacheKey),
-                                 inputs);
-
-export const parseLeakyReluAttributes = (node: Graph.Node): LeakyReluAttributes =>
-    createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 0.01)});
-
-export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
-
-export const neg = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Neg', a => `-${a}`), inputs);
-
-// export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
-
-export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
-    createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
-
-export const sigmoid = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
-    createElementwiseProgramInfoLoader(inputs[0], 'Sigmoid', a => `(vec4(1.0) / (vec4(1.0) + exp(-${a})))`), inputs);
-
-export const sin = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sin', 'sin'), inputs);
-
-export const sqrt = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Sqrt', 'sqrt'), inputs);
-
-export const tan = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tan', 'tan'), inputs);
-
-export const tanh = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-    handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Tanh', 'tanh'), inputs);
diff --git a/js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts b/js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts
deleted file mode 100644
index 8a099dc92cbd9..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/ops/unsqueeze.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Graph} from '../../../graph';
-import {OperatorInitialization} from '../../../operators';
-import {Tensor} from '../../../tensor';
-import {ShapeUtil} from '../../../util';
-import {WebGpuInferenceHandler} from '../inference-handler';
-
-export const unsqueeze = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
-  validateInputs(inputs);
-  const outputShape = ShapeUtil.unsqueezeShape(inputs[0].dims, axes);
-  const output = inferenceHandler.reshape(inputs[0], outputShape);
-  return [output];
-};
-
-export const unsqueezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
-  validateInputsV13(inputs);
-  return unsqueeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
-};
-
-export const parseUnsqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
-    node.attributes.getInts('axes');
-
-const validateInputs = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 1) {
-    throw new Error('Unsqueeze requires 1 input.');
-  }
-
-  if (inputs[0].type === 'string') {
-    throw new Error('invalid input tensor types.');
-  }
-};
-
-const validateInputsV13 = (inputs: Tensor[]): void => {
-  if (!inputs || inputs.length !== 2) {
-    throw new Error('Unsqueeze requires 2 inputs.');
-  }
-
-  if (inputs[1].type !== 'int32') {
-    throw new Error('Invalid input type.');
-  }
-};
diff --git a/js/web/lib/onnxjs/backends/webgpu/program-manager.ts b/js/web/lib/onnxjs/backends/webgpu/program-manager.ts
deleted file mode 100644
index 3a6ae37e3ab54..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/program-manager.ts
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {env} from 'onnxruntime-common';
-
-import {Logger, Profiler} from '../../instrument';
-import {WebGpuBackend} from '../backend-webgpu';
-
-import {Artifact, GpuData, ProgramInfo} from './types';
-
-/**
- * ProgramManager is the main class behind running computations
- * It builds ProgramInfo's into Artifacts
- * It compiles given ProgramInfo's into WebGL Prorams (cached as Artifacts)
- * Uses the artifact to run the computation by calling Draw on
- * the WebGL drawing buffer
- * ProgramManager automatically maps (binds) input variables to their
- * corresponding Location's in the binary program
- */
-export class ProgramManager {
-  repo: Map<unknown, Artifact>;  // this should be per-session object
-  attributesBound: boolean;
-
-  constructor(private backend: WebGpuBackend, public profiler: Readonly<Profiler>) {
-    this.repo = new Map();
-    this.attributesBound = false;
-  }
-  getArtifact(key: unknown): Artifact|undefined {
-    return this.repo.get(key);
-  }
-  setArtifact(key: unknown, artifact: Artifact): void {
-    this.repo.set(key, artifact);
-  }
-  run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[],
-      dispatchGroup: {x: number; y?: number; z?: number}): void {
-    const device = this.backend.device;
-
-    const computePassEncoder = this.backend.getComputePassEncoder();
-
-    computePassEncoder.setPipeline(buildArtifact.computePipeline);
-    const entries = [];
-    for (const input of inputs) {
-      entries.push({binding: entries.length, resource: {buffer: input.buffer}});
-    }
-    for (const output of outputs) {
-      entries.push({binding: entries.length, resource: {buffer: output.buffer}});
-    }
-    const bindGroup = device.createBindGroup({layout: buildArtifact.computePipeline.getBindGroupLayout(0), entries});
-    computePassEncoder.setBindGroup(0, bindGroup);
-
-    const {x, y, z} = dispatchGroup;
-    computePassEncoder.dispatchWorkgroups(x, y, z);
-
-    this.backend.pendingDispatchNumber++;
-
-    if (this.backend.pendingDispatchNumber >= 16) {
-      this.backend.flush();
-    }
-  }
-  dispose(): void {
-    // this.repo.forEach(a => this.glContext.deleteProgram(a.program));
-  }
-  build(programInfo: ProgramInfo): Artifact {
-    const device = this.backend.device;
-
-    const shaderModule = device.createShaderModule({code: programInfo.shaderSource});
-    if (env.debug) {
-      Logger.verbose('WebGpuProgram', programInfo.shaderSource);
-    }
-
-    const computePipeline =
-        device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}, layout: 'auto'});
-
-    return {programInfo, computePipeline};
-  }
-}
diff --git a/js/web/lib/onnxjs/backends/webgpu/session-handler.ts b/js/web/lib/onnxjs/backends/webgpu/session-handler.ts
deleted file mode 100644
index 1fe288c36dd1e..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/session-handler.ts
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {SessionHandler} from '../../backend';
-import {Graph} from '../../graph';
-import {Operator} from '../../operators';
-import {OpSet, resolveOperator} from '../../opset';
-import {Session} from '../../session';
-import {Tensor} from '../../tensor';
-import {WebGpuBackend} from '../backend-webgpu';
-
-import {WebGpuInferenceHandler} from './inference-handler';
-import {WEBGPU_OP_RESOLVE_RULES} from './op-resolve-rules';
-import {ProgramManager} from './program-manager';
-import {createTensorDataManager, TensorDataManager} from './tensor-data-manager';
-
-export class WebGpuSessionHandler implements SessionHandler {
-  private initializers: Set<Tensor.Id>;
-  readonly dataManager: TensorDataManager;
-  readonly programManager: ProgramManager;
-
-  constructor(public readonly backend: WebGpuBackend, public readonly context: Session.Context) {
-    this.dataManager = createTensorDataManager(this.backend.gpuDataManager);
-    this.programManager = new ProgramManager(this.backend, this.context.profiler);
-  }
-
-  createInferenceHandler() {
-    return new WebGpuInferenceHandler(this);
-  }
-  onGraphInitialized(graph: Graph): void {
-    const initializers = graph.getValues().filter(v => v.from === -1 && v.tensor).map(v => v.tensor!.dataId);
-    this.initializers = new Set(initializers);
-  }
-  isInitializer(tensorId: Tensor.Id): boolean {
-    return this.initializers ? this.initializers.has(tensorId) : false;
-  }
-  addInitializer(tensorId: Tensor.Id): void {
-    this.initializers.add(tensorId);
-  }
-  dispose(): void {
-    // TODO
-  }
-  resolve(node: Graph.Node, opsets: readonly OpSet[], graph: Graph): Operator {
-    const op = resolveOperator(node, opsets, WEBGPU_OP_RESOLVE_RULES);
-    return {impl: op.opImpl, context: op.opInit ? op.opInit(node, graph) : node};
-  }
-}
diff --git a/js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts b/js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts
deleted file mode 100644
index bdf6c7f9ebe42..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/tensor-data-manager.ts
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {createView, Tensor} from '../../tensor';
-
-import {GpuDataManager} from './gpu-data-manager';
-import {GpuData, GpuDataId, GpuDataType} from './types';
-
-/**
- * manages Tensor ID -> Gpu Data ID
- *
- * A tensor ID is a unique ID representing a value(tensor), which is the graph's node's input or output.
- * A GPU Data ID is a unique ID representing an abstract data on GPU memory. Specifically, for current WebGPU scenarios,
- *   GPU Data is a storage buffer, and GPU Data ID is a handle to a storage buffer.
- *
- * - a value is different to the graph's edge. if a node's output is consumed by 2 other downstream nodes, there are
- *   2 edges, but only one value.
- *
- * - a tensor ID maps to 0 or 1 GPU Data ID, depending on whether the data is available on GPU or not.
- *
- * - a GPU Data ID maps to 1 or more tensor ID.
- *
- */
-export interface TensorDataManager {
-  /**
-   * upload a CPU tensor to GPU.
-   */
-  uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): Promise<GpuData>;
-
-  /**
-   * create a new GPU tensor.
-   */
-  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData];
-
-  /**
-   * check whether the tensor has GPU data
-   */
-  hasGpuData(tensorId: Tensor.Id): boolean;
-
-  /**
-   * create a reference to the GPU data.
-   */
-  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData];
-
-  /**
-   * release the GPU resources referred by the tensor.
-   */
-  releaseGpuTensor(tensorId: Tensor.Id): void;
-}
-
-class TensorDataManagerImpl implements TensorDataManager {
-  private map: Map<Tensor.Id, GpuDataId>;
-  private reverseMap: Map<GpuDataId, Set<Tensor.Id>>;
-
-  constructor(private gpuDataManager: GpuDataManager) {
-    this.map = new Map();
-    this.reverseMap = new Map();
-  }
-
-  private registerIdMapping(tensorId: Tensor.Id, gpuDataId: GpuDataId): void {
-    this.map.set(tensorId, gpuDataId);
-
-    let tensorIds = this.reverseMap.get(gpuDataId);
-    if (!tensorIds) {
-      tensorIds = new Set();
-      this.reverseMap.set(gpuDataId, tensorIds);
-    }
-    tensorIds.add(tensorId);
-  }
-
-  async uploadTensorToGpu(tensor: Tensor, gpuDataType: GpuDataType): Promise<GpuData> {
-    const gpuDataId = this.map.get(tensor.dataId);
-    if (gpuDataId) {
-      const gpuData = this.gpuDataManager.get(gpuDataId);
-      if (!gpuData) {
-        throw new Error('internal error. this should never happen');
-      }
-      return gpuData;
-    }
-
-    const gpuData = await this.gpuDataManager.upload(tensor.numberData, gpuDataType);
-    this.registerIdMapping(tensor.dataId, gpuData.id);
-    return gpuData;
-  }
-
-  createGpuTensor(type: Tensor.DataType, dims: readonly number[], gpuDataType: GpuDataType): [Tensor, GpuData] {
-    const gpuData = this.gpuDataManager.create(type, dims, gpuDataType);
-    const tensor = new Tensor(dims, type, undefined, async () => {
-      const data = await this.gpuDataManager.download(gpuData.id);
-      return createView(data, type);
-    });
-
-    this.registerIdMapping(tensor.dataId, gpuData.id);
-    return [tensor, gpuData];
-  }
-
-  hasGpuData(tensorId: Tensor.Id): boolean {
-    return this.map.has(tensorId);
-  }
-
-  createGpuRef(tensorId: Tensor.Id, type: Tensor.DataType, dims: readonly number[]): [Tensor, GpuData] {
-    const gpuDataId = this.map.get(tensorId);
-    if (!gpuDataId) {
-      throw new Error('internal error. this should never happen');
-    }
-
-    const gpuData = this.gpuDataManager.get(gpuDataId);
-    if (!gpuData) {
-      throw new Error('internal error. this should never happen');
-    }
-
-    const tensor = new Tensor(dims, type, undefined, async () => {
-      const data = await this.gpuDataManager.download(gpuData.id);
-      return createView(data, type);
-    });
-
-    this.registerIdMapping(tensor.dataId, gpuData.id);
-    return [tensor, gpuData];
-  }
-
-  releaseGpuTensor(tensorId: Tensor.Id): void {
-    const gpuDataId = this.map.get(tensorId);
-    if (gpuDataId) {
-      this.map.delete(tensorId);
-
-      const tensorIds = this.reverseMap.get(gpuDataId);
-      if (!tensorIds) {
-        throw new Error('internal error. this should never happen');
-      }
-      tensorIds.delete(tensorId);
-      if (tensorIds.size === 0) {
-        this.gpuDataManager.release(gpuDataId);
-        this.reverseMap.delete(gpuDataId);
-      }
-    }
-  }
-}
-
-export const createTensorDataManager = (gpuDataManager: GpuDataManager): TensorDataManager =>
-    new TensorDataManagerImpl(gpuDataManager);
diff --git a/js/web/lib/onnxjs/backends/webgpu/types.ts b/js/web/lib/onnxjs/backends/webgpu/types.ts
deleted file mode 100644
index 96f6e247de5a3..0000000000000
--- a/js/web/lib/onnxjs/backends/webgpu/types.ts
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import {Guid} from 'guid-typescript';
-
-import {Tensor} from '../../tensor';
-
-export enum GpuDataType {
-  default = 0
-}
-export type GpuDataId = Guid;
-
-export interface GpuData {
-  type: GpuDataType;
-  id: GpuDataId;
-  buffer: GPUBuffer;
-}
-
-export interface TensorInfo {
-  id?: Tensor.Id;
-  dims: readonly number[];
-  type: Tensor.DataType;
-  gpuDataType: GpuDataType;
-}
-
-
-export interface ProgramVariable {
-  type: 'float'|'int';
-  name: string;
-  arrayLength?: number;
-  data: number|number[];
-}
-
-
-export interface ProgramMetadata {
-  /**
-   * the name of the program. used for debugging and profiling
-   */
-  name: string;
-
-  // inputLayouts: GPUBindGroupLayoutEntry[];
-  // outputLayouts: GPUBindGroupLayoutEntry[];
-
-  /**
-   * gpu data types for each input
-   */
-  inputTypes: GpuDataType[];
-  /**
-   * an optional string as a cache hint in the artifact cache
-   */
-  cacheHint?: string;
-}
-
-/**
- * A ProgramInfoLoader allows
- */
-export interface ProgramInfoLoader extends ProgramMetadata {
-  /**
-   * a function to get the program info
-   */
-  get(): ProgramInfo;
-}
-
-/**
- * A set of data that represent a shader program
- */
-export interface ProgramInfo extends ProgramMetadata {
-  /**
-   * information of uniform variables
-   */
-  variables?: ProgramVariable[];
-  /**
-   * tensor info for outputs
-   */
-  outputs: TensorInfo[];
-  /**
-   * the shader's processing source code
-   */
-  shaderSource: string;
-  /**
-   * default is "main"
-   */
-  // entryPoint: string;
-
-  dispatchGroup: (inputs: readonly Tensor[]) => {
-    x: number;
-    y?: number;
-    z?: number;
-  };
-}
-
-export interface Artifact {
-  programInfo: ProgramInfo;
-  computePipeline: GPUComputePipeline;
-  // attribLocations: {position: number; textureCoord: number};
-}
diff --git a/js/web/lib/onnxjs/execution-plan.ts b/js/web/lib/onnxjs/execution-plan.ts
index 5136e1283d119..b95e639817dbf 100644
--- a/js/web/lib/onnxjs/execution-plan.ts
+++ b/js/web/lib/onnxjs/execution-plan.ts
@@ -1,10 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {env} from 'onnxruntime-common';
-
 import {SessionHandler} from './backend';
-import {WebGpuBackend} from './backends/backend-webgpu';
 import {Graph} from './graph';
 import {Logger, Profiler} from './instrument';
 import {Operator} from './operators';
@@ -60,7 +57,6 @@ export class ExecutionPlan {
 
       // create inference handler
       const inferenceHandler = sessionHandler.createInferenceHandler();
-      const IS_WEBGPU = sessionHandler.backend instanceof WebGpuBackend;
 
       // populate inputs value
       const graphInputs = this.graph.getInputIndices();
@@ -107,17 +103,6 @@ export class ExecutionPlan {
           throw new Error('the size of output does not match model definition.');
         }
 
-        if (env.debug) {
-          for (let i = 0; i < outputList.length; i++) {
-            if (IS_WEBGPU) {
-              await outputList[i].getData();
-            } else {
-              // eslint-disable-next-line no-unused-expressions
-              outputList[i].data;
-            }
-          }
-        }
-
         // fill value
         outputList.forEach((output, i) => {
           const j = thisOp.node.outputs[i];
@@ -125,10 +110,6 @@ export class ExecutionPlan {
             throw new Error(`output [${j}] already has value: op:${thisOp.node.name}`);
           }
           this._values[j] = output;
-
-          if (env.debug) {
-            Logger.verbose('ExecPlanDataDump', `output${i}[${output.dims}]:${output.data}`);
-          }
         });
 
         // resolve downstream nodes
@@ -159,8 +140,7 @@ export class ExecutionPlan {
         if (outputTensor === undefined) {
           throw new Error(`required output [${outputIndex}] does not have value`);
         }
-
-        if (IS_WEBGPU) {
+        if (outputIndex === 0) {
           await outputTensor.getData();
         } else {
           // eslint-disable-next-line no-unused-expressions
diff --git a/js/web/lib/onnxjs/operators.ts b/js/web/lib/onnxjs/operators.ts
index 2117484316dca..4d664f6dcda5a 100644
--- a/js/web/lib/onnxjs/operators.ts
+++ b/js/web/lib/onnxjs/operators.ts
@@ -5,13 +5,11 @@ import {InferenceHandler} from './backend';
 import {Graph} from './graph';
 import {Tensor} from './tensor';
 
-export type OperatorImplementation<ContextType, ReturnType extends Tensor[]|Promise<Tensor[]> = Tensor[]> =
-    (inferenceHandler: InferenceHandler, inputs: Tensor[], context: ContextType) => ReturnType;
-export type OperatorAsyncImplementation<T> = OperatorImplementation<T, Promise<Tensor[]>>;
+export type OperatorImplementation<T> = (inferenceHandler: InferenceHandler, inputs: Tensor[], context: T) => Tensor[];
 export type OperatorInitialization<T> = (node: Graph.Node, graph: Graph) => T;
 
 export interface Operator {
-  readonly impl: OperatorImplementation<unknown>|OperatorAsyncImplementation<unknown>;
+  readonly impl: OperatorImplementation<unknown>;
   readonly context: Graph.Node|unknown;
 }
 
diff --git a/js/web/lib/onnxjs/opset.ts b/js/web/lib/onnxjs/opset.ts
index 12618969efc1a..e7eb3251babc5 100644
--- a/js/web/lib/onnxjs/opset.ts
+++ b/js/web/lib/onnxjs/opset.ts
@@ -2,28 +2,24 @@
 // Licensed under the MIT License.
 
 import {Graph} from './graph';
-import {OperatorAsyncImplementation, OperatorImplementation, OperatorInitialization} from './operators';
+import {OperatorImplementation, OperatorInitialization} from './operators';
 
 export interface OpSet {
   domain: string;
   version: number;
 }
-
 export declare namespace OpSet {
   /**
    * Domain of an opset, it can be an empty string(default value, represent for ai.onnx), or 'ai.onnx.ml'
    */
   type Domain = ''|'ai.onnx.ml'|'com.microsoft';
-
   /**
    * A resolve rule consists of 4 or 5 items: opType, opSetDomain, versionSelector, operatorImplementation and
    * operatorInitialization (optional)
    */
-  type ResolveRule =
-      [
-        string, Domain, string, OperatorImplementation<Graph.Node>| OperatorAsyncImplementation<Graph.Node>
-      ]|[string, Domain, string, OperatorImplementation<unknown>| OperatorAsyncImplementation<unknown>,
-         OperatorInitialization<unknown>];
+  type ResolveRule = [
+    string, Domain, string, OperatorImplementation<Graph.Node>
+  ]|[string, Domain, string, OperatorImplementation<unknown>, OperatorInitialization<unknown>];
 }
 
 export function resolveOperator(node: Graph.Node, opsets: readonly OpSet[], rules: readonly OpSet.ResolveRule[]) {
diff --git a/js/web/lib/onnxjs/tensor.ts b/js/web/lib/onnxjs/tensor.ts
index db5e599fd68dc..4ec49f7b936ea 100644
--- a/js/web/lib/onnxjs/tensor.ts
+++ b/js/web/lib/onnxjs/tensor.ts
@@ -131,15 +131,7 @@ export class Tensor {
    */
   async getData(): Promise<TensorData> {
     if (this.cache === undefined) {
-      if (this.asyncDataProvider) {
-        const data = await this.asyncDataProvider(this.dataId);
-        if (data.length !== this.size) {
-          throw new Error('Length of data provided by the Data Provider is inconsistent with the dims of this Tensor.');
-        }
-        this.cache = data;
-      } else {
-        return this.data;
-      }
+      this.cache = await this.asyncDataProvider!(this.dataId);
     }
     return this.cache;
   }
@@ -356,7 +348,7 @@ export class Tensor {
   }
 }
 
-export function sizeof(type: Tensor.DataType): number {
+function sizeof(type: Tensor.DataType): number {
   switch (type) {
     case 'bool':
     case 'int8':
@@ -398,7 +390,7 @@ function sizeofProto(type: onnx.TensorProto.DataType|ortFbs.TensorDataType): num
   }
 }
 
-export function createView(dataBuffer: ArrayBuffer, type: Tensor.DataType) {
+function createView(dataBuffer: ArrayBuffer, type: Tensor.DataType) {
   return new (dataviewConstructor(type))(dataBuffer);
 }
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 400757f4a7721..e41019b1023c4 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -48,7 +48,7 @@ const appendDefaultOptions = (options: InferenceSession.SessionOptions): void =>
 
   // if using JSEP with WebGPU, always disable memory pattern
   if (options.executionProviders &&
-      options.executionProviders.some(ep => ['jsep-webgpu'].includes(typeof ep === 'string' ? ep : ep.name))) {
+      options.executionProviders.some(ep => (typeof ep === 'string' ? ep : ep.name) === 'webgpu')) {
     options.enableMemPattern = false;
   }
 };
@@ -64,7 +64,7 @@ const setExecutionProviders =
           case 'xnnpack':
             epName = 'XNNPACK';
             break;
-          case 'jsep-webgpu':
+          case 'webgpu':
             epName = 'JS';
             break;
           case 'wasm':
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index 249f89d42a490..87507acc60d9b 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -37,7 +37,6 @@ Options:
                                    webgpu
                                    wasm
                                    xnnpack
-                                   jsep-webgpu
  -e=<...>, --env=<...>         Specify the environment to run the test. Should be one of the following:
                                  chrome     (default)
                                  edge       (Windows only)
@@ -105,7 +104,7 @@ Examples:
 
 export declare namespace TestRunnerCliArgs {
   type Mode = 'suite0'|'suite1'|'model'|'unittest'|'op';
-  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack'|'jsep-webgpu';
+  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack';
   type Environment = 'chrome'|'edge'|'firefox'|'electron'|'safari'|'node'|'bs';
   type BundleMode = 'prod'|'dev'|'perf';
 }
@@ -360,7 +359,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   }
 
   // Option: -b=<...>, --backend=<...>
-  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack', 'jsep-webgpu'];
+  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack'];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
   const backend =
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index dd9a832595b61..68eda8aea17ff 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -50,7 +50,7 @@ if (shouldLoadSuiteTestData) {
 
 // The default backends and opset version lists. Those will be used in suite tests.
 const DEFAULT_BACKENDS: readonly TestRunnerCliArgs.Backend[] =
-    args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu', 'jsep-webgpu'];
+    args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu'];
 const DEFAULT_OPSET_VERSIONS = fs.readdirSync(TEST_DATA_MODEL_NODE_ROOT, {withFileTypes: true})
                                    .filter(dir => dir.isDirectory() && dir.name.startsWith('opset'))
                                    .map(dir => dir.name.slice(5));
@@ -458,7 +458,7 @@ function run(config: Test.Config) {
     // STEP 5. use Karma to run test
     npmlog.info('TestRunnerCli.Run', '(5/5) Running karma to start test runner...');
     const karmaCommand = path.join(npmBin, 'karma');
-    const webgpu = args.backends.indexOf('webgpu') > -1 || args.backends.indexOf('jsep-webgpu') > -1;
+    const webgpu = args.backends.indexOf('webgpu') > -1;
     const browser = getBrowserNameFromEnv(
         args.env,
         args.bundleMode === 'perf' ? 'perf' :
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index ef14eafae0792..73c08a5bda92b 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -283,375 +283,6 @@
     ]
   },
   "webgpu": {
-    "onnx": [],
-    "node": [
-      "test_abs",
-      "test_acos_example",
-      "test_acos",
-      "test_add_bcast",
-      "test_add",
-      // "test_and_bcast3v1d",
-      // "test_and_bcast3v2d",
-      // "test_and_bcast4v2d",
-      // "test_and_bcast4v3d",
-      // "test_and_bcast4v4d",
-      // "test_and2d",
-      // "test_and3d",
-      // "test_and4d",
-      "test_asin_example",
-      "test_asin",
-      "test_atan_example",
-      "test_atan",
-      "test_averagepool_1d_default",
-      "test_averagepool_2d_default",
-      "test_averagepool_2d_pads",
-      "test_averagepool_2d_precomputed_pads",
-      "test_averagepool_2d_precomputed_same_upper",
-      "test_averagepool_2d_precomputed_strides",
-      "test_averagepool_2d_same_upper",
-      "test_averagepool_2d_same_lower",
-      "test_averagepool_2d_strides",
-      "test_averagepool_3d_default",
-      "test_basic_conv_with_padding",
-      "test_basic_conv_without_padding",
-      // "test_batchnorm_epsilon",
-      // "test_batchnorm_example",
-      // "test_cast_DOUBLE_to_FLOAT",
-      // "test_cast_FLOAT_to_DOUBLE",
-      "opset{7,8,9,10}/test_clip_splitbounds",
-      "opset{7,8,9,10}/test_clip_outbounds",
-      "opset{7,8,9,10}/test_clip_inbounds",
-      "opset{7,8,9,10}/test_clip_example",
-      "opset{7,8,9,10}/test_clip_default_min",
-      "opset{7,8,9,10}/test_clip_default_max",
-      "opset{7,8,9,10}/test_clip_default_inbounds",
-      "opset{7,8,9,10}/test_clip",
-      "test_concat_1d_axis_0",
-      "test_concat_2d_axis_0",
-      "test_concat_2d_axis_1",
-      "test_concat_3d_axis_0",
-      "test_concat_3d_axis_1",
-      "test_concat_3d_axis_2",
-      "test_conv_with_strides_and_asymmetric_padding",
-      "test_conv_with_strides_no_padding",
-      "test_conv_with_strides_padding",
-      "test_constant",
-      "test_cos_example",
-      "test_cos",
-      "test_div_bcast",
-      "test_div_example",
-      "test_div",
-      // "test_dropout_default",
-      // "test_dropout_random",
-      // "test_depthtospace_crd_mode",
-      // "test_depthtospace_crd_mode_example",
-      // "test_depthtospace_dcr_mode",
-      // "test_depthtospace_example",
-      "test_elu_example",
-      "test_elu",
-      "test_elu_default",
-      // "test_flatten_axis0",
-      // "test_flatten_axis1",
-      // "test_flatten_axis2",
-      // "test_flatten_axis3",
-      // "test_flatten_default_axis",
-      "test_gather_0",
-      "test_gather_1",
-      "test_gemm_nobroadcast",
-      "test_gemm_broadcast",
-      "test_globalaveragepool_precomputed",
-      "test_globalaveragepool",
-      "test_globalmaxpool_precomputed",
-      "test_globalmaxpool",
-      // "test_greater_bcast",
-      // "test_greater",
-      // "test_instancenorm_epsilon",
-      // "test_instancenorm_example",
-      // "test_less_bcast",
-      // "test_less",
-      // "test_equal_bcast",
-      // "test_equal",
-      // "test_identity",
-      "test_leakyrelu_default",
-      "test_leakyrelu_example",
-      "test_leakyrelu",
-      // "test_lrn_default",  <-- failing due to low precison. If absolute CPU error threshold is increased from 1e-4 to 1e-2 (100x increase), it passes the test.
-      // "test_lrn",  <-- failing due to low precison. If absolute CPU error threshold is increased from 1e-4 to 1e-3 (10x increase), it passes the test.
-      "test_matmul_2d",
-      "test_matmul_3d",
-      "test_matmul_4d",
-      "test_maxpool_1d_default",
-      "test_maxpool_2d_default",
-      "test_maxpool_2d_pads",
-      "test_maxpool_2d_precomputed_pads",
-      "test_maxpool_2d_precomputed_same_upper",
-      "test_maxpool_2d_precomputed_strides",
-      "test_maxpool_2d_same_lower",
-      "test_maxpool_2d_same_upper",
-      "test_maxpool_2d_strides",
-      "test_maxpool_3d_default",
-      "test_mul_bcast",
-      "test_mul_example",
-      "test_mul",
-      "test_neg",
-      "test_neg_example",
-      // "test_not_2d",
-      // "test_not_3d",
-      // "test_not_4d",
-      // "test_or_bcast3v1d",
-      // "test_or_bcast3v2d",
-      // "test_or_bcast4v2d",
-      // "test_or_bcast4v3d",
-      // "test_or_bcast4v4d",
-      // "test_prelu_broadcast",
-      // "test_prelu_example",
-      "test_relu",
-      // "test_reshape_extended_dims",
-      // "test_reshape_negative_dim",
-      // "test_reshape_one_dim",
-      // "test_reshape_reduced_dims",
-      // "test_reshape_reordered_dims",
-      "test_sigmoid",
-      "test_sigmoid_example",
-      "test_sin_example",
-      "test_sin",
-      // "test_softmax_axis_0",
-      // "test_softmax_axis_1",
-      // "test_softmax_axis_2",
-      // "test_softmax_default_axis",
-      // "test_softmax_example",
-      // {
-      //   "name": "test_softmax_large_number",
-      //   "condition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
-      // },
-      "test_sub_bcast",
-      "test_sub_example",
-      "test_sub",
-      // "test_sum_example",
-      // "test_sum_one_input",
-      // "test_sum_two_inputs",
-      // "test_reduce_log_sum_asc_axes",
-      // "test_reduce_log_sum_default",
-      // "test_reduce_log_sum_desc_axes",
-      // "test_reduce_max_default_axes_keepdim_example",
-      // "test_reduce_max_default_axes_keepdims_random",
-      // "test_reduce_max_do_not_keepdims_example",
-      // "test_reduce_max_do_not_keepdims_random",
-      // "test_reduce_max_keepdims_example",
-      // "test_reduce_max_keepdims_random",
-      // "test_reduce_mean_default_axes_keepdims_example",
-      // "test_reduce_mean_default_axes_keepdims_random",
-      // "test_reduce_mean_do_not_keepdims_example",
-      // "test_reduce_mean_do_not_keepdims_random",
-      // "test_reduce_mean_keepdims_example",
-      // "test_reduce_mean_keepdims_random",
-      // "test_reduce_min_default_axes_keepdims_example",
-      // "test_reduce_min_default_axes_keepdims_random",
-      // "test_reduce_min_do_not_keepdims_example",
-      // "test_reduce_min_do_not_keepdims_random",
-      // "test_reduce_min_keepdims_example",
-      // "test_reduce_min_keepdims_random",
-      // {
-      //   "name": "test_reduce_prod_default_axes_keepdims_example",
-      //   "condition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
-      // },
-      // "test_reduce_prod_default_axes_keepdims_random",
-      // "test_reduce_prod_do_not_keepdims_example",
-      // "test_reduce_prod_do_not_keepdims_random",
-      // "test_reduce_prod_keepdims_example",
-      // "test_reduce_prod_keepdims_random",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_example",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_random",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_example",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_random",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_keepdims_example",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_keepdims_random",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_example",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_default_axes_keepdims_random",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_example",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_do_not_keepdims_random",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_example",
-      // "opset{7,8,9,10,11,12}/test_reduce_sum_square_keepdims_random",
-      // "opset{7,8,9,10,11,12}/test_split_variable_parts_default_axis",
-      // "opset{7,8,9,10,11,12}/test_split_variable_parts_1d",
-      // "opset{7,8,9,10,11,12}/test_split_variable_parts_2d",
-      // "opset{7,8,9,10,11,12}/test_split_equal_parts_default_axis",
-      // "opset{7,8,9,10,11,12}/test_split_equal_parts_1d",
-      // "opset{7,8,9,10,11,12}/test_split_equal_parts_2d",
-      "opset{7,8,9}/test_slice",
-      "opset{7,8,9}/test_slice_default_axes",
-      "opset{7,8,9}/test_slice_end_out_of_bounds",
-      "opset{7,8,9}/test_slice_neg",
-      // "test_slice_start_out_of_bounds", // tensor shape of 0
-      // "test_squeeze",
-      "test_tan_example",
-      "test_tan",
-      "test_tanh_example",
-      "test_tanh",
-      // "test_tile",
-      // "test_tile_precomputed",
-      "test_transpose_all_permutations_0",
-      "test_transpose_all_permutations_1",
-      "test_transpose_all_permutations_2",
-      "test_transpose_all_permutations_3",
-      "test_transpose_all_permutations_4",
-      "test_transpose_all_permutations_5",
-      "test_transpose_default"
-      // "test_unsqueeze",
-      // "test_xor_bcast3v1d",
-      // "test_xor_bcast3v2d",
-      // "test_xor_bcast4v2d",
-      // "test_xor_bcast4v3d",
-      // "test_xor_bcast4v4d",
-      // "test_xor2d",
-      // "test_xor3d",
-      // "test_xor4d"
-    ],
-    "ops": [
-      "abs.jsonc",
-      "acos.jsonc",
-      "add.jsonc",
-      //"and.jsonc",
-      "asin.jsonc",
-      "ceil.jsonc",
-      "concat.jsonc",
-      "conv.jsonc",
-      "cos.jsonc",
-      "div.jsonc",
-      //"depth-to-space.jsonc",
-      //"equal.jsonc",
-      "exp.jsonc",
-      "floor.jsonc",
-      "global-average-pool.jsonc",
-      "gemm.jsonc",
-      //"greater.jsonc",
-      ////"identity.jsonc",
-      //"image-scaler.jsonc",
-      //"less.jsonc",
-      "log.jsonc",
-      //"matmul.jsonc",
-      "mul.jsonc",
-      "neg.jsonc",
-      //"not.jsonc",
-      //"or.jsonc",
-      "leaky-relu.jsonc",
-      //"reduce-min.jsonc",
-      "relu.jsonc",
-      //"pad.jsonc",
-      //"pad-big.jsonc",
-      "pow.jsonc",
-      "pow-big-number.jsonc",
-      //"reshape.jsonc",
-      //"softmax.jsonc",
-      "sin.jsonc",
-      //"split.jsonc",
-      "sqrt.jsonc",
-      "sub.jsonc",
-      "tan.jsonc",
-      "transpose.jsonc"
-      //"xor.jsonc"
-    ]
-  },
-  "wasm": {
-    "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
-    "node": [
-      // Check in node tests that have native Wasm implementations
-      // (i.e.) not tests that rely on the fallback cpu implementations
-      // Use the 'cpu' level of node tests to test those implementations
-      "test_add_bcast",
-      "test_add",
-      "test_sub_bcast",
-      "test_sub_example",
-      "test_sub",
-      "test_mul_bcast",
-      "test_mul_example",
-      "test_mul",
-      "test_div_bcast",
-      "test_div_example",
-      "test_div",
-      "test_xor_bcast3v1d",
-      "test_xor_bcast3v2d",
-      "test_xor_bcast4v2d",
-      "test_xor_bcast4v3d",
-      "test_xor_bcast4v4d",
-      "test_xor2d",
-      "test_xor3d",
-      "test_xor4d",
-      "test_or_bcast3v1d",
-      "test_or_bcast3v2d",
-      "test_or_bcast4v2d",
-      "test_or_bcast4v3d",
-      "test_or_bcast4v4d",
-      "test_and_bcast3v1d",
-      "test_and_bcast3v2d",
-      "test_and_bcast4v2d",
-      "test_and_bcast4v3d",
-      "test_and_bcast4v4d",
-      "test_and2d",
-      "test_and3d",
-      "test_and4d",
-      "test_prelu_broadcast",
-      "test_prelu_example",
-      "test_basic_conv_with_padding",
-      "test_basic_conv_without_padding",
-      "test_batchnorm_epsilon",
-      "test_batchnorm_example",
-      "opset{10,11,12}/test_cast_STRING_to_FLOAT",
-      "test_clip_splitbounds",
-      "test_clip_outbounds",
-      "test_clip_inbounds",
-      "test_clip_example",
-      "test_clip_default_min",
-      "test_clip_default_max",
-      "test_clip_default_inbounds",
-      "test_clip",
-      "test_conv_with_strides_and_asymmetric_padding",
-      "test_conv_with_strides_no_padding",
-      "test_conv_with_strides_padding",
-      "test_gemm_nobroadcast",
-      "test_gemm_broadcast",
-      "test_matmul_2d",
-      "test_matmul_3d",
-      "test_matmul_4d",
-      "test_softmax_axis_0",
-      "test_softmax_axis_1",
-      "test_softmax_axis_2",
-      "test_softmax_default_axis",
-      "test_softmax_example",
-      "test_softmax_large_number",
-      "test_sum_example",
-      "test_sum_one_input",
-      "test_sum_two_inputs",
-      "test_averagepool_1d_default",
-      "test_averagepool_2d_default",
-      "test_averagepool_2d_pads",
-      "test_averagepool_2d_precomputed_pads",
-      "test_averagepool_2d_precomputed_same_upper",
-      "test_averagepool_2d_precomputed_strides",
-      "test_averagepool_2d_same_upper",
-      "test_averagepool_2d_same_lower",
-      "test_averagepool_2d_strides",
-      "test_averagepool_3d_default",
-      "test_maxpool_1d_default",
-      "test_maxpool_2d_default",
-      "test_maxpool_2d_pads",
-      "test_maxpool_2d_precomputed_pads",
-      "test_maxpool_2d_precomputed_same_upper",
-      "test_maxpool_2d_precomputed_strides",
-      "test_maxpool_2d_same_lower",
-      "test_maxpool_2d_same_upper",
-      "test_maxpool_2d_strides",
-      "test_maxpool_3d_default",
-      "test_globalaveragepool_precomputed",
-      "test_globalaveragepool",
-      "test_globalmaxpool_precomputed",
-      "test_globalmaxpool",
-      "test_instancenorm_epsilon",
-      "test_instancenorm_example"
-    ],
-    "ops": []
-  },
-  "jsep-webgpu": {
     "onnx": [],
     "node": [
       "test_abs",
@@ -1689,5 +1320,104 @@
       // "test_xor4d"
     ],
     "ops": []
+  },
+  "wasm": {
+    "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
+    "node": [
+      // Check in node tests that have native Wasm implementations
+      // (i.e.) not tests that rely on the fallback cpu implementations
+      // Use the 'cpu' level of node tests to test those implementations
+      "test_add_bcast",
+      "test_add",
+      "test_sub_bcast",
+      "test_sub_example",
+      "test_sub",
+      "test_mul_bcast",
+      "test_mul_example",
+      "test_mul",
+      "test_div_bcast",
+      "test_div_example",
+      "test_div",
+      "test_xor_bcast3v1d",
+      "test_xor_bcast3v2d",
+      "test_xor_bcast4v2d",
+      "test_xor_bcast4v3d",
+      "test_xor_bcast4v4d",
+      "test_xor2d",
+      "test_xor3d",
+      "test_xor4d",
+      "test_or_bcast3v1d",
+      "test_or_bcast3v2d",
+      "test_or_bcast4v2d",
+      "test_or_bcast4v3d",
+      "test_or_bcast4v4d",
+      "test_and_bcast3v1d",
+      "test_and_bcast3v2d",
+      "test_and_bcast4v2d",
+      "test_and_bcast4v3d",
+      "test_and_bcast4v4d",
+      "test_and2d",
+      "test_and3d",
+      "test_and4d",
+      "test_prelu_broadcast",
+      "test_prelu_example",
+      "test_basic_conv_with_padding",
+      "test_basic_conv_without_padding",
+      "test_batchnorm_epsilon",
+      "test_batchnorm_example",
+      "opset{10,11,12}/test_cast_STRING_to_FLOAT",
+      "test_clip_splitbounds",
+      "test_clip_outbounds",
+      "test_clip_inbounds",
+      "test_clip_example",
+      "test_clip_default_min",
+      "test_clip_default_max",
+      "test_clip_default_inbounds",
+      "test_clip",
+      "test_conv_with_strides_and_asymmetric_padding",
+      "test_conv_with_strides_no_padding",
+      "test_conv_with_strides_padding",
+      "test_gemm_nobroadcast",
+      "test_gemm_broadcast",
+      "test_matmul_2d",
+      "test_matmul_3d",
+      "test_matmul_4d",
+      "test_softmax_axis_0",
+      "test_softmax_axis_1",
+      "test_softmax_axis_2",
+      "test_softmax_default_axis",
+      "test_softmax_example",
+      "test_softmax_large_number",
+      "test_sum_example",
+      "test_sum_one_input",
+      "test_sum_two_inputs",
+      "test_averagepool_1d_default",
+      "test_averagepool_2d_default",
+      "test_averagepool_2d_pads",
+      "test_averagepool_2d_precomputed_pads",
+      "test_averagepool_2d_precomputed_same_upper",
+      "test_averagepool_2d_precomputed_strides",
+      "test_averagepool_2d_same_upper",
+      "test_averagepool_2d_same_lower",
+      "test_averagepool_2d_strides",
+      "test_averagepool_3d_default",
+      "test_maxpool_1d_default",
+      "test_maxpool_2d_default",
+      "test_maxpool_2d_pads",
+      "test_maxpool_2d_precomputed_pads",
+      "test_maxpool_2d_precomputed_same_upper",
+      "test_maxpool_2d_precomputed_strides",
+      "test_maxpool_2d_same_lower",
+      "test_maxpool_2d_same_upper",
+      "test_maxpool_2d_strides",
+      "test_maxpool_3d_default",
+      "test_globalaveragepool_precomputed",
+      "test_globalaveragepool",
+      "test_globalmaxpool_precomputed",
+      "test_globalmaxpool",
+      "test_instancenorm_epsilon",
+      "test_instancenorm_example"
+    ],
+    "ops": []
   }
 }
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 452ef828e768a..f84dcef33fa28 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -106,7 +106,7 @@ async function loadTensors(
   const outputs: Test.NamedTensor[] = [];
   let dataFileType: 'none'|'pb'|'npy' = 'none';
 
-  const allowInt64 = ['wasm', 'xnnpack', 'jsep-webgpu'].includes(backendName);
+  const allowInt64 = ['wasm', 'xnnpack', 'webgpu'].includes(backendName);
 
   for (const dataFile of testCase.dataFiles) {
     const ext = extname(dataFile);
@@ -308,7 +308,7 @@ export class TensorResultValidator {
         this.absoluteThreshold = WEBGL_THRESHOLD_ABSOLUTE_ERROR;
         this.relativeThreshold = WEBGL_THRESHOLD_RELATIVE_ERROR;
       }
-    } else if (backend === 'webgpu' || backend === 'jsep-webgpu') {
+    } else if (backend === 'webgpu') {
       this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'wasm' || backend === 'xnnpack') {
diff --git a/js/web/test/unittests/backends/webgl/test-conv-new.ts b/js/web/test/unittests/backends/webgl/test-conv-new.ts
index fa783acb6c4d0..0fddddf58181c 100644
--- a/js/web/test/unittests/backends/webgl/test-conv-new.ts
+++ b/js/web/test/unittests/backends/webgl/test-conv-new.ts
@@ -832,7 +832,7 @@ function webglConv(
   if (biasTensor) {
     inputs.push(biasTensor);
   }
-  return (op.impl(webglInferenceHandler!, inputs, op.context) as Tensor[])[0];
+  return (op.impl(webglInferenceHandler!, inputs, op.context))[0];
 }
 function cpuConv(
     inputTensor: Tensor, kernelTensor: Tensor, biasTensor: Tensor|null, autoPad: string|undefined, dilations: number[],
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index de9b004b6c897..9c9472860a072 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -183,7 +183,6 @@ Status IExecutionFrame::GetOrCreateNodeOutputMLValue(const int output_index, int
       if (shape != nullptr && IsOutput(ort_value_idx)) {
         VerifyOutputSizes(output_index, node, *shape);
       }
-     // printf("before CreateNodeOutputMLValueImpl()\n");
       status = CreateNodeOutputMLValueImpl(*p_ort_value, ort_value_idx, shape);
     }
   }
@@ -520,7 +519,6 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
   // try to allocated on pre-allocated big chunk.
   const auto& per_alloc_plan = GetAllocationPlan(ort_value_index);
 
-   // printf("{{ before check memory patterns:\n");
   if (mem_patterns_ && per_alloc_plan.alloc_kind != AllocKind::kAllocateOutput &&
       per_alloc_plan.alloc_kind != AllocKind::kAllocatedExternally) {
     auto pattern = mem_patterns_->GetPatterns(location);
@@ -528,7 +526,6 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
       auto block = pattern->GetBlock(ort_value_index);
       // if block not found, fall back to default behavior
       if (block) {
-   // printf("{{   memory patterns - found block.\n");
         auto it = buffers_.find(location);
         if (it != buffers_.end()) {
           // if the block is not correct, log message then fall back to default behavior
@@ -729,9 +726,6 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
 #endif
 
     AllocKind alloc_kind = per_alloc_plan.alloc_kind;
-#ifndef NDEBUG
-    printf("{{alloc_kind}}=%d\n", (int)alloc_kind);
-#endif
     switch (alloc_kind) {
       // Right now for kAllocate and kAllocateOutput we are using same approach.
       // In the future we may want to have different way to handle it.
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index 23dd9176cede9..5bc5dcdbd7696 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -32,7 +32,6 @@ IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                   const IKernelLookup& kernel_lookup) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (const auto& node : graph.Nodes()) {
-    //printf("IExecutionProvider::GetCapability() calling on node: [%s][%s][%s]\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str());
     if (const KernelCreateInfo* kernel_create_info = kernel_lookup.LookUpKernel(node);
         kernel_create_info != nullptr) {
       std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 15bb6ea120f7f..d34f685415f27 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -106,10 +106,6 @@ static bool TryAssignSingleNode(Graph& graph,
   assert(indexed_sub_graph.GetMetaDef() == nullptr && indexed_sub_graph.nodes.size() == 1);
 
   auto* node = graph.GetNode(indexed_sub_graph.nodes[0]);
-  // if (node->Domain() == kMSInternalNHWCDomain && node->Op() == nullptr ) {
-  //   printf("TryAssignSingleNode() calling SetOpSchemaFromRegistryForNode()\n");
-  //   graph.SetOpSchemaFromRegistryForNode(*node);
-  // }
   if (nullptr != node && node->GetExecutionProviderType().empty()) {
     // The node was not fused or assigned. Assign it to <provider_type>.
     node->SetExecutionProviderType(provider_type);
diff --git a/onnxruntime/core/framework/kernel_lookup.h b/onnxruntime/core/framework/kernel_lookup.h
index 6c52129965bc4..2b4d3ce81623a 100644
--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@@ -30,23 +30,14 @@ class KernelLookup final : public IExecutionProvider::IKernelLookup {
 
   const KernelCreateInfo* LookUpKernel(const Node& node) const override {
     const KernelCreateInfo* kernel_create_info{};
-#ifndef NDEBUG
-    printf(" LookUpKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), provider_type_.c_str());
-#endif
     for (const auto& registry : kernel_registries_) {
       const auto lookup_status = registry->TryFindKernel(node, provider_type_, kernel_type_str_resolver_,
                                                          &kernel_create_info);
       if (lookup_status.IsOK() && kernel_create_info != nullptr) {
-#ifndef NDEBUG
-    printf(" - found\n");
-#endif
         return kernel_create_info;
       }
     }
 
-#ifndef NDEBUG
-    printf(" - not found\n");
-#endif
     return nullptr;
   }
 
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index efa81b5a9f98c..e2bc7c3e3ce6f 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -166,9 +166,6 @@ Status KernelRegistry::TryFindKernel(const Node& node,
   const auto& node_provider = node.GetExecutionProviderType();
   const auto& expected_provider = (node_provider.empty() ? exec_provider : node_provider);
 
-#ifndef NDEBUG
-    printf("  KernelRegistry::TryFindKernel() calling on node: [%s][%s][%s], provider type=%s\n", node.Domain().c_str(), node.OpType().c_str(), node.Name().c_str(), expected_provider.c_str());
-#endif
   auto range = kernel_creator_fn_map_.equal_range(GetMapKey(node.OpType(), node.Domain(), expected_provider));
   if (out) *out = nullptr;
 
@@ -178,9 +175,6 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     std::string error_str;
     if (VerifyKernelDef(node, *i->second.kernel_def, kernel_type_str_resolver, error_str)) {
       if (out) *out = &i->second;
-#ifndef NDEBUG
-    printf("  KernelRegistry::TryFindKernel() OK\n");
-#endif
       return Status::OK();
     }
     verify_kernel_def_error_strs.push_back(error_str);
@@ -197,15 +191,9 @@ Status KernelRegistry::TryFindKernel(const Node& node,
     oss << ")";
 
     VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
-#ifndef NDEBUG
-    printf("  KernelRegistry::TryFindKernel() failed: %s\n",oss.str().c_str());
-#endif
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
-#ifndef NDEBUG
-    printf("  KernelRegistry::TryFindKernel() failed: Kernel not found\n");
-#endif
   return Status(common::ONNXRUNTIME, common::FAIL, "Kernel not found");
 }
 
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 9ea751b455046..facce93cde798 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -1444,9 +1444,6 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   }
 #endif
 
-#ifndef NDEBUG
-  printf("before SaveInitializedTensors()\n");
-#endif
   ORT_RETURN_IF_ERROR(
       session_state_utils::SaveInitializedTensors(
           Env::Default(), graph_location, *graph_viewer_,
@@ -1466,9 +1463,6 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   // Record Weight allocation info on device
   GetMemoryProfiler()->GetMemoryInfo().RecordInitializerAllocInfo(GetInitializedTensors());
 #endif
-#ifndef NDEBUG
-  printf("after SaveInitializedTensors()\n");
-#endif
 
   // remove weights from the graph now to save memory but in many cases it won't save memory, if the tensor was
   // preallocated with the some other tensors in a single 'allocate' call, which is very common.

From 39fabc3e9558d9fa0d510c636fa0cb861c6d52f4 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 9 Feb 2023 16:18:53 -0800
Subject: [PATCH 40/81] Revert "[js] support 2 flags from session options"

This reverts commit f7df9bab6164e75a9802c2a65ac073367688426b.
---
 js/common/lib/inference-session.ts    |  8 ------
 js/web/lib/wasm/binding/ort-wasm.d.ts |  2 +-
 js/web/lib/wasm/session-options.ts    |  7 +-----
 js/web/script/test-runner-cli-args.ts | 29 ---------------------
 js/web/script/test-runner-cli.ts      |  2 --
 js/web/test/test-main.ts              |  5 +---
 js/web/test/test-runner.ts            | 12 +++------
 js/web/test/test-types.ts             |  1 -
 onnxruntime/core/graph/model.cc       | 36 ---------------------------
 onnxruntime/wasm/api.cc               |  7 +-----
 onnxruntime/wasm/api.h                |  4 +--
 11 files changed, 9 insertions(+), 104 deletions(-)

diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 638cb90f36716..1f2f855a3e487 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -94,14 +94,6 @@ export declare namespace InferenceSession {
      */
     executionMode?: 'sequential'|'parallel';
 
-    /**
-     * Optimized model file path.
-     *
-     * If this setting is specified, the optimized model will be dumped. In browser, a blob will be created
-     * with a pop-up window.
-     */
-    optimizedModelFilePath?: string;
-
     /**
      * Wether enable profiling.
      *
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 2e51d3257ec9c..efb73c9943518 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -47,7 +47,7 @@ export interface OrtWasmModule extends EmscriptenModule {
   _OrtCreateSessionOptions(
       graphOptimizationLevel: number, enableCpuMemArena: boolean, enableMemPattern: boolean, executionMode: number,
       enableProfiling: boolean, profileFilePrefix: number, logId: number, logSeverityLevel: number,
-      logVerbosityLevel: number, optimizedModelFilePath: number): number;
+      logVerbosityLevel: number): number;
   _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number;
   _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number;
   _OrtReleaseSessionOptions(sessionOptionsHandle: number): void;
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index e41019b1023c4..87005346784f4 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -131,15 +131,10 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
       sessionOptions.enableProfiling = false;
     }
 
-    let optimizedModelFilePathOffset = 0;
-    if (typeof options?.optimizedModelFilePath === 'string') {
-      optimizedModelFilePathOffset = allocWasmString(options.optimizedModelFilePath, allocs);
-    }
-
     sessionOptionsHandle = wasm._OrtCreateSessionOptions(
         graphOptimizationLevel, !!sessionOptions.enableCpuMemArena!, !!sessionOptions.enableMemPattern!, executionMode,
         !!sessionOptions.enableProfiling!, 0, logIdDataOffset, sessionOptions.logSeverityLevel!,
-        sessionOptions.logVerbosityLevel!, optimizedModelFilePathOffset);
+        sessionOptions.logVerbosityLevel!);
     if (sessionOptionsHandle === 0) {
       throw new Error('Can\'t create session options');
     }
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index 87507acc60d9b..1dbe095c7e90f 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -51,10 +51,6 @@ Options:
                                  This flag can be used with a number as value, specifying the total count of test cases to run. The test cases may be used multiple times. Default value is 10.
  -c, --file-cache              Enable file cache.
 
-*** Session Options ***
- -u=<...>, --optimized-model-file-path=<...>        Specify whether to dump the optimized model.
- -o=<...>, --graph-optimization-level=<...>    Specify graph optimization level.
-                                                 Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'all'.
 *** Logging Options ***
 
  --log-verbose=<...>           Set log level to verbose
@@ -155,16 +151,6 @@ export interface TestRunnerCliArgs {
    */
   times?: number;
 
-  /**
-   * whether to dump the optimized model
-   */
-  optimizedModelFilePath?: string;
-
-  /**
-   * Specify graph optimization level
-   */
-  graphOptimizationLevel: 'disabled'|'basic'|'extended'|'all';
-
   cpuOptions?: InferenceSession.CpuExecutionProviderOption;
   cudaOptions?: InferenceSession.CudaExecutionProviderOption;
   cudaFlags?: Record<string, unknown>;
@@ -403,19 +389,6 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     logConfig.push({category: 'TestRunner.Perf', config: {minimalSeverity: 'verbose'}});
   }
 
-  // Option: -u, --optimized-model-file-path
-  const optimizedModelFilePath = args['optimized-model-file-path'] || args.u || undefined;
-  if (typeof optimizedModelFilePath !== 'undefined' && typeof optimizedModelFilePath !== 'string') {
-    throw new Error('Flag "optimized-model-file-path" need to be either empty or a valid file path.');
-  }
-
-  // Option: -o, --graph-optimization-level
-  const graphOptimizationLevel = args['graph-optimization-level'] || args.o || 'all';
-  if (typeof graphOptimizationLevel !== 'string' ||
-      ['disabled', 'basic', 'extended', 'all'].indexOf(graphOptimizationLevel) === -1) {
-    throw new Error(`graph optimization level is invalid: ${graphOptimizationLevel}`);
-  }
-
   // Option: -c, --file-cache
   const fileCache = parseBooleanArg(args['file-cache'] || args.c, false);
 
@@ -443,8 +416,6 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     logConfig,
     profile,
     times: perf ? times : undefined,
-    optimizedModelFilePath,
-    graphOptimizationLevel: graphOptimizationLevel as TestRunnerCliArgs['graphOptimizationLevel'],
     fileCache,
     cpuOptions,
     webglOptions,
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 68eda8aea17ff..ad6e5a9a92675 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -148,8 +148,6 @@ run({
   log: args.logConfig,
   profile: args.profile,
   options: {
-    sessionOptions:
-        {graphOptimizationLevel: args.graphOptimizationLevel, optimizedModelFilePath: args.optimizedModelFilePath},
     debug: args.debug,
     cpuOptions: args.cpuOptions,
     webglOptions: args.webglOptions,
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 345faf509504d..b85ed541372ad 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -106,10 +106,7 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
         let context: ModelTestContext;
 
         before('prepare session', async () => {
-          console.log(`[_BEFORE_PREPARE_SESSION_] ${performance.now()}`);
-          context = await ModelTestContext.create(
-              test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
-          console.log(`[_AFTER_PREPARE_SESSION_] ${performance.now()}`);
+          context = await ModelTestContext.create(test, ORT_WEB_TEST_CONFIG.profile);
         });
 
         after('release session', () => {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index f84dcef33fa28..54da471fb95ce 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -147,7 +147,7 @@ async function loadTensors(
 }
 
 async function initializeSession(
-    modelFilePath: string, backendHint: string, profile: boolean, sessionOptions: ort.InferenceSession.SessionOptions,
+    modelFilePath: string, backendHint: string, profile: boolean,
     fileCache?: FileCacheBuffer): Promise<ort.InferenceSession> {
   const preloadModelData: Uint8Array|undefined =
       fileCache && fileCache[modelFilePath] ? fileCache[modelFilePath] : undefined;
@@ -157,8 +157,7 @@ async function initializeSession(
           preloadModelData ? ` [preloaded(${preloadModelData.byteLength})]` : ''}`);
 
   const profilerConfig = profile ? {maxNumberEvents: 65536} : undefined;
-  const sessionConfig =
-      {...sessionOptions, executionProviders: [backendHint], profiler: profilerConfig, enableProfiling: profile};
+  const sessionConfig = {executionProviders: [backendHint], profiler: profilerConfig, enableProfiling: profile};
   let session: ort.InferenceSession;
 
   try {
@@ -231,9 +230,7 @@ export class ModelTestContext {
   /**
    * create a ModelTestContext object that used in every test cases in the given ModelTest.
    */
-  static async create(
-      modelTest: Test.ModelTest, profile: boolean,
-      sessionOptions?: ort.InferenceSession.SessionOptions): Promise<ModelTestContext> {
+  static async create(modelTest: Test.ModelTest, profile: boolean): Promise<ModelTestContext> {
     if (this.initializing) {
       throw new Error('cannot create a ModelTestContext object when the previous creation is not done');
     }
@@ -242,8 +239,7 @@ export class ModelTestContext {
       this.initializing = true;
 
       const initStart = now();
-      const session =
-          await initializeSession(modelTest.modelUrl, modelTest.backend!, profile, sessionOptions || {}, this.cache);
+      const session = await initializeSession(modelTest.modelUrl, modelTest.backend!, profile, this.cache);
       const initEnd = now();
 
       for (const testCase of modelTest.cases) {
diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts
index 966b1e704a5b7..a7ab9d7025706 100644
--- a/js/web/test/test-types.ts
+++ b/js/web/test/test-types.ts
@@ -104,7 +104,6 @@ export declare namespace Test {
    */
   export interface Options {
     debug?: boolean;
-    sessionOptions?: InferenceSession.SessionOptions;
     cpuOptions?: InferenceSession.CpuExecutionProviderOption;
     cpuFlags?: Record<string, unknown>;
     cudaOptions?: InferenceSession.CudaExecutionProviderOption;
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 34ab1ccfc53c0..8af9f99ed1d44 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -29,10 +29,6 @@
 #include "core/graph/function_utils.h"
 #endif
 
-#if defined(__wasm__)
-#include <emscripten.h>
-#endif
-
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime;
 using namespace onnxruntime::common;
@@ -504,37 +500,6 @@ static Status LoadModel(const T& file_path, std::shared_ptr<Model>& p_model,
 
 template <typename T>
 static Status SaveModel(Model& model, const T& file_path) {
-#if defined(__wasm__)
-  ORT_RETURN_IF_ERROR(model.MainGraph().Resolve());
-  auto model_proto = model.ToProto();
-  auto buffer_size = model_proto.ByteSizeLong();
-  void* buffer = malloc(buffer_size);
-  model_proto.SerializeToArray(buffer, buffer_size);
-
-  EM_ASM(({
-           const buffer = $0;
-           const buffer_size = $1;
-           const file_path = UTF8ToString($2);
-           const bytes = new Uint8Array(buffer_size);
-           bytes.set(HEAPU8.subarray(buffer, buffer + buffer_size));
-           if (typeof process == 'object' && typeof process.versions == 'object' && typeof process.versions.node == 'string') {
-             // Node.js
-             require('fs').writeFileSync(file_path, bytes);
-           } else {
-             // Browser
-             const file = new File([bytes], file_path, {type: "application/octet-stream" });
-             const url = URL.createObjectURL(file);
-             window.open(url, '_blank');
-           }
-         }),
-         reinterpret_cast<int32_t>(buffer),
-         static_cast<int32_t>(buffer_size),
-         reinterpret_cast<int32_t>(file_path.c_str()));
-
-  free(buffer);
-  return Status::OK();
-
-#else
   int fd;
   Status status = Env::Default().FileOpenWr(file_path, fd);
   ORT_RETURN_IF_ERROR(status);
@@ -553,7 +518,6 @@ static Status SaveModel(Model& model, const T& file_path) {
     return status;
   }
   return Env::Default().FileClose(fd);
-#endif
 }
 
 #ifdef _WIN32
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 69b179ddd6969..d24cbd495d1fa 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -68,15 +68,10 @@ OrtSessionOptions* OrtCreateSessionOptions(size_t graph_optimization_level,
                                            const char* /*profile_file_prefix*/,
                                            const char* log_id,
                                            size_t log_severity_level,
-                                           size_t log_verbosity_level,
-                                           const char* optimized_model_filepath) {
+                                           size_t log_verbosity_level) {
   OrtSessionOptions* session_options = nullptr;
   RETURN_NULLPTR_IF_ERROR(CreateSessionOptions, &session_options);
 
-  if (optimized_model_filepath) {
-    RETURN_NULLPTR_IF_ERROR(SetOptimizedModelFilePath, session_options, optimized_model_filepath);
-  }
-
   // assume that a graph optimization level is checked and properly set at JavaScript
   RETURN_NULLPTR_IF_ERROR(SetSessionGraphOptimizationLevel,
                           session_options,
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index 80466ecd871c4..d3435f2958a17 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -46,7 +46,6 @@ int EMSCRIPTEN_KEEPALIVE OrtInit(int num_threads, int logging_level);
  * @param log_id logger id for session output
  * @param log_severity_level verbose, info, warning, error or fatal
  * @param log_verbosity_level vlog level
- * @param optimized_model_filepath filepath of the optimized model to dump.
  * @returns a pointer to a session option handle and must be freed by calling OrtReleaseSessionOptions().
  */
 ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t graph_optimization_level,
@@ -57,8 +56,7 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t
                                                                           const char* profile_file_prefix,
                                                                           const char* log_id,
                                                                           size_t log_severity_level,
-                                                                          size_t log_verbosity_level,
-                                                                          const char* optimized_model_filepath);
+                                                                          size_t log_verbosity_level);
 
 /**
  * append an execution provider for a session.

From d4a07505637645ce7e71e602ed623a3979f62b19 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 9 Feb 2023 16:32:54 -0800
Subject: [PATCH 41/81] allow reuse allocator

---
 .../providers/js/js_execution_provider.cc     | 57 +++++++++++++------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index edf2eef5cbb66..6f327a9fe7e90 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -310,23 +310,46 @@ void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager)
   printf("JsExecutionProvider::RegisterAllocator() \n");
 #endif
 
-  AllocatorCreationInfo cpuInputAllocatorCreationInfo([&](int) {
-    return std::make_unique<js::JsCPUInputAllocator>();
-  });
-  InsertAllocator(CreateAllocator(cpuInputAllocatorCreationInfo));
-
-  AllocatorCreationInfo cpuOutputAllocatorCreationInfo([&](int) {
-    return std::make_unique<js::JsCPUOutputAllocator>();
-  });
-  InsertAllocator(CreateAllocator(cpuOutputAllocatorCreationInfo));
-
-  // use_arena might have some issue, for this to work need to change
-  // https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/framework/execution_frame.cc#L507
-  AllocatorCreationInfo memory_info(
-      [&](int) { return std::make_unique<js::JsCustomAllocator>(); }, 0, false);
-
-  AllocatorPtr allocator = CreateAllocator(memory_info);
-  InsertAllocator(allocator);
+  OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
+  auto cpu_input_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeCPUInput);
+  if (!cpu_input_alloc) {
+    cpu_input_alloc = allocator_manager.GetAllocator(OrtMemTypeCPUInput, cpu_device);
+    if (!cpu_input_alloc) {
+      AllocatorCreationInfo cpuInputAllocatorCreationInfo([&](int) {
+        return std::make_unique<js::JsCPUInputAllocator>();
+      });
+      cpu_input_alloc = CreateAllocator(cpuInputAllocatorCreationInfo);
+      allocator_manager.InsertAllocator(cpu_input_alloc);
+    }
+    InsertAllocator(cpu_input_alloc);
+  }
+
+  auto cpu_output_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeCPUOutput);
+  if (!cpu_output_alloc) {
+    cpu_output_alloc = allocator_manager.GetAllocator(OrtMemTypeCPUOutput, cpu_device);
+    if (!cpu_output_alloc) {
+      AllocatorCreationInfo cpuOutputAllocatorCreationInfo([&](int) {
+        return std::make_unique<js::JsCPUOutputAllocator>();
+      });
+      cpu_output_alloc = CreateAllocator(cpuOutputAllocatorCreationInfo);
+      allocator_manager.InsertAllocator(cpu_output_alloc);
+    }
+    InsertAllocator(cpu_output_alloc);
+  }
+
+  OrtDevice custom_device{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0};
+  auto custom_alloc = GetAllocator(custom_device.Id(), OrtMemTypeDefault);
+  if (!custom_alloc) {
+    custom_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, custom_device);
+    if (!custom_alloc) {
+      AllocatorCreationInfo customAllocatorCreationInfo([&](int) {
+        return std::make_unique<js::JsCustomAllocator>();
+      });
+      custom_alloc = CreateAllocator(customAllocatorCreationInfo);
+      allocator_manager.InsertAllocator(custom_alloc);
+    }
+    InsertAllocator(custom_alloc);
+  }
 }
 
 std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapability(

From acabe32801106dc5ce1ff9049c5fa6f3c34bec8e Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 9 Feb 2023 17:28:29 -0800
Subject: [PATCH 42/81] remove js_kernel_lookup

---
 .../providers/js/js_execution_provider.cc     | 41 +------------------
 .../core/providers/js/js_kernel_lookup.cc     | 18 --------
 .../core/providers/js/js_kernel_lookup.h      | 22 ----------
 3 files changed, 1 insertion(+), 80 deletions(-)
 delete mode 100644 onnxruntime/core/providers/js/js_kernel_lookup.cc
 delete mode 100644 onnxruntime/core/providers/js/js_kernel_lookup.h

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 6f327a9fe7e90..e691605be1066 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -15,7 +15,6 @@
 #include "core/providers/shared/node_unit/node_unit.h"
 #include "allocator.h"
 #include "data_transfer.h"
-#include "js_kernel_lookup.h"
 
 namespace onnxruntime {
 
@@ -305,11 +304,6 @@ JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info)
 
 // implement RegisterAllocator to test/validate sharing the CPU EP's allocator
 void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
-
-#ifndef NDEBUG
-  printf("JsExecutionProvider::RegisterAllocator() \n");
-#endif
-
   OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
   auto cpu_input_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeCPUInput);
   if (!cpu_input_alloc) {
@@ -355,40 +349,7 @@ void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager)
 std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& kernel_lookup) const {
-
-  auto lookup = JsKernelLookup{kernel_lookup};
-  auto list = IExecutionProvider::GetCapability(graph, lookup);
-#ifndef NDEBUG
-  printf("JsExecutionProvider::GetCapability() results:\n");
-
-  for (size_t i = 0; i < list.size(); i++) {
-    auto &nodes = list[i]->sub_graph->nodes;
-    printf("  subgraph %zu: %zu node(s)\n", i, list[i]->sub_graph->nodes.size());
-    for (size_t j = 0; j < nodes.size(); j++) {
-      auto node_index = nodes[j];
-      auto *node = graph.GetNode(node_index);
-      auto *kernel_info = lookup.LookUpKernel(*node);
-
-      (void)(node_index);
-      (void)(node);
-      (void)(kernel_info);
-      printf("    node[%zu]: [%s][%s][%s]\n", node_index, node->Domain().c_str(), node->OpType().c_str(), node->Name().c_str());
-
-      // if (node->OpType() == "Clip" && node->InputDefs().size() == 3) {
-      //   printf("Clip node: [%s] %s, %s\n", node->Name().c_str(), node->InputDefs()[1]->Name().c_str(), node->InputDefs()[2]->Name().c_str());
-      //   if (!graph.IsConstantInitializer(node->InputDefs()[1]->Name(), true) ||
-      //       !graph.IsConstantInitializer(node->InputDefs()[2]->Name(), true)) {
-      //     printf("--erasing\n");
-      //     nodes.erase(nodes.begin() + j);
-      //     j--;
-      //     continue;
-      //   }
-      // }
-    }
-  }
-#endif
-
-  return list;
+  return IExecutionProvider::GetCapability(graph, kernel_lookup);
 }
 
 std::shared_ptr<KernelRegistry> JsExecutionProvider::GetKernelRegistry() const {
diff --git a/onnxruntime/core/providers/js/js_kernel_lookup.cc b/onnxruntime/core/providers/js/js_kernel_lookup.cc
deleted file mode 100644
index 18108589bfcd1..0000000000000
--- a/onnxruntime/core/providers/js/js_kernel_lookup.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "js_kernel_lookup.h"
-
-namespace onnxruntime {
-namespace js {
-
-const KernelCreateInfo* JsKernelLookup::LookUpKernel(const Node& node) const {
-    // if (node.OpType() == "Clip") {
-    //     node.
-    // }
-
-    return orig_.LookUpKernel(node);
-}
-
-}  // namespace js
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_kernel_lookup.h b/onnxruntime/core/providers/js/js_kernel_lookup.h
deleted file mode 100644
index aad3da26778fe..0000000000000
--- a/onnxruntime/core/providers/js/js_kernel_lookup.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/framework/execution_provider.h"
-#include "core/framework/op_kernel.h"
-
-namespace onnxruntime {
-namespace js {
-
-class JsKernelLookup : public IExecutionProvider::IKernelLookup {
- public:
-  JsKernelLookup(const IKernelLookup& orig): orig_(orig) {
-  }
-  const KernelCreateInfo* LookUpKernel(const Node& node) const override;
- private:
-  const IKernelLookup& orig_;
-};
-
-}  // namespace js
-}  // namespace onnxruntime

From a95384802dbfdf8a4fa38bdea0b6b0b19556d406 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 9 Feb 2023 17:37:53 -0800
Subject: [PATCH 43/81] remove some std output

---
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 431e3fdbf1969..b5d661f0122b4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -170,8 +170,6 @@ const conv2d = (context: ComputeContext, attributes: ConvAttributes): number =>
        (attributes.autoPad === 'SAME_UPPER' || attributes.autoPad === 'SAME_LOWER' ||
         attributes.autoPad === 'VALID'))) {
     // return conv2dByMatMul({x, filter, convInfo, backend, bias, activation, preluActivationWeights, leakyreluAlpha});
-    // eslint-disable-next-line no-console
-    console.log('[_CONV_]conv2dByMatMul');
     context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
     return 0;
   }
@@ -186,8 +184,6 @@ const conv2d = (context: ComputeContext, attributes: ConvAttributes): number =>
   // if (workgroupsBy32x32 <= thresholdToIncreaseWorkgroups) {
   //   // return conv2dWithIm2Col({x, filter, convInfo, backend, bias, preluActivationWeights, leakyreluAlpha,
   //   // activation});
-  //   //  eslint-disable-next-line no-console
-  //   console.log('[_CONV_]conv2dWithIm2Col');
   //   context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
   //   return 0;
   // }
@@ -205,8 +201,6 @@ const conv2d = (context: ComputeContext, attributes: ConvAttributes): number =>
   //     inputs.push(context.inputs[2]);
   //   }
   // }
-  // eslint-disable-next-line no-console
-  // console.log('[_CONV_]Conv2DMMProgram');
 
   // STEP.1: transpose weight
   const transposedWeight = (context.customData.wT as TensorView | undefined) ??

From b931ef10032f655fec26d630c575ee12601d9089 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 9 Feb 2023 18:05:49 -0800
Subject: [PATCH 44/81] disable arena for js custom allocator

---
 onnxruntime/core/providers/js/js_execution_provider.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index e691605be1066..c4b5b22f0530a 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -338,7 +338,7 @@ void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager)
     if (!custom_alloc) {
       AllocatorCreationInfo customAllocatorCreationInfo([&](int) {
         return std::make_unique<js::JsCustomAllocator>();
-      });
+      }, 0, false);
       custom_alloc = CreateAllocator(customAllocatorCreationInfo);
       allocator_manager.InsertAllocator(custom_alloc);
     }

From 8f87dd9b13fd49c6827527f2faec19e20475deb5 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 28 Feb 2023 15:08:25 -0800
Subject: [PATCH 45/81] fix parameter of GetAllocator()

---
 onnxruntime/core/providers/js/js_execution_provider.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index c4b5b22f0530a..3b9443d1e7c99 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -305,7 +305,7 @@ JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info)
 // implement RegisterAllocator to test/validate sharing the CPU EP's allocator
 void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
   OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
-  auto cpu_input_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeCPUInput);
+  auto cpu_input_alloc = GetAllocator(OrtMemTypeCPUInput);
   if (!cpu_input_alloc) {
     cpu_input_alloc = allocator_manager.GetAllocator(OrtMemTypeCPUInput, cpu_device);
     if (!cpu_input_alloc) {
@@ -318,7 +318,7 @@ void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager)
     InsertAllocator(cpu_input_alloc);
   }
 
-  auto cpu_output_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeCPUOutput);
+  auto cpu_output_alloc = GetAllocator(OrtMemTypeCPUOutput);
   if (!cpu_output_alloc) {
     cpu_output_alloc = allocator_manager.GetAllocator(OrtMemTypeCPUOutput, cpu_device);
     if (!cpu_output_alloc) {
@@ -332,7 +332,7 @@ void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager)
   }
 
   OrtDevice custom_device{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0};
-  auto custom_alloc = GetAllocator(custom_device.Id(), OrtMemTypeDefault);
+  auto custom_alloc = GetAllocator(OrtMemTypeDefault);
   if (!custom_alloc) {
     custom_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, custom_device);
     if (!custom_alloc) {

From df225fbc12179b3473280a95fa322e7944dc82a2 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 28 Feb 2023 17:49:22 -0800
Subject: [PATCH 46/81] remove bb.bat and br.bat

---
 bb.bat | 25 -------------------------
 br.bat | 25 -------------------------
 2 files changed, 50 deletions(-)
 delete mode 100644 bb.bat
 delete mode 100644 br.bat

diff --git a/bb.bat b/bb.bat
deleted file mode 100644
index ff5c14fa0abd5..0000000000000
--- a/bb.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-@echo off
-
-if ["%~1"]==["--clean"] (
-    if exist "%~dp0build\Windows\Debug" (
-        rd /s /q %~dp0build\Windows\Debug
-    )
-)
-
-setlocal
-set PATH=C:\Program Files\Git\usr\bin;%PATH%
-
-if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
-    set protoc_path_flag=--path_to_protoc_exe %~dp0build\Windows\host_protoc\Release\protoc.exe
-) else (
-    set protoc_path_flag=
-)
-
-call .\build.bat --config Debug --skip_submodule_sync --skip_tests --build_wasm --use_xnnpack --enable_wasm_simd --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
-
-IF %ERRORLEVEL% == 0 (
-copy /Y .\build\Windows\Debug\ort-wasm-simd.js .\js\web\lib\wasm\binding\ort-wasm.js
-copy /Y .\build\Windows\Debug\ort-wasm-simd.wasm .\js\web\dist\
-)
-
-endlocal
diff --git a/br.bat b/br.bat
deleted file mode 100644
index 8642d417f0d30..0000000000000
--- a/br.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-@echo off
-
-if ["%~1"]==["--clean"] (
-    if exist "%~dp0build\Windows\Release" (
-        rd /s /q %~dp0build\Windows\Release
-    )
-)
-
-setlocal
-set PATH=C:\Program Files\Git\usr\bin;%PATH%
-
-if exist "%~dp0build\Windows\host_protoc\Release\protoc.exe" (
-    set protoc_path_flag=--path_to_protoc_exe %~dp0build\Windows\host_protoc\Release\protoc.exe
-) else (
-    set protoc_path_flag=
-)
-
-call .\build.bat --config Release --skip_submodule_sync --skip_tests --disable_wasm_exception_catching --disable_rtti --build_wasm --use_xnnpack --enable_wasm_simd --use_js --cmake_generator "Visual Studio 17 2022" %protoc_path_flag% --target onnxruntime_webassembly
-
-IF %ERRORLEVEL% == 0 (
-copy /Y .\build\Windows\Release\ort-wasm-simd.js .\js\web\lib\wasm\binding\ort-wasm.js
-copy /Y .\build\Windows\Release\ort-wasm-simd.wasm .\js\web\dist\
-)
-
-endlocal

From 09b74396877c1ebc423df35f7d70ef637953ea7c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 1 Mar 2023 16:20:15 -0800
Subject: [PATCH 47/81] remove unused comments

---
 .../providers/js/js_execution_provider.cc     | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 3b9443d1e7c99..db3b44d995327 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -161,14 +161,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool);
-// class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Softmax);
-// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax);
 
-// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, uint8_t, QLinearConv);
-// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, int8_t, QLinearConv);
-// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, QLinearAveragePool);
-// class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider,
-//                                       kDynamicDomainByCreate, 1, QLinearSoftmax);
 
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
@@ -270,18 +263,6 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool)>,
-      // // layout insensitive, use ONNX-domain directly
-      // BuildKernelCreateInfo<
-      //     ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax)>,
-      // BuildKernelCreateInfo<
-      //     ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Softmax)>,
-
-      // //  quantization op
-      // KERNEL_CREATE_INFO_TYPED(10, uint8_t, QLinearConv),
-      // KERNEL_CREATE_INFO_TYPED(10, int8_t, QLinearConv),
-      // KERNEL_CREATE_INFO(1, QLinearAveragePool),
-      // BuildKernelCreateInfo<
-      //     ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax)>,
   };
 
   for (auto& function_table_entry : function_table) {

From 0d0b0a17dd43fdec7098c14ee6e02ad9df3743fc Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 9 Mar 2023 11:32:59 -0800
Subject: [PATCH 48/81] use target_link_options

---
 cmake/onnxruntime_webassembly.cmake | 69 ++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 7d98a42ca5fc4..ac17088b9fa84 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -205,56 +205,75 @@ else()
     set(EXPORTED_FUNCTIONS "_malloc,_free")
   endif()
 
-  set_target_properties(onnxruntime_webassembly PROPERTIES LINK_FLAGS " \
-                        -s \"EXPORTED_RUNTIME_METHODS=${EXPORTED_RUNTIME_METHODS}\" \
-                        -s \"EXPORTED_FUNCTIONS=${EXPORTED_FUNCTIONS}\" \
-                        -s MAXIMUM_MEMORY=4294967296 \
-                        -s WASM=1 \
-                        -s EXIT_RUNTIME=0 \
-                        -s ALLOW_MEMORY_GROWTH=1 \
-                        -s MODULARIZE=1 \
-                        -s EXPORT_ALL=0 \
-                        -s LLD_REPORT_UNDEFINED \
-                        -s VERBOSE=0 \
-                        -s FILESYSTEM=0 \
-                        ${WASM_API_EXCEPTION_CATCHING} \
-                        --no-entry")
+  target_link_options(onnxruntime_webassembly PRIVATE
+    "SHELL:-s EXPORTED_RUNTIME_METHODS=${EXPORTED_RUNTIME_METHODS}"
+    "SHELL:-s EXPORTED_FUNCTIONS=${EXPORTED_FUNCTIONS}"
+    "SHELL:-s MAXIMUM_MEMORY=4294967296"
+    "SHELL:-s WASM=1"
+    "SHELL:-s EXIT_RUNTIME=0"
+    "SHELL:-s ALLOW_MEMORY_GROWTH=1"
+    "SHELL:-s MODULARIZE=1"
+    "SHELL:-s EXPORT_ALL=0"
+    "SHELL:-s LLD_REPORT_UNDEFINED"
+    "SHELL:-s VERBOSE=0"
+    "SHELL:-s FILESYSTEM=0"
+    ${WASM_API_EXCEPTION_CATCHING}
+    --no-entry
+  )
 
   if (onnxruntime_USE_JS)
-    target_compile_definitions(onnxruntime_webassembly PRIVATE -DUSE_JS=1)
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS
-      " --pre-js \"${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js\" -s ASYNCIFY=1 -s ASYNCIFY_STACK_SIZE=65536")
+    target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JS=1)
+    target_link_options(onnxruntime_webassembly PRIVATE
+      --pre-js "${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js"
+      "SHELL:-s ASYNCIFY=1"
+      "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
+    )
   endif()
 
   if (onnxruntime_EMSCRIPTEN_SETTINGS)
     foreach(setting IN LISTS onnxruntime_EMSCRIPTEN_SETTINGS)
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS
-      " -s ${setting}")
+      target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s ${setting}")
     endforeach()
   endif()
 
   if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -s ASSERTIONS=2 -s SAFE_HEAP=1 -s STACK_OVERFLOW_CHECK=1 -s DEMANGLE_SUPPORT=1")
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:-s ASSERTIONS=2"
+      "SHELL:-s SAFE_HEAP=1"
+      "SHELL:-s STACK_OVERFLOW_CHECK=1"
+      "SHELL:-s DEMANGLE_SUPPORT=1"
+    )
   else()
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -s ASSERTIONS=0 -s SAFE_HEAP=0 -s STACK_OVERFLOW_CHECK=0 -s DEMANGLE_SUPPORT=0 --closure 1")
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:-s ASSERTIONS=0"
+      "SHELL:-s SAFE_HEAP=0"
+      "SHELL:-s STACK_OVERFLOW_CHECK=0"
+      "SHELL:-s DEMANGLE_SUPPORT=0"
+      --closure 1
+    )
   endif()
 
   # Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
-  set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -s DISABLE_EXCEPTION_THROWING=0")
+  target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s DISABLE_EXCEPTION_THROWING=0")
 
   if (onnxruntime_ENABLE_WEBASSEMBLY_PROFILING)
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --profiling --profiling-funcs")
+    target_link_options(onnxruntime_webassembly PRIVATE --profiling --profiling-funcs)
   endif()
 
   if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -s EXPORT_NAME=ortWasmThreaded -s USE_PTHREADS=1")
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:-s EXPORT_NAME=ortWasmThreaded"
+      "SHELL:-s USE_PTHREADS=1"
+    )
     if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
       set_target_properties(onnxruntime_webassembly PROPERTIES OUTPUT_NAME "ort-wasm-simd-threaded")
     else()
       set_target_properties(onnxruntime_webassembly PROPERTIES OUTPUT_NAME "ort-wasm-threaded")
     endif()
   else()
-    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -s EXPORT_NAME=ortWasm")
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:-s EXPORT_NAME=ortWasm"
+    )
     if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
       set_target_properties(onnxruntime_webassembly PROPERTIES OUTPUT_NAME "ort-wasm-simd")
     else()

From 5c8c14afe74274ad9ac4b621233cb7c87af34d03 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 15 Mar 2023 17:10:54 -0700
Subject: [PATCH 49/81] resolve comments #2

---
 js/web/lib/wasm/jsep/backend-webgpu.ts | 21 ++++++++++++++++-----
 js/web/lib/wasm/jsep/webgpu/types.ts   |  5 +++++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 6ce156142bfa4..ab6c122ee4d2a 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -7,8 +7,15 @@ import {TensorView} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
-
+import {ComputeContext, ComputeContextInputsOutputsMapping, GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+
+/**
+ * get a unique key representing the program from the program info,input shapes and types.
+ *
+ * @returns a unique key is a shorter string than the shader source, which contains all the information to identify a
+ * program. if the key is the same, the program shader source should be the same, so we can reuse the program.
+ *
+ */
 const getProgramInfoUniqueKey =
     (programInfo: ProgramInfo|ProgramInfoLoader, inputTensorShapes: ReadonlyArray<TensorView['dims']>,
      inputGpuDataTypes: readonly GpuDataType[]): string => {
@@ -59,6 +66,8 @@ export class WebGpuBackend {
         maxStorageBufferBindingSize: adapter.limits.maxStorageBufferBindingSize,
       }
     };
+    // WebGPU Spec: Timestamp Queries Inside Passes
+    // https://github.com/gpuweb/gpuweb/blob/main/proposals/timestamp-query-inside-passes.md
     if (adapter.features.has('timestamp-query-inside-passes') && env.webgpu.profilingMode === 'default') {
       this.profilingEnabled = true;
       // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -148,19 +157,21 @@ export class WebGpuBackend {
     // check ouput indices
     const validatedOutputIndices = outputIndices.length === 0 ? programInfo.outputs.map((_, i) => i) : outputIndices;
     if (validatedOutputIndices.length !== programInfo.outputs.length) {
-      throw new Error(`Output size must be equal to ${programInfo.outputs.length}.`);
+      throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${programInfo.outputs.length}.`);
     }
 
     // create info for outputs
     const outputTensorViews: TensorView[] = [];
     const outputDatas: GpuData[] = [];
     for (let i = 0; i < programInfo.outputs.length; ++i) {
+      // value -1 and -2 are used for creating temporary and persistent outputs. so -2, -1 and 0, 1, 2, ... are valid
+      // output indices. see type definition of ComputeContextInputsOutputsMapping for more details.
       if (!Number.isInteger(validatedOutputIndices[i]) || validatedOutputIndices[i] < -2 ||
           validatedOutputIndices[i] >= programInfo.outputs.length) {
         throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
       }
-      const isTemporary = validatedOutputIndices[i] === -2;
-      const isPersistent = validatedOutputIndices[i] === -1;
+      const isTemporary = validatedOutputIndices[i] === ComputeContextInputsOutputsMapping.TEMPORARY_OUTPUT;
+      const isPersistent = validatedOutputIndices[i] === ComputeContextInputsOutputsMapping.PERSISTENT_OUTPUT;
       const tensorView = (isTemporary || isPersistent) ?
           createTemporaryOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 34ab337105ff4..b6513d8f81028 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -117,6 +117,11 @@ export interface ComputeContextInputsOutputsMapping {
   readonly outputs?: readonly number[];
 }
 
+export declare namespace ComputeContextInputsOutputsMapping {
+  export const TEMPORARY_OUTPUT: -1;
+  export const PERSISTENT_OUTPUT: -2;
+}
+
 export interface ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];

From 372aa7f33debef829b3360566d0af562638a9f9b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 21 Mar 2023 16:39:21 -0700
Subject: [PATCH 50/81] match correct ipaddress family

---
 js/web/karma.conf.js | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index b51a3b16ddc04..2a4e71e064632 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -38,6 +38,11 @@ function getMachineIpAddress() {
   return 'localhost';
 }
 
+const hostname = getMachineIpAddress();
+// In Node.js v16 and below, 'localhost' is using IPv4, so need to listen to '0.0.0.0'
+// In Node.js v17+, 'localhost' is using IPv6, so need to listen to '::'
+const listenAddress = Number.parseInt(process.versions.node.split('.')[0]) >= 17 ? '::' : '0.0.0.0';
+
 module.exports = function (config) {
   config.set({
     // global config of your BrowserStack account
@@ -78,7 +83,8 @@ module.exports = function (config) {
     browserNoActivityTimeout: 300000,
     browserDisconnectTolerance: 0,
     browserSocketTimeout: 60000,
-    hostname: getMachineIpAddress(),
+    hostname,
+    listenAddress,
     customLaunchers: {
       ChromeTest: { base: 'ChromeHeadless', flags: ['--enable-features=SharedArrayBuffer'] },
       ChromePerf: { base: 'Chrome', flags: ['--window-size=1,1', '--enable-features=SharedArrayBuffer'] },

From 4c0a2b99057c656e91cb2e8ad056e5e6aeabcf63 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:44:17 -0700
Subject: [PATCH 51/81] const value in declare namepsace does not work - revert

---
 js/web/lib/wasm/jsep/backend-webgpu.ts | 6 +++---
 js/web/lib/wasm/jsep/webgpu/types.ts   | 5 -----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index ab6c122ee4d2a..a0befd6dcf3bb 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -7,7 +7,7 @@ import {TensorView} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, ComputeContextInputsOutputsMapping, GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+import {ComputeContext, GpuData, GpuDataType, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
 /**
  * get a unique key representing the program from the program info,input shapes and types.
@@ -170,8 +170,8 @@ export class WebGpuBackend {
           validatedOutputIndices[i] >= programInfo.outputs.length) {
         throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
       }
-      const isTemporary = validatedOutputIndices[i] === ComputeContextInputsOutputsMapping.TEMPORARY_OUTPUT;
-      const isPersistent = validatedOutputIndices[i] === ComputeContextInputsOutputsMapping.PERSISTENT_OUTPUT;
+      const isTemporary = validatedOutputIndices[i] === -1;
+      const isPersistent = validatedOutputIndices[i] === -2;
       const tensorView = (isTemporary || isPersistent) ?
           createTemporaryOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index b6513d8f81028..34ab337105ff4 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -117,11 +117,6 @@ export interface ComputeContextInputsOutputsMapping {
   readonly outputs?: readonly number[];
 }
 
-export declare namespace ComputeContextInputsOutputsMapping {
-  export const TEMPORARY_OUTPUT: -1;
-  export const PERSISTENT_OUTPUT: -2;
-}
-
 export interface ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];

From 41f888622385dbae1cfcd6ca357a6018d86ee82c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:51:53 -0700
Subject: [PATCH 52/81] erf

---
 js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts     | 17 +++++++++++++++++
 js/web/test/suite-test-list.jsonc               |  2 +-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index b6d331ecea60b..75dde0511d81e 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -36,7 +36,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['Dropout', '', '7+', unaryOps.identity],
   // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
   // ['Equal', '', '7+', binaryOps.equal],
-  ['Elu', [unaryOps.elu, unaryOps.parseEluAttributes]],  //['Exp', [unaryOps.exp]],
+  ['Elu', [unaryOps.elu, unaryOps.parseEluAttributes]], ['Erf', [unaryOps.erf]],  //['Exp', [unaryOps.exp]],
   // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
   ['Floor', [unaryOps.floor]],
   // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index fe9753f75b3b8..cadd6f293d969 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -162,6 +162,23 @@ export const elu = (context: ComputeContext, attributes: EluAttributes): number
 export const parseEluAttributes = (attributes: Record<string, unknown>): EluAttributes =>
     createAttributeWithCacheKey(attributes as {alpha: number});
 
+export const erf = (context: ComputeContext): number => {
+  context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Erf', a => `erf_vf32(${a})`, `
+  const r0: f32 = 0.3275911;
+  const r1: f32 = 0.254829592;
+  const r2: f32 = -0.284496736;
+  const r3: f32 = 1.421413741;
+  const r4: f32 = -1.453152027;
+  const r5: f32 = 1.061405429;
+
+  fn erf_vf32(v: vec4<f32>) -> vec4<f32> {
+    let absv = abs(v);
+    let x = 1.0 / (1.0 + r0 * absv);
+    return sign(v) * (1.0 - ((((r5 * x + r4) * x + r3) * x + r2) * x + r1) * x * exp(-absv * absv));
+  }`));
+  return 0;
+};
+
 // export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
 //     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
 
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 73c08a5bda92b..5714c7b284848 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -516,7 +516,7 @@
       "test_elu",
       // "test_equal_bcast",
       // "test_equal",
-      // "test_erf",
+      "test_erf",
       // "test_exp_example",
       // "test_exp",
       // "test_expand_dim_changed",

From 6c2bb34251757cec8bb81ec07fad1221f31b006a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 23 Mar 2023 18:56:09 -0700
Subject: [PATCH 53/81] resolve comments #

---
 js/web/lib/wasm/jsep/backend-webgpu.ts        | 43 +++++++++++++++++--
 js/web/lib/wasm/jsep/util.ts                  | 20 ++++++---
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  |  4 +-
 js/web/lib/wasm/jsep/webgpu/types.ts          |  6 +++
 js/web/lib/wasm/proxy-wrapper.ts              |  4 +-
 js/web/script/test-runner-cli.ts              | 20 ++++-----
 js/web/test/test-runner.ts                    |  2 +-
 7 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index a0befd6dcf3bb..bdb52771caeff 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -29,15 +29,40 @@ const getProgramInfoUniqueKey =
       return key;
     };
 
+/**
+ * this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
+ * the first parameter so that it is stored for future use.
+ */
 export class WebGpuBackend {
   device: GPUDevice;
+  /**
+   * an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
+   */
   gpuDataManager: GpuDataManager;
+  /**
+   * an instance of ProgramManager to build and run WebGPU compute shader program, and manage a ProgramKey -> Program
+   * artifacts mapping
+   */
   programManager: ProgramManager;
 
-  temporaryData: GpuData[];
+  /**
+   * representing the kernel ID of which is currently being computed (CPU code perspective).
+   * `null` means no kernel is being computed.
+   * only one kernel can be computed at a moment.
+   */
   currentKernelId: number|null = null;
+  /**
+   * a list of temporary GPU data for the current kernel. should release when the kernel done computation.
+   */
+  temporaryData: GpuData[];
+  /**
+   * a KernelID -> a GPU data list, which stores persistent GPU data owned by the specific kernel.
+   */
   kernelPersistentData: Map<number, GpuData[]>;
 
+  /**
+   * a KernelID -> kernel info mapping. value is [ name, run function, [optional] preprocess_attribute_once function ]
+   */
   kernels: Map<number, [string, RunFunction, [((attribute: unknown) => unknown) | undefined, unknown]]>;
 
   commandEncoder: GPUCommandEncoder|null = null;
@@ -130,9 +155,21 @@ export class WebGpuBackend {
     this.pendingDispatchNumber = 0;
   }
 
+  /**
+   * run a WebGPU program.
+   * @param program either a ProgramInfo instance containing metadata including the shader code, or a function that
+   * can be called and return a ProgramInfo instance
+   * @param inputs a TensorView array. each element represents a value already exists in GPU.
+   * @param outputIndices an indices array. each element can be either -1 (temporary data), -2 (persistent data) or an
+   * index to the kernel's output.
+   * @param createKernelOutput a callback function that create a value to kernel's output with the given index
+   * @param createIntermediateOutput a callback function that create a value as a intermediate value, either temporary
+   * or persistent (owned by the current kernel)
+   * @returns a TensorView array representing the result.
+   */
   run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[], outputIndices: readonly number[],
       createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
-      createTemporaryOutput: (dataType: number, dims: readonly number[]) => TensorView): TensorView[] {
+      createIntermediateOutput: (dataType: number, dims: readonly number[]) => TensorView): TensorView[] {
     if (inputs.length !== program.inputTypes.length) {
       throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
     }
@@ -173,7 +210,7 @@ export class WebGpuBackend {
       const isTemporary = validatedOutputIndices[i] === -1;
       const isPersistent = validatedOutputIndices[i] === -2;
       const tensorView = (isTemporary || isPersistent) ?
-          createTemporaryOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
+          createIntermediateOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
       const gpuData = this.gpuDataManager.get(tensorView.data);
       if (!gpuData) {
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index b3ac9c9e8e7ed..cd128ad5e501d 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -179,11 +179,16 @@ export class BroadcastUtil {
 
 
 export class ShapeUtil {
+  /**
+   * calculate the size (number of elements)
+   */
   static size(dims: readonly number[]): number {
     return ShapeUtil.getSizeFromDimensionRange(dims, 0, dims.length);
   }
 
-  // `axis` inclusive
+  /**
+   * calculate the size (number of elements) from the given axis (inclusive)
+   */
   static sizeFromDimension(dims: readonly number[], axis: number): number {
     if (axis < 0 || axis > dims.length) {
       throw new Error(`invalid dimension of ${axis} for sizeFromDimension as Tensor has ${dims.length} dimensions.`);
@@ -191,7 +196,9 @@ export class ShapeUtil {
     return ShapeUtil.getSizeFromDimensionRange(dims, axis, dims.length);
   }
 
-  // `axis` exclusive
+  /**
+   * calculate the size (number of elements) to the given axis (exclusive)
+   */
   static sizeToDimension(dims: readonly number[], axis: number): number {
     if (axis < 0 || axis > dims.length) {
       throw new Error(`invalid dimension of ${axis} for sizeToDimension as Tensor has ${dims.length} dimensions.`);
@@ -199,6 +206,9 @@ export class ShapeUtil {
     return ShapeUtil.getSizeFromDimensionRange(dims, 0, axis);
   }
 
+  /**
+   * calculate the size (number of elements) from and to the given axis [start, end)
+   */
   static getSizeFromDimensionRange(dims: readonly number[], start: number, end: number): number {
     let size = 1;
     for (let i = start; i < end; i++) {
@@ -272,12 +282,10 @@ export class ShapeUtil {
     return axis < 0 ? axis + tensorRank : axis;
   }
 
-  static normalizeAxes(axes: readonly number[], tensorRank: number): number[] {
-    return axes.map(x => this.normalizeAxis(x, tensorRank));
+  static normalizeAxes(axes: readonly number[], tensorRank?: number): number[] {
+    return axes.map(x => this.normalizeAxis(x, tensorRank ?? axes.length));
   }
 
-  // Increment an index into a tensor (in lexicographic
-  // ordering), wrapping around the specified upper_bound.
   /**
    * Increment an index into a tensor (in lexicographic ordering), wrapping around the specified upper_bound.
    * @param index Given index to increment (Will be mutated)
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index c8c456d248e7e..c3d565e9a47b8 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -72,15 +72,15 @@ class GpuDataManagerImpl implements GpuDataManager {
   // GPU Data ID => GPU Data ( read buffer )
   downloadCache: Map<GpuDataId, DownloadCacheValue>;
 
+  // pending buffers for uploading ( data is unmapped )
   private buffersForUploadingPending: GPUBuffer[];
-  // private buffersForDownloadingPending: GPUBuffer[];
+  // pending buffers for computing
   private buffersPending: GPUBuffer[];
 
   constructor(private backend: WebGpuBackend /* , private reuseBuffer: boolean */) {
     this.storageCache = new Map();
     this.downloadCache = new Map();
     this.buffersForUploadingPending = [];
-    // this.buffersForDownloadingPending = [];
     this.buffersPending = [];
   }
 
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 34ab337105ff4..eb0ec850b37ff 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -117,7 +117,13 @@ export interface ComputeContextInputsOutputsMapping {
   readonly outputs?: readonly number[];
 }
 
+/**
+ * A ComputeContext instance carries the states that representing the current running of a kernel.
+ */
 export interface ComputeContext {
+  /**
+   * stores the pointer to OpKernelContext
+   */
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
   readonly customData: {[key: string]: unknown};
diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 73aade5c1556a..02c61800d9e66 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -3,7 +3,7 @@
 
 import {env, InferenceSession} from 'onnxruntime-common';
 
-import {init} from './jsep/init';
+import {init as initJsep} from './jsep/init';
 import {OrtWasmMessage, SerializableModeldata, SerializableSessionMetadata, SerializableTensor} from './proxy-messages';
 import * as core from './wasm-core-impl';
 import {getInstance, initializeWebAssembly} from './wasm-factory';
@@ -149,7 +149,7 @@ export const initOrt = async(numThreads: number, loggingLevel: number): Promise<
     core.initOrt(numThreads, loggingLevel);
 
     // init JSEP if available
-    await init(getInstance());
+    await initJsep(getInstance());
   }
 };
 
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index ee2c17df66bf1..72938789bc2df 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -53,10 +53,10 @@ async function main() {
 
   // The default backends and opset version lists. Those will be used in suite tests.
   const DEFAULT_BACKENDS: readonly TestRunnerCliArgs.Backend[] =
-    args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu'];
-      const DEFAULT_OPSET_VERSIONS = fs.readdirSync(TEST_DATA_MODEL_NODE_ROOT, {withFileTypes: true})
-      .filter(dir => dir.isDirectory() && dir.name.startsWith('opset'))
-      .map(dir => dir.name.slice(5));
+      args.env === 'node' ? ['cpu', 'wasm'] : ['wasm', 'webgl', 'webgpu'];
+  const DEFAULT_OPSET_VERSIONS = fs.readdirSync(TEST_DATA_MODEL_NODE_ROOT, {withFileTypes: true})
+                                     .filter(dir => dir.isDirectory() && dir.name.startsWith('opset'))
+                                     .map(dir => dir.name.slice(5));
 
   const FILE_CACHE_ENABLED = args.fileCache;         // whether to enable file cache
   const FILE_CACHE_MAX_FILE_SIZE = 1 * 1024 * 1024;  // The max size of the file that will be put into file cache
@@ -458,13 +458,13 @@ async function main() {
     } else {
       // STEP 5. use Karma to run test
       npmlog.info('TestRunnerCli.Run', '(4/4) Running karma to start test runner...');
-    const webgpu = args.backends.indexOf('webgpu') > -1;
-    const browser = getBrowserNameFromEnv(
+      const webgpu = args.backends.indexOf('webgpu') > -1;
+      const browser = getBrowserNameFromEnv(
           args.env,
           args.bundleMode === 'perf' ? 'perf' :
               args.debug             ? 'debug' :
                                        'test',
-               webgpu, config.options.globalEnvFlags?.webgpu?.profilingMode === 'default');
+          webgpu, config.options.globalEnvFlags?.webgpu?.profilingMode === 'default');
       const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
       if (args.debug) {
         karmaArgs.push('--log-level info --timeout-mocha 9999999');
@@ -477,7 +477,7 @@ async function main() {
       if (webgpu) {
         karmaArgs.push('--force-localhost');
       }
-        karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
+      karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
       if (browser === 'Edge') {
         // There are currently 2 Edge browser launchers:
         //  - karma-edge-launcher: used to launch the old Edge browser
@@ -569,10 +569,10 @@ async function main() {
   }
 
   function getBrowserNameFromEnv(
-    env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, profile: boolean) {
+      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, profile: boolean) {
     switch (env) {
       case 'chrome':
-      return selectChromeBrowser(mode, webgpu, profile);
+        return selectChromeBrowser(mode, webgpu, profile);
       case 'edge':
         return 'Edge';
       case 'firefox':
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index f84dcef33fa28..8f9784de053d4 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -595,7 +595,7 @@ async function runOpTestcase(
   const inputTensors =
       testcase.inputs.map(input => createTensor(input.dims, input.type as Tensor.DataType, input.data));
 
-  const results = await operator.impl(inferenceHandler, inputTensors, operator.context);
+  const results = operator.impl(inferenceHandler, inputTensors, operator.context);
 
   // try async data read.
   for (const result of results) {

From f7d6a6a8e7f55ae9351838d77bb57aecbd9b4653 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 24 Mar 2023 16:48:20 -0700
Subject: [PATCH 54/81] Update onnxruntime/core/providers/js/js_kernel.h

Co-authored-by: Scott McKay <skottmckay@gmail.com>
---
 onnxruntime/core/providers/js/js_kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 44a9126cfe493..47850f59871af 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -65,7 +65,7 @@ class JsKernel : public OpKernel {
  public:
   explicit JsKernel(const OpKernelInfo& info)
       : OpKernel(info) {}
-  virtual ~JsKernel() {
+  ~JsKernel() override {
       EM_ASM({ Module.jsepReleaseKernel($0); }, this);
   }
 

From 83727cd29ffab7f0f745de48a0b57176559abf6d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 29 Mar 2023 15:52:10 -0700
Subject: [PATCH 55/81] conv1d

---
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  | 21 ++++-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 79 +++++++++++++------
 .../core/providers/js/operators/conv.h        | 77 ++++++++++++------
 3 files changed, 124 insertions(+), 53 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index fb9a57a10b755..39d59391866fd 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -17,7 +17,8 @@ const createGroupedConvProgramMetadata = (hasBias: boolean, cacheHint: string):
 });
 
 const createGroupedConvProgramInfo =
-    (inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: ConvAttributes): ProgramInfo => {
+    (inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: ConvAttributes,
+     squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => {
       const hasBias = inputs.length > 2;
       const processBias = hasBias ? 'value += b[output_channel];' : '';
       const xShape = inputs[0].dims;
@@ -109,14 +110,26 @@ const createGroupedConvProgramInfo =
   }`;
       return {
         ...metadata,
-        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        outputs: [{
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+          gpuDataType: GpuDataType.default
+        }],
         shaderSource,
         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
       };
     };
 
+/**
+ * naive grouped conv implementation, supports 1d/2d conv
+ * @param squeezeOutputShapeFunction - an optional function to squeeze the output shape, only used in conv1d
+ */
 export const createGroupedConvProgramInfoLoader =
-    (inputs: readonly TensorView[], attributes: ConvAttributes): ProgramInfoLoader => {
+    (inputs: readonly TensorView[], attributes: ConvAttributes,
+     squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfoLoader => {
       const metadata = createGroupedConvProgramMetadata(inputs.length > 2, attributes.cacheKey);
-      return {...metadata, get: () => createGroupedConvProgramInfo(inputs, metadata, attributes)};
+      return {
+        ...metadata,
+        get: () => createGroupedConvProgramInfo(inputs, metadata, attributes, squeezeOutputShapeFunction)
+      };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index b5d661f0122b4..1b097bcd7d291 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -56,12 +56,16 @@ const validateInputs = (inputs: readonly TensorView[], attributes: ConvAttribute
   }
 
   // TODO : Need to add support for multi-dimensional conv
-  if (inputs[0].dims.length !== 4 || inputs[1].dims.length !== 4) {
-    throw new Error('currently only support 2-dimensional conv');
+  if (inputs[0].dims.length !== 4 && inputs[0].dims.length !== 3) {
+    throw new Error('currently only support conv 1D and 2D');
+  }
+
+  if (inputs[0].dims.length !== inputs[1].dims.length) {
+    throw new Error('filter does not have same dimension as input');
   }
 
   // FILTER_IN_CHANNEL should be equal to DATA_CHANNEL
-  const dataChannel = inputs[0].dims[attributes.format === 'NHWC' ? 3 : 1];
+  const dataChannel = inputs[0].dims[attributes.format === 'NHWC' ? inputs[0].dims.length - 1 : 1];
   const filterInChannel = inputs[1].dims[1] * attributes.group;
   if (dataChannel !== filterInChannel) {
     throw new Error('FILTER_IN_CHANNEL should be equal to DATA_CHANNEL');
@@ -139,24 +143,24 @@ export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAt
       {autoPad, format, dilations, group, kernelShape, pads, strides, wIsConst, ...activationAttributes});
 };
 
-const conv2d = (context: ComputeContext, attributes: ConvAttributes): number => {
-  const adjustedAttributes = getAdjustedConvAttributes(attributes, context.inputs);
+const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes): number => {
+  const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
 
   // check attributes
 
-  const hasBias = context.inputs.length === 3;
+  const hasBias = inputs.length === 3;
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
   const isChannelsLast = attributes.format === 'NHWC';
 
   // const batchSize = context.inputs[0].dims[0];
-  const inputHeight = context.inputs[0].dims[isChannelsLast ? 1 : 2];
-  const inputWidth = context.inputs[0].dims[isChannelsLast ? 2 : 3];
-  const inputChannels = context.inputs[0].dims[isChannelsLast ? 3 : 1];
-  const weightHeight = context.inputs[1].dims[2];
-  const weightWidth = context.inputs[1].dims[3];
+  const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
+  const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
+  const inputChannels = inputs[0].dims[isChannelsLast ? 3 : 1];
+  const weightHeight = inputs[1].dims[2];
+  const weightWidth = inputs[1].dims[3];
 
   const outputShape = calculateOutputShape(
-      context.inputs[0].dims, context.inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
+      inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
       isChannelsLast);
   const outHeight = outputShape[isChannelsLast ? 1 : 2];
   const outWidth = outputShape[isChannelsLast ? 2 : 3];
@@ -170,12 +174,12 @@ const conv2d = (context: ComputeContext, attributes: ConvAttributes): number =>
        (attributes.autoPad === 'SAME_UPPER' || attributes.autoPad === 'SAME_LOWER' ||
         attributes.autoPad === 'VALID'))) {
     // return conv2dByMatMul({x, filter, convInfo, backend, bias, activation, preluActivationWeights, leakyreluAlpha});
-    context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
+    context.compute(createGroupedConvProgramInfoLoader(inputs, adjustedAttributes));
     return 0;
   }
 
   if (!isChannelsLast || attributes.group !== 1) {
-    context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
+    context.compute(createGroupedConvProgramInfoLoader(inputs, adjustedAttributes));
     return 0;
   }
 
@@ -208,29 +212,58 @@ const conv2d = (context: ComputeContext, attributes: ConvAttributes): number =>
           {
             ...transposeProgramMetadata,
             cacheHint: weightTransposeAttribute.cacheKey,
-            get: () => createTransposeProgramInfo(context.inputs[1], weightTransposeAttribute.perm)
+            get: () => createTransposeProgramInfo(inputs[1], weightTransposeAttribute.perm)
           },
           {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
   if (attributes.wIsConst && !context.customData.wT) {
     context.customData.wT = transposedWeight;
   }
 
-  const inputs = [context.inputs[0], transposedWeight];
+  const convInputs = [inputs[0], transposedWeight];
   if (hasBias) {
-    if (!isChannelsLast && context.inputs[2].dims.length === 1) {
-      inputs.push(context.inputs[2].reshape([context.inputs[2].dims[0], 1, 1]));
+    if (!isChannelsLast && inputs[2].dims.length === 1) {
+      convInputs.push(inputs[2].reshape([inputs[2].dims[0], 1, 1]));
     } else {
-      inputs.push(context.inputs[2]);
+      convInputs.push(inputs[2]);
     }
   }
   context.compute(
       createConv2DMatMulProgramInfoLoader(
-          inputs, adjustedAttributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias, sequentialAccessByThreads),
-      {inputs});
+          convInputs, adjustedAttributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias,
+          sequentialAccessByThreads),
+      {inputs: convInputs});
+  return 0;
+};
+
+const conv1d = (context: ComputeContext, attributes: ConvAttributes): number => {
+  // extend the input to 2D by adding H dimension
+  const isChannelLast = attributes.format === 'NHWC';
+  const inputs = [
+    context.inputs[0].reshape(
+        isChannelLast ?
+            // [N, W, C] -> [N, H=1, W, C]
+            [context.inputs[0].dims[0], 1, context.inputs[0].dims[1], context.inputs[0].dims[2]] :
+            // [N, C, W] -> [N, C, H=1, W]
+            [context.inputs[0].dims[0], context.inputs[0].dims[1], 1, context.inputs[0].dims[2]]),
+    //[FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kW] -> [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kH=1, kW]
+    context.inputs[1].reshape([context.inputs[1].dims[0], context.inputs[1].dims[1], 1, context.inputs[1].dims[2]])
+  ];
+  if (context.inputs.length === 3) {
+    inputs.push(context.inputs[2]);
+  }
+  const pads = [0, attributes.pads[0], 0, attributes.pads[1]];
+  const strides = [1].concat(attributes.strides);
+  const dilations = [1].concat(attributes.dilations);
+  const kernelShape = [1].concat(attributes.kernelShape);
+  const adjustedAttributes = getAdjustedConvAttributes({...attributes, pads, strides, dilations, kernelShape}, inputs);
+  context.compute(createGroupedConvProgramInfoLoader(
+      inputs, adjustedAttributes,
+      outputShape => isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : []));
   return 0;
 };
 
 export const conv = (context: ComputeContext, attributes: ConvAttributes): number => {
-  validateInputs(context.inputs, attributes);  // currently will fail if not conv2D
-  return conv2d(context, attributes);
+  validateInputs(context.inputs, attributes);  // currently will fail if not conv1D/2D
+  return context.inputs[0].dims.length === 3 ? conv1d(context, attributes) :
+                                               conv2d(context, context.inputs, attributes);
 };
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index ca303f855a8bc..8881faf5b7f5f 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -21,32 +21,57 @@ class Conv : public JsKernel {
 
     int64_t channels_last = is_channels_last ? 1 : info.GetAttrOrDefault<int64_t>("channels_last", 0);
 
-    // currently only support Conv2D. TODO: support other
-    JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
-        "format": $13 ? "NHWC" : "NCHW",
-        "auto_pad": $1,
-        "dilations": [$2, $3],
-        "group": $4,
-        "kernel_shape": [$5, $6],
-        "pads": [$7, $8, $9, $10],
-        "strides": [$11, $12],
-        "w_is_const": () => (!!HEAP8[$14])
-    }),
-    static_cast<int32_t>(conv_attrs_.auto_pad),
-    static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
-    static_cast<int32_t>(conv_attrs_.dilations.size() > 1 ? conv_attrs_.dilations[1] : 0),
-    static_cast<int32_t>(conv_attrs_.group),
-    static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
-    static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0),
-    static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
-    static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
-    static_cast<int32_t>(conv_attrs_.pads.size() > 2 ? conv_attrs_.pads[2] : 0),
-    static_cast<int32_t>(conv_attrs_.pads.size() > 3 ? conv_attrs_.pads[3] : 0),
-    static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
-    static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
-    static_cast<int32_t>(channels_last),
-    reinterpret_cast<int32_t>(&w_is_const_)
-    );
+    // currently only support Conv 1D/2D. TODO: support Conv3D and other
+    if (conv_attrs_.dilations.size() == 1 ||
+        (conv_attrs_.kernel_shape_specified && kernel_shape.size() == 1) ||
+        conv_attrs_.strides.size() == 1) {
+      JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
+          "format": $8 ? "NHWC" : "NCHW",
+          "auto_pad": $1,
+          "dilations": [$2],
+          "group": $3,
+          "kernel_shape": [$4],
+          "pads": [$5, $6],
+          "strides": [$7],
+          "w_is_const": () => (!!HEAP8[$9])
+      }),
+      static_cast<int32_t>(conv_attrs_.auto_pad),
+      static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
+      static_cast<int32_t>(conv_attrs_.group),
+      static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
+      static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
+      static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
+      static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
+      static_cast<int32_t>(channels_last),
+      reinterpret_cast<int32_t>(&w_is_const_)
+      );
+    } else {
+      JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
+          "format": $13 ? "NHWC" : "NCHW",
+          "auto_pad": $1,
+          "dilations": [$2, $3],
+          "group": $4,
+          "kernel_shape": [$5, $6],
+          "pads": [$7, $8, $9, $10],
+          "strides": [$11, $12],
+          "w_is_const": () => (!!HEAP8[$14])
+      }),
+      static_cast<int32_t>(conv_attrs_.auto_pad),
+      static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
+      static_cast<int32_t>(conv_attrs_.dilations.size() > 1 ? conv_attrs_.dilations[1] : 0),
+      static_cast<int32_t>(conv_attrs_.group),
+      static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
+      static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0),
+      static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
+      static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
+      static_cast<int32_t>(conv_attrs_.pads.size() > 2 ? conv_attrs_.pads[2] : 0),
+      static_cast<int32_t>(conv_attrs_.pads.size() > 3 ? conv_attrs_.pads[3] : 0),
+      static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
+      static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
+      static_cast<int32_t>(channels_last),
+      reinterpret_cast<int32_t>(&w_is_const_)
+      );
+    }
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,

From 88e5e3f4c80d46d24795cbfedf99c1cb96eab0cb Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 30 Mar 2023 16:39:06 -0700
Subject: [PATCH 56/81] use shader helper

---
 js/web/lib/wasm/jsep/backend-webgpu.ts        |  6 ++-
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts  | 50 +++++++++++--------
 js/web/lib/wasm/jsep/webgpu/ops/common.ts     | 37 ++++++++++++++
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts     | 20 +++-----
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  | 19 +++----
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts       | 16 ++----
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       | 44 +++++++---------
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  | 21 +++-----
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   | 23 ++++-----
 .../lib/wasm/jsep/webgpu/program-manager.ts   | 35 ++++++++++---
 js/web/lib/wasm/jsep/webgpu/types.ts          |  4 +-
 12 files changed, 153 insertions(+), 124 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index bdb52771caeff..323d3cdb2c48b 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -231,12 +231,14 @@ export class WebGpuBackend {
       outputDatas.push(gpuData);
     }
 
+    const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(programInfo.dispatchGroup(inputs));
+
     if (!artifact) {
-      artifact = this.programManager.build(programInfo);
+      artifact = this.programManager.build(programInfo, normalizedDispatchGroup);
       this.programManager.setArtifact(key, artifact);
     }
 
-    this.programManager.run(artifact, inputDatas, outputDatas, artifact.programInfo.dispatchGroup(inputs));
+    this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup);
 
     return outputTensorViews;
   }
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 4737f57980f57..b6b8fe91c78b2 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -219,7 +219,7 @@ export const createConv2DMatMulProgramInfo =
         ...metadata,
         outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
         dispatchGroup: () => ({x: dispatch[0], y: dispatch[1], z: dispatch[2]}),
-        shaderSource: `
+        getShaderSource: () => `
         ${utilFunctions}
         //struct Uniforms { xShape : vec4<i32>, wShape : vec4<i32>, outShape : vec4<i32>,
         //  outShapeStrides: vec3<i32>, filterDims : vec2<i32>, pad : vec2<i32>, stride : vec2<i32>,
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index 451cef484b6d8..d56bbd6e4364e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -6,7 +6,7 @@ import {TensorView} from '../../tensor';
 import {BroadcastUtil, ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+import {createIndicesHelper, ShaderHelper} from './common';
 
 type BuiltinFunctionName = string;
 type BinaryCustomExpression = (expressionA: string, expressionB: string) => string;
@@ -16,9 +16,9 @@ type BinaryFunctionCall = BuiltinFunctionName|BinaryCustomExpression|{
 };
 
 const createBinaryOpProgramShader =
-    (dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[], vectorize: boolean,
-     doBroadcast: boolean, funcCall: BinaryFunctionCall, additionalImplementation?: string, typeA = 'f32',
-     typeB = 'f32', typeOutput = 'f32') => {
+    (shaderHelper: ShaderHelper, dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[],
+     vectorize: boolean, doBroadcast: boolean, funcCall: BinaryFunctionCall, additionalImplementation?: string,
+     typeA = 'f32', typeB = 'f32', typeOutput = 'f32') => {
       const outputSize = ShapeUtil.size(dimsOutput);
       const vecSize = Math.ceil(outputSize / 4);
 
@@ -66,12 +66,12 @@ const createBinaryOpProgramShader =
         if (doBroadcast) {
           assignment = `
       ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
-      ${outputIndicesHelper.o2iCall('global_id.x * 4u', 'outputIndices')}
+      ${outputIndicesHelper.o2iCall('global_idx * 4u', 'outputIndices')}
       let offsetA = calcOffsetA(&outputIndices);
       let offsetB = calcOffsetB(&outputIndices);
-      outputData[global_id.x] = ${expressionVector('aData[offsetA / 4u]', 'bData[offsetB / 4u]')};`;
+      outputData[global_idx] = ${expressionVector('aData[offsetA / 4u]', 'bData[offsetB / 4u]')};`;
         } else {
-          assignment = `outputData[global_id.x] = ${expressionVector('aData[global_id.x]', 'bData[global_id.x]')};`;
+          assignment = `outputData[global_idx] = ${expressionVector('aData[global_idx]', 'bData[global_idx]')};`;
         }
       } else {
         if (!doBroadcast) {
@@ -81,14 +81,14 @@ const createBinaryOpProgramShader =
           const expressionA = `aData[indexA${x}][componentA${x}]`;
           const expressionB = `bData[indexB${x}][componentB${x}]`;
           return `
-      ${outputIndicesHelper.o2iCall(`global_id.x * 4u + ${x}u`, 'outputIndices')}
+      ${outputIndicesHelper.o2iCall(`global_idx * 4u + ${x}u`, 'outputIndices')}
       let offsetA${x} = calcOffsetA(&outputIndices);
       let offsetB${x} = calcOffsetB(&outputIndices);
       let indexA${x} = offsetA${x} / 4u;
       let indexB${x} = offsetB${x} / 4u;
       let componentA${x} = offsetA${x} % 4u;
       let componentB${x} = offsetB${x} % 4u;
-      outputData[global_id.x][${x}] = ${expressionScalar(expressionA, expressionB)};`;
+      outputData[global_idx][${x}] = ${expressionScalar(expressionA, expressionB)};`;
         };
 
         assignment = `
@@ -100,8 +100,6 @@ const createBinaryOpProgramShader =
       }
 
       return `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
   @group(0) @binding(0) var<storage, read> aData : array<vec4<${typeA}>>;
   @group(0) @binding(1) var<storage, read> bData : array<vec4<${typeB}>>;
   @group(0) @binding(2) var<storage, read_write> outputData : array<vec4<${typeOutput}>>;
@@ -109,14 +107,8 @@ const createBinaryOpProgramShader =
   ${additionalImplementation ?? ''}
   ${broadcastImpl}
 
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${vecSize}u) {
-      return;
-    }
-
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(vecSize)}
     ${assignment}
   }`;
     };
@@ -163,8 +155,8 @@ const createBinaryOpProgramInfo =
 
       return {
         ...metadata,
-        shaderSource: createBinaryOpProgramShader(
-            a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, additionalImplementation),
+        getShaderSource: (shaderHelper) => createBinaryOpProgramShader(
+            shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, additionalImplementation),
         outputs: [{dims: outputShape, dataType: outputDataType, gpuDataType: GpuDataType.default}],
         dispatchGroup: () =>
             ({x: Math.ceil(outputSize / 64 /* workgroup size */ / (vectorize ? 4 : 1) /* vec size */)})
@@ -213,7 +205,21 @@ export const mul = (context: ComputeContext): number => {
 //     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslOr(), 'bool'), inputs)];
 
 export const pow = (context: ComputeContext): number => {
-  context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Pow', 'pow'));
+  context.compute(createBinaryOpProgramInfoLoader(
+      context.inputs, 'Pow', ({scalar: (a, b) => `pow_f32(${a},${b})`, vector: (a, b) => `pow_vf32(${a},${b})`}), `
+    fn pow_f32(a : f32, b : f32) -> f32 {
+      if (b == 0.0) {
+        return 1.0;
+      } else if (a < 0.0 && b != floor(b)) {
+        return pow(a, b); // NaN
+      }
+      return select(sign(a), 1.0, round(abs(b) % 2.0) != 1.0) * pow(abs(a), b);
+    }
+    fn pow_vf32(a : vec4<f32>, b : vec4<f32>) -> vec4<f32> {
+      // TODO: implement vectorized pow
+      return vec4<f32>(pow_f32(a.x, b.x), pow_f32(a.y, b.y), pow_f32(a.z, b.z), pow_f32(a.w, b.w));
+    }
+      `));
   return 0;
 };
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index dfe4f3c8106e1..2022f61d0aa83 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -95,3 +95,40 @@ export const createIndicesHelper = (name: string, shape: readonly number[]): Ind
 
   return {o2iImpl, o2iCall, i2oImpl, i2oExpression, indicesVariableDeclaration, iType};
 };
+
+export interface ShaderHelper {
+  mainStart(workgroupSize?: number|[number, number, number]): string;
+  guardAgainstOutOfBoundsWorkgroupSizes(size: unknown): string;
+}
+
+class ShaderHelperImpl implements ShaderHelper {
+  constructor(private normalizedDispatchGroup: [number, number, number]) {}
+  guardAgainstOutOfBoundsWorkgroupSizes(size: number|string): string {
+    // Guard against out-of-bounds work group sizes
+    const sizeInCode = typeof size === 'number' ? `${size}u` : size;
+    return `if (global_idx >= ${sizeInCode}) { return; }`;
+  }
+  mainStart(workgroupSize: number|[number, number, number] = WORKGROUP_SIZE) {
+    const workgroupSizeX = typeof workgroupSize === 'number' ? workgroupSize : workgroupSize[0];
+    const workgroupSizeY = typeof workgroupSize === 'number' ? 1 : workgroupSize[1];
+    const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];
+
+    const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
+    const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3<u32>' :
+                                             `@builtin(local_invocation_index) local_index : u32,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>`;
+    const globalIdxDefinition = is1DimensionDispatch ?
+        'let global_idx = global_id.x;' :
+        `let global_idx = (workgroup_id.z * ${this.normalizedDispatchGroup[0] * this.normalizedDispatchGroup[1]}u +
+          workgroup_id.y * ${this.normalizedDispatchGroup[0]}u + workgroup_id.x) * ${
+            workgroupSizeX * workgroupSizeY * workgroupSizeZ}u + local_index;`;
+
+    return `@compute @workgroup_size(${workgroupSizeX}, ${workgroupSizeY}, ${workgroupSizeZ})
+  fn main(${paramList}) {
+    ${globalIdxDefinition}
+  `;
+  }
+}
+
+export const createShaderHelper = (dispatchGroup: [number, number, number]): ShaderHelper =>
+    new ShaderHelperImpl(dispatchGroup);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index 588ffd37c723f..4f4b91b9283c1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -7,7 +7,7 @@ import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
-import {createIndicesHelper, IndicesHelper, WORKGROUP_SIZE} from './common';
+import {createIndicesHelper, IndicesHelper, ShaderHelper} from './common';
 
 export interface ConcatAttributes extends AttributeWithCacheKey {
   readonly axis: number;
@@ -118,8 +118,7 @@ const createConcatProgramInfo =
       const outputIndicesHelper = createIndicesHelper('output', outputShape);
 
       const indicesAxis = rank < 2 ? 'indices' : `indices[${adjustedAxis}]`;
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+      const getShaderSource = (shaderHelper: ShaderHelper) => `
 
   ${inputStorageBuffersDeclarations.join('\n')}
   @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
@@ -131,28 +130,23 @@ const createConcatProgramInfo =
   ${calculateInputIndexImpl(sizeInConcatAxis.length)}
   ${readBufferDataImpl(inputIndicesHelpers, rank, dataType)}
 
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
 
     ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+    ${outputIndicesHelper.o2iCall('global_idx', 'indices')}
 
     let textureIndex = calculateInputIndex(${indicesAxis});
     if (textureIndex != 0u) {
       ${indicesAxis} -= sizeInConcatAxis[textureIndex - 1u];
     }
 
-    output[global_id.x] = readBufferData(textureIndex, &indices);
+    output[global_idx] = readBufferData(textureIndex, &indices);
   }`;
       return {
         ...metadata,
         outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
-        shaderSource,
+        getShaderSource,
         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 39d59391866fd..ebf305a129ce9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -5,7 +5,7 @@ import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+import {createIndicesHelper, ShaderHelper} from './common';
 import {calculateOutputShape, ConvAttributes} from './conv';
 import {getActicationSnippet} from './fuse-utils';
 
@@ -43,8 +43,7 @@ const createGroupedConvProgramInfo =
       const xIndicesHelper = createIndicesHelper('x', xShape);
       const wIndicesHelper = createIndicesHelper('w', wShape);
 
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+      const getShaderSource = (shaderHelper: ShaderHelper) => `
   const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
   const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
 
@@ -56,15 +55,11 @@ const createGroupedConvProgramInfo =
   ${xIndicesHelper.i2oImpl}
   ${wIndicesHelper.i2oImpl}
 
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
 
     ${outputIndicesHelper.indicesVariableDeclaration('outputIndices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'outputIndices')}
+    ${outputIndicesHelper.o2iCall('global_idx', 'outputIndices')}
     let batch: u32 = outputIndices[0];
     let output_channel: u32 = outputIndices[${isChannelLast ? 3 : 1}];
     let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[${isChannelLast ? 1 : 2}], outputIndices[${
@@ -106,7 +101,7 @@ const createGroupedConvProgramInfo =
     }
     ${processBias}
     ${applyActivation}
-    output[global_id.x] = value;
+    output[global_idx] = value;
   }`;
       return {
         ...metadata,
@@ -115,7 +110,7 @@ const createGroupedConvProgramInfo =
           dataType: inputs[0].dataType,
           gpuDataType: GpuDataType.default
         }],
-        shaderSource,
+        getShaderSource,
         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 48920646a7e9e..6759c58374001 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -7,7 +7,7 @@ import {GemmUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
-import {WORKGROUP_SIZE} from './common';
+import {ShaderHelper} from './common';
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs) {
@@ -91,8 +91,7 @@ const createGemmProgramInfo =
       if (inputs.length === 3) {
         inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
       }
-      const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
+      const getShaderSource = (shaderHelper: ShaderHelper) => `
   const M: u32 = ${M}u;
   const N: u32 = ${N}u;
   const K: u32 = ${K}u;
@@ -102,13 +101,8 @@ const createGemmProgramInfo =
   ${inputStorageBuffersDeclarations.join('\n')}
   @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
 
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
 
     let m = global_id.x / N;
     let n = global_id.x % N;
@@ -126,7 +120,7 @@ const createGemmProgramInfo =
       return {
         ...metadata,
         outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
-        shaderSource,
+        getShaderSource,
         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index d4ab30ab00ccb..d73b07911e8cf 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -7,7 +7,7 @@ import {PoolConvUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+import {createIndicesHelper, ShaderHelper} from './common';
 
 // TODO: support:
 // - ceil_mode                 "test_maxpool_2d_ceil"
@@ -61,8 +61,8 @@ const getAdjustedPoolAttributesAndOutputShape = <AttributeType extends AveragePo
 };
 
 const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
-    inputDims: readonly number[], outputShape: readonly number[], attributes: AttributeType, op1: string, op2: string,
-    dataType: string, start: string): string => {
+    shaderHelper: ShaderHelper, inputDims: readonly number[], outputShape: readonly number[], attributes: AttributeType,
+    op1: string, op2: string, dataType: string, start: string): string => {
   const isChannelsLast = attributes.format === 'NHWC';
   const rank = inputDims.length;
   const outputSize = ShapeUtil.size(outputShape);
@@ -126,25 +126,19 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
     }
 
     const poolingCode = `
-            const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
             @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
             @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
 
             ${outputIndicesHelper.o2iImpl}
             ${xIndicesHelper.i2oImpl}
 
-            @compute @workgroup_size(WORKGROUP_SIZE)
-            fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-              // Guard against out-of-bounds work group sizes
-              if (global_id.x >= ${outputSize}u) {
-                return;
-              }
+            ${shaderHelper.mainStart()}
+              ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
 
               ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-              ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+              ${outputIndicesHelper.o2iCall('global_idx', 'indices')}
               ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-              ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+              ${outputIndicesHelper.o2iCall('global_idx', 'xIndices')}
 
               var value: ${dataType} = ${dataType}(${start});
               var pad = 0;
@@ -153,7 +147,7 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
               ${codeHEnd}
               ${op2}
 
-              output[global_id.x] = value;
+              output[global_idx] = value;
             }`;
     return poolingCode;
   } else {
@@ -186,7 +180,6 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
             `;
     }
     const poolingCode = `
-            const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
             @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
             @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
 
@@ -198,18 +191,13 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
             const kernelStrides = array<u32, ${stridesRank}>(${kernelStrides.map(i => `${i}u`).join(',')});
             const strides = array<u32, ${stridesRank}>(${attributes.strides.map(i => `${i}u`).join(',')});
 
-            @compute @workgroup_size(WORKGROUP_SIZE)
-            fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-              // Guard against out-of-bounds work group sizes
-              if (global_id.x >= ${outputSize}u) {
-                return;
-              }
+            ${shaderHelper.mainStart()}
+              ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
 
               ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-              ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+              ${outputIndicesHelper.o2iCall('global_idx', 'indices')}
               ${outputIndicesHelper.indicesVariableDeclaration('xIndices')}
-              ${outputIndicesHelper.o2iCall('global_id.x', 'xIndices')}
+              ${outputIndicesHelper.o2iCall('global_idx', 'xIndices')}
 
               var offsets: array<u32, ${stridesRank}>;
 
@@ -233,7 +221,7 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
               }
               ${op2}
 
-              output[global_id.x] = value;
+              output[global_idx] = value;
             }`;
     return poolingCode;
   }
@@ -283,7 +271,8 @@ const createAveragePoolProgramInfo =
       return {
         ...metadata,
         outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
-        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType, '0.0'),
+        getShaderSource: shaderHelper => generatePoolingCode(
+            shaderHelper, inputs[0].dims, outputShape, adjustedAttributes, op1, op2, dataType, '0.0'),
         dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
       };
     };
@@ -348,7 +337,8 @@ const createMaxPoolProgramInfo =
       return {
         ...metadata,
         outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
-        shaderSource: generatePoolingCode(inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32', '-1e5'),
+        getShaderSource: shaderHelper =>
+            generatePoolingCode(shaderHelper, inputs[0].dims, outputShape, adjustedAttributes, op1, op2, 'f32', '-1e5'),
         dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 06c3729510b9c..7cdd8df26d6a7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -7,7 +7,7 @@ import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo} from '../types';
 
-import {createIndicesHelper, WORKGROUP_SIZE} from './common';
+import {createIndicesHelper, ShaderHelper} from './common';
 
 export interface TransposeAttributes extends AttributeWithCacheKey {
   readonly perm: number[];
@@ -58,9 +58,7 @@ export const createTransposeProgramInfo = (input: TensorView, permAttr: number[]
   const outputIndicesHelper = createIndicesHelper('output', outputShape);
   const inputIndicesHelper = createIndicesHelper('a', inputShape);
 
-  const shaderSource = `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
+  const getShaderSource = (shaderHelper: ShaderHelper) => `
   @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
   @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
 
@@ -68,25 +66,20 @@ export const createTransposeProgramInfo = (input: TensorView, permAttr: number[]
   ${outputIndicesHelper.o2iImpl}
   ${inputIndicesHelper.i2oImpl}
 
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${outputSize}u) {
-      return;
-    }
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
 
     ${outputIndicesHelper.indicesVariableDeclaration('indices')}
-    ${outputIndicesHelper.o2iCall('global_id.x', 'indices')}
+    ${outputIndicesHelper.o2iCall('global_idx', 'indices')}
     ${inputIndicesHelper.indicesVariableDeclaration('aIndices')}
     perm(&aIndices, &indices);
 
-    output[global_id.x] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
+    output[global_idx] = a[${inputIndicesHelper.i2oExpression('aIndices')}];
   }`;
   return {
     ...transposeProgramMetadata,
     outputs: [{dims: outputShape, dataType: input.dataType, gpuDataType: GpuDataType.default}],
-    shaderSource,
+    getShaderSource,
     dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
   };
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index cadd6f293d969..b432da1247d6b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -6,14 +6,15 @@ import {MAX_CLIP, MIN_CLIP, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
-import {WORKGROUP_SIZE} from './common';
+import {ShaderHelper} from './common';
 
 type BuiltinFunctionName = string;
 type ElementwiseCustomExpression = (expression: string) => string;
 type ElementwiseFunctionCall = BuiltinFunctionName|ElementwiseCustomExpression;
 
 const createElementwiseProgramShader =
-    (datasize: number, funcCall: ElementwiseFunctionCall, additionalImplementation?: string): string => {
+    (shaderHelper: ShaderHelper, datasize: number, funcCall: ElementwiseFunctionCall,
+     additionalImplementation?: string): string => {
       const vecSize = Math.ceil(datasize / 4);
 
       let expression = '';
@@ -23,23 +24,16 @@ const createElementwiseProgramShader =
         expression = funcCall('a');
       }
       return `
-  const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
   @group(0) @binding(0) var<storage, read> inputData : array<vec4<f32>>;
   @group(0) @binding(1) var<storage, read_write> outputData : array<vec4<f32>>;
 
   ${additionalImplementation ?? ''}
 
-  @compute @workgroup_size(WORKGROUP_SIZE)
-  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-    // Guard against out-of-bounds work group sizes
-    if (global_id.x >= ${vecSize}u) {
-      return;
-    }
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(vecSize)}
 
-    let a = inputData[global_id.x];
-    outputData[global_id.x] = ${expression};
+    let a = inputData[global_idx];
+    outputData[global_idx] = ${expression};
   }`;
     };
 
@@ -47,7 +41,8 @@ const createElementwiseProgramInfo =
     (metadata: ProgramMetadata, input: TensorView, funcCall: ElementwiseFunctionCall,
      additionalImplementation?: string): ProgramInfo => ({
       ...metadata,
-      shaderSource: createElementwiseProgramShader(ShapeUtil.size(input.dims), funcCall, additionalImplementation),
+      getShaderSource: shaderHelper =>
+          createElementwiseProgramShader(shaderHelper, ShapeUtil.size(input.dims), funcCall, additionalImplementation),
       outputs: [{dims: input.dims, dataType: input.dataType, gpuDataType: GpuDataType.default}],
       dispatchGroup: (inputTensors) =>
           ({x: Math.ceil(ShapeUtil.size(inputTensors[0].dims) / 64 /* workgroup size */ / 4 /* vec size */)})
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index 692b9ba5ec0fb..088a1fffcda29 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -5,6 +5,7 @@ import {env} from 'onnxruntime-common';
 
 import {WebGpuBackend} from '../backend-webgpu';
 
+import {createShaderHelper} from './ops/common';
 import {Artifact, GpuData, ProgramInfo} from './types';
 
 /**
@@ -30,8 +31,7 @@ export class ProgramManager {
   setArtifact(key: unknown, artifact: Artifact): void {
     this.repo.set(key, artifact);
   }
-  run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[],
-      dispatchGroup: {x: number; y?: number; z?: number}): void {
+  run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[], dispatchGroup: [number, number, number]): void {
     const device = this.backend.device;
     const computePassEncoder = this.backend.getComputePassEncoder();
 
@@ -53,8 +53,7 @@ export class ProgramManager {
     const bindGroup = device.createBindGroup({layout: buildArtifact.computePipeline.getBindGroupLayout(0), entries});
     computePassEncoder.setBindGroup(0, bindGroup);
 
-    const {x, y, z} = dispatchGroup;
-    computePassEncoder.dispatchWorkgroups(x, y, z);
+    computePassEncoder.dispatchWorkgroups(...dispatchGroup);
 
     this.backend.pendingDispatchNumber++;
 
@@ -109,13 +108,14 @@ export class ProgramManager {
   dispose(): void {
     // this.repo.forEach(a => this.glContext.deleteProgram(a.program));
   }
-  build(programInfo: ProgramInfo): Artifact {
+  build(programInfo: ProgramInfo, normalizedDispatchGroupSize: [number, number, number]): Artifact {
     const device = this.backend.device;
 
-    const shaderModule = device.createShaderModule({code: programInfo.shaderSource});
+    const code = programInfo.getShaderSource(createShaderHelper(normalizedDispatchGroupSize));
+    const shaderModule = device.createShaderModule({code});
     if (env.debug) {
       // eslint-disable-next-line no-console
-      console.log('WebGpuProgram: ' + programInfo.shaderSource);
+      console.log(`WebGpuProgram: ${code}`);
     }
 
     const computePipeline =
@@ -123,4 +123,25 @@ export class ProgramManager {
 
     return {programInfo, computePipeline};
   }
+
+  normalizeDispatchGroupSize(dispatchGroup: ReturnType<ProgramInfo['dispatchGroup']>): [number, number, number] {
+    const x = typeof dispatchGroup === 'number' ? dispatchGroup : dispatchGroup.x;
+    const y = typeof dispatchGroup === 'number' ? 1 : (dispatchGroup.y || 1);
+    const z = typeof dispatchGroup === 'number' ? 1 : (dispatchGroup.z || 1);
+    const limitPerDimension = this.backend.device.limits.maxComputeWorkgroupsPerDimension;
+    if (x <= limitPerDimension && y <= limitPerDimension && z <= limitPerDimension) {
+      return [x, y, z];
+    }
+    const size = x * y * z;
+    let dispatchAverage = Math.ceil(Math.sqrt(size));
+    if (dispatchAverage > limitPerDimension) {
+      dispatchAverage = Math.ceil(Math.cbrt(size));
+      if (dispatchAverage > limitPerDimension) {
+        throw new Error('Total dispatch size exceeds WebGPU maximum.');
+      }
+      return [dispatchAverage, dispatchAverage, dispatchAverage];
+    } else {
+      return [dispatchAverage, dispatchAverage, 1];
+    }
+  }
 }
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index eb0ec850b37ff..db841b5e890c2 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -3,6 +3,8 @@
 
 import {Tensor, TensorView} from '../tensor';
 
+import {ShaderHelper} from './ops/common';
+
 export enum GpuDataType {
   default = 0,
   upload = 1,
@@ -76,7 +78,7 @@ export interface ProgramInfo extends ProgramMetadata {
   /**
    * the shader's processing source code
    */
-  shaderSource: string;
+  getShaderSource: (shaderHelper: ShaderHelper) => string;
   /**
    * default is "main"
    */

From 8b7a5705b2389900d8ff96e3d6cde1f9d9834b6b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 30 Mar 2023 16:39:22 -0700
Subject: [PATCH 57/81] matmul

---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   5 +-
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts     | 209 ++++++++----------
 .../providers/js/js_execution_provider.cc     |   4 +
 .../core/providers/js/operators/matmul.cc     |  21 ++
 4 files changed, 125 insertions(+), 114 deletions(-)
 create mode 100644 onnxruntime/core/providers/js/operators/matmul.cc

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 75dde0511d81e..28092c37f6db8 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -6,7 +6,7 @@ import * as binaryOps from './ops/binary-op';
 import {conv, parseConvAttributes} from './ops/conv';
 // import {gather, parseGatherAttributes} from './ops/gather';
 import {gemm, parseGemmAttributes} from './ops/gemm';
-// import {matMul, parseMatMulAttributes} from './ops/matmul';
+import {matMul} from './ops/matmul';
 import * as pool from './ops/pool';
 //  import {sum} from
 // './ops/reduce-tensors'; import {reshape} from './ops/reshape'; import {shape} from './ops/shape';
@@ -50,7 +50,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   // ['InstanceNormalization', '', '6+', instanceNormalization, parseInstanceNormalizationAttributes],
   //['LeakyRelu', '', '6+', unaryOps.leakyRelu, unaryOps.parseLeakyReluAttributes],
   // ['Less', '', '7+', binaryOps.less],
-  //['Log', '', '6+', unaryOps.log], ['MatMul', '', '1+', matMul, parseMatMulAttributes],
+  //['Log', '', '6+', unaryOps.log],
+  ['MatMul', [matMul]],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
   ['MaxPool', [pool.maxPool, pool.parseMaxPoolAttributes]], ['Mul', [binaryOps.mul]], ['Neg', [unaryOps.neg]],
   // ['Not', '', '1+', unaryOps.not],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index d6f63820eff04..d6225473371b2 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -1,115 +1,100 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import {Graph} from '../../../graph';
-// import {OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {BroadcastUtil, ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-// import {WORKGROUP_SIZE} from './common';
-// import {getActicationSnippet, InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
-
-// export const matMul: OperatorAsyncImplementation<InternalActivationAttributes> =
-//     async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: InternalActivationAttributes):
-//         Promise<Tensor[]> => {
-//           validateInputs(inputs);
-
-//           return inferenceHandler.run(createMatmulProgramInfoLoader(inputs, attributes), inputs);
-//         };
-
-// export const parseMatMulAttributes: OperatorInitialization<InternalActivationAttributes> =
-//     (node: Graph.Node): InternalActivationAttributes => parseInternalActivationAttributes(node.attributes);
-
-// const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
-//   name: 'MatMul',
-//   inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
-//                         [GpuDataType.default, GpuDataType.default],
-//   cacheHint
-// });
-
-// function createMatmulProgramInfo(
-//     metadata: ProgramMetadata, inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfo {
-//   const aShape = inputs[0].dims;
-//   const bShape = inputs[1].dims;
-//   const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
-//   if (!outputShape) {
-//     throw new Error('Can\'t use matmul on the given tensors');
-//   }
-//   const outputSize = ShapeUtil.size(outputShape);
-//   // TODO: support broadcasting
-
-//   const dataType = 'f32';  // TODO: support other data type
-//   const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
-
-//   const M = outputShape[outputShape.length - 2];
-//   const K = aShape[aShape.length - 1];
-//   const N = outputShape[outputShape.length - 1];
-//   const shaderSource = `
-//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-//   const M: u32 = ${M}u;
-//   const N: u32 = ${N}u;
-//   const K: u32 = ${K}u;
-
-//   @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
-//   @group(0) @binding(1) var<storage, read> b : array<${dataType}>;
-//   @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
-
-//   ${activationFunction}
-
-//   @compute @workgroup_size(WORKGROUP_SIZE)
-//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//     // Guard against out-of-bounds work group sizes
-//     if (global_id.x >= ${outputSize}u) {
-//       return;
-//     }
-
-//     let stack = global_id.x / (M * N);
-//     let mn = global_id.x % (M * N);
-//     let n = global_id.x % N;
-//     let m = mn / N;
-
-//     let offsetA = stack * (M * K);
-//     let offsetB = stack * (K * N);
-
-//     var value = ${dataType}(0);
-//     for (var k: u32 = 0u; k<${K}u; k++) {
-//       value += a[offsetA + m * K + k] * b[offsetB + k * N + n];
-//     }
-//     ${applyActivation}
-//     output[global_id.x] = value;
-//   }`;
-//   return {
-//     ...metadata,
-//     outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-//     shaderSource,
-//     dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-//   };
-// }
-
-// export function createMatmulProgramInfoLoader(
-//     inputs: Tensor[], activationAttributes: InternalActivationAttributes): ProgramInfoLoader {
-//   const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
-//   return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes)};
-// }
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 2) {
-//     throw new Error('MatMul requires 2 inputs.');
-//   }
-
-//   if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
-//     throw new Error('shared dimension does not match.');
-//   }
-
-//   if ((inputs[0].type !== 'float32' && inputs[0].type !== 'float64') ||
-//       (inputs[1].type !== 'float32' && inputs[1].type !== 'float64')) {
-//     throw new Error('inputs should be float type');
-//   }
-
-//   if (inputs[0].type !== inputs[1].type) {
-//     throw new Error('inputs types should match');
-//   }
-// };
+import {DataType} from '../../../wasm-core-impl';
+import {TensorView} from '../../tensor';
+import {BroadcastUtil, ShapeUtil} from '../../util';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {ShaderHelper} from './common';
+import {getActicationSnippet, InternalActivationAttributes} from './fuse-utils';
+
+
+const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
+  name: 'MatMul',
+  inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                        [GpuDataType.default, GpuDataType.default],
+  cacheHint
+});
+
+const createMatmulProgramInfo =
+    (metadata: ProgramMetadata, inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes):
+        ProgramInfo => {
+          const aShape = inputs[0].dims;
+          const bShape = inputs[1].dims;
+          const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
+          if (!outputShape) {
+            throw new Error('Can\'t use matmul on the given tensors');
+          }
+          const outputSize = ShapeUtil.size(outputShape);
+          // TODO: support broadcasting
+
+          const dataType = 'f32';  // TODO: support other data type
+          const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
+
+          const M = outputShape[outputShape.length - 2];
+          const K = aShape[aShape.length - 1];
+          const N = outputShape[outputShape.length - 1];
+          const getShaderSource = (shaderHelper: ShaderHelper) => `
+  const M: u32 = ${M}u;
+  const N: u32 = ${N}u;
+  const K: u32 = ${K}u;
+
+  @group(0) @binding(0) var<storage, read> a : array<${dataType}>;
+  @group(0) @binding(1) var<storage, read> b : array<${dataType}>;
+  @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
+
+  ${activationFunction}
+
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+
+    let stack = global_idx / (M * N);
+    let mn = global_idx % (M * N);
+    let n = global_idx % N;
+    let m = mn / N;
+
+    let offsetA = stack * (M * K);
+    let offsetB = stack * (K * N);
+
+    var value = ${dataType}(0);
+    for (var k: u32 = 0u; k<${K}u; k++) {
+      value += a[offsetA + m * K + k] * b[offsetB + k * N + n];
+    }
+    ${applyActivation}
+    output[global_idx] = value;
+  }`;
+          return {
+            ...metadata,
+            outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+            getShaderSource,
+            dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+          };
+        };
+
+export const createMatmulProgramInfoLoader =
+    (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes): ProgramInfoLoader => {
+      const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
+      return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes)};
+    };
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('MatMul requires 2 inputs.');
+  }
+
+  if (inputs[0].dims[inputs[0].dims.length - 1] !== inputs[1].dims[inputs[1].dims.length - 2]) {
+    throw new Error('shared dimension does not match.');
+  }
+
+  if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
+    throw new Error('inputs should be float type');
+  }
+};
+
+export const matMul = (context: ComputeContext): number => {
+  validateInputs(context.inputs);
+
+  context.compute(createMatmulProgramInfoLoader(context.inputs, {activation: '', activationCacheKey: ''}));
+  return 0;
+};
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index db3b44d995327..1f44125f0ef9f 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -150,6 +150,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, float, Gemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Gemm);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
@@ -252,6 +254,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, float, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
diff --git a/onnxruntime/core/providers/js/operators/matmul.cc b/onnxruntime/core/providers/js/operators/matmul.cc
new file mode 100644
index 0000000000000..19c295ac1c04d
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/matmul.cc
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+JSEP_KERNEL_IMPL(MatMul, MatMul)
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(MatMul, kOnnxDomain, 1, 12, kJsExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  MatMul);
+
+ONNX_OPERATOR_KERNEL_EX(MatMul, kOnnxDomain, 13, kJsExecutionProvider,
+                        KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                        MatMul);
+
+
+}  // namespace js
+}  // namespace onnxruntime

From 058de80ce23e4d1cd9c0ef8491fe53cf5bbf1847 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 3 Apr 2023 15:45:43 -0700
Subject: [PATCH 58/81] remove unused stats assignment

---
 onnxruntime/core/providers/js/allocator.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/onnxruntime/core/providers/js/allocator.cc b/onnxruntime/core/providers/js/allocator.cc
index 6345ef5b335f1..67b0536d34ae6 100644
--- a/onnxruntime/core/providers/js/allocator.cc
+++ b/onnxruntime/core/providers/js/allocator.cc
@@ -13,17 +13,11 @@ void* JsCustomAllocator::Alloc(size_t size) {
   void* p = EM_ASM_PTR({return Module.jsepAlloc($0);}, size);
   stats_.num_allocs++;
   stats_.bytes_in_use += size;
-  stats_.max_bytes_in_use =std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-  stats_.max_alloc_size = std::max<int64_t>(stats_.max_alloc_size, static_cast<int64_t>(size));
-  stats_.num_arena_extensions++;
-  stats_.num_arena_shrinkages = std::max(stats_.num_arena_shrinkages, stats_.num_arena_extensions);
-  stats_.total_allocated_bytes += size;
   return p;
 }
 
 void JsCustomAllocator::Free(void* p) {
   size_t size = (size_t)(void*)EM_ASM_PTR({return Module.jsepFree($0);}, p);
-  stats_.num_arena_extensions--;
   stats_.bytes_in_use -= size;
 }
 

From 998f16519c928ee9d976603a460e5614f706f2b0 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Apr 2023 13:24:45 -0700
Subject: [PATCH 59/81] replace logging

---
 onnxruntime/core/providers/js/js_export.cc | 10 ++++------
 onnxruntime/core/providers/js/js_kernel.h  |  3 ++-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index 5c578a0a432f1..aefef9dc39bc9 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -16,13 +16,11 @@ const void * JsepOutput(void * context, int index, void * data) {
         dims[i] = static_cast<int64_t>(*data_offset++);
     }
 
-#ifndef NDEBUG
-    printf("JsepOutput(%d, %s)\n", index, onnxruntime::TensorShape(dims).ToString().c_str());
-#endif
+    LOGF_DEFAULT(VERBOSE, "JsepOutput(%d, %s)", index, onnxruntime::TensorShape(dims).ToString().c_str());
+
     auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
     auto r = output->DataRaw();
-#ifndef NDEBUG
-    printf("JsepOutput -- data=%zu\n", (size_t)(r));
-#endif
+
+    LOGF_DEFAULT(VERBOSE, "JsepOutput -- data=%zu", (size_t)(r));
     return r;
 }
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 47850f59871af..43cea74209cce 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -117,7 +117,8 @@ class JsKernel : public OpKernel {
 
       int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_serialized_kernel_context);
 
-      // printf("outputs = %d. Y.data=%zu\n", context->OutputCount(), (size_t)(context->Output<Tensor>(0)->DataRaw()));
+      LOGS_DEFAULT(VERBOSE) << "outputs = " << context->OutputCount() << ". Y.data="
+                            << (size_t)(context->Output<Tensor>(0)->DataRaw()) << ".";
 
       alloc->Free(p_serialized_kernel_context);
 

From 442d222ce17326ec736e8de722360ccd46d29f53 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:59:58 -0700
Subject: [PATCH 60/81] use ort log

---
 onnxruntime/core/providers/js/js_kernel.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 43cea74209cce..3187b132f47bd 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -5,6 +5,10 @@
 
 #include <emscripten.h>
 
+#ifndef NDEBUG
+#include <sstream>
+#endif
+
 #include "core/framework/op_kernel.h"
 #include "core/providers/js/js_execution_provider.h"
 
@@ -99,11 +103,13 @@ class JsKernel : public OpKernel {
       }
 
 #ifndef NDEBUG
-      printf("temp data size: %zu. Data: ", temp_data_size);
-      for (int i=0; i < (int)temp_data_size/4;i++) {
-        printf("%u ", p_serialized_kernel_context[i]);
+      std::ostringstream os;
+      os << "temp data size: " << temp_data_size << ". Data:";
+      size_t temp_data_count = temp_data_size >> 2;
+      for (size_t i = 0; i < temp_data_count; i++) {
+        os << " " << p_serialized_kernel_context[i];
       }
-      printf("\n");
+      LOGS_DEFAULT(VERBOSE) << os.str();
 #endif
 
       return p_serialized_kernel_context;

From 613f37c68f527ab31997c2c15023d0cb6b698357 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Apr 2023 17:13:01 -0700
Subject: [PATCH 61/81] add comments for asyncify

---
 cmake/onnxruntime_webassembly.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 7600be8e55e0c..80a44ffb3fa63 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -220,6 +220,10 @@ else()
   )
 
   if (onnxruntime_USE_JS)
+    # NOTE: "-s ASYNCIFY=1" is required for JSEP to work with WebGPU
+    #       This flag allows async functions to be called from sync functions, in the cost of binary size and
+    #       build time. See https://emscripten.org/docs/porting/asyncify.html for more details.
+
     target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JS=1)
     target_link_options(onnxruntime_webassembly PRIVATE
       --pre-js "${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js"

From 1b63fe2780a057606df3490bdec01cb64181ee88 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Apr 2023 17:28:01 -0700
Subject: [PATCH 62/81] fix typo

---
 js/web/lib/wasm/jsep/backend-webgpu.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 323d3cdb2c48b..68e80c4fa0369 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -191,7 +191,7 @@ export class WebGpuBackend {
         (typeof (program as ProgramInfoLoader).get === 'function' ? (program as ProgramInfoLoader).get() :
                                                                     (program as ProgramInfo));
 
-    // check ouput indices
+    // check output indices
     const validatedOutputIndices = outputIndices.length === 0 ? programInfo.outputs.map((_, i) => i) : outputIndices;
     if (validatedOutputIndices.length !== programInfo.outputs.length) {
       throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${programInfo.outputs.length}.`);

From 40a1ab8a54a6e0e02855b32c054b3bc2dda570a6 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Apr 2023 18:06:03 -0700
Subject: [PATCH 63/81] clean up code

---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  93 ++++---------
 .../webgpu/ops/3rd-party/activation_util.ts   |  35 +----
 .../jsep/webgpu/ops/3rd-party/conv_util.ts    | 126 ------------------
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   |   6 +-
 4 files changed, 30 insertions(+), 230 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 28092c37f6db8..10875b2abddc1 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -2,96 +2,55 @@
 // Licensed under the MIT License.
 
 import * as binaryOps from './ops/binary-op';
-// import {concat, parseConcatAttributes} from './ops/concat';
 import {conv, parseConvAttributes} from './ops/conv';
-// import {gather, parseGatherAttributes} from './ops/gather';
 import {gemm, parseGemmAttributes} from './ops/gemm';
 import {matMul} from './ops/matmul';
 import * as pool from './ops/pool';
-//  import {sum} from
-// './ops/reduce-tensors'; import {reshape} from './ops/reshape'; import {shape} from './ops/shape';
-// import {parseSliceAttributes, slice, sliceV10} from './ops/slice';
-// import {parseSqueezeAttributes, squeeze, squeezeV13} from './ops/squeeze';
 import {parseTransposeAttributes, transpose} from './ops/transpose';
 import * as unaryOps from './ops/unary-op';
 import {ComputeContext} from './types';
 
-// import {parseUnsqueezeAttributes, unsqueeze, unsqueezeV13} from './ops/unsqueeze';
-
 export type RunFunction = (context: ComputeContext, attribute?: unknown) => number;
 export type ParseAttributeFunction = (attributeRaw: unknown) => unknown;
 export type OperatorImplementation = [RunFunction]|[RunFunction, ParseAttributeFunction];
 
 export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new Map([
-  ['Abs', [unaryOps.abs]], ['Acos', [unaryOps.acos]], ['Acosh', [unaryOps.acosh]], ['Add', [binaryOps.add]],
-  // ['And', '', '7+', binaryOps.and],
-  ['Asin', [unaryOps.asin]], ['Asinh', [unaryOps.asinh]], ['Atan', [unaryOps.atan]], ['Atanh', [unaryOps.atanh]],
+  ['Abs', [unaryOps.abs]],
+  ['Acos', [unaryOps.acos]],
+  ['Acosh', [unaryOps.acosh]],
+  ['Add', [binaryOps.add]],
+  ['Asin', [unaryOps.asin]],
+  ['Asinh', [unaryOps.asinh]],
+  ['Atan', [unaryOps.atan]],
+  ['Atanh', [unaryOps.atanh]],
   // TODO: support new attributes for AveragePool-10
   ['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
-  // ['BatchNormalization', '', '7+', batchNormalization, parseBatchNormalizationAttributes],
-  // ['Cast', '', '6+', cast, parseCastAttributes],
-  ['Ceil', [unaryOps.ceil]], ['ClipV10', [unaryOps.clip]],
-  ['Clip', [unaryOps.clipV11]],  // ['Concat', '', '4+', concat, parseConcatAttributes],
-  ['Conv', [conv, parseConvAttributes]], ['Cos', [unaryOps.cos]], ['Cosh', [unaryOps.cosh]], ['Div', [binaryOps.div]],
-  // ['Dropout', '', '7+', unaryOps.identity],
-  // ['DepthToSpace', '', '1+', depthToSpace, parseDepthToSpaceAttributes],
-  // ['Equal', '', '7+', binaryOps.equal],
-  ['Elu', [unaryOps.elu, unaryOps.parseEluAttributes]], ['Erf', [unaryOps.erf]],  //['Exp', [unaryOps.exp]],
-  // ['Flatten', '', '1+', flatten, parseFlattenAttributes],
+  ['Ceil', [unaryOps.ceil]],
+  ['ClipV10', [unaryOps.clipV10]],
+  ['Clip', [unaryOps.clip]],
+  ['Conv', [conv, parseConvAttributes]],
+  ['Cos', [unaryOps.cos]],
+  ['Cosh', [unaryOps.cosh]],
+  ['Div', [binaryOps.div]],
+  ['Elu', [unaryOps.elu, unaryOps.parseEluAttributes]],
+  ['Erf', [unaryOps.erf]],
   ['Floor', [unaryOps.floor]],
-  // ['FusedConv', 'com.microsoft', '1+', conv, parseConvAttributes],
-  //['Gather', '', '1+', gather, parseGatherAttributes],
   ['Gemm', [gemm, parseGemmAttributes]],
   ['GlobalAveragePool', [pool.globalAveragePool, pool.parseGlobalAveragePoolAttributes]],
   ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
-  // ['Greater', '', '7+', binaryOps.greater],
-  // ['Identity', '', '1+', unaryOps.identity],
-  // ['ImageScaler', '', '1+', imageScaler, parseImageScalerAttributes],
-  // ['InstanceNormalization', '', '6+', instanceNormalization, parseInstanceNormalizationAttributes],
-  //['LeakyRelu', '', '6+', unaryOps.leakyRelu, unaryOps.parseLeakyReluAttributes],
-  // ['Less', '', '7+', binaryOps.less],
-  //['Log', '', '6+', unaryOps.log],
   ['MatMul', [matMul]],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
-  ['MaxPool', [pool.maxPool, pool.parseMaxPoolAttributes]], ['Mul', [binaryOps.mul]], ['Neg', [unaryOps.neg]],
-  // ['Not', '', '1+', unaryOps.not],
-  // ['Or', '', '7+', binaryOps.or],
-  // ['Pad', '', '2-10', padV2, parsePadAttributesV2],
-  // ['Pad', '', '11+', padV11, parsePadAttributesV11],
+  ['MaxPool', [pool.maxPool, pool.parseMaxPoolAttributes]],
+  ['Mul', [binaryOps.mul]],
+  ['Neg', [unaryOps.neg]],
   ['Pow', [binaryOps.pow]],
-  // ['PRelu', '', '7+', binaryOps.pRelu],
   ['Reciprocal', [unaryOps.reciprocal]],
-  // ['ReduceLogSum', '', '1+', reduceLogSum, parseReduceAttributes],
-  // ['ReduceMax', '', '1+', reduceMax, parseReduceAttributes],
-  // ['ReduceMean', '', '1+', reduceMean, parseReduceAttributes],
-  // ['ReduceMin', '', '1+', reduceMin, parseReduceAttributes],
-  // ['ReduceProd', '', '1+', reduceProd, parseReduceAttributes],
-  // ['ReduceSum', '', '1-12', reduceSum, parseReduceAttributes],
-  // ['ReduceSumSquare', '', '1+', reduceLogSumSquare, parseReduceAttributes],
-  //['Relu', '', '6+', unaryOps.relu], ['Reshape', '', '5+', reshape],
-  // ['Resize', '', '10', resize, parseResizeAttributesV10],
-  // ['Resize', '', '11+', resize, parseResizeAttributesV11],
-  //['Shape', '', '1+', shape],
-  ['Sigmoid', [unaryOps.sigmoid]], ['Sin', [unaryOps.sin]], ['Sinh', [unaryOps.sinh]],
-  //['Slice', '', '10+', sliceV10],  // TODO: support 'steps' for Slice-10
-  //['Slice', '', '1-9', slice, parseSliceAttributes],
-  // // The "semantic" meaning of axis has changed in opset-13.
-  // ['Softmax', '', '1-12', softmax, parseSoftmaxAttributes],
-  // ['Softmax', '', '13+', softmaxV13, parseSoftmaxAttributesV13],
-  // // 'Split' operator has an optional attribute 'split'
-  // // this attribute determines how the specified axis of input data is split.
-  // // When the attribute is missing, we need the count of number of outputs
-  // // so that we can determine the 'split' attribute from the runtime input to the Operator
-  // ['Split', '', '2-12', split, parseSplitAttributes],
+  ['Sigmoid', [unaryOps.sigmoid]],
+  ['Sin', [unaryOps.sin]],
+  ['Sinh', [unaryOps.sinh]],
   ['Sqrt', [unaryOps.sqrt]],
-  // ['Squeeze', '', '1-12', squeeze, parseSqueezeAttributes],
-  //['Squeeze', '', '13+', squeezeV13],
-  ['Sub', [binaryOps.sub]],  // ['Sum', '', '6+', sum],
-  ['Tan', [unaryOps.tan]], ['Tanh', [unaryOps.tanh]],
-  // ['Tile', '', '6+', tile],
+  ['Sub', [binaryOps.sub]],
+  ['Tan', [unaryOps.tan]],
+  ['Tanh', [unaryOps.tanh]],
   ['Transpose', [transpose, parseTransposeAttributes]],
-  // ['Upsample', '', '7-8', upsample, parseUpsampleAttributesV7],
-  // ['Upsample', '', '9', upsample, parseUpsampleAttributesV9],
-  //['Unsqueeze', '', '1-12', unsqueeze, parseUnsqueezeAttributes], ['Unsqueeze', '', '13+', unsqueezeV13],
-  // ['Xor', '', '7+', binaryOps.xor],
 ]);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
index 3ac290103842e..5345367eadfef 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
@@ -42,41 +42,8 @@ export const activationFnSnippet =
         return '';
       }
 
+      // TODO: add implementations
       return '';
-      // let activationOpSnippet = '';
-      // if (activation === 'linear') {
-      //   activationOpSnippet = getUnaryOpString(UnaryOpType.LINEAR);
-      // } else if (activation === 'relu') {
-      //   activationOpSnippet = getUnaryOpString(UnaryOpType.RELU, packed);
-      // } else if (activation === 'elu') {
-      //   activationOpSnippet = getUnaryOpString(UnaryOpType.ELU, packed);
-      // } else if (activation === 'relu6') {
-      //   activationOpSnippet = getUnaryOpString(UnaryOpType.RELU6, packed);
-      // } else if (activation === 'prelu') {
-      //   activationOpSnippet = getBinaryOpString(BinaryOpType.PRELU, packed);
-      // } else if (activation === 'sigmoid') {
-      //   activationOpSnippet = getUnaryOpString(UnaryOpType.SIGMOID, packed);
-      // } else if (activation === 'leakyrelu') {
-      //   activationOpSnippet = getUnaryOpString(UnaryOpType.LEAKYRELU, packed);
-      // } else {
-      //   throw new Error(`Activation ${activation} has not been implemented for the WebGPU backend.`);
-      // }
-      // const elementSize = packed ? 4 : 1;
-      // const dataType = typeSnippet(elementSize);
-      // let activationFnSnippet = '';
-      // if (hasPreluActivationWeights) {
-      //   activationFnSnippet = `
-      // fn activation(a : ${dataType}, coords : vec${coordsLength}<i32>) -> ${dataType} {
-      //   let b = getPreluActivationWeightsByOutputCoords(coords);
-      //   ${activationOpSnippet}
-      // }`;
-      // } else {
-      //   activationFnSnippet = `
-      // fn activation(a : ${dataType}, coords : vec${coordsLength}<i32>) -> ${dataType} {
-      //   ${activationOpSnippet}
-      // }`;
-      // }
-      // return activationFnSnippet;
     };
 
 export const biasActivationSnippet = (hasBias: boolean, activation?: Activation): string => `
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts
index dd79f88ee6880..0ba48a33fbc47 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_util.ts
@@ -29,129 +29,3 @@ fn getOutputIndexFromCoords(coords : vec4<i32>) -> i32 {
     outShapeStrides.x, outShapeStrides.y, outShapeStrides.z, 1));
 }
 `;
-// type PadType = 'SAME'|'VALID'|'NUMBER'|'EXPLICIT';
-
-// export interface PadInfo {
-//   top: number;
-//   left: number;
-//   right: number;
-//   bottom: number;
-//   type: PadType;
-// }
-
-// /**
-//  * Information about the forward pass of a convolution/pooling operation.
-//  * It includes input and output shape, strides, filter size and padding
-//  * information.
-//  */
-// export interface Conv2DInfo {
-//   batchSize: number;
-//   inHeight: number;
-//   inWidth: number;
-//   inChannels: number;
-//   outHeight: number;
-//   outWidth: number;
-//   outChannels: number;
-//   isChannelsFirst: boolean;
-//   strideHeight: number;
-//   strideWidth: number;
-//   dilationHeight: number;
-//   dilationWidth: number;
-//   filterHeight: number;
-//   filterWidth: number;
-//   effectiveFilterHeight: number;
-//   effectiveFilterWidth: number;
-//   padInfo: PadInfo;
-//   inShape: [number, number, number, number];
-//   outShape: [number, number, number, number];
-//   filterShape: [number, number, number, number];
-// }
-
-// const parseTupleParam = (param: number|number[]): [number, number, number] => {
-//   if (typeof param === 'number') {
-//     return [param, param, param];
-//   }
-//   if (param.length === 2) {
-//     return [param[0], param[1], 1];
-//   }
-//   return param as [number, number, number];
-// };
-
-// /* See https://www.tensorflow.org/api_docs/python/tf/nn/atrous_conv2d
-//  * Atrous convolution is equivalent to standard convolution with upsampled
-//  * filters with effective_filter_height =
-//  * filter_height + (filter_height - 1) * (dilation - 1)
-//  * and effective_filter_width =
-//  * filter_width + (filter_width - 1) * (dilation - 1),
-//  * produced by inserting dilation - 1 zeros along consecutive elements across
-//  * the filters' spatial dimensions.
-//  * When there is a dilation, this converts a filter dimension to the
-//  * effective filter dimension, so it can be used in a standard convolution.
-//  */
-// const getEffectiveFilterSize = (filterSize: number, dilation: number): number => {
-//   if (dilation <= 1) {
-//     return filterSize;
-//   }
-
-//   return filterSize + (filterSize - 1) * (dilation - 1);
-// };
-
-
-// /**
-//  * Computes the information for a forward pass of a convolution/pooling
-//  * operation.
-//  */
-// export const computeConv2DInfo =
-//     (inShape: [number, number, number, number], filterShape: [number, number, number, number],
-//      strides: number|[number, number], dilations: number|[number, number],
-//      pad: 'SAME_UPPER'|'SAME_LOWER'|'VALID'|number|[number, number, number, number],
-//      roundingMode: 'floor'|'round'|'ceil', depthwise: boolean, isChannelsFirst: boolean): Conv2DInfo => {
-//       let [batchSize, inHeight, inWidth, inChannels] = [-1, -1, -1, -1];
-//       if (isChannelsFirst) {
-//         [batchSize, inChannels, inHeight, inWidth] = inShape;
-//       } else {
-//         [batchSize, inHeight, inWidth, inChannels] = inShape;
-//       }
-
-//       const [filterHeight, filterWidth, , filterChannels] = filterShape;
-//       const [strideHeight, strideWidth] = parseTupleParam(strides);
-//       const [dilationHeight, dilationWidth] = parseTupleParam(dilations);
-
-//       const effectiveFilterHeight = getEffectiveFilterSize(filterHeight, dilationHeight);
-//       const effectiveFilterWidth = getEffectiveFilterSize(filterWidth, dilationWidth);
-//       const {padInfo, outHeight, outWidth} = getPadAndOutInfo(
-//           pad, inHeight, inWidth, strideHeight, strideWidth, effectiveFilterHeight, effectiveFilterWidth,
-//           roundingMode, dataFormat);
-
-//       const outChannels = depthwise ? filterChannels * inChannels : filterChannels;
-
-//       let outShape: [number, number, number, number];
-//       if (dataFormat === 'channelsFirst') {
-//         outShape = [batchSize, outChannels, outHeight, outWidth];
-//       } else if (dataFormat === 'channelsLast') {
-//         outShape = [batchSize, outHeight, outWidth, outChannels];
-//       }
-
-//       return {
-//         batchSize,
-//         dataFormat,
-//         inHeight,
-//         inWidth,
-//         inChannels,
-//         outHeight,
-//         outWidth,
-//         outChannels,
-//         padInfo,
-//         strideHeight,
-//         strideWidth,
-//         filterHeight,
-//         filterWidth,
-//         effectiveFilterHeight,
-//         effectiveFilterWidth,
-//         dilationHeight,
-//         dilationWidth,
-//         inShape,
-//         outShape,
-//         filterShape
-//       };
-//     }
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index b432da1247d6b..7334dbd601f41 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -97,7 +97,7 @@ export interface ClipAttributes extends AttributeWithCacheKey {
   readonly max: number;
 }
 
-export const clip = (context: ComputeContext, attributes: ClipAttributes): number => {
+export const clipV10 = (context: ComputeContext, attributes: ClipAttributes): number => {
   context.compute(
       createElementwiseProgramInfoLoader(
           context.inputs[0], 'Clip', a => `clamp(${a}, clip_min_, clip_max_)`, `
@@ -114,9 +114,9 @@ const generateClipAttributesFromInputs = (inputs: readonly TensorView[]): ClipAt
   return createAttributeWithCacheKey({min, max});
 };
 
-export const clipV11 = (context: ComputeContext): number => {
+export const clip = (context: ComputeContext): number => {
   const attributes = generateClipAttributesFromInputs(context.inputs);
-  return clip(context, attributes);
+  return clipV10(context, attributes);
 };
 
 export const ceil = (context: ComputeContext): number => {

From 2e528f63dd2037676809a52d1535cf68450efeca Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Apr 2023 18:21:51 -0700
Subject: [PATCH 64/81] code clean

---
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  |   4 +-
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts  |  22 ---
 js/web/lib/wasm/jsep/webgpu/ops/common.ts     |   3 +
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts     |   6 -
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       |  27 +--
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts     | 132 -------------
 .../wasm/jsep/webgpu/ops/reduce-tensors.ts    |  87 ---------
 js/web/lib/wasm/jsep/webgpu/ops/reshape.ts    |  22 ---
 js/web/lib/wasm/jsep/webgpu/ops/slice.ts      | 180 ------------------
 js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts    |  44 -----
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   |  35 ----
 js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts  |  43 -----
 12 files changed, 10 insertions(+), 595 deletions(-)
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/ops/gather.ts
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/ops/slice.ts
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
 delete mode 100644 js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index b6b8fe91c78b2..1858b140223f1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -78,7 +78,7 @@ const conv2dCommonSnippet =
       col % outWidth);
     `;
 
-      const xHight = isChannelsLast ? 'xShape[1]' : 'xShape[2]';
+      const xHeight = isChannelsLast ? 'xShape[1]' : 'xShape[2]';
       const xWidth = isChannelsLast ? 'xShape[2]' : 'xShape[3]';
       const row = isChannelsLast ? 'row' : 'col';
       const col = isChannelsLast ? 'col' : 'row';
@@ -96,7 +96,7 @@ const conv2dCommonSnippet =
     var resData = ${typeSnippet(innerElementSizeX)}(0.0);
     // The bounds checking is always needed since we use it to pad zero for
     // the 'same' padding type.
-    if (xRow >= 0 && xRow < ${xHight} && xCol >= 0 && xCol < ${xWidth}) {
+    if (xRow >= 0 && xRow < ${xHeight} && xCol >= 0 && xCol < ${xWidth}) {
       ${coordASnippet}
       let xIndex = getIndexFromCoords4D(coord, xShape);
       ${getXSnippet(innerElementSizeX)}
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index d56bbd6e4364e..604f4fc66e1ac 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
 import {TensorView} from '../../tensor';
 import {BroadcastUtil, ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
@@ -179,31 +178,16 @@ export const add = (context: ComputeContext): number => {
   return 0;
 };
 
-// export const and = (backend: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslAnd(), 'bool'), inputs)];
-
 export const div = (context: ComputeContext): number => {
   context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Div', (a, b) => `${a}/${b}`));
   return 0;
 };
 
-// export const equal = (backend: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslEqual(), 'bool'), inputs)];
-
-// export const greater = (backend: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslGreater(), 'bool'), inputs)];
-
-// export const less = (backend: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslLess(), 'bool'), inputs)];
-
 export const mul = (context: ComputeContext): number => {
   context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Mul', (a, b) => `${a}*${b}`));
   return 0;
 };
 
-// export const or = (backend: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslOr(), 'bool'), inputs)];
-
 export const pow = (context: ComputeContext): number => {
   context.compute(createBinaryOpProgramInfoLoader(
       context.inputs, 'Pow', ({scalar: (a, b) => `pow_f32(${a},${b})`, vector: (a, b) => `pow_vf32(${a},${b})`}), `
@@ -223,13 +207,7 @@ export const pow = (context: ComputeContext): number => {
   return 0;
 };
 
-// export const pRelu = (backend: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslPRelu()), inputs)];
-
 export const sub = (context: ComputeContext): number => {
   context.compute(createBinaryOpProgramInfoLoader(context.inputs, 'Sub', (a, b) => `${a}-${b}`));
   return 0;
 };
-
-// export const xor = (backend: WebGLInferenceHandler, inputs: Tensor[]):
-//     Tensor[] => [backend.run(createBinaryProgramInfoLoader(backend, inputs, glslXor(), 'bool'), inputs)];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 2022f61d0aa83..7305ab592d4a7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -96,6 +96,9 @@ export const createIndicesHelper = (name: string, shape: readonly number[]): Ind
   return {o2iImpl, o2iCall, i2oImpl, i2oExpression, indicesVariableDeclaration, iType};
 };
 
+/**
+ * A ShaderHelper is a helper class for generating WGSL code.
+ */
 export interface ShaderHelper {
   mainStart(workgroupSize?: number|[number, number, number]): string;
   guardAgainstOutOfBoundsWorkgroupSizes(size: unknown): string;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index 4f4b91b9283c1..0f1381ed6bc21 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-core-impl';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -21,11 +20,6 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   const inputType = inputs[0].dataType;
   const inputDimensionality = inputs[0].dims.length;
 
-  // TODO: Support string concat
-  if (inputType === DataType.string) {
-    throw new Error('string tensor is not supported yet');
-  }
-
   for (const input of inputs) {
     // make sure types of all inputs match
     if (input.dataType !== inputType) {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 1b097bcd7d291..192b17696185d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -9,14 +9,9 @@ import {ComputeContext} from '../types';
 
 import {createGroupedConvProgramInfoLoader} from './conv-grouped';
 import {createConv2DMatMulProgramInfoLoader} from './conv2d-mm';
-// import {createDotProductProgramInfoLoader} from './dot-product';
 import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
 import {createTransposeProgramInfo, TransposeAttributes, transposeProgramMetadata} from './transpose';
 
-// import {createIm2ColProgramInfoLoader} from './im2col';
-// import {createMatmulProgramInfoLoader} from './matmul';
-
-
 export const calculateOutputShape =
     (inputShape: readonly number[], kernelShape: readonly number[], dilations: readonly number[],
      adjustPads: readonly number[], strides: readonly number[], isChannelLast: boolean): number[] => {
@@ -173,7 +168,7 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
        attributes.strides[0] === 1 && attributes.strides[1] === 1 &&
        (attributes.autoPad === 'SAME_UPPER' || attributes.autoPad === 'SAME_LOWER' ||
         attributes.autoPad === 'VALID'))) {
-    // return conv2dByMatMul({x, filter, convInfo, backend, bias, activation, preluActivationWeights, leakyreluAlpha});
+    // TODO: implement conv2dByMatMul()
     context.compute(createGroupedConvProgramInfoLoader(inputs, adjustedAttributes));
     return 0;
   }
@@ -183,28 +178,13 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
     return 0;
   }
 
-  // const thresholdToIncreaseWorkgroups = 8;
-  // const workgroupsBy32x32 = batchSize * Math.ceil((outHeight * outWidth) / 32) * Math.ceil(outChannels / 32);
-  // if (workgroupsBy32x32 <= thresholdToIncreaseWorkgroups) {
-  //   // return conv2dWithIm2Col({x, filter, convInfo, backend, bias, preluActivationWeights, leakyreluAlpha,
-  //   // activation});
-  //   context.compute(createGroupedConvProgramInfoLoader(context.inputs, adjustedAttributes));
-  //   return 0;
-  // }
+  // TODO: implement conv2dWithIm2Col()
 
   const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels;
   const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth;
   const dimInner = weightHeight * weightWidth * inputChannels;
 
   const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true;
-  // const inputs = [context.inputs[0], context.inputs[1]];
-  // if (hasBias) {
-  //   if (!isChannelsLast && context.inputs[2].dims.length === 1) {
-  //     inputs.push(context.inputs[2].reshape([context.inputs[2].dims[0], 1, 1]));
-  //   } else {
-  //     inputs.push(context.inputs[2]);
-  //   }
-  // }
 
   // STEP.1: transpose weight
   const transposedWeight = (context.customData.wT as TensorView | undefined) ??
@@ -219,6 +199,7 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
     context.customData.wT = transposedWeight;
   }
 
+  // STEP.2: prepare reshaped inputs
   const convInputs = [inputs[0], transposedWeight];
   if (hasBias) {
     if (!isChannelsLast && inputs[2].dims.length === 1) {
@@ -227,6 +208,8 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
       convInputs.push(inputs[2]);
     }
   }
+
+  // STEP.3: compute matmul
   context.compute(
       createConv2DMatMulProgramInfoLoader(
           convInputs, adjustedAttributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias,
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
deleted file mode 100644
index 9f8a8e55417b5..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-// import {Graph} from '../../../graph';
-// import {NUMBER_TYPES, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
-
-// import {createIndicesHelper, WORKGROUP_SIZE} from './common';
-
-// interface GatherAttributes extends AttributeWithCacheKey {
-//   readonly axis: number;
-// }
-
-// export const gather = async(
-//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: GatherAttributes): Promise<Tensor[]> => {
-//   validateInputs(inputs, attributes.axis);
-//   return inferenceHandler.run(createGatherProgramInfoLoader(inputs, attributes), inputs);
-// };
-
-// export const parseGatherAttributes: OperatorInitialization<GatherAttributes> = (node: Graph.Node): GatherAttributes
-// =>
-//     createAttributeWithCacheKey({axis: node.attributes.getInt('axis', 0)});
-
-// const gatherProgramMetadata = {
-//   name: 'Gather',
-//   inputTypes: [GpuDataType.default, GpuDataType.default]
-// };
-
-// const createGatherProgramInfo =
-//     (metadata: ProgramMetadata, inputs: Tensor[], axis: number, dataType = 'f32'): ProgramInfo => {
-//       const dataShape = inputs[0].dims.slice();
-//       const indicesShape = inputs[1].dims.slice();
-//       const outputShape = new Array(dataShape.length + indicesShape.length - 1);
-
-//       axis = ShapeUtil.normalizeAxis(axis, dataShape.length);
-//       const indexCopyOps: string[] = [];
-//       if (indicesShape.length > 1) {
-//         indexCopyOps.push('indicesIdx[0] = 0u;');
-//       } else {
-//         indexCopyOps.push('indicesIdx = 0u;');
-//       }
-//       for (let i = 0; i < outputShape.length; i++) {
-//         // outputShape is divided into three parts: A, B, C
-//         // |0        axis|  axis + indicesShape.length |          end|
-//         // |     A       |             B               |      C      |
-//         //
-//         // dataIdx: [A, inputs[1][B], C]
-//         const outputIdxLValue = outputShape.length > 1 ? `outputIdx[${i}]` : 'outputIdx';
-//         if (i < axis) {  // A
-//           const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i}]` : 'dataIdx';
-//           outputShape[i] = dataShape[i];
-//           indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
-//         } else {
-//           if (i < axis + indicesShape.length) {  // B
-//             const indicesIdxLValue = indicesShape.length > 1 ? `indicesIdx[${i - axis}]` : 'indicesIdx';
-//             outputShape[i] = indicesShape[i - axis];
-//             indexCopyOps.push(`${indicesIdxLValue} = ${outputIdxLValue};`);
-//           } else {  // C
-//             const dataIdxLValue = dataShape.length > 1 ? `dataIdx[${i - indicesShape.length + 1}]` : 'dataIdx';
-//             outputShape[i] = dataShape[i - indicesShape.length + 1];  // skip 1 for axis
-//             indexCopyOps.push(`${dataIdxLValue} = ${outputIdxLValue};`);
-//           }
-//         }
-//       }
-//       const outputSize = ShapeUtil.size(outputShape);
-//       const outputIndicesHelper = createIndicesHelper('output', outputShape);
-//       const dataIndicesHelper = createIndicesHelper('data', dataShape);
-//       const indicesIndicesHelper = createIndicesHelper('indices', indicesShape);
-
-//       const shaderSource = `
-//     const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-//     @group(0) @binding(0) var<storage, read> data : array<${dataType}>;
-//     @group(0) @binding(1) var<storage, read> indices : array<i32>;
-//     @group(0) @binding(2) var<storage, read_write> output : array<${dataType}>;
-
-//     ${outputIndicesHelper.o2iImpl}
-//     ${indicesIndicesHelper.i2oImpl}
-//     ${dataIndicesHelper.i2oImpl}
-
-//     @compute @workgroup_size(WORKGROUP_SIZE)
-//     fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//       // Guard against out-of-bounds work group sizes
-//       if (global_id.x >= ${outputSize}u) {
-//         return;
-//       }
-
-//       ${outputIndicesHelper.indicesVariableDeclaration('outputIdx')}
-//       ${outputIndicesHelper.o2iCall('global_id.x', 'outputIdx')}
-//       ${dataIndicesHelper.indicesVariableDeclaration('dataIdx')}
-//       ${indicesIndicesHelper.indicesVariableDeclaration('indicesIdx')}
-//       ${indexCopyOps.join('\n        ')}
-//       let idx = indices[${indicesIndicesHelper.i2oExpression('indicesIdx')}];
-//       dataIdx${dataShape.length > 1 ? `[${axis}]` : ''} = u32(select(idx, idx + ${dataShape[axis]}, idx < 0));
-//       output[global_id.x] = data[${dataIndicesHelper.i2oExpression('dataIdx')}];
-//     }`;
-//       return {
-//         ...metadata,
-//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-//         shaderSource,
-//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-//       };
-//     };
-
-// const createGatherProgramInfoLoader = (inputs: Tensor[], attributes: GatherAttributes): ProgramInfoLoader => {
-//   const metadata = {...gatherProgramMetadata, cacheHint: attributes.cacheKey};
-//   return {...metadata, get: () => createGatherProgramInfo(metadata, inputs, attributes.axis)};
-// };
-
-// const validateInputs = (inputs: Tensor[], axis: number): void => {
-//   if (!inputs || inputs.length !== 2) {
-//     throw new Error('Gather requires 2 inputs.');
-//   }
-//   const tensorRank = inputs[0].dims.length;
-//   if (tensorRank < 1) {
-//     throw new Error('Invalid input shape.');
-//   }
-//   if (axis < -tensorRank || axis > tensorRank - 1) {
-//     throw new Error('Invalid axis.');
-//   }
-//   if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
-//     throw new Error('Invaid input type.');
-//   }
-//   if (inputs[1].type !== 'int32') {
-//     throw new Error('Invaid input type.');
-//   }
-// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
deleted file mode 100644
index 48c98766a1ee3..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce-tensors.ts
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
-
-// import {WORKGROUP_SIZE} from './common';
-
-// export const sum = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-//   validateInputs(inputs);
-
-//   const sumProgramMetadata = {name: 'Sum', inputTypes: new Array(inputs.length).fill(GpuDataType.default)};
-
-//   return inferenceHandler.run(
-//       {...sumProgramMetadata, get: () => createSumProgramInfo(inferenceHandler, inputs, sumProgramMetadata)},
-//       inputs);
-// };
-
-// const createSumProgramInfo =
-//     (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], sumProgramMetadata: ProgramMetadata): ProgramInfo
-//     => {
-//       const dataType = 'f32';
-//       const outputShape = inputs[0].dims;
-//       const outputSize = ShapeUtil.size(outputShape);
-
-
-//       const inputsDeclaration =
-//           inputs.map((_, i) => `@group(0) @binding(${i}) var<storage, read> input${i} : array<${dataType}>;`);
-//       const sumLine = inputs.map((_, i) => `input${i}[offset]`).join('+');
-//       const shaderSource = `
-//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-
-//   ${inputsDeclaration.join('\n')}
-//   @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
-
-//   @compute @workgroup_size(WORKGROUP_SIZE)
-//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//     // Guard against out-of-bounds work group sizes
-//     if (global_id.x >= ${outputSize}u) {
-//       return;
-//     }
-
-//     let offset = global_id.x;
-
-//     var value = ${dataType}(0);
-//     value = ${sumLine};
-
-//     output[offset] = value;
-//   }`;
-//       return {
-//         ...sumProgramMetadata,
-//         outputs: [{dims: outputShape, type: inputs[0].type, gpuDataType: GpuDataType.default}],
-//         shaderSource,
-//         dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-//       };
-//     };
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length === 0) {
-//     throw new Error('Sum requires inputs.');
-//   }
-
-//   const length = inputs[0].dims.length;
-//   for (let i = 1; i < inputs.length; i++) {
-//     if (length !== inputs[i].dims.length) {
-//       throw new Error('Input shapes are mismatched. broadcasting not supported yet');
-//     }
-
-//     for (let j = 0; j < length; j++) {
-//       if (inputs[0].dims[j] !== inputs[i].dims[j]) {
-//         throw new Error('Input shapes are not matched. broadcasting not supported yet');
-//       }
-//     }
-//   }
-
-//   if (inputs[0].type !== 'float32' && inputs[0].type !== 'float64') {
-//     throw new Error('Invalid input type.');
-//   }
-//   for (let i = 1; i < inputs.length; i++) {
-//     if (inputs[0].type !== inputs[i].type) {
-//       throw new Error('Input types are not matched.');
-//     }
-//   }
-// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts b/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
deleted file mode 100644
index 0227ce5ae28eb..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/ops/reshape.ts
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-
-// export const reshape = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-//   validateInputs(inputs);
-//   const shape = await inputs[1].getData();
-//   const reshapedDims = ShapeUtil.calculateReshapedDims(inputs[0].dims, shape as Int32Array);
-//   return [handler.reshape(inputs[0], reshapedDims)];
-// };
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 2) {
-//     throw new Error('Reshape requires 2 inputs.');
-//   }
-//   if (inputs[1].type !== 'int32') {
-//     throw new Error('Invalid input type.');
-//   }
-// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
deleted file mode 100644
index c35bf970c5675..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../../../attribute-with-cache-key';
-// import {Graph} from '../../../graph';
-// import {NUMBER_TYPES, OperatorAsyncImplementation, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-// import {GpuDataType, ProgramInfo} from '../types';
-
-// import {WORKGROUP_SIZE} from './common';
-
-// export interface SliceAttributes extends AttributeWithCacheKey {
-//   readonly axes: number[];
-//   readonly ends: number[];
-//   readonly starts: number[];
-// }
-
-// const sliceProgramMetadata = {
-//   name: 'Slice',
-//   inputTypes: [GpuDataType.default]
-// };
-
-// export const slice: OperatorAsyncImplementation<SliceAttributes> = async(
-//     inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], attributes: SliceAttributes): Promise<Tensor[]> => {
-//   validateInputs(inputs);
-//   return inferenceHandler.run(
-//       {
-//         ...sliceProgramMetadata,
-//         cacheHint: attributes.cacheKey,
-//         get: () => createSliceProgramInfo(inputs[0], attributes)
-//       },
-//       inputs);
-// };
-
-// export const parseSliceAttributes: OperatorInitialization<SliceAttributes> = (node: Graph.Node): SliceAttributes => {
-//   const starts = node.attributes.getInts('starts');
-//   const ends = node.attributes.getInts('ends');
-//   const axes = node.attributes.getInts('axes', []);
-//   return createAttributeWithCacheKey({starts, ends, axes});
-// };
-
-// const offsetToIndices = (offset: string, strides: readonly number[], indicesPrefix: string): string => {
-//   const outputLines: string[] = [];
-
-//   for (let i = 0; i < strides.length - 1; i++) {
-//     outputLines.push(`var ${indicesPrefix}${i}=${offset}/${strides[i]}u;`);
-//     outputLines.push(`${offset}%=${strides[i]}u;`);
-//   }
-//   outputLines.push(`var ${indicesPrefix}${strides.length - 1}=${offset};`);
-
-//   return outputLines.join('\n');
-// };
-
-// const indicesToOffset = (indicesPrefix: string, strides: readonly number[], offset: string): string => {
-//   const outputLines: string[] = [];
-
-//   for (let i = 0; i < strides.length - 1; i++) {
-//     outputLines.push(`${offset}+=${indicesPrefix}${i} * ${strides[i]}u;`);
-//   }
-//   outputLines.push(`${offset}+=${indicesPrefix}${strides.length - 1};`);
-
-//   return outputLines.join('\n');
-// };
-
-// const createSliceProgramInfo = (input: Tensor, attributes: SliceAttributes, dataType = 'f32'): ProgramInfo => {
-//   const axes = (attributes.axes.length === 0) ? input.dims.slice(0).map((val, i) => i) : attributes.axes;
-//   const normalizedAxes = ShapeUtil.normalizeAxes(axes, input.dims.length);
-//   const starts = attributes.starts.map((start, i) => {
-//     if (start > input.dims[normalizedAxes[i]] - 1) {
-//       return input.dims[normalizedAxes[i]];
-//     }
-//     return ShapeUtil.normalizeAxis(start, input.dims[normalizedAxes[i]]);
-//   });
-//   const ends = attributes.ends.map((end, i) => {
-//     if (end > input.dims[normalizedAxes[i]] - 1) {
-//       return input.dims[normalizedAxes[i]];
-//     }
-//     return ShapeUtil.normalizeAxis(end, input.dims[normalizedAxes[i]]);
-//   });
-
-//   const outputShape = input.dims.slice();
-
-//   const sliceOps: string[] = [];
-//   for (let i = 0; i < normalizedAxes.length; i++) {
-//     outputShape[normalizedAxes[i]] = ends[i] - starts[i];
-//     if (starts[i] > 0) {
-//       sliceOps.push(`idx_${normalizedAxes[i]} += ${starts[i]}u;`);
-//     }  // else { sliceOps.push(`outputIdx[${normalizedAxes[i]}] += 0;`); }
-//   }
-
-//   const outputSize = ShapeUtil.size(outputShape);
-//   const outputStrides = ShapeUtil.computeStrides(outputShape);
-//   const shaderSource = `
-//   const WORKGROUP_SIZE: u32 = ${WORKGROUP_SIZE}u;
-//   @group(0) @binding(0) var<storage, read> input : array<${dataType}>;
-//   @group(0) @binding(1) var<storage, read_write> output : array<${dataType}>;
-
-//   @compute @workgroup_size(WORKGROUP_SIZE)
-//   fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
-
-//     // Guard against out-of-bounds work group sizes
-//     if (global_id.x >= ${outputSize}u) {
-//       return;
-//     }
-
-//     var offset = global_id.x;
-//     ${offsetToIndices('offset', outputStrides, 'idx_')}
-//     ${sliceOps.join('')}
-//     var offsetInput = 0u;
-//     ${indicesToOffset('idx_', ShapeUtil.computeStrides(input.dims), 'offsetInput')}
-//     output[global_id.x] = input[offsetInput];
-//   }`;
-//   return {
-//     ...sliceProgramMetadata,
-//     outputs: [{dims: outputShape, type: input.type, gpuDataType: GpuDataType.default}],
-//     shaderSource,
-//     dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
-//   };
-// };
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 1) {
-//     throw new Error('Slice requires 1 input.');
-//   }
-//   if (NUMBER_TYPES.indexOf(inputs[0].type) === -1) {
-//     throw new Error('Invalid input type.');
-//   }
-// };
-
-// export const sliceV10 = async(inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> => {
-//   validateInputsV10(inputs);
-//   const attributes = generateSliceAttributesFromInputs(inferenceHandler, inputs);
-//   return inferenceHandler.run(
-//       {
-//         ...sliceProgramMetadata,
-//         cacheHint: attributes.cacheKey,
-//         get: () => createSliceProgramInfo(inputs[0], attributes)
-//       },
-//       [inputs[0]]);
-// };
-
-// const generateSliceAttributesFromInputs =
-//     (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): SliceAttributes => {
-//       if (!inferenceHandler.session.isInitializer(inputs[1].dataId) ||
-//           !inferenceHandler.session.isInitializer(inputs[2].dataId) ||
-//           (inputs.length >= 4 && !inferenceHandler.session.isInitializer(inputs[3].dataId)) ||
-//           (inputs.length >= 5 && !inferenceHandler.session.isInitializer(inputs[4].dataId))) {
-//         throw new Error('dynamic slice attributes are not allowed');
-//       }
-
-//       if (inputs.length >= 5 && inputs[4].integerData.some((i: number) => i !== 1)) {
-//         throw new Error('currently non-1 steps is not supported for Slice');
-//       }
-
-//       const starts = Array.from(inputs[1].integerData);
-//       const ends = Array.from(inputs[2].integerData);
-//       const axes = inputs.length >= 4 ? Array.from(inputs[3].integerData) : [];
-//       const cacheKey = `${axes};${starts};${ends}`;
-//       return {starts, ends, axes, cacheKey};
-//     };
-
-// const validateInputsV10 = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length < 3 || inputs.length > 5) {
-//     throw new Error('Invalid input number.');
-//   }
-//   if (inputs[1].type !== 'int32' || inputs[1].dims.length !== 1) {
-//     throw new Error('Invalid input type.');
-//   }
-//   if (inputs[2].type !== 'int32' || inputs[2].dims.length !== 1) {
-//     throw new Error('Invalid input type.');
-//   }
-//   if (inputs.length >= 4 && (inputs[3].type !== 'int32' || inputs[3].dims.length !== 1)) {
-//     throw new Error('Invalid input type.');
-//   }
-//   if (inputs.length >= 5 && (inputs[4].type !== 'int32' || inputs[4].dims.length !== 1)) {
-//     throw new Error('Invalid input type.');
-//   }
-// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts b/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
deleted file mode 100644
index f0509c34a06a5..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/ops/squeeze.ts
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// import {Graph} from '../../../graph';
-// import {OperatorImplementation, OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-
-// export const squeeze: OperatorImplementation<number[]> =
-//     (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
-//       validateInputs(inputs);
-//       const outputShape = ShapeUtil.squeezeShape(inputs[0].dims, axes);
-//       const output = inferenceHandler.reshape(inputs[0], outputShape);
-//       return [output];
-//     };
-
-// export const squeezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
-//   validateInputsV13(inputs);
-//   return squeeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
-// };
-
-// export const parseSqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
-//     node.attributes.getInts('axes');
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 1) {
-//     throw new Error('Squeeze requires 1 input.');
-//   }
-
-//   if (inputs[0].type === 'string') {
-//     throw new Error('invalid input tensor types.');
-//   }
-// };
-
-// const validateInputsV13 = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 2) {
-//     throw new Error('Squeeze requires 2 inputs.');
-//   }
-
-//   if (inputs[1].type !== 'int32') {
-//     throw new Error('Invalid input type.');
-//   }
-// };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index 7334dbd601f41..93643914609ee 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -174,56 +174,21 @@ export const erf = (context: ComputeContext): number => {
   return 0;
 };
 
-// export const exp = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Exp', 'exp'), inputs);
-
 export const floor = (context: ComputeContext): number => {
   context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Floor', 'floor'));
   return 0;
 };
 
-// export interface LeakyReluAttributes extends AttributeWithCacheKey {
-//   readonly alpha: number;
-// }
-
-// export const leakyRelu = async(handler: WebGpuInferenceHandler, inputs: Tensor[], attributes: EluAttributes):
-//                              Promise<Tensor[] >=>handler.run(
-//                                  createElementwiseProgramInfoLoader(
-//                                      inputs[0], 'LeakyRelu', a => `leaky_relu_vf32(${a})`, `
-//     let leaky_relu_alpha_: f32 = f32(${attributes.alpha});
-
-//     fn leaky_relu_f32(a: f32) -> f32 {
-//       return select(a, a * leaky_relu_alpha_, a < 0.0);
-//     }
-
-//     fn leaky_relu_vf32(v: vec4<f32>) -> vec4<f32> {
-//       return vec4(leaky_relu_f32(v.x), leaky_relu_f32(v.y), leaky_relu_f32(v.z), leaky_relu_f32(v.w));
-//     }`,
-//                                      attributes.cacheKey),
-//                                  inputs);
-
-// export const parseLeakyReluAttributes = (node: Graph.Node): LeakyReluAttributes =>
-//     createAttributeWithCacheKey({alpha: node.attributes.getFloat('alpha', 0.01)});
-
-// export const log = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[]> =>
-//     handler.run(createElementwiseProgramInfoLoader(inputs[0], 'Log', 'log'), inputs);
-
 export const neg = (context: ComputeContext): number => {
   context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Neg', a => `-${a}`));
   return 0;
 };
 
-// // export const not = (handler: WebGLInferenceHandler, inputs: Tensor[]):
-// //     Tensor[] => [handler.run(createElementwiseProgramInfoLoader(handler, inputs[0], glslNot()), inputs)];
-
 export const reciprocal = (context: ComputeContext): number => {
   context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Reciprocal', a => `1.0/${a}`));
   return 0;
 };
 
-// export const relu = async(handler: WebGpuInferenceHandler, inputs: Tensor[]): Promise<Tensor[] >=>handler.run(
-//     createElementwiseProgramInfoLoader(inputs[0], 'Relu', a => `max(${a}, vec4(0.0))`), inputs);
-
 export const sigmoid = (context: ComputeContext): number => {
   context.compute(createElementwiseProgramInfoLoader(context.inputs[0], 'Sigmoid', a => `(1.0 / (1.0 + exp(-${a})))`));
   return 0;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts b/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
deleted file mode 100644
index 2cefbe72bc8a5..0000000000000
--- a/js/web/lib/wasm/jsep/webgpu/ops/unsqueeze.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// import {Graph} from '../../../graph';
-// import {OperatorInitialization} from '../../../operators';
-// import {Tensor} from '../../../tensor';
-// import {ShapeUtil} from '../../../util';
-// import {WebGpuInferenceHandler} from '../inference-handler';
-
-// export const unsqueeze = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[], axes: number[]): Tensor[] => {
-//   validateInputs(inputs);
-//   const outputShape = ShapeUtil.unsqueezeShape(inputs[0].dims, axes);
-//   const output = inferenceHandler.reshape(inputs[0], outputShape);
-//   return [output];
-// };
-
-// export const unsqueezeV13 = (inferenceHandler: WebGpuInferenceHandler, inputs: Tensor[]): Tensor[] => {
-//   validateInputsV13(inputs);
-//   return unsqueeze(inferenceHandler, [inputs[0]], Array.from(inputs[1].integerData));
-// };
-
-// export const parseUnsqueezeAttributes: OperatorInitialization<number[]> = (node: Graph.Node): number[] =>
-//     node.attributes.getInts('axes');
-
-// const validateInputs = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 1) {
-//     throw new Error('Unsqueeze requires 1 input.');
-//   }
-
-//   if (inputs[0].type === 'string') {
-//     throw new Error('invalid input tensor types.');
-//   }
-// };
-
-// const validateInputsV13 = (inputs: Tensor[]): void => {
-//   if (!inputs || inputs.length !== 2) {
-//     throw new Error('Unsqueeze requires 2 inputs.');
-//   }
-
-//   if (inputs[1].type !== 'int32') {
-//     throw new Error('Invalid input type.');
-//   }
-// };

From ddacc4704e7dd2086771a66043bffd7f1cb7d086 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Apr 2023 18:43:35 -0700
Subject: [PATCH 65/81] code clean

---
 js/web/lib/wasm/jsep/init.ts         | 1 -
 js/web/lib/wasm/jsep/webgpu/types.ts | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index f75e6fb730302..003763d0e7374 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -93,7 +93,6 @@ class OpKernelContext implements ComputeContext {
 }
 
 export const init = async(module: OrtWasmModule): Promise<void> => {
-  // init JSEP if available
   const init = module.jsepInit;
   if (init && navigator.gpu) {
     const backend = new WebGpuBackend();
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index db841b5e890c2..9cf8487fabd04 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -40,9 +40,6 @@ export interface ProgramMetadata {
    */
   name: string;
 
-  // inputLayouts: GPUBindGroupLayoutEntry[];
-  // outputLayouts: GPUBindGroupLayoutEntry[];
-
   /**
    * gpu data types for each input
    */
@@ -94,7 +91,6 @@ export interface ProgramInfo extends ProgramMetadata {
 export interface Artifact {
   programInfo: ProgramInfo;
   computePipeline: GPUComputePipeline;
-  // attribLocations: {position: number; textureCoord: number};
 }
 
 export interface ComputeContextInputsOutputsMapping {

From 28fc10a4593844622d11e719b370bce0072e8fbb Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 5 Apr 2023 14:25:58 -0700
Subject: [PATCH 66/81] use verbose log in test runner

---
 js/web/test/test-runner.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 8f9784de053d4..26ebcbbd6e212 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -502,10 +502,10 @@ export async function runModelTestSet(
     const feeds: Record<string, ort.Tensor> = {};
     testCase.inputs!.forEach((tensor, i) => feeds[context.session.inputNames[i]] = tensor);
     const start = now();
-    console.log(`[_BEFORE_SESSION_RUN_] ${start}`);
+    Logger.verbose('TestRunner', `Timestamp before session run: ${start}`);
     const outputs = await context.session.run(feeds);
     const end = now();
-    console.log(`[_AFTER_SESSION_RUN_] ${end}`);
+    Logger.verbose('TestRunner', `Timestamp after session run: ${end}`);
     if (context.perfData.count === 0) {
       context.perfData.firstRun = end - start;
     } else {

From b5e6a732363695e6d45ccb3164d7ac3f24cf6e44 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 5 Apr 2023 14:26:44 -0700
Subject: [PATCH 67/81] use OrtMemTypeCPU only for cpu

---
 onnxruntime/core/providers/js/allocator.h     | 17 +++------
 .../providers/js/js_execution_provider.cc     | 35 ++++++-------------
 .../core/providers/js/operators/reshape.cc    |  6 ++--
 .../core/providers/js/operators/shape_op.cc   |  6 ++--
 .../core/providers/js/operators/unary.cc      | 12 +++----
 5 files changed, 27 insertions(+), 49 deletions(-)

diff --git a/onnxruntime/core/providers/js/allocator.h b/onnxruntime/core/providers/js/allocator.h
index 1c57540c24b97..6aa8313c01f38 100644
--- a/onnxruntime/core/providers/js/allocator.h
+++ b/onnxruntime/core/providers/js/allocator.h
@@ -9,22 +9,13 @@
 namespace onnxruntime {
 namespace js {
 
-class JsCPUInputAllocator : public CPUAllocator {
+class JsCPUAllocator : public CPUAllocator {
  public:
-  JsCPUInputAllocator()
+  JsCPUAllocator()
       : CPUAllocator(
-            OrtMemoryInfo("JsCPUInputAllocator", OrtAllocatorType::OrtDeviceAllocator,
+            OrtMemoryInfo("JsCPUAllocator", OrtAllocatorType::OrtDeviceAllocator,
                           OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
-                          0, OrtMemTypeCPUInput)){};
-};
-
-class JsCPUOutputAllocator : public CPUAllocator {
- public:
-  JsCPUOutputAllocator()
-      : CPUAllocator(
-            OrtMemoryInfo("JsCPUOutputAllocator", OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
-                          0, OrtMemTypeCPUOutput)){};
+                          0, OrtMemTypeCPU)){};
 };
 
 class JsCustomAllocator : public IAllocator {
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 1f44125f0ef9f..825ba0e861d75 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -45,7 +45,7 @@ ONNX_OPERATOR_KERNEL_EX(
     1,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .InputMemoryType(OrtMemTypeCPUInput, 0)
+        .InputMemoryType(OrtMemTypeCPU, 0)
         .ExecQueueId(0)
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
     Memcpy);
@@ -56,7 +56,7 @@ ONNX_OPERATOR_KERNEL_EX(
     1,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .OutputMemoryType(OrtMemTypeCPUOutput, 0)
+        .OutputMemoryType(OrtMemTypeCPU, 0)
         .ExecQueueId(1)
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
     Memcpy);
@@ -290,30 +290,17 @@ JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info)
 // implement RegisterAllocator to test/validate sharing the CPU EP's allocator
 void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
   OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
-  auto cpu_input_alloc = GetAllocator(OrtMemTypeCPUInput);
-  if (!cpu_input_alloc) {
-    cpu_input_alloc = allocator_manager.GetAllocator(OrtMemTypeCPUInput, cpu_device);
-    if (!cpu_input_alloc) {
-      AllocatorCreationInfo cpuInputAllocatorCreationInfo([&](int) {
-        return std::make_unique<js::JsCPUInputAllocator>();
+  auto cpu_alloc = GetAllocator(OrtMemTypeCPU);
+  if (!cpu_alloc) {
+    cpu_alloc = allocator_manager.GetAllocator(OrtMemTypeCPU, cpu_device);
+    if (!cpu_alloc) {
+      AllocatorCreationInfo cpuAllocatorCreationInfo([&](int) {
+        return std::make_unique<js::JsCPUAllocator>();
       });
-      cpu_input_alloc = CreateAllocator(cpuInputAllocatorCreationInfo);
-      allocator_manager.InsertAllocator(cpu_input_alloc);
+      cpu_alloc = CreateAllocator(cpuAllocatorCreationInfo);
+      allocator_manager.InsertAllocator(cpu_alloc);
     }
-    InsertAllocator(cpu_input_alloc);
-  }
-
-  auto cpu_output_alloc = GetAllocator(OrtMemTypeCPUOutput);
-  if (!cpu_output_alloc) {
-    cpu_output_alloc = allocator_manager.GetAllocator(OrtMemTypeCPUOutput, cpu_device);
-    if (!cpu_output_alloc) {
-      AllocatorCreationInfo cpuOutputAllocatorCreationInfo([&](int) {
-        return std::make_unique<js::JsCPUOutputAllocator>();
-      });
-      cpu_output_alloc = CreateAllocator(cpuOutputAllocatorCreationInfo);
-      allocator_manager.InsertAllocator(cpu_output_alloc);
-    }
-    InsertAllocator(cpu_output_alloc);
+    InsertAllocator(cpu_alloc);
   }
 
   OrtDevice custom_device{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0};
diff --git a/onnxruntime/core/providers/js/operators/reshape.cc b/onnxruntime/core/providers/js/operators/reshape.cc
index 023c1cd022abc..d8959c89f3fe7 100644
--- a/onnxruntime/core/providers/js/operators/reshape.cc
+++ b/onnxruntime/core/providers/js/operators/reshape.cc
@@ -15,7 +15,7 @@ ONNX_OPERATOR_KERNEL_EX(
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
         .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
         .Alias(0, 0)
-        .InputMemoryType(OrtMemTypeCPUInput, 1),
+        .InputMemoryType(OrtMemTypeCPU, 1),
     Reshape);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -27,7 +27,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
         .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
         .Alias(0, 0)
-        .InputMemoryType(OrtMemTypeCPUInput, 1),
+        .InputMemoryType(OrtMemTypeCPU, 1),
     Reshape);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -39,7 +39,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
         .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
         .Alias(0, 0)
-        .InputMemoryType(OrtMemTypeCPUInput, 1),
+        .InputMemoryType(OrtMemTypeCPU, 1),
     Reshape);
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/shape_op.cc b/onnxruntime/core/providers/js/operators/shape_op.cc
index 42710d26bb09d..ec0de3c04a11e 100644
--- a/onnxruntime/core/providers/js/operators/shape_op.cc
+++ b/onnxruntime/core/providers/js/operators/shape_op.cc
@@ -14,7 +14,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
         // properly force CPU/GPU synch inside the kernel
-        .OutputMemoryType(OrtMemTypeCPUInput, 0)
+        .OutputMemoryType(OrtMemTypeCPU, 0)
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Shape);
@@ -26,7 +26,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
         // properly force CPU/GPU synch inside the kernel
-        .OutputMemoryType(OrtMemTypeCPUInput, 0)
+        .OutputMemoryType(OrtMemTypeCPU, 0)
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Shape);
@@ -38,7 +38,7 @@ ONNX_OPERATOR_KERNEL_EX(
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
         // properly force CPU/GPU synch inside the kernel
-        .OutputMemoryType(OrtMemTypeCPUInput, 0)
+        .OutputMemoryType(OrtMemTypeCPU, 0)
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Shape);
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index 5ed76972dc363..79f2b074f005a 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -97,20 +97,20 @@ JSEP_KERNEL_IMPL(Clip, Clip)
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(Clip, kOnnxDomain, 11, 11, kJsExecutionProvider,
     KernelDefBuilder()
       .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-      .InputMemoryType(OrtMemTypeCPUInput, 1)
-      .InputMemoryType(OrtMemTypeCPUInput, 2),
+      .InputMemoryType(OrtMemTypeCPU, 1)
+      .InputMemoryType(OrtMemTypeCPU, 2),
     Clip);
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(Clip, kOnnxDomain, 12, 12, kJsExecutionProvider,
     KernelDefBuilder()
       .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-      .InputMemoryType(OrtMemTypeCPUInput, 1)
-      .InputMemoryType(OrtMemTypeCPUInput, 2),
+      .InputMemoryType(OrtMemTypeCPU, 1)
+      .InputMemoryType(OrtMemTypeCPU, 2),
     Clip);
 ONNX_OPERATOR_KERNEL_EX(Clip, kOnnxDomain, 13, kJsExecutionProvider,
     KernelDefBuilder()
       .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-      .InputMemoryType(OrtMemTypeCPUInput, 1)
-      .InputMemoryType(OrtMemTypeCPUInput, 2),
+      .InputMemoryType(OrtMemTypeCPU, 1)
+      .InputMemoryType(OrtMemTypeCPU, 2),
     Clip);
 
 JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(Elu, Elu, alpha, 1.0)

From 75231d9283707fd079769dde1cc963e7b264b01c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 5 Apr 2023 16:14:11 -0700
Subject: [PATCH 68/81] use debug logging

---
 js/web/lib/wasm/jsep/backend-webgpu.ts        |  10 +-
 js/web/lib/wasm/jsep/init.ts                  |  29 +---
 js/web/lib/wasm/jsep/log.ts                   |  38 +++++
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  |  18 +-
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  |   8 +-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts       |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts     |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  |   2 +-
 .../lib/wasm/jsep/webgpu/program-manager.ts   |   8 +-
 js/web/lib/wasm/session-handler.ts            |  22 +--
 js/web/lib/wasm/wasm-common.ts                | 158 ++++++++++++++++++
 js/web/lib/wasm/wasm-core-impl.ts             | 125 +-------------
 14 files changed, 227 insertions(+), 199 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep/log.ts
 create mode 100644 js/web/lib/wasm/wasm-common.ts

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 68e80c4fa0369..90bfc6c28d5a9 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -3,6 +3,7 @@
 
 import {env} from 'onnxruntime-common';
 
+import {LOG_DEBUG} from './log';
 import {TensorView} from './tensor';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
@@ -238,6 +239,10 @@ export class WebGpuBackend {
       this.programManager.setArtifact(key, artifact);
     }
 
+    LOG_DEBUG(
+        'info',
+        () => `[ProgramManager] run "${programInfo.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
+            normalizedDispatchGroup[1]}x${normalizedDispatchGroup[2]}`);
     this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup);
 
     return outputTensorViews;
@@ -301,10 +306,7 @@ export class WebGpuBackend {
       attributes[0] = undefined;
     }
 
-    if (env.debug) {
-      // eslint-disable-next-line no-console
-      console.log(`[js] Start to run kernel "${name}"...`);
-    }
+    LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "${name}"...`);
 
     this.temporaryData = [];
     try {
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 003763d0e7374..635fec7d1ada7 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -1,12 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {env} from 'onnxruntime-common';
-
 import {OrtWasmModule} from '../binding/ort-wasm';
-import {getTensorElementSize} from '../wasm-core-impl';
+import {getTensorElementSize} from '../wasm-common';
 
 import {WebGpuBackend} from './backend-webgpu';
+import {LOG_DEBUG} from './log';
 import {TensorView} from './tensor';
 import {ShapeUtil} from './util';
 import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
@@ -111,17 +110,10 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
         // jsepCopy(src, dst, size, isSourceGpu)
         (src: number, dst: number, size: number, isSourceGpu = false) => {
           if (isSourceGpu) {
-            if (env.debug) {
-              // eslint-disable-next-line no-console
-              console.log(`[js][${performance.now()}] jsepCopyGpuToGpu: src=${src}, dst=${dst}, size=${size}`);
-            }
+            LOG_DEBUG('verbose', () => `[WebGPU] jsepCopyGpuToGpu: src=${src}, dst=${dst}, size=${size}`);
             backend.memcpy(src, dst);
           } else {
-            if (env.debug) {
-              // eslint-disable-next-line no-console
-              console.log(
-                  `[js][${performance.now()}] jsepCopyCpuToGpu: dataOffset=${src}, gpuDataId=${dst}, size=${size}`);
-            }
+            LOG_DEBUG('verbose', () => `[WebGPU] jsepCopyCpuToGpu: dataOffset=${src}, gpuDataId=${dst}, size=${size}`);
             const data = module.HEAPU8.subarray(src, src + size);
             backend.upload(dst, data);
           }
@@ -132,11 +124,9 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
             Promise<void> => {
               const data = module.HEAPU8.subarray(dataOffset, dataOffset + size);
 
-              if (env.debug) {
-                // eslint-disable-next-line no-console
-                console.log(`[js][${performance.now()}] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${
-                    dataOffset}, size=${size}`);
-              }
+              LOG_DEBUG(
+                  'verbose',
+                  () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
 
               await backend.download(gpuDataId, data);
             },
@@ -149,10 +139,7 @@ export const init = async(module: OrtWasmModule): Promise<void> => {
 
         // jsepRun
         (kernel: number, contextDataOffset: number) => {
-          if (env.debug) {
-            // eslint-disable-next-line no-console
-            console.log(`[js][${performance.now()}] jsepRun on ${contextDataOffset}`);
-          }
+          LOG_DEBUG('verbose', () => `[WebGPU] jsepRun: kernel=${kernel}, contextDataOffset=${contextDataOffset}`);
           const context = new OpKernelContext(module, backend, contextDataOffset);
           return backend.computeKernel(kernel, context);
         });
diff --git a/js/web/lib/wasm/jsep/log.ts b/js/web/lib/wasm/jsep/log.ts
new file mode 100644
index 0000000000000..2e27e4905742e
--- /dev/null
+++ b/js/web/lib/wasm/jsep/log.ts
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {env} from 'onnxruntime-common';
+
+import {logLevelStringToEnum} from '../wasm-common';
+
+type LogLevel = NonNullable<typeof env.logLevel>;
+type MessageString = string;
+type MessageFunction = () => string;
+type Message = MessageString|MessageFunction;
+
+const logLevelPrefix = ['V', 'I', 'W', 'E', 'F'];
+
+const doLog = (level: number, message: string): void => {
+  // eslint-disable-next-line no-console
+  console.log(`[${logLevelPrefix[level]},${new Date().toISOString()}]${message}`);
+};
+
+/**
+ * A simple logging utility to log messages to the console.
+ */
+export const LOG = (logLevel: LogLevel, msg: Message): void => {
+  const messageLevel = logLevelStringToEnum(logLevel);
+  const configLevel = logLevelStringToEnum(env.logLevel!);
+  if (messageLevel >= configLevel) {
+    doLog(messageLevel, typeof msg === 'function' ? msg() : msg);
+  }
+};
+
+/**
+ * A simple logging utility to log messages to the console. Only logs when debug is enabled.
+ */
+export const LOG_DEBUG: typeof LOG = (...args: Parameters<typeof LOG>) => {
+  if (env.debug) {
+    LOG(...args);
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index c3d565e9a47b8..076ec8ca7b5ec 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -1,9 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {env} from 'onnxruntime-common';
-
 import {WebGpuBackend} from '../backend-webgpu';
+import {LOG_DEBUG} from '../log';
 
 import {GpuData, GpuDataId, GpuDataType} from './types';
 
@@ -115,10 +114,7 @@ class GpuDataManagerImpl implements GpuDataManager {
     this.backend.endComputePass();
     commandEncoder.copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size);
 
-    if (env.debug) {
-      // eslint-disable-next-line no-console
-      console.log(`[js] GpuDataManager.upload(id=${id})`);
-    }
+    LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.upload(id=${id})`);
 
     this.buffersForUploadingPending.push(gpuBufferForUploading);
   }
@@ -158,10 +154,7 @@ class GpuDataManagerImpl implements GpuDataManager {
     const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
     this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
 
-    if (env.debug) {
-      // eslint-disable-next-line no-console
-      console.log(`[js] GpuDataManager.create(size=${size}) => id=${gpuData.id}`);
-    }
+    LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.create(size=${size}) => id=${gpuData.id}`);
     return gpuData;
   }
 
@@ -175,10 +168,7 @@ class GpuDataManagerImpl implements GpuDataManager {
       throw new Error('releasing data does not exist');
     }
 
-    if (env.debug) {
-      // eslint-disable-next-line no-console
-      console.log(`[js] GpuDataManager.release(id=${id}), gpuDataId=${cachedData.gpuData.id}`);
-    }
+    LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.release(id=${id}), gpuDataId=${cachedData.gpuData.id}`);
 
     this.storageCache.delete(id);
     this.buffersPending.push(cachedData.gpuData.buffer);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 1858b140223f1..b77e9bea7b871 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -19,8 +19,7 @@
 //
 // modified to fit the needs of the project
 
-import {env} from 'onnxruntime-common';
-
+import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
@@ -178,10 +177,7 @@ export const createConv2DMatMulProgramInfo =
         Math.ceil(batchSize / workGroupSize[2] / elementsPerThread[1])
       ];
 
-      if (env.debug) {
-        // eslint-disable-next-line no-console
-        console.log(`dispatch = ${dispatch}`);
-      }
+      LOG_DEBUG('verbose', () => `[conv2d_mm_webgpu] dispatch = ${dispatch}`);
 
       const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : elementsPerThread[0];
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 192b17696185d..f333d44ea499d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-core-impl';
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {PoolConvUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 6759c58374001..16327a18503ff 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-core-impl';
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {GemmUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index d6225473371b2..e78ecfa53d805 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-core-impl';
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {BroadcastUtil, ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index d73b07911e8cf..5c905ce1ce705 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-core-impl';
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {PoolConvUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 7cdd8df26d6a7..24a2d7bf8c0e3 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-core-impl';
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index 088a1fffcda29..951e76de5449e 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -1,9 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {env} from 'onnxruntime-common';
-
 import {WebGpuBackend} from '../backend-webgpu';
+import {LOG_DEBUG} from '../log';
 
 import {createShaderHelper} from './ops/common';
 import {Artifact, GpuData, ProgramInfo} from './types';
@@ -113,10 +112,7 @@ export class ProgramManager {
 
     const code = programInfo.getShaderSource(createShaderHelper(normalizedDispatchGroupSize));
     const shaderModule = device.createShaderModule({code});
-    if (env.debug) {
-      // eslint-disable-next-line no-console
-      console.log(`WebGpuProgram: ${code}`);
-    }
+    LOG_DEBUG('verbose', () => `[WebGPU] shader code: ${code}`);
 
     const computePipeline =
         device.createComputePipeline({compute: {module: shaderModule, entryPoint: 'main'}, layout: 'auto'});
diff --git a/js/web/lib/wasm/session-handler.ts b/js/web/lib/wasm/session-handler.ts
index a507b09e89315..038a46d82e61a 100644
--- a/js/web/lib/wasm/session-handler.ts
+++ b/js/web/lib/wasm/session-handler.ts
@@ -7,28 +7,10 @@ import {promisify} from 'util';
 
 import {SerializableModeldata} from './proxy-messages';
 import {createSession, createSessionAllocate, createSessionFinalize, endProfiling, initOrt, releaseSession, run} from './proxy-wrapper';
+import {logLevelStringToEnum} from './wasm-common';
 
 let ortInit: boolean;
 
-
-const getLogLevel = (logLevel: 'verbose'|'info'|'warning'|'error'|'fatal'): number => {
-  switch (logLevel) {
-    case 'verbose':
-      return 0;
-    case 'info':
-      return 1;
-    case 'warning':
-      return 2;
-    case 'error':
-      return 3;
-    case 'fatal':
-      return 4;
-    default:
-      throw new Error(`unsupported logging level: ${logLevel}`);
-  }
-};
-
-
 export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
   private sessionId: number;
 
@@ -45,7 +27,7 @@ export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
 
   async loadModel(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions): Promise<void> {
     if (!ortInit) {
-      await initOrt(env.wasm.numThreads!, getLogLevel(env.logLevel!));
+      await initOrt(env.wasm.numThreads!, logLevelStringToEnum(env.logLevel!));
       ortInit = true;
     }
 
diff --git a/js/web/lib/wasm/wasm-common.ts b/js/web/lib/wasm/wasm-common.ts
new file mode 100644
index 0000000000000..d0df08419fb5d
--- /dev/null
+++ b/js/web/lib/wasm/wasm-common.ts
@@ -0,0 +1,158 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from 'onnxruntime-common';
+
+/**
+ * Copied from ONNX definition. Use this to drop dependency 'onnx_proto' to decrease compiled .js file size.
+ */
+export const enum DataType {
+  undefined = 0,
+  float = 1,
+  uint8 = 2,
+  int8 = 3,
+  uint16 = 4,
+  int16 = 5,
+  int32 = 6,
+  int64 = 7,
+  string = 8,
+  bool = 9,
+  float16 = 10,
+  double = 11,
+  uint32 = 12,
+  uint64 = 13,
+  complex64 = 14,
+  complex128 = 15,
+  bfloat16 = 16
+}
+
+/**
+ * Map string tensor data to enum value
+ */
+export const tensorDataTypeStringToEnum = (type: string): DataType => {
+  switch (type) {
+    case 'int8':
+      return DataType.int8;
+    case 'uint8':
+      return DataType.uint8;
+    case 'bool':
+      return DataType.bool;
+    case 'int16':
+      return DataType.int16;
+    case 'uint16':
+      return DataType.uint16;
+    case 'int32':
+      return DataType.int32;
+    case 'uint32':
+      return DataType.uint32;
+    case 'float32':
+      return DataType.float;
+    case 'float64':
+      return DataType.double;
+    case 'string':
+      return DataType.string;
+    case 'int64':
+      return DataType.int64;
+    case 'uint64':
+      return DataType.uint64;
+
+    default:
+      throw new Error(`unsupported data type: ${type}`);
+  }
+};
+
+/**
+ * Map enum value to string tensor data
+ */
+export const tensorDataTypeEnumToString = (typeProto: DataType): Tensor.Type => {
+  switch (typeProto) {
+    case DataType.int8:
+      return 'int8';
+    case DataType.uint8:
+      return 'uint8';
+    case DataType.bool:
+      return 'bool';
+    case DataType.int16:
+      return 'int16';
+    case DataType.uint16:
+      return 'uint16';
+    case DataType.int32:
+      return 'int32';
+    case DataType.uint32:
+      return 'uint32';
+    case DataType.float:
+      return 'float32';
+    case DataType.double:
+      return 'float64';
+    case DataType.string:
+      return 'string';
+    case DataType.int64:
+      return 'int64';
+    case DataType.uint64:
+      return 'uint64';
+
+    default:
+      throw new Error(`unsupported data type: ${typeProto}`);
+  }
+};
+
+/**
+ * get tensor element size in bytes by the given data type
+ * @returns size in integer or undefined if the data type is not supported
+ */
+export const getTensorElementSize = (dateType: number): number|
+    undefined => [undefined, 4, 1, 1, 2, 2, 4, 8, undefined, 1, 2, 8, 4, 8, undefined, undefined, undefined][dateType];
+
+/**
+ * get typed array constructor by the given tensor type
+ */
+export const tensorTypeToTypedArrayConstructor = (type: Tensor.Type): Float32ArrayConstructor|Uint8ArrayConstructor|
+    Int8ArrayConstructor|Uint16ArrayConstructor|Int16ArrayConstructor|Int32ArrayConstructor|BigInt64ArrayConstructor|
+    Uint8ArrayConstructor|Float64ArrayConstructor|Uint32ArrayConstructor|BigUint64ArrayConstructor => {
+      switch (type) {
+        case 'float32':
+          return Float32Array;
+        case 'uint8':
+          return Uint8Array;
+        case 'int8':
+          return Int8Array;
+        case 'uint16':
+          return Uint16Array;
+        case 'int16':
+          return Int16Array;
+        case 'int32':
+          return Int32Array;
+        case 'bool':
+          return Uint8Array;
+        case 'float64':
+          return Float64Array;
+        case 'uint32':
+          return Uint32Array;
+        case 'int64':
+          return BigInt64Array;
+        case 'uint64':
+          return BigUint64Array;
+        default:
+          throw new Error(`unsupported type: ${type}`);
+      }
+    };
+
+/**
+ * Map string log level to integer value
+ */
+export const logLevelStringToEnum = (logLevel: 'verbose'|'info'|'warning'|'error'|'fatal'): number => {
+  switch (logLevel) {
+    case 'verbose':
+      return 0;
+    case 'info':
+      return 1;
+    case 'warning':
+      return 2;
+    case 'error':
+      return 3;
+    case 'fatal':
+      return 4;
+    default:
+      throw new Error(`unsupported logging level: ${logLevel}`);
+  }
+};
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index d0b342996c3e5..02aa88760e954 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -7,6 +7,7 @@ import {SerializableModeldata, SerializableSessionMetadata, SerializableTensor}
 import {setRunOptions} from './run-options';
 import {setSessionOptions} from './session-options';
 import {allocWasmString} from './string-utils';
+import {tensorDataTypeEnumToString, tensorDataTypeStringToEnum, tensorTypeToTypedArrayConstructor} from './wasm-common';
 import {getInstance} from './wasm-factory';
 
 /**
@@ -115,128 +116,6 @@ export const releaseSession = (sessionId: number): void => {
   activeSessions.delete(sessionId);
 };
 
-/**
- * Copied from ONNX definition. Use this to drop dependency 'onnx_proto' to decrease compiled .js file size.
- */
-export const enum DataType {
-  undefined = 0,
-  float = 1,
-  uint8 = 2,
-  int8 = 3,
-  uint16 = 4,
-  int16 = 5,
-  int32 = 6,
-  int64 = 7,
-  string = 8,
-  bool = 9,
-  float16 = 10,
-  double = 11,
-  uint32 = 12,
-  uint64 = 13,
-  complex64 = 14,
-  complex128 = 15,
-  bfloat16 = 16
-}
-
-export const getTensorElementSize = (dateType: number): number|
-    undefined => [undefined, 4, 1, 1, 2, 2, 4, 8, undefined, 1, 2, 8, 4, 8, undefined, undefined, undefined][dateType];
-
-
-const tensorDataTypeStringToEnum = (type: string): DataType => {
-  switch (type) {
-    case 'int8':
-      return DataType.int8;
-    case 'uint8':
-      return DataType.uint8;
-    case 'bool':
-      return DataType.bool;
-    case 'int16':
-      return DataType.int16;
-    case 'uint16':
-      return DataType.uint16;
-    case 'int32':
-      return DataType.int32;
-    case 'uint32':
-      return DataType.uint32;
-    case 'float32':
-      return DataType.float;
-    case 'float64':
-      return DataType.double;
-    case 'string':
-      return DataType.string;
-    case 'int64':
-      return DataType.int64;
-    case 'uint64':
-      return DataType.uint64;
-
-    default:
-      throw new Error(`unsupported data type: ${type}`);
-  }
-};
-
-const tensorDataTypeEnumToString = (typeProto: DataType): Tensor.Type => {
-  switch (typeProto) {
-    case DataType.int8:
-      return 'int8';
-    case DataType.uint8:
-      return 'uint8';
-    case DataType.bool:
-      return 'bool';
-    case DataType.int16:
-      return 'int16';
-    case DataType.uint16:
-      return 'uint16';
-    case DataType.int32:
-      return 'int32';
-    case DataType.uint32:
-      return 'uint32';
-    case DataType.float:
-      return 'float32';
-    case DataType.double:
-      return 'float64';
-    case DataType.string:
-      return 'string';
-    case DataType.int64:
-      return 'int64';
-    case DataType.uint64:
-      return 'uint64';
-
-    default:
-      throw new Error(`unsupported data type: ${typeProto}`);
-  }
-};
-
-const numericTensorTypeToTypedArray = (type: Tensor.Type): Float32ArrayConstructor|Uint8ArrayConstructor|
-    Int8ArrayConstructor|Uint16ArrayConstructor|Int16ArrayConstructor|Int32ArrayConstructor|BigInt64ArrayConstructor|
-    Uint8ArrayConstructor|Float64ArrayConstructor|Uint32ArrayConstructor|BigUint64ArrayConstructor => {
-      switch (type) {
-        case 'float32':
-          return Float32Array;
-        case 'uint8':
-          return Uint8Array;
-        case 'int8':
-          return Int8Array;
-        case 'uint16':
-          return Uint16Array;
-        case 'int16':
-          return Int16Array;
-        case 'int32':
-          return Int32Array;
-        case 'bool':
-          return Uint8Array;
-        case 'float64':
-          return Float64Array;
-        case 'uint32':
-          return Uint32Array;
-        case 'int64':
-          return BigInt64Array;
-        case 'uint64':
-          return BigUint64Array;
-        default:
-          throw new Error(`unsupported type: ${type}`);
-      }
-    };
-
 /**
  * perform inference run
  */
@@ -379,7 +258,7 @@ export const run = async(
               }
               output.push([type, dims, stringData]);
             } else {
-              const typedArrayConstructor = numericTensorTypeToTypedArray(type);
+              const typedArrayConstructor = tensorTypeToTypedArrayConstructor(type);
               const data = new typedArrayConstructor(size);
               new Uint8Array(data.buffer, data.byteOffset, data.byteLength)
                   .set(wasm.HEAPU8.subarray(dataOffset, dataOffset + data.byteLength));

From 6f0e42d2cb81e63389f8ab6f52c21184879c62db Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 6 Apr 2023 01:21:41 -0700
Subject: [PATCH 69/81] fix per kernel custom data

---
 js/web/lib/wasm/jsep/backend-webgpu.ts | 27 ++++++++++++++++++++++++--
 js/web/lib/wasm/jsep/init.ts           |  4 +++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 90bfc6c28d5a9..332a9d86f6646 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -55,11 +55,31 @@ export class WebGpuBackend {
   /**
    * a list of temporary GPU data for the current kernel. should release when the kernel done computation.
    */
-  temporaryData: GpuData[];
+  private temporaryData: GpuData[];
   /**
    * a KernelID -> a GPU data list, which stores persistent GPU data owned by the specific kernel.
    */
-  kernelPersistentData: Map<number, GpuData[]>;
+  private kernelPersistentData: Map<number, GpuData[]>;
+  /**
+   * a KernelID -> a custom data, which stores custom data owned by the specific kernel.
+   */
+  private kernelCustomData: Map<number, {[key: string]: unknown}>;
+  /**
+   * get the custom data of the current kernel
+   */
+  get currentKernelCustomData(): {[key: string]: unknown} {
+    if (this.currentKernelId === null) {
+      throw new Error('currentKernelCustomData(): currentKernelId is null. (should not happen)');
+    }
+
+    let data = this.kernelCustomData.get(this.currentKernelId);
+    if (!data) {
+      data = {};
+      this.kernelCustomData.set(this.currentKernelId, data);
+    }
+
+    return data;
+  }
 
   /**
    * a KernelID -> kernel info mapping. value is [ name, run function, [optional] preprocess_attribute_once function ]
@@ -105,6 +125,7 @@ export class WebGpuBackend {
     this.programManager = new ProgramManager(this);
     this.kernels = new Map();
     this.kernelPersistentData = new Map();
+    this.kernelCustomData = new Map();
     // TODO: set up flags
 
     this.device.onuncapturederror = ev => {
@@ -286,6 +307,8 @@ export class WebGpuBackend {
       }
       this.kernelPersistentData.delete(kernelId);
     }
+
+    this.kernelCustomData.delete(kernelId);
     this.kernels.delete(kernelId);
   }
 
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 635fec7d1ada7..4226a0ef46f57 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -32,7 +32,9 @@ class TensorViewImpl implements TensorView {
 class OpKernelContext implements ComputeContext {
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
-  readonly customData: {[key: string]: unknown} = {};
+  get customData(): {[key: string]: unknown} {
+    return this.backend.currentKernelCustomData;
+  }
   constructor(private module: OrtWasmModule, private backend: WebGpuBackend, contextDataOffset: number) {
     const heapU32 = module.HEAPU32;
 

From 21676c22239cd7adeb0624784e4c1ffeb3de215a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 6 Apr 2023 01:24:54 -0700
Subject: [PATCH 70/81] fix eslint

---
 js/web/lib/wasm/proxy-worker/main.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/proxy-worker/main.ts b/js/web/lib/wasm/proxy-worker/main.ts
index ef44ce416f451..9a247c56189a8 100644
--- a/js/web/lib/wasm/proxy-worker/main.ts
+++ b/js/web/lib/wasm/proxy-worker/main.ts
@@ -10,7 +10,7 @@ import {initializeWebAssembly} from '../wasm-factory';
 self.onmessage = (ev: MessageEvent<OrtWasmMessage>): void => {
   switch (ev.data.type) {
     case 'init-wasm':
-      initializeWebAssembly(ev.data.in!)
+      initializeWebAssembly(ev.data.in)
           .then(
               () => postMessage({type: 'init-wasm'} as OrtWasmMessage),
               err => postMessage({type: 'init-wasm', err} as OrtWasmMessage));

From 63d5a6a1a9f0cff478785f20ec0ae9d8cac05b8f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 6 Apr 2023 18:58:40 -0700
Subject: [PATCH 71/81] remove from default backends in test

---
 js/web/script/test-runner-cli-args.ts | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index 87507acc60d9b..e20c391513c67 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -360,10 +360,15 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
 
   // Option: -b=<...>, --backend=<...>
   const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack'];
+
+  // TODO: remove this when Chrome support WebGPU.
+  //       we need this for now because Chrome does not support webgpu yet,
+  //       and ChromeCanary is not in CI.
+  const defaultBrowserBackends = ['webgl', /* 'webgpu', */ 'wasm', 'xnnpack'];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
-  const backend =
-      (typeof backendArgs !== 'string') ? (env === 'node' ? nodejsBackends : browserBackends) : backendArgs.split(',');
+  const backend = (typeof backendArgs !== 'string') ? (env === 'node' ? nodejsBackends : defaultBrowserBackends) :
+                                                      backendArgs.split(',');
   for (const b of backend) {
     if ((env !== 'node' && browserBackends.indexOf(b) === -1) || (env === 'node' && nodejsBackends.indexOf(b) === -1)) {
       throw new Error(`backend ${b} is not supported in env ${env}`);

From 6d03488e12b071e8bf0cf9d1a045fdd2fccef84e Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 7 Apr 2023 15:19:27 -0700
Subject: [PATCH 72/81] use macro 'USE_JS' to wrap OrtRun

---
 onnxruntime/wasm/api.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 69b179ddd6969..9a6582d8e56d1 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -254,7 +254,7 @@ int OrtGetTensorData(OrtValue* tensor, int* data_type, void** data, size_t** dim
 
   ONNXType tensor_type;
   RETURN_ERROR_CODE_IF_ERROR(GetValueType, tensor, &tensor_type);
-  if ( tensor_type != ONNX_TYPE_TENSOR ) {
+  if (tensor_type != ONNX_TYPE_TENSOR) {
     return ORT_FAIL;
   }
 
@@ -363,9 +363,13 @@ int OrtRun(OrtSession* session,
            const char** input_names, const ort_tensor_handle_t* inputs, size_t input_count,
            const char** output_names, size_t output_count, ort_tensor_handle_t* outputs,
            OrtRunOptions* run_options) {
-  EM_ASM({ Module["jsepRunPromise"] = new Promise((r) => {Module.jsepRunPromiseResolve = r;}); });
+#if defined(USE_JS)
+  EM_ASM({ Module["jsepRunPromise"] = new Promise((r) = > { Module.jsepRunPromiseResolve = r; }); });
+#endif
   auto status_code = CHECK_STATUS(Run, session, run_options, input_names, inputs, input_count, output_names, output_count, outputs);
+#if defined(USE_JS)
   EM_ASM({ Module.jsepRunPromiseResolve($0); }, status_code);
+#endif
   return status_code;
 }
 

From 2e8a039c1292054424418ef1b68e26869a2e6113 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 7 Apr 2023 17:42:21 -0700
Subject: [PATCH 73/81] disable tests Reduce* ops for webgl

---
 .../lib/onnxjs/backends/webgl/ops/reduce.ts   |  3 +-
 js/web/test/suite-test-list.jsonc             | 54 +++++++++----------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/js/web/lib/onnxjs/backends/webgl/ops/reduce.ts b/js/web/lib/onnxjs/backends/webgl/ops/reduce.ts
index a61270163f879..1a2bc7422c833 100644
--- a/js/web/lib/onnxjs/backends/webgl/ops/reduce.ts
+++ b/js/web/lib/onnxjs/backends/webgl/ops/reduce.ts
@@ -98,6 +98,7 @@ const createReduceProgramInfo =
     };
 
 const validateInputs = (inputs: Tensor[]): void => {
+  // TODO: support Reduce* operators with 2 inputs.
   if (!inputs || inputs.length !== 1) {
     throw new Error('Reduce op requires 1 input.');
   }
@@ -174,4 +175,4 @@ export const reduceLogSumSquare: OperatorImplementation<ReduceAttributes> =
     (inferenceHandler: WebGLInferenceHandler, inputs: Tensor[], attributes: ReduceAttributes): Tensor[] => {
       const reduceOp: ReduceOp = (): string[] => ['float t; value = 0.0;', 't = _A(inputIdx); value += t * t;', ''];
       return reduce(inferenceHandler, inputs, attributes, 'ReduceLogSumSquare', reduceOp);
-    };
\ No newline at end of file
+    };
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 5714c7b284848..17928899c91b1 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -160,36 +160,36 @@
       "test_sum_example",
       "test_sum_one_input",
       "test_sum_two_inputs",
-      "test_reduce_log_sum_asc_axes",
-      "test_reduce_log_sum_default",
-      "test_reduce_log_sum_desc_axes",
-      "test_reduce_max_default_axes_keepdim_example",
-      "test_reduce_max_default_axes_keepdims_random",
-      "test_reduce_max_do_not_keepdims_example",
-      "test_reduce_max_do_not_keepdims_random",
-      "test_reduce_max_keepdims_example",
-      "test_reduce_max_keepdims_random",
-      "test_reduce_mean_default_axes_keepdims_example",
-      "test_reduce_mean_default_axes_keepdims_random",
-      "test_reduce_mean_do_not_keepdims_example",
-      "test_reduce_mean_do_not_keepdims_random",
-      "test_reduce_mean_keepdims_example",
-      "test_reduce_mean_keepdims_random",
-      "test_reduce_min_default_axes_keepdims_example",
-      "test_reduce_min_default_axes_keepdims_random",
-      "test_reduce_min_do_not_keepdims_example",
-      "test_reduce_min_do_not_keepdims_random",
-      "test_reduce_min_keepdims_example",
-      "test_reduce_min_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_log_sum_asc_axes",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_log_sum_default",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_log_sum_desc_axes",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_max_default_axes_keepdim_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_max_default_axes_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_max_do_not_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_max_do_not_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_max_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_max_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_mean_default_axes_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_mean_default_axes_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_mean_do_not_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_mean_do_not_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_mean_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_mean_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_min_default_axes_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_min_default_axes_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_min_do_not_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_min_do_not_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_min_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_min_keepdims_random",
       {
-        "name": "test_reduce_prod_default_axes_keepdims_example",
+        "name": "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_default_axes_keepdims_example",
         "condition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
       },
-      "test_reduce_prod_default_axes_keepdims_random",
-      "test_reduce_prod_do_not_keepdims_example",
-      "test_reduce_prod_do_not_keepdims_random",
-      "test_reduce_prod_keepdims_example",
-      "test_reduce_prod_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_default_axes_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_do_not_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_do_not_keepdims_random",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_keepdims_example",
+      "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_keepdims_random",
       "opset{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_example",
       "opset{7,8,9,10,11,12}/test_reduce_sum_default_axes_keepdims_random",
       "opset{7,8,9,10,11,12}/test_reduce_sum_do_not_keepdims_example",

From 46e62a47551a9560d70b3285fecaf48c9d32a3cc Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 12 Apr 2023 16:26:54 -0700
Subject: [PATCH 74/81] fix build break in api.cc

---
 onnxruntime/wasm/api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 9a6582d8e56d1..728bd1cb39aed 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -364,7 +364,7 @@ int OrtRun(OrtSession* session,
            const char** output_names, size_t output_count, ort_tensor_handle_t* outputs,
            OrtRunOptions* run_options) {
 #if defined(USE_JS)
-  EM_ASM({ Module["jsepRunPromise"] = new Promise((r) = > { Module.jsepRunPromiseResolve = r; }); });
+  EM_ASM({ Module["jsepRunPromise"] = new Promise((r) => { Module.jsepRunPromiseResolve = r; }); });
 #endif
   auto status_code = CHECK_STATUS(Run, session, run_options, input_names, inputs, input_count, output_names, output_count, outputs);
 #if defined(USE_JS)

From 57b520e67f55a0a05b28564520e0fefc2d4ed8ea Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 13 Apr 2023 18:36:45 -0700
Subject: [PATCH 75/81] add jsep in wasm build CI

---
 .../github/azure-pipelines/templates/web-ci.yml  | 16 +++++++++++++---
 .../azure-pipelines/templates/win-wasm-ci.yml    | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
index a60ace38724e4..e036e62d9a7c3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
@@ -19,10 +19,19 @@ parameters:
   displayName: 'Build static library'
   type: boolean
   default: false
+- name: BuildJsep
+  displayName: 'Build JSEP'
+  type: boolean
+  default: true
 
+- name: ReleaseBuildArgs
+  displayName: 'Build command line arguments for Release build'
+  type: string
+  default: '--skip_tests --enable_wasm_api_exception_catching --disable_rtti'
 - name: ExtraBuildArgs
+  displayName: 'Extra build command line arguments'
   type: string
-
+  default: '--use_extensions --cmake_extra_defines onnxruntime_WEBASSEMBLY_DEFAULT_EXTENSION_FLAGS=ON'
 
 stages:
 - stage: Extract_commit
@@ -54,7 +63,7 @@ stages:
     parameters:
       CommitOverride: true
       BuildConfig: 'Debug'
-      ExtraBuildArgs: '--use_extensions --cmake_extra_defines onnxruntime_WEBASSEMBLY_DEFAULT_EXTENSION_FLAGS=ON ${{ parameters.ExtraBuildArgs }}'
+      ExtraBuildArgs: '${{ parameters.ExtraBuildArgs }}'
       PoolName: ${{ parameters.PoolName }}
 
 - stage: Build_web_Debug
@@ -75,8 +84,9 @@ stages:
     parameters:
       CommitOverride: true
       BuildConfig: 'Release'
-      ExtraBuildArgs: '--skip_tests --enable_wasm_api_exception_catching --disable_rtti --use_extensions --cmake_extra_defines onnxruntime_WEBASSEMBLY_DEFAULT_EXTENSION_FLAGS=ON ${{ parameters.ExtraBuildArgs }}'
+      ExtraBuildArgs: '${{ parameters.ReleaseBuildArgs }} ${{ parameters.ExtraBuildArgs }}'
       PoolName: ${{ parameters.PoolName }}
+      BuildJsep: ${{ parameters.BuildJsep }}
 
 - ${{ if eq(parameters.BuildStaticLib, 'true') }}:
   - stage: Build_wasm_Release_static_library
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
index 18391e26584a2..4ec339bb0fb81 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
@@ -22,6 +22,10 @@ parameters:
 - name: TimeoutInMinutes
   default: 180
 
+- name: BuildJsep
+  type: boolean
+  default: false
+
 jobs:
 - job: build_WASM
   pool: ${{ parameters.PoolName }}
@@ -95,12 +99,23 @@ jobs:
       scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
       arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd --enable_wasm_simd'
       workingDirectory: '$(Build.BinariesDirectory)'
+  - ${{ if eq(parameters.BuildJsep, true) }}:
+    - task: PythonScript@0
+      displayName: 'Build (simd + JSEP)'
+      inputs:
+        scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_jsep --enable_wasm_simd --use_js --target onnxruntime_webassembly'
+        workingDirectory: '$(Build.BinariesDirectory)'
   - ${{ if eq(parameters.SkipPublish, false) }}:
     - script: |
         copy $(Build.BinariesDirectory)\wasm\${{ parameters.BuildConfig }}\ort-wasm*.* $(Build.ArtifactStagingDirectory)
         copy $(Build.BinariesDirectory)\wasm_threads\${{ parameters.BuildConfig }}\ort-wasm*.* $(Build.ArtifactStagingDirectory)
         copy $(Build.BinariesDirectory)\wasm_simd_threads\${{ parameters.BuildConfig }}\ort-wasm*.* $(Build.ArtifactStagingDirectory)
         copy $(Build.BinariesDirectory)\wasm_simd\${{ parameters.BuildConfig }}\ort-wasm*.* $(Build.ArtifactStagingDirectory)
+        if exist $(Build.BinariesDirectory)\wasm_simd_jsep (
+          copy $(Build.BinariesDirectory)\wasm_simd_jsep\${{ parameters.BuildConfig }}\ort-wasm-simd.wasm $(Build.ArtifactStagingDirectory)\ort-wasm-simd.jsep.wasm
+          copy $(Build.BinariesDirectory)\wasm_simd_jsep\${{ parameters.BuildConfig }}\ort-wasm-simd.js $(Build.ArtifactStagingDirectory)\ort-wasm-simd.jsep.js
+        )
       displayName: 'Create Artifacts'
   - ${{ if eq(parameters.SkipPublish, false) }}:
     - task: PublishPipelineArtifact@0

From 62e6f3d1265ee29be66117085c6742287f6c195f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 14 Apr 2023 14:43:04 -0700
Subject: [PATCH 76/81] resolve comments

---
 tools/ci_build/github/azure-pipelines/post-merge-jobs.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index cda39c572fc58..ae0a78f1d619d 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -23,6 +23,8 @@ stages:
       PoolName: 'aiinfra-Win-CPU-2022-web-beta'
       BuildStaticLib: true
       ExtraBuildArgs: ''
+      # Test if build works when JSEP is disabled
+      BuildJsep: false
 
 # This stage is to test if the combined build works on
 # o Windows ARM64

From 129efe86ebcf2145e08c2fe2e7ee63537cdfd725 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 14 Apr 2023 14:50:06 -0700
Subject: [PATCH 77/81] update TPN

---
 ThirdPartyNotices.txt | 221 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 215 insertions(+), 6 deletions(-)

diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index e925f75090a46..b4d981d42dfb8 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -5422,8 +5422,8 @@ _____
 
 Tencent/rapidjson, https://github.com/Tencent/rapidjson
 
-Tencent is pleased to support the open source community by making RapidJSON available. 
- 
+Tencent is pleased to support the open source community by making RapidJSON available.
+
 Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.  All rights reserved.
 
 If you have downloaded a copy of the RapidJSON binary from Tencent, please note that the RapidJSON binary is licensed under the MIT License.
@@ -5435,13 +5435,13 @@ Other dependencies and licenses:
 Open Source Software Licensed Under the BSD License:
 --------------------------------------------------------------------
 
-The msinttypes r29 
-Copyright (c) 2006-2013 Alexander Chemeris 
+The msinttypes r29
+Copyright (c) 2006-2013 Alexander Chemeris
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 
-* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
+* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 * Neither the name of  copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 
@@ -5450,7 +5450,7 @@ THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY EXPR
 Open Source Software Licensed Under the JSON License:
 --------------------------------------------------------------------
 
-json.org 
+json.org
 Copyright (c) 2002 JSON.org
 All Rights Reserved.
 
@@ -5784,3 +5784,212 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+_____
+
+TensorFlow.js
+
+https://github.com/tensorflow/tfjs
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

From f0ac354a11b0c097b31aa2b42a89ae3e49cf0d03 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 17 Apr 2023 16:50:32 -0700
Subject: [PATCH 78/81] revert extracting build flags from parameters

---
 .../ci_build/github/azure-pipelines/templates/web-ci.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
index e036e62d9a7c3..b3ccb1bc2d6fa 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
@@ -24,14 +24,9 @@ parameters:
   type: boolean
   default: true
 
-- name: ReleaseBuildArgs
-  displayName: 'Build command line arguments for Release build'
-  type: string
-  default: '--skip_tests --enable_wasm_api_exception_catching --disable_rtti'
 - name: ExtraBuildArgs
   displayName: 'Extra build command line arguments'
   type: string
-  default: '--use_extensions --cmake_extra_defines onnxruntime_WEBASSEMBLY_DEFAULT_EXTENSION_FLAGS=ON'
 
 stages:
 - stage: Extract_commit
@@ -63,7 +58,7 @@ stages:
     parameters:
       CommitOverride: true
       BuildConfig: 'Debug'
-      ExtraBuildArgs: '${{ parameters.ExtraBuildArgs }}'
+      ExtraBuildArgs: '--use_extensions --cmake_extra_defines onnxruntime_WEBASSEMBLY_DEFAULT_EXTENSION_FLAGS=ON ${{ parameters.ExtraBuildArgs }}'
       PoolName: ${{ parameters.PoolName }}
 
 - stage: Build_web_Debug
@@ -84,7 +79,7 @@ stages:
     parameters:
       CommitOverride: true
       BuildConfig: 'Release'
-      ExtraBuildArgs: '${{ parameters.ReleaseBuildArgs }} ${{ parameters.ExtraBuildArgs }}'
+      ExtraBuildArgs: '--skip_tests --enable_wasm_api_exception_catching --disable_rtti --use_extensions --cmake_extra_defines onnxruntime_WEBASSEMBLY_DEFAULT_EXTENSION_FLAGS=ON ${{ parameters.ExtraBuildArgs }}'
       PoolName: ${{ parameters.PoolName }}
       BuildJsep: ${{ parameters.BuildJsep }}
 

From 25f38970298dc0ee9f16394a7d32e767d7c0a3df Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 17 Apr 2023 16:53:32 -0700
Subject: [PATCH 79/81] revert post-merge-jobs

---
 tools/ci_build/github/azure-pipelines/post-merge-jobs.yml  | 2 --
 tools/ci_build/github/azure-pipelines/templates/web-ci.yml | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 58b3d0a689ff0..c6bde53d242da 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -23,8 +23,6 @@ stages:
       PoolName: 'aiinfra-Win-CPU-2022-web-beta'
       BuildStaticLib: true
       ExtraBuildArgs: ''
-      # Test if build works when JSEP is disabled
-      BuildJsep: false
 
 # This stage is to test if the combined build works on
 # o Windows ARM64
diff --git a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
index b3ccb1bc2d6fa..1b3ec6af24109 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
@@ -28,6 +28,7 @@ parameters:
   displayName: 'Extra build command line arguments'
   type: string
 
+
 stages:
 - stage: Extract_commit
   jobs:

From f3e9dee4dd1c2cf17c2735c728a1dbc25656bf9c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 18 Apr 2023 15:09:20 -0700
Subject: [PATCH 80/81] add a few comments

---
 js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts | 3 +++
 js/web/lib/wasm/jsep/webgpu/types.ts                    | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts b/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
index 6608b00471e77..adba0fb9d022d 100644
--- a/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
+++ b/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
@@ -20,5 +20,8 @@ export interface AttributeWithCacheKey {
   readonly cacheKey: string;
 }
 
+/**
+ * create a new object from the given attribute, and add a cacheKey property to it
+ */
 export const createAttributeWithCacheKey = <T extends Record<string, unknown>>(attribute: T): T&AttributeWithCacheKey =>
     new AttributeWithCacheKeyImpl(attribute) as unknown as T & AttributeWithCacheKey;
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 9cf8487fabd04..634e3a167184f 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -123,8 +123,17 @@ export interface ComputeContext {
    * stores the pointer to OpKernelContext
    */
   readonly opKernelContext: number;
+
+  /**
+   * a list of inputs, each input is an instance of TensorView
+   */
   readonly inputs: readonly TensorView[];
+
+  /**
+   * a custom data object that can be used to store any data that is needed by the kernel
+   */
   readonly customData: {[key: string]: unknown};
+
   compute(program: ProgramInfoLoader|ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping):
       TensorView[];
   output(index: number, dims: readonly number[]): number;

From ae5e444d1641670465ed84435d6da07d57851291 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 18 Apr 2023 18:41:25 -0700
Subject: [PATCH 81/81] format code

---
 onnxruntime/core/providers/js/allocator.cc    |   4 +-
 .../core/providers/js/data_transfer.cc        |   2 +-
 onnxruntime/core/providers/js/data_transfer.h |   4 +-
 .../providers/js/js_execution_provider.cc     |   4 +-
 .../core/providers/js/js_execution_provider.h |   2 +-
 onnxruntime/core/providers/js/js_export.cc    |  30 +--
 onnxruntime/core/providers/js/js_export.h     |   2 +-
 onnxruntime/core/providers/js/js_kernel.h     | 175 +++++++++---------
 .../core/providers/js/operators/binary.cc     |   1 -
 .../core/providers/js/operators/conv.cc       |   2 -
 .../core/providers/js/operators/conv.h        |  89 +++++----
 .../core/providers/js/operators/gemm.cc       |   2 -
 .../core/providers/js/operators/gemm.h        |  21 +--
 .../core/providers/js/operators/matmul.cc     |   1 -
 .../core/providers/js/operators/pool.cc       |  16 +-
 .../core/providers/js/operators/pool.h        |  55 +++---
 .../core/providers/js/operators/reshape.h     |   6 +-
 .../core/providers/js/operators/transpose.h   |  19 +-
 .../core/providers/js/operators/unary.cc      |  31 ++--
 onnxruntime/wasm/api.cc                       |   2 +-
 20 files changed, 229 insertions(+), 239 deletions(-)

diff --git a/onnxruntime/core/providers/js/allocator.cc b/onnxruntime/core/providers/js/allocator.cc
index 67b0536d34ae6..c1d0aa9abbf6b 100644
--- a/onnxruntime/core/providers/js/allocator.cc
+++ b/onnxruntime/core/providers/js/allocator.cc
@@ -10,14 +10,14 @@ namespace onnxruntime {
 namespace js {
 
 void* JsCustomAllocator::Alloc(size_t size) {
-  void* p = EM_ASM_PTR({return Module.jsepAlloc($0);}, size);
+  void* p = EM_ASM_PTR({ return Module.jsepAlloc($0); }, size);
   stats_.num_allocs++;
   stats_.bytes_in_use += size;
   return p;
 }
 
 void JsCustomAllocator::Free(void* p) {
-  size_t size = (size_t)(void*)EM_ASM_PTR({return Module.jsepFree($0);}, p);
+  size_t size = (size_t)(void*)EM_ASM_PTR({ return Module.jsepFree($0); }, p);
   stats_.bytes_in_use -= size;
 }
 
diff --git a/onnxruntime/core/providers/js/data_transfer.cc b/onnxruntime/core/providers/js/data_transfer.cc
index 64d71fea8ce54..c62362d90867f 100644
--- a/onnxruntime/core/providers/js/data_transfer.cc
+++ b/onnxruntime/core/providers/js/data_transfer.cc
@@ -5,7 +5,7 @@
 
 #include "core/providers/js/data_transfer.h"
 
-EM_ASYNC_JS(void, jsepDownload, (const void *src_data, void *dst_data, size_t bytes), {
+EM_ASYNC_JS(void, jsepDownload, (const void* src_data, void* dst_data, size_t bytes), {
   await Module.jsepCopyAsync(src_data, dst_data, bytes);
 });
 
diff --git a/onnxruntime/core/providers/js/data_transfer.h b/onnxruntime/core/providers/js/data_transfer.h
index 6a0e8586776a2..3dfb19cfde5ac 100644
--- a/onnxruntime/core/providers/js/data_transfer.h
+++ b/onnxruntime/core/providers/js/data_transfer.h
@@ -11,8 +11,8 @@ namespace js {
 
 class DataTransfer : public IDataTransfer {
  public:
-  DataTransfer() {};
-  ~DataTransfer() {};
+  DataTransfer(){};
+  ~DataTransfer(){};
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 825ba0e861d75..d1308da7f888c 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -164,7 +164,6 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnn
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool);
 
-
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
@@ -310,7 +309,8 @@ void JsExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager)
     if (!custom_alloc) {
       AllocatorCreationInfo customAllocatorCreationInfo([&](int) {
         return std::make_unique<js::JsCustomAllocator>();
-      }, 0, false);
+      },
+                                                        0, false);
       custom_alloc = CreateAllocator(customAllocatorCreationInfo);
       allocator_manager.InsertAllocator(custom_alloc);
     }
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index ac5f20f185288..ce8ec53eca1f6 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -18,7 +18,7 @@ namespace js {
 template <typename T>
 KernelCreateInfo BuildKernelCreateInfo();
 
-}
+}  // namespace js
 
 // placeholder for future use. no options currently
 struct JsExecutionProviderInfo {
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index aefef9dc39bc9..ca0527a2ef89b 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -5,22 +5,22 @@
 
 #include "core/framework/op_kernel.h"
 
-const void * JsepOutput(void * context, int index, void * data) {
-    uint32_t * data_offset = reinterpret_cast<uint32_t *>(data);
-    uint32_t dim = *data_offset++;
-    size_t dim_size = static_cast<size_t>(dim);
-    std::vector<int64_t> dims;
-    dims.reserve(dim_size);
-    dims.resize(dim_size);
-    for (size_t i = 0; i < dim_size; i++) {
-        dims[i] = static_cast<int64_t>(*data_offset++);
-    }
+const void* JsepOutput(void* context, int index, void* data) {
+  uint32_t* data_offset = reinterpret_cast<uint32_t*>(data);
+  uint32_t dim = *data_offset++;
+  size_t dim_size = static_cast<size_t>(dim);
+  std::vector<int64_t> dims;
+  dims.reserve(dim_size);
+  dims.resize(dim_size);
+  for (size_t i = 0; i < dim_size; i++) {
+    dims[i] = static_cast<int64_t>(*data_offset++);
+  }
 
-    LOGF_DEFAULT(VERBOSE, "JsepOutput(%d, %s)", index, onnxruntime::TensorShape(dims).ToString().c_str());
+  LOGF_DEFAULT(VERBOSE, "JsepOutput(%d, %s)", index, onnxruntime::TensorShape(dims).ToString().c_str());
 
-    auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
-    auto r = output->DataRaw();
+  auto output = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->Output(index, onnxruntime::TensorShape(dims));
+  auto r = output->DataRaw();
 
-    LOGF_DEFAULT(VERBOSE, "JsepOutput -- data=%zu", (size_t)(r));
-    return r;
+  LOGF_DEFAULT(VERBOSE, "JsepOutput -- data=%zu", (size_t)(r));
+  return r;
 }
diff --git a/onnxruntime/core/providers/js/js_export.h b/onnxruntime/core/providers/js/js_export.h
index fa6ec4f9e25f5..bb1eb356cc9d5 100644
--- a/onnxruntime/core/providers/js/js_export.h
+++ b/onnxruntime/core/providers/js/js_export.h
@@ -10,5 +10,5 @@
 // TODO: Move to api.h
 
 extern "C" {
-const void * EMSCRIPTEN_KEEPALIVE JsepOutput(void * context, int index, void * data);
+const void* EMSCRIPTEN_KEEPALIVE JsepOutput(void* context, int index, void* data);
 };
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 3187b132f47bd..15fce3727b8b3 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -17,50 +17,55 @@ struct pthreadpool;
 namespace onnxruntime {
 namespace js {
 
-#define JSEP_INIT_KERNEL(optype) EM_ASM({ Module.jsepCreateKernel(#optype, $0, undefined); }, this)
-#define JSEP_INIT_KERNEL_ATTRIBUTE(optype, attr, ...) EM_ASM({ Module.jsepCreateKernel(#optype, $0, attr); }, this, __VA_ARGS__)
+// This macro is defined to bypass the code format from clang-format, which will overwrite "=>" into "= >"
+// We can use it to write JS inline code with arrow functions.
 
-#define JSEP_KERNEL_IMPL(classname, optype)                  \
-class classname : public JsKernel {                          \
-public:                                                      \
-    classname(const OpKernelInfo& info) : JsKernel(info) {   \
-        JSEP_INIT_KERNEL(optype);                            \
-    }                                                        \
-};
+// clang-format off
+#define JS_ARROW =>
+// clang-format on
 
-#define JSEP_KERNEL_TYPED_IMPL(classname, optype)            \
-template<typename T>                                         \
-class classname : public JsKernel {                          \
-public:                                                      \
-    classname(const OpKernelInfo& info) : JsKernel(info) {   \
-        JSEP_INIT_KERNEL(optype);                            \
-    }                                                        \
-};
+#define JSEP_INIT_KERNEL(optype) EM_ASM({ Module.jsepCreateKernel(#optype, $0, undefined); }, this)
+#define JSEP_INIT_KERNEL_ATTRIBUTE(optype, attr, ...) EM_ASM({ Module.jsepCreateKernel(#optype, $0, attr); }, this, __VA_ARGS__)
 
-#define JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, attr_pre, attr, ...)  \
-class classname : public JsKernel {                                        \
-public:                                                                    \
-    classname(const OpKernelInfo& info) : JsKernel(info) {                 \
-        attr_pre                                                           \
-        JSEP_INIT_KERNEL_ATTRIBUTE(optype, attr, __VA_ARGS__);             \
-    }                                                                      \
-};
+#define JSEP_KERNEL_IMPL(classname, optype)                \
+  class classname : public JsKernel {                      \
+   public:                                                 \
+    classname(const OpKernelInfo& info) : JsKernel(info) { \
+      JSEP_INIT_KERNEL(optype);                            \
+    }                                                      \
+  };
+
+#define JSEP_KERNEL_TYPED_IMPL(classname, optype)          \
+  template <typename T>                                    \
+  class classname : public JsKernel {                      \
+   public:                                                 \
+    classname(const OpKernelInfo& info) : JsKernel(info) { \
+      JSEP_INIT_KERNEL(optype);                            \
+    }                                                      \
+  };
+
+#define JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, attr_pre, attr, ...) \
+  class classname : public JsKernel {                                     \
+   public:                                                                \
+    classname(const OpKernelInfo& info) : JsKernel(info) {                \
+      attr_pre                                                            \
+          JSEP_INIT_KERNEL_ATTRIBUTE(optype, attr, __VA_ARGS__);          \
+    }                                                                     \
+  };
 
 #define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(classname, optype, attr_name, default_value, ...) \
-    JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, , ({#attr_name:$1}), static_cast<double>(info.GetAttrOrDefault<float>(#attr_name, default_value)))
+  JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, , ({#attr_name : $1}), static_cast<double>(info.GetAttrOrDefault<float>(#attr_name, default_value)))
 
 #define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_2_DEFAULT(classname, optype, attr_name_1, default_value_1, attr_name_2, default_value_2, ...) \
-    JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, , ({#attr_name_1:$1, #attr_name_2:$2}),                                              \
-                              static_cast<double>(info.GetAttrOrDefault<float>(#attr_name_1, default_value_1)),                       \
-                              static_cast<double>(info.GetAttrOrDefault<float>(#attr_name_2, default_value_2)))
-
-
-#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT(classname, optype, attr_name, ...) \
-    JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype,                           \
-        float value;                                                       \
-        ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value)); ,            \
-        , ({#attr_name:$1}), static_cast<double>(value))
+  JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype, , ({#attr_name_1 : $1, #attr_name_2 : $2}),                                            \
+                            static_cast<double>(info.GetAttrOrDefault<float>(#attr_name_1, default_value_1)),                         \
+                            static_cast<double>(info.GetAttrOrDefault<float>(#attr_name_2, default_value_2)))
 
+#define JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT(classname, optype, attr_name, ...)         \
+  JSEP_CLASS_IMPL_ATTRIBUTE(classname, optype,                                     \
+                            float value;                                           \
+                            ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value));, \
+                                                                                 , ({#attr_name : $1}), static_cast<double>(value))
 
 // TODO:
 // class JsMultiProgramKernel : public OpKernel { /* TBD */ };
@@ -70,73 +75,73 @@ class JsKernel : public OpKernel {
   explicit JsKernel(const OpKernelInfo& info)
       : OpKernel(info) {}
   ~JsKernel() override {
-      EM_ASM({ Module.jsepReleaseKernel($0); }, this);
+    EM_ASM({ Module.jsepReleaseKernel($0); }, this);
   }
 
-  void * SerializeKernelContext(OpKernelContext* context, AllocatorPtr alloc) const {
-      //
-      // temp_data_format (every item is (u)int32_t):
-      //    context_prt | input_count | [input_data_0] ... [input_data_N-1]
-      //
-      // input_data_format:
-      //    type | data_ptr | dim_size | dim[0] ... dim[N-1]
-      //
-      size_t temp_data_size = sizeof(size_t) * 2;
-      for (int i = 0; i < context->InputCount(); i++) {
-        temp_data_size += sizeof(size_t) * (3 + context->Input<Tensor>(i)->Shape().NumDimensions());
-      }
-      uint32_t *p_serialized_kernel_context = reinterpret_cast<uint32_t*>(alloc->Alloc(temp_data_size));
-      if (p_serialized_kernel_context == nullptr) {
-        return nullptr;
-      }
-
-      p_serialized_kernel_context[0] = reinterpret_cast<uint32_t>(context);
-      p_serialized_kernel_context[1] = static_cast<uint32_t>(context->InputCount());
-      size_t index = 2;
-      for (int i = 0; i < context->InputCount(); i++) {
-        p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->GetElementType());
-        p_serialized_kernel_context[index++] = reinterpret_cast<uint32_t>(context->Input<Tensor>(i)->DataRaw());
-        p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape().NumDimensions());
-        for (size_t d = 0; d < context->Input<Tensor>(i)->Shape().NumDimensions(); d++) {
-          p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape()[d]);
-        }
+  void* SerializeKernelContext(OpKernelContext* context, AllocatorPtr alloc) const {
+    //
+    // temp_data_format (every item is (u)int32_t):
+    //    context_prt | input_count | [input_data_0] ... [input_data_N-1]
+    //
+    // input_data_format:
+    //    type | data_ptr | dim_size | dim[0] ... dim[N-1]
+    //
+    size_t temp_data_size = sizeof(size_t) * 2;
+    for (int i = 0; i < context->InputCount(); i++) {
+      temp_data_size += sizeof(size_t) * (3 + context->Input<Tensor>(i)->Shape().NumDimensions());
+    }
+    uint32_t* p_serialized_kernel_context = reinterpret_cast<uint32_t*>(alloc->Alloc(temp_data_size));
+    if (p_serialized_kernel_context == nullptr) {
+      return nullptr;
+    }
+
+    p_serialized_kernel_context[0] = reinterpret_cast<uint32_t>(context);
+    p_serialized_kernel_context[1] = static_cast<uint32_t>(context->InputCount());
+    size_t index = 2;
+    for (int i = 0; i < context->InputCount(); i++) {
+      p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->GetElementType());
+      p_serialized_kernel_context[index++] = reinterpret_cast<uint32_t>(context->Input<Tensor>(i)->DataRaw());
+      p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape().NumDimensions());
+      for (size_t d = 0; d < context->Input<Tensor>(i)->Shape().NumDimensions(); d++) {
+        p_serialized_kernel_context[index++] = static_cast<uint32_t>(context->Input<Tensor>(i)->Shape()[d]);
       }
+    }
 
 #ifndef NDEBUG
-      std::ostringstream os;
-      os << "temp data size: " << temp_data_size << ". Data:";
-      size_t temp_data_count = temp_data_size >> 2;
-      for (size_t i = 0; i < temp_data_count; i++) {
-        os << " " << p_serialized_kernel_context[i];
-      }
-      LOGS_DEFAULT(VERBOSE) << os.str();
+    std::ostringstream os;
+    os << "temp data size: " << temp_data_size << ". Data:";
+    size_t temp_data_count = temp_data_size >> 2;
+    for (size_t i = 0; i < temp_data_count; i++) {
+      os << " " << p_serialized_kernel_context[i];
+    }
+    LOGS_DEFAULT(VERBOSE) << os.str();
 #endif
 
-      return p_serialized_kernel_context;
+    return p_serialized_kernel_context;
   }
 
   virtual Status ComputeInternal(OpKernelContext* context) const {
-      AllocatorPtr alloc;
-      ORT_RETURN_IF_ERROR(context->GetTempSpaceCPUAllocator(&alloc));
+    AllocatorPtr alloc;
+    ORT_RETURN_IF_ERROR(context->GetTempSpaceCPUAllocator(&alloc));
 
-      auto p_serialized_kernel_context = SerializeKernelContext(context, alloc);
+    auto p_serialized_kernel_context = SerializeKernelContext(context, alloc);
 
-      int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_serialized_kernel_context);
+    int status = EM_ASM_INT({ return Module.jsepRun($0, $1); }, this, p_serialized_kernel_context);
 
-      LOGS_DEFAULT(VERBOSE) << "outputs = " << context->OutputCount() << ". Y.data="
-                            << (size_t)(context->Output<Tensor>(0)->DataRaw()) << ".";
+    LOGS_DEFAULT(VERBOSE) << "outputs = " << context->OutputCount() << ". Y.data="
+                          << (size_t)(context->Output<Tensor>(0)->DataRaw()) << ".";
 
-      alloc->Free(p_serialized_kernel_context);
+    alloc->Free(p_serialized_kernel_context);
 
-      if (status == 0) {
-        return Status::OK();
-      } else {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to run JSEP kernel");
-      }
+    if (status == 0) {
+      return Status::OK();
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to run JSEP kernel");
+    }
   }
 
   Status Compute(OpKernelContext* context) const override {
-      return ComputeInternal(context);
+    return ComputeInternal(context);
   }
 };
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/binary.cc b/onnxruntime/core/providers/js/operators/binary.cc
index 030b2803e717f..ffad51f7e5af0 100644
--- a/onnxruntime/core/providers/js/operators/binary.cc
+++ b/onnxruntime/core/providers/js/operators/binary.cc
@@ -24,7 +24,6 @@ namespace js {
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()),              \
       KERNEL_CLASS);
 
-
 JSEP_KERNEL_IMPL(Add, Add)
 REG_ELEMENTWISE_VERSIONED_KERNEL(Add, 7, 12, float, Add);
 REG_ELEMENTWISE_VERSIONED_KERNEL(Add, 13, 13, float, Add);
diff --git a/onnxruntime/core/providers/js/operators/conv.cc b/onnxruntime/core/providers/js/operators/conv.cc
index 78c1385f6ff56..c7c9f7f7c3f0e 100644
--- a/onnxruntime/core/providers/js/operators/conv.cc
+++ b/onnxruntime/core/providers/js/operators/conv.cc
@@ -35,8 +35,6 @@ namespace js {
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Conv<T, false>);
 
-
-
 REGISTER_KERNEL_TYPED(float)
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 8881faf5b7f5f..22f7721276677 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -13,10 +13,9 @@ template <typename T, bool is_channels_last>
 class Conv : public JsKernel {
  public:
   Conv(const OpKernelInfo& info) : JsKernel(info), conv_attrs_(info), w_is_const_(false) {
-
     TensorShapeVector kernel_shape;
     if (conv_attrs_.kernel_shape_specified) {
-        ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape).IsOK());
+      ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape).IsOK());
     }
 
     int64_t channels_last = is_channels_last ? 1 : info.GetAttrOrDefault<int64_t>("channels_last", 0);
@@ -26,51 +25,49 @@ class Conv : public JsKernel {
         (conv_attrs_.kernel_shape_specified && kernel_shape.size() == 1) ||
         conv_attrs_.strides.size() == 1) {
       JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
-          "format": $8 ? "NHWC" : "NCHW",
-          "auto_pad": $1,
-          "dilations": [$2],
-          "group": $3,
-          "kernel_shape": [$4],
-          "pads": [$5, $6],
-          "strides": [$7],
-          "w_is_const": () => (!!HEAP8[$9])
-      }),
-      static_cast<int32_t>(conv_attrs_.auto_pad),
-      static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
-      static_cast<int32_t>(conv_attrs_.group),
-      static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
-      static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
-      static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
-      static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
-      static_cast<int32_t>(channels_last),
-      reinterpret_cast<int32_t>(&w_is_const_)
-      );
+                                   "format" : $8 ? "NHWC" : "NCHW",
+                                   "auto_pad" : $1,
+                                   "dilations" : [$2],
+                                   "group" : $3,
+                                   "kernel_shape" : [$4],
+                                   "pads" : [ $5, $6 ],
+                                   "strides" : [$7],
+                                   "w_is_const" : () JS_ARROW(!!HEAP8[$9])
+                                 }),
+                                 static_cast<int32_t>(conv_attrs_.auto_pad),
+                                 static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
+                                 static_cast<int32_t>(conv_attrs_.group),
+                                 static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
+                                 static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
+                                 static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
+                                 static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
+                                 static_cast<int32_t>(channels_last),
+                                 reinterpret_cast<int32_t>(&w_is_const_));
     } else {
       JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
-          "format": $13 ? "NHWC" : "NCHW",
-          "auto_pad": $1,
-          "dilations": [$2, $3],
-          "group": $4,
-          "kernel_shape": [$5, $6],
-          "pads": [$7, $8, $9, $10],
-          "strides": [$11, $12],
-          "w_is_const": () => (!!HEAP8[$14])
-      }),
-      static_cast<int32_t>(conv_attrs_.auto_pad),
-      static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
-      static_cast<int32_t>(conv_attrs_.dilations.size() > 1 ? conv_attrs_.dilations[1] : 0),
-      static_cast<int32_t>(conv_attrs_.group),
-      static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
-      static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0),
-      static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
-      static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
-      static_cast<int32_t>(conv_attrs_.pads.size() > 2 ? conv_attrs_.pads[2] : 0),
-      static_cast<int32_t>(conv_attrs_.pads.size() > 3 ? conv_attrs_.pads[3] : 0),
-      static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
-      static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
-      static_cast<int32_t>(channels_last),
-      reinterpret_cast<int32_t>(&w_is_const_)
-      );
+                                   "format" : $13 ? "NHWC" : "NCHW",
+                                   "auto_pad" : $1,
+                                   "dilations" : [ $2, $3 ],
+                                   "group" : $4,
+                                   "kernel_shape" : [ $5, $6 ],
+                                   "pads" : [ $7, $8, $9, $10 ],
+                                   "strides" : [ $11, $12 ],
+                                   "w_is_const" : () JS_ARROW(!!HEAP8[$14])
+                                 }),
+                                 static_cast<int32_t>(conv_attrs_.auto_pad),
+                                 static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
+                                 static_cast<int32_t>(conv_attrs_.dilations.size() > 1 ? conv_attrs_.dilations[1] : 0),
+                                 static_cast<int32_t>(conv_attrs_.group),
+                                 static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0),
+                                 static_cast<int32_t>(conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0),
+                                 static_cast<int32_t>(conv_attrs_.pads.size() > 0 ? conv_attrs_.pads[0] : 0),
+                                 static_cast<int32_t>(conv_attrs_.pads.size() > 1 ? conv_attrs_.pads[1] : 0),
+                                 static_cast<int32_t>(conv_attrs_.pads.size() > 2 ? conv_attrs_.pads[2] : 0),
+                                 static_cast<int32_t>(conv_attrs_.pads.size() > 3 ? conv_attrs_.pads[3] : 0),
+                                 static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
+                                 static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
+                                 static_cast<int32_t>(channels_last),
+                                 reinterpret_cast<int32_t>(&w_is_const_));
     }
   }
 
@@ -94,7 +91,7 @@ class Conv : public JsKernel {
  protected:
   ConvAttributes conv_attrs_;
   bool w_is_const_;
-  //Tensor w_transposed_;
+  // Tensor w_transposed_;
 };
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/gemm.cc b/onnxruntime/core/providers/js/operators/gemm.cc
index a5cf40055031d..f579d62bdfb5f 100644
--- a/onnxruntime/core/providers/js/operators/gemm.cc
+++ b/onnxruntime/core/providers/js/operators/gemm.cc
@@ -34,8 +34,6 @@ namespace js {
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Gemm<T>);
 
-
-
 REGISTER_KERNEL_TYPED(float)
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/gemm.h b/onnxruntime/core/providers/js/operators/gemm.h
index 50042fb298c3e..27c41788ccfbd 100644
--- a/onnxruntime/core/providers/js/operators/gemm.h
+++ b/onnxruntime/core/providers/js/operators/gemm.h
@@ -12,7 +12,6 @@ template <typename T>
 class Gemm : public JsKernel {
  public:
   Gemm(const OpKernelInfo& info) : JsKernel(info) {
-
     float alpha = info.GetAttrOrDefault<float>("alpha", 1.0f);
     float beta = info.GetAttrOrDefault<float>("beta", 1.0f);
     int64_t transA = info.GetAttrOrDefault<int64_t>("transA", 0);
@@ -20,19 +19,17 @@ class Gemm : public JsKernel {
 
     // currently only support Conv2D. TODO: support other
     JSEP_INIT_KERNEL_ATTRIBUTE(Gemm, ({
-        "alpha": $1,
-        "beta": $2,
-        "transA": $3,
-        "transB": $4
-    }),
-    static_cast<double>(alpha),
-    static_cast<double>(beta),
-    static_cast<int32_t>(transA),
-    static_cast<int32_t>(transB)
-    );
+                                 "alpha" : $1,
+                                 "beta" : $2,
+                                 "transA" : $3,
+                                 "transB" : $4
+                               }),
+                               static_cast<double>(alpha),
+                               static_cast<double>(beta),
+                               static_cast<int32_t>(transA),
+                               static_cast<int32_t>(transB));
   }
 };
 
-
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/matmul.cc b/onnxruntime/core/providers/js/operators/matmul.cc
index 19c295ac1c04d..ddfbb454def07 100644
--- a/onnxruntime/core/providers/js/operators/matmul.cc
+++ b/onnxruntime/core/providers/js/operators/matmul.cc
@@ -16,6 +16,5 @@ ONNX_OPERATOR_KERNEL_EX(MatMul, kOnnxDomain, 13, kJsExecutionProvider,
                         KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                         MatMul);
 
-
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc
index 0a3491e1a308c..03e6caef7e5b8 100644
--- a/onnxruntime/core/providers/js/operators/pool.cc
+++ b/onnxruntime/core/providers/js/operators/pool.cc
@@ -8,14 +8,14 @@
 namespace onnxruntime {
 namespace js {
 
-#define POOLING_KERNEL(op_name, domain, is_channels_last, data_type, pool_type, since_version)             \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                           \
-      op_name,                                                                                             \
-      domain,                                                                                              \
-      since_version,                                                                                       \
-      data_type,                                                                                           \
-      kJsExecutionProvider,                                                                                \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),         \
+#define POOLING_KERNEL(op_name, domain, is_channels_last, data_type, pool_type, since_version)     \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                   \
+      op_name,                                                                                     \
+      domain,                                                                                      \
+      since_version,                                                                               \
+      data_type,                                                                                   \
+      kJsExecutionProvider,                                                                        \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
       Pool<data_type, pool_type, is_channels_last>);
 
 #define POOLING_KERNEL_VERSIONED(op_name, domain, is_channels_last, data_type, pool_type, since_version, end_version) \
diff --git a/onnxruntime/core/providers/js/operators/pool.h b/onnxruntime/core/providers/js/operators/pool.h
index 294cbfac1e03f..5dbe5d0b8881d 100644
--- a/onnxruntime/core/providers/js/operators/pool.h
+++ b/onnxruntime/core/providers/js/operators/pool.h
@@ -10,38 +10,37 @@ namespace onnxruntime {
 namespace js {
 
 #define POOL_ATTRIBUTES_JS_OBJ_MAPPING ({ \
-    "format": $15 ? "NHWC" : "NCHW",      \
-    "auto_pad": $1,                       \
-    "ceil_mode": $2,                      \
-    "count_include_pad": $3,              \
-    "storage_order": $4,                  \
-    "dilations": [$5, $6],                \
-    "kernel_shape": [$7, $8],             \
-    "pads": [$9, $10, $11, $12],          \
-    "strides": [$13, $14]                 \
-  })
+  "format" : $15 ? "NHWC" : "NCHW",       \
+  "auto_pad" : $1,                        \
+  "ceil_mode" : $2,                       \
+  "count_include_pad" : $3,               \
+  "storage_order" : $4,                   \
+  "dilations" : [ $5, $6 ],               \
+  "kernel_shape" : [ $7, $8 ],            \
+  "pads" : [ $9, $10, $11, $12 ],         \
+  "strides" : [ $13, $14 ]                \
+})
 
-#define POOL_ATTRIBUTES_PARAM_LIST                                                             \
-  static_cast<int32_t>(pool_attrs_.auto_pad),                                                  \
-  static_cast<int32_t>(pool_attrs_.ceil_mode),                                                 \
-  static_cast<int32_t>(pool_attrs_.count_include_pad),                                         \
-  static_cast<int32_t>(pool_attrs_.storage_order),                                             \
-  static_cast<int32_t>(pool_attrs_.dilations.size() > 0 ? pool_attrs_.dilations[0] : 0),       \
-  static_cast<int32_t>(pool_attrs_.dilations.size() > 1 ? pool_attrs_.dilations[1] : 0),       \
-  static_cast<int32_t>(pool_attrs_.kernel_shape.size() > 0 ? pool_attrs_.kernel_shape[0] : 0), \
-  static_cast<int32_t>(pool_attrs_.kernel_shape.size() > 1 ? pool_attrs_.kernel_shape[1] : 0), \
-  static_cast<int32_t>(pool_attrs_.pads.size() > 0 ? pool_attrs_.pads[0] : 0),                 \
-  static_cast<int32_t>(pool_attrs_.pads.size() > 1 ? pool_attrs_.pads[1] : 0),                 \
-  static_cast<int32_t>(pool_attrs_.pads.size() > 2 ? pool_attrs_.pads[2] : 0),                 \
-  static_cast<int32_t>(pool_attrs_.pads.size() > 3 ? pool_attrs_.pads[3] : 0),                 \
-  static_cast<int32_t>(pool_attrs_.strides.size() > 0 ? pool_attrs_.strides[0] : 0),           \
-  static_cast<int32_t>(pool_attrs_.strides.size() > 1 ? pool_attrs_.strides[1] : 0),           \
-  static_cast<int32_t>(is_channels_last)
+#define POOL_ATTRIBUTES_PARAM_LIST                                                                 \
+  static_cast<int32_t>(pool_attrs_.auto_pad),                                                      \
+      static_cast<int32_t>(pool_attrs_.ceil_mode),                                                 \
+      static_cast<int32_t>(pool_attrs_.count_include_pad),                                         \
+      static_cast<int32_t>(pool_attrs_.storage_order),                                             \
+      static_cast<int32_t>(pool_attrs_.dilations.size() > 0 ? pool_attrs_.dilations[0] : 0),       \
+      static_cast<int32_t>(pool_attrs_.dilations.size() > 1 ? pool_attrs_.dilations[1] : 0),       \
+      static_cast<int32_t>(pool_attrs_.kernel_shape.size() > 0 ? pool_attrs_.kernel_shape[0] : 0), \
+      static_cast<int32_t>(pool_attrs_.kernel_shape.size() > 1 ? pool_attrs_.kernel_shape[1] : 0), \
+      static_cast<int32_t>(pool_attrs_.pads.size() > 0 ? pool_attrs_.pads[0] : 0),                 \
+      static_cast<int32_t>(pool_attrs_.pads.size() > 1 ? pool_attrs_.pads[1] : 0),                 \
+      static_cast<int32_t>(pool_attrs_.pads.size() > 2 ? pool_attrs_.pads[2] : 0),                 \
+      static_cast<int32_t>(pool_attrs_.pads.size() > 3 ? pool_attrs_.pads[3] : 0),                 \
+      static_cast<int32_t>(pool_attrs_.strides.size() > 0 ? pool_attrs_.strides[0] : 0),           \
+      static_cast<int32_t>(pool_attrs_.strides.size() > 1 ? pool_attrs_.strides[1] : 0),           \
+      static_cast<int32_t>(is_channels_last)
 
-#define GLOBAL_POOL_ATTRIBUTES_JS_OBJ_MAPPING ({ "format": $1 ? "NHWC" : "NCHW" })
+#define GLOBAL_POOL_ATTRIBUTES_JS_OBJ_MAPPING ({"format" : $1 ? "NHWC" : "NCHW"})
 #define GLOBAL_POOL_ATTRIBUTES_PARAM_LIST static_cast<int32_t>(is_channels_last)
 
-
 template <typename T, typename PoolType, bool is_channels_last>
 class Pool : public JsKernel, public PoolBase {
  public:
diff --git a/onnxruntime/core/providers/js/operators/reshape.h b/onnxruntime/core/providers/js/operators/reshape.h
index db919f0021228..97a294163c748 100644
--- a/onnxruntime/core/providers/js/operators/reshape.h
+++ b/onnxruntime/core/providers/js/operators/reshape.h
@@ -32,7 +32,7 @@ class Reshape final : public JsKernel {
     Tensor* Y = context->Output(0, TensorShape(shape));
     const void* source = X->DataRaw();
     void* target = Y->MutableDataRaw();
-    //If source and target pointers are not equal (non-inplace operation), we need to copy the data.
+    // If source and target pointers are not equal (non-inplace operation), we need to copy the data.
     if (target != source) {
       ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*X, *Y));
     }
@@ -40,8 +40,8 @@ class Reshape final : public JsKernel {
     return Status::OK();
   }
 
-  private:
-   bool allow_zero_;
+ private:
+  bool allow_zero_;
 };
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
index c8ace13bddfd5..f2214438c6fd1 100644
--- a/onnxruntime/core/providers/js/operators/transpose.h
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -15,20 +15,19 @@ class Transpose final : public JsKernel, public TransposeBase {
   Transpose(const OpKernelInfo& info) : JsKernel(info), TransposeBase(info) {
     std::vector<int32_t> perm;
     if (perm_specified_) {
-        perm.resize(perm_.size());
-        perm[0] = gsl::narrow_cast<int32_t>(perm_.size());
-        for (size_t i = 0; i < perm_.size(); ++i) {
-            perm[i] = gsl::narrow_cast<int32_t>(perm_[i]);
-        }
+      perm.resize(perm_.size());
+      perm[0] = gsl::narrow_cast<int32_t>(perm_.size());
+      for (size_t i = 0; i < perm_.size(); ++i) {
+        perm[i] = gsl::narrow_cast<int32_t>(perm_[i]);
+      }
     }
     // printf("Transpose: perm_specified_ = %d, perm.size() = %d, perm[0] = %d, perm[1] = %d, perm[2] = %d, perm[3] = %d\n",
     //   perm_specified_, static_cast<int32_t>(perm.size()), perm[0], perm[1], perm[2], perm[3]);
     JSEP_INIT_KERNEL_ATTRIBUTE(Transpose, ({
-        "perm": $1 ? Array.from(HEAP32.subarray($2, $2 + $1)) : []
-    }),
-    gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
-    reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2
-    );
+                                 "perm" : $1 ? Array.from(HEAP32.subarray($2, $2 + $1)) : []
+                               }),
+                               gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
+                               reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index 79f2b074f005a..df8c9760c1067 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -95,27 +95,26 @@ JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_2_DEFAULT(ClipV10, ClipV10, min, 3.402823e+38f,
 JSEP_ELEMENTWISE_VERSIONED_KERNEL(Clip, 6, 10, float, ClipV10)
 JSEP_KERNEL_IMPL(Clip, Clip)
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(Clip, kOnnxDomain, 11, 11, kJsExecutionProvider,
-    KernelDefBuilder()
-      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-      .InputMemoryType(OrtMemTypeCPU, 1)
-      .InputMemoryType(OrtMemTypeCPU, 2),
-    Clip);
+                                  KernelDefBuilder()
+                                      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+                                      .InputMemoryType(OrtMemTypeCPU, 1)
+                                      .InputMemoryType(OrtMemTypeCPU, 2),
+                                  Clip);
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(Clip, kOnnxDomain, 12, 12, kJsExecutionProvider,
-    KernelDefBuilder()
-      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-      .InputMemoryType(OrtMemTypeCPU, 1)
-      .InputMemoryType(OrtMemTypeCPU, 2),
-    Clip);
+                                  KernelDefBuilder()
+                                      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+                                      .InputMemoryType(OrtMemTypeCPU, 1)
+                                      .InputMemoryType(OrtMemTypeCPU, 2),
+                                  Clip);
 ONNX_OPERATOR_KERNEL_EX(Clip, kOnnxDomain, 13, kJsExecutionProvider,
-    KernelDefBuilder()
-      .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-      .InputMemoryType(OrtMemTypeCPU, 1)
-      .InputMemoryType(OrtMemTypeCPU, 2),
-    Clip);
+                        KernelDefBuilder()
+                            .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+                            .InputMemoryType(OrtMemTypeCPU, 1)
+                            .InputMemoryType(OrtMemTypeCPU, 2),
+                        Clip);
 
 JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_DEFAULT(Elu, Elu, alpha, 1.0)
 JSEP_ELEMENTWISE_KERNEL(Elu, 6, float, Elu)
 
-
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 728bd1cb39aed..47cb578f7e969 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -364,7 +364,7 @@ int OrtRun(OrtSession* session,
            const char** output_names, size_t output_count, ort_tensor_handle_t* outputs,
            OrtRunOptions* run_options) {
 #if defined(USE_JS)
-  EM_ASM({ Module["jsepRunPromise"] = new Promise((r) => { Module.jsepRunPromiseResolve = r; }); });
+  EM_ASM({ Module["jsepRunPromise"] = new Promise(function(r) { Module.jsepRunPromiseResolve = r; }); });
 #endif
   auto status_code = CHECK_STATUS(Run, session, run_options, input_names, inputs, input_count, output_names, output_count, outputs);
 #if defined(USE_JS)